利用爬虫代码美女写真套图下载转PDF保存

爬的不是全站链接，该爬虫爬的是每套图的链接，相当于是详情页的链接

需要单独下载的python库

requests

fake_useragent

xpath

PIL

下面有图片打码是为了过审，这网站真的是正经的写真网站不是什么老司机网站

import requests    from fake_useragent import UserAgent  from lxml import etree  import os  import aiohttp  import asyncio  from PIL import Image  import shutil  #获取详情页全部图片函数  async def get_xq(url):      #计数      p=0      tasks = []      res = requests.get(url=url,headers=header)      res.encoding="utf-8"      res.close()      tree = etree.HTML(res.text)      #图片标题      imgtitle = tree.xpath('//title/text()')[0]      global title      title = imgtitle      #第一页的图片链接      img = tree.xpath('//article//img/@src')      imgurl = ["https://www.xgyw01.co" + i for i in img]      print("当前下载的套图标题:n"+imgtitle)      for i in imgurl:          p += 1          tasks.append(down(i,imgtitle,p))      #获取下一页的链接      next = "https://www.xgyw01.co" + tree.xpath('//div[@class="pagination"][1]//a[last()]/@href')[0]          #判断的条件      bool = tree.xpath('//div[@class="pagination"][1]//a[last()]/text()')[0]      #循环获取所有页      while True:          print("正在获取"+next)          if bool == "下一页" or bool == "下页":              res = requests.get(url=next, headers=header)              res.encoding = "utf-8"              res.close()              tree = etree.HTML(res.text)              img = tree.xpath('//article//img/@src')              imgurl = ["https://www.xgyw01.co" + i for i in img]              for i in imgurl:                  p += 1                  tasks.append(down(i,imgtitle,p))          #判断的条件              bool = tree.xpath('//div[@class="pagination"][1]//a[last()]/text()')[0]                if bool == "下一页" or bool == "下页":                  next = "https://www.xgyw01.co" + tree.xpath('//div[@class="pagination"][1]//a[last()]/@href')[0]                  continue              else:                  break          else:              break      print("开始异步下载图片.....")      await asyncio.wait(tasks)  #下载函数  async def down(url,filename,fn):      filepath = os.path.join(dirdz,filename)      if not os.path.exists(filepath):          os.mkdir(filepath)      async with aiohttp.ClientSession() as session:          async with session.get(url) as res:              with open(filepath+"/"+str(fn)+".jpg",mode="wb") as f:                  f.write(await res.content.read())                  print(f"第{fn}张下载完成")  #写真图片转PDF函数         def jpg_pdf(filename):      # 定义文件夹路径和PDF文件名      folder_path = os.path.join(dirdz,filename)      pdf_filename = os.path.join(dirdz,filename + ".pdf")      # # 将JPEG文件转换为Pillow Image对象并添加到列表中      image_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.jpeg') or f.endswith('.jpg')]      image_list = []      for file_path in image_files:          img = Image.open(os.path.abspath(file_path))          image_list.append(img)      # 找到所有JPEG文件      image_list[0].save(pdf_filename, "PDF" ,resolution=100.0, save_all=True, append_images=image_list[1:])      print("pdf合并完成,删除缓存的的图片")      shutil.rmtree(folder_path)      print("清理完成")  #主函数     if __name__ == '__main__':      #定义的公共请求头      header = {"user-agent": UserAgent().random}      #定义的保存路径（文件夹路径）      dirdz = "这里输入保存的文件夹路径"      #定义一个全局的标题保存文件名字      title = ""      #爬取的网站是:https://www.xgyw01.co/，找到想下的套图直接点进去复制详情页的链接就行了      url = input("请输入套图详情页链接:")      #使用异常捕获排除异常      try:          #因为是异步函数所以需要用asyncio.run启动          asyncio.run(get_xq(url))          print(title)      except:          print("链接详情页解析错误.........")      print("全部下载完成,开始生成pdf.......")      #调用图片转PDF函数      jpg_pdf(title)