本文由 发布,转载请注明出处,如有问题请联系我们! 发布时间: 2021-05-11Python初学...爬取百度图库图片
加载中^ 功能就是输入一个想要爬取的字符串比如“超级玛丽”,程序会自动分页加载一直爬取,直到没有图片
import requests import os import re pn = 0 #从哪个图片下标开始 rn = 30 #每次多少张图片 pn与rn参数是在Google开发者工具里面找到的两个参数。。。很难找,坑爹啊啊啊 global number #中文的话文件夹名字会乱码 name = "chaojimali" def getImagePath(pn = 0): try: url = '''http://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&word=%s&pn=%d&rn=%d''' % (name,pn,rn) headers = {"user_agent": "Mozilla/5.0"} response = requests.get(url, headers=headers) # source = BeautifulSoup(response.content, 'lxml', from_encoding='utf-8') # paths = source.find_all("objURL") content = response.content #正则验证 # links = re.findall('"((http|ftp)s?://.*?.(png|jpg|jpeg|gif))"', response.content) links = re.findall('"((http|ftp)s?://.*?.(png|jpg|jpeg|gif))"',content) if not os.path.exists(name): os.mkdir(name) for path in links: imgPath = path[0] image = requests.get(imgPath) #返回码为200才去下载 if image.status_code != 200: continue print imgPath try: #尝试下载图片,失败了跳过这张图 open(name + os.sep + (imgPath[imgPath.rfind("/"):]), "wb").write(image.content) except: continue pn+=rn getImagePath(pn) except: pn += rn getImagePath(pn) #开始 getImagePath(pn)