import concurrent.futures import os import time import requests import re def imgdata_set(save_path, word, epoch): if not os.path.exists(save_path): os.makedirs(save_path) else: return 0 q = 0 # 停止爬取图片条件 a = 0 # 图片名称 while(True): time.sleep(1) print("开始爬取图片") url = "https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word={}&pn={}&ct=&ic=0&lm=-1&width=0&height=0".format(word, q) # word=需要搜索的名字 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.56' } response = requests.get(url, headers=headers) # 发送请求获取响应 html = response.text # 获取响应的HTML内容 urls = re.findall('"objURL":"(.*?)"', html) # 使用正则表达式提取图片URL print(len(urls)) # 使用concurrent.futures实现并发下载 with concurrent.futures.ThreadPoolExecutor( max_workers=10) as executor: # 提交所有下载任务并收集future对象 futures = [executor.submit(download_image, index, headers,save_path,url ) for index,url in enumerate(urls)] q = q + 20 if (q / 20) >= int(epoch): break def download_image(a, headers, save_path, url): try: print(a) # 图片的名字 response = requests.get(url, headers=headers, timeout=10) # 发送请求获取图片响应 # 如果没有一直响应怎么处理 image = response.content # 获取图片内容 with open(os.path.join(save_path, "{}.jpg".format(a)), 'wb') as f: # 将图片内容保存到指定路径 f.write(image) except Exception as e: pass if __name__ == "__main__": save_path = input('你想保存的路径:') # 询问用户保存路径 word = input('你想要下载什么图片?请输入:') # 询问用户搜索关键词 epoch = input('你想要下载几轮图片?请输入(一轮为60张左右图片):') # 需要迭代几次图片 imgdata_set(save_path, word, epoch)