import concurrent.futures
|
import os
|
import time
|
import requests
|
import re
|
|
|
def imgdata_set(save_path, word, epoch):
|
if not os.path.exists(save_path):
|
os.makedirs(save_path)
|
else:
|
return 0
|
q = 0 # 停止爬取图片条件
|
a = 0 # 图片名称
|
while(True):
|
time.sleep(1)
|
print("开始爬取图片")
|
url = "https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word={}&pn={}&ct=&ic=0&lm=-1&width=0&height=0".format(word, q)
|
# word=需要搜索的名字
|
headers = {
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.56'
|
}
|
response = requests.get(url, headers=headers) # 发送请求获取响应
|
html = response.text # 获取响应的HTML内容
|
urls = re.findall('"objURL":"(.*?)"', html) # 使用正则表达式提取图片URL
|
print(len(urls))
|
# 使用concurrent.futures实现并发下载
|
with concurrent.futures.ThreadPoolExecutor( max_workers=10) as executor:
|
# 提交所有下载任务并收集future对象
|
futures = [executor.submit(download_image, index, headers,save_path,url ) for index,url in enumerate(urls)]
|
q = q + 20
|
if (q / 20) >= int(epoch):
|
break
|
|
|
def download_image(a, headers, save_path, url):
|
try:
|
print(a) # 图片的名字
|
response = requests.get(url, headers=headers, timeout=10) # 发送请求获取图片响应
|
# 如果没有一直响应怎么处理
|
image = response.content # 获取图片内容
|
with open(os.path.join(save_path, "{}.jpg".format(a)), 'wb') as f: # 将图片内容保存到指定路径
|
f.write(image)
|
except Exception as e:
|
pass
|
|
|
if __name__ == "__main__":
|
save_path = input('你想保存的路径:') # 询问用户保存路径
|
word = input('你想要下载什么图片?请输入:') # 询问用户搜索关键词
|
epoch = input('你想要下载几轮图片?请输入(一轮为60张左右图片):') # 需要迭代几次图片
|
imgdata_set(save_path, word, epoch)
|