import re
|
import uuid
|
|
import requests
|
import random
|
from lxml import html
|
from UserAgent import user_agent_pool
|
import pymysql
|
|
class ChineseMedicineSpider(object):
|
def __init__(self):
|
self.headers = {'User-Agent': random.choice(user_agent_pool)}
|
# 启动爬虫
|
def run_spider(self):
|
pages = int(input('请输入需要爬取的页数:(小于或等于45)'))
|
if pages >= 0 and pages <= 45:
|
print('爬虫开始运行!!!')
|
self.get_chinese_medicine(pages)
|
print('爬虫运行结束!!!')
|
else:
|
print("输入无效!!!")
|
|
def get_chinese_medicine(self, pages):
|
medList = []
|
for page in range(46, pages + 46):
|
url = f'http://www.zhongyoo.com/name/page_{page}.html'
|
# print(url)
|
resp = requests.get(url, self.headers)
|
if resp.status_code == 200:
|
# print(resp.content.decode('gbk'))
|
tree = html.fromstring(resp.content.decode('gbk'))
|
zy_name_list = tree.xpath('//div[@class="sp"]/span/a/img/@alt') # 匹配到中药材的名字
|
zy_imge_list = tree.xpath('//div[@class="sp"]/span/a/img/@src') # 匹配到中药材的图片url
|
zy_info_list = tree.xpath('//div[@class="sp"]/span/a/@href') # 选取该页全部的中药的详情页url
|
i = 0
|
for zy_name in zy_name_list:
|
chinese_medicine = {} # 创建一个存放中药材信息的字典
|
chinese_medicine['name'] = zy_name
|
print(zy_name)
|
imge_url = zy_imge_list[i]
|
print(zy_imge_list[i])
|
# self.get_chinese_medicine_imge(imge_url, chinese_medicine)
|
info_url = zy_info_list[i]
|
self.get_chinese_medicine_info(info_url, chinese_medicine)
|
medicine_info = {}
|
i += 1
|
for key, value in chinese_medicine.items():
|
print(f'Key: {key}, Value: {value}')
|
if key == 'info':
|
medicine_info['alias'] = ''
|
medicine_info['english'] = ''
|
medicine_info['harvest'] = ''
|
medicine_info['origin'] = ''
|
medicine_info['morphology'] = ''
|
medicine_info['parts'] = ''
|
medicine_info['character'] = ''
|
medicine_info['famt'] = ''
|
medicine_info['efficacy'] = ''
|
medicine_info['clinical'] = ''
|
medicine_info['pharmacological'] = ''
|
medicine_info['bases'] = ''
|
medicine_info['Usage'] = ''
|
for line in value.split('\n'):
|
if "【中药名】" in line or "【正名】" in line or "【药名】" in line :
|
new_str = line.replace("【中药名】", "").replace("【正名】", "").replace("【药名】", "")
|
sp = new_str.split()
|
medicine_info['name'] = sp[0]
|
if len(sp) > 1:
|
medicine_info['pinyin'] = sp[1]
|
else:
|
medicine_info['pinyin'] = ''
|
if "【别名】" in line:
|
medicine_info['alias'] = line.replace("【别名】", "") .replace("【别名】", "")
|
if "【英文名】" in line or "【外语名】" in line:
|
medicine_info['english'] = line.replace("【外语名】", "").replace("【英文名】", "")
|
if "【药用部位】" in line:
|
medicine_info['parts'] = line # .replace("【药用部位】", "")
|
if "【植物形态】" in line:
|
medicine_info['morphology'] = line # .replace("【植物形态】", "")
|
if "【产地分布】" in line:
|
medicine_info['origin'] = line # .replace("【产地分布】", "")
|
if "【采收加工】" in line:
|
medicine_info['harvest'] = line # .replace("【采收加工】", "")
|
if "【药材性状】" in line:
|
medicine_info['character'] = line # .replace("【药材性状】", "")
|
if "【性味归经】" in line:
|
medicine_info['famt'] = line # .replace("【性味归经】", "")
|
if "【功效与作用】" in line:
|
medicine_info['efficacy'] = line # .replace("【功效与作用】", "")
|
if "【临床应用】" in line:
|
medicine_info['clinical'] = line # .replace("【临床应用】", "")
|
if "【药理研究】" in line:
|
medicine_info['pharmacological'] = line # .replace("【药理研究】", "")
|
if "【主要成分】" in line or "【化学成分】" in line:
|
medicine_info['bases'] = line # .replace("【主要成分】", "").replace("【化学成分】", "")
|
if "【使用禁忌】" in line:
|
medicine_info['Usage'] = line # .replace("【使用禁忌】", "")
|
# 建立与MySQL服务器的连接
|
conn = pymysql.connect(host='192.168.2.6', user='root', password='123456',
|
database='herb')
|
cursor = conn.cursor()
|
uuid_val = uuid.uuid4()
|
try:
|
selectSql = "select * from dry_herb_info where name = '" + medicine_info['name'] + "'"
|
cursor.execute(selectSql)
|
res = cursor.fetchone()
|
print(res)
|
if res == None:
|
# SQL语句,将数据插入到表格中
|
sql = "INSERT INTO dry_herb_info (id, name, pinyin, alias, english, parts, morphology, origin, harvest, character_drug, famt, efficacy, clinical, pharmacological, bases, usage_taboo) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s,%s, %s,%s,%s,%s,%s,%s)"
|
data = (uuid_val,medicine_info['name'],medicine_info['pinyin'],medicine_info['alias'],medicine_info['english'],
|
medicine_info['parts'],medicine_info['morphology'],medicine_info['origin'],medicine_info['harvest'],medicine_info['character'],
|
medicine_info['famt'],medicine_info['efficacy'],medicine_info['clinical'],medicine_info['pharmacological'],medicine_info['bases'],medicine_info['Usage'])
|
# 执行SQL语句
|
cursor.execute(sql,data)
|
# 提交事务
|
conn.commit()
|
except Exception as e:
|
print("Error occurred while inserting data into MySQL table.")
|
print(e)
|
finally:
|
# 关闭游标和连接
|
cursor.close()
|
conn.close()
|
medList.append(medicine_info)
|
else:
|
print("响应结果为空")
|
def get_chinese_medicine_info(self, info_url, chinese_medicine):
|
# 发送请求
|
try:
|
resp = requests.get(info_url, self.headers).content.decode('gbk')
|
# 匹配<p></p>和<p></p>中间的全部内容
|
pattern = r'<p></p>([\s\S]*?)<p></p>'
|
match = re.search(pattern, resp)
|
if match:
|
# print(match.group(1))
|
# 正则表达式 < [ ^ >]+ > 匹配HTML标签及其内容,并将其替换为空字符串,从而实现了去除HTML标签和标签中内容的效果
|
text = re.sub(r"<[^>]+>", "", match.group(1))
|
# 去除【前的空格
|
text = re.sub(r"\s+【", "\n【", text)
|
# 去除开头空白行
|
text = re.sub(r"^\s*", "", text)
|
# 去除相关推荐文章
|
text = re.sub(r"相关推荐文章.*", "", text, flags=re.DOTALL)
|
# 去除结尾空白行
|
info = re.sub(r"\s*$", "", text)
|
# 把爬取到的详细信息存入到字典
|
chinese_medicine['info'] = info
|
else:
|
chinese_medicine['info'] = "暂无详细信息,请等待管理员添加!"
|
except Exception as e:
|
pass
|
if __name__ == '__main__':
|
zy = ChineseMedicineSpider()
|
zy.run_spider()
|