import re import uuid import requests import random from lxml import html from UserAgent import user_agent_pool import pymysql class ChineseMedicineSpider(object): def __init__(self): self.headers = {'User-Agent': random.choice(user_agent_pool)} # 启动爬虫 def run_spider(self): pages = int(input('请输入需要爬取的页数:(小于或等于45)')) if pages >= 0 and pages <= 45: print('爬虫开始运行!!!') self.get_chinese_medicine(pages) print('爬虫运行结束!!!') else: print("输入无效!!!") def get_chinese_medicine(self, pages): medList = [] for page in range(46, pages + 46): url = f'http://www.zhongyoo.com/name/page_{page}.html' # print(url) resp = requests.get(url, self.headers) if resp.status_code == 200: # print(resp.content.decode('gbk')) tree = html.fromstring(resp.content.decode('gbk')) zy_name_list = tree.xpath('//div[@class="sp"]/span/a/img/@alt') # 匹配到中药材的名字 zy_imge_list = tree.xpath('//div[@class="sp"]/span/a/img/@src') # 匹配到中药材的图片url zy_info_list = tree.xpath('//div[@class="sp"]/span/a/@href') # 选取该页全部的中药的详情页url i = 0 for zy_name in zy_name_list: chinese_medicine = {} # 创建一个存放中药材信息的字典 chinese_medicine['name'] = zy_name print(zy_name) imge_url = zy_imge_list[i] print(zy_imge_list[i]) # self.get_chinese_medicine_imge(imge_url, chinese_medicine) info_url = zy_info_list[i] self.get_chinese_medicine_info(info_url, chinese_medicine) medicine_info = {} i += 1 for key, value in chinese_medicine.items(): print(f'Key: {key}, Value: {value}') if key == 'info': medicine_info['alias'] = '' medicine_info['english'] = '' medicine_info['harvest'] = '' medicine_info['origin'] = '' medicine_info['morphology'] = '' medicine_info['parts'] = '' medicine_info['character'] = '' medicine_info['famt'] = '' medicine_info['efficacy'] = '' medicine_info['clinical'] = '' medicine_info['pharmacological'] = '' medicine_info['bases'] = '' medicine_info['Usage'] = '' for line in value.split('\n'): if "【中药名】" in line or "【正名】" in line or "【药名】" in line : new_str = line.replace("【中药名】", "").replace("【正名】", "").replace("【药名】", "") sp = new_str.split() medicine_info['name'] = sp[0] if len(sp) > 1: medicine_info['pinyin'] = sp[1] else: medicine_info['pinyin'] = '' if "【别名】" in line: medicine_info['alias'] = line.replace("【别名】", "") .replace("【别名】", "") if "【英文名】" in line or "【外语名】" in line: medicine_info['english'] = line.replace("【外语名】", "").replace("【英文名】", "") if "【药用部位】" in line: medicine_info['parts'] = line # .replace("【药用部位】", "") if "【植物形态】" in line: medicine_info['morphology'] = line # .replace("【植物形态】", "") if "【产地分布】" in line: medicine_info['origin'] = line # .replace("【产地分布】", "") if "【采收加工】" in line: medicine_info['harvest'] = line # .replace("【采收加工】", "") if "【药材性状】" in line: medicine_info['character'] = line # .replace("【药材性状】", "") if "【性味归经】" in line: medicine_info['famt'] = line # .replace("【性味归经】", "") if "【功效与作用】" in line: medicine_info['efficacy'] = line # .replace("【功效与作用】", "") if "【临床应用】" in line: medicine_info['clinical'] = line # .replace("【临床应用】", "") if "【药理研究】" in line: medicine_info['pharmacological'] = line # .replace("【药理研究】", "") if "【主要成分】" in line or "【化学成分】" in line: medicine_info['bases'] = line # .replace("【主要成分】", "").replace("【化学成分】", "") if "【使用禁忌】" in line: medicine_info['Usage'] = line # .replace("【使用禁忌】", "") # 建立与MySQL服务器的连接 conn = pymysql.connect(host='192.168.2.6', user='root', password='123456', database='herb') cursor = conn.cursor() uuid_val = uuid.uuid4() try: selectSql = "select * from dry_herb_info where name = '" + medicine_info['name'] + "'" cursor.execute(selectSql) res = cursor.fetchone() print(res) if res == None: # SQL语句,将数据插入到表格中 sql = "INSERT INTO dry_herb_info (id, name, pinyin, alias, english, parts, morphology, origin, harvest, character_drug, famt, efficacy, clinical, pharmacological, bases, usage_taboo) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s,%s, %s,%s,%s,%s,%s,%s)" data = (uuid_val,medicine_info['name'],medicine_info['pinyin'],medicine_info['alias'],medicine_info['english'], medicine_info['parts'],medicine_info['morphology'],medicine_info['origin'],medicine_info['harvest'],medicine_info['character'], medicine_info['famt'],medicine_info['efficacy'],medicine_info['clinical'],medicine_info['pharmacological'],medicine_info['bases'],medicine_info['Usage']) # 执行SQL语句 cursor.execute(sql,data) # 提交事务 conn.commit() except Exception as e: print("Error occurred while inserting data into MySQL table.") print(e) finally: # 关闭游标和连接 cursor.close() conn.close() medList.append(medicine_info) else: print("响应结果为空") def get_chinese_medicine_info(self, info_url, chinese_medicine): # 发送请求 try: resp = requests.get(info_url, self.headers).content.decode('gbk') # 匹配

中间的全部内容 pattern = r'

([\s\S]*?)

' match = re.search(pattern, resp) if match: # print(match.group(1)) # 正则表达式 < [ ^ >]+ > 匹配HTML标签及其内容,并将其替换为空字符串,从而实现了去除HTML标签和标签中内容的效果 text = re.sub(r"<[^>]+>", "", match.group(1)) # 去除【前的空格 text = re.sub(r"\s+【", "\n【", text) # 去除开头空白行 text = re.sub(r"^\s*", "", text) # 去除相关推荐文章 text = re.sub(r"相关推荐文章.*", "", text, flags=re.DOTALL) # 去除结尾空白行 info = re.sub(r"\s*$", "", text) # 把爬取到的详细信息存入到字典 chinese_medicine['info'] = info else: chinese_medicine['info'] = "暂无详细信息,请等待管理员添加!" except Exception as e: pass if __name__ == '__main__': zy = ChineseMedicineSpider() zy.run_spider()