baoshiwei
2024-03-04 e595c312581496403ac182f12f3d4939d3d00998
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import re
import uuid
 
import requests
import random
from lxml import html
from UserAgent import user_agent_pool
import pymysql
 
class ChineseMedicineSpider(object):
    def __init__(self):
        self.headers = {'User-Agent': random.choice(user_agent_pool)}
    # 启动爬虫
    def run_spider(self):
        pages = int(input('请输入需要爬取的页数:(小于或等于45)'))
        if pages >= 0 and pages <= 45:
            print('爬虫开始运行!!!')
            self.get_chinese_medicine(pages)
            print('爬虫运行结束!!!')
        else:
            print("输入无效!!!")
 
    def get_chinese_medicine(self, pages):
        medList = []
        for page in range(46, pages + 46):
            url = f'http://www.zhongyoo.com/name/page_{page}.html'
            # print(url)
            resp = requests.get(url, self.headers)
            if resp.status_code == 200:
                # print(resp.content.decode('gbk'))
                tree = html.fromstring(resp.content.decode('gbk'))
                zy_name_list = tree.xpath('//div[@class="sp"]/span/a/img/@alt')  # 匹配到中药材的名字
                zy_imge_list = tree.xpath('//div[@class="sp"]/span/a/img/@src')  # 匹配到中药材的图片url
                zy_info_list = tree.xpath('//div[@class="sp"]/span/a/@href')  # 选取该页全部的中药的详情页url
                i = 0
                for zy_name in zy_name_list:
                    chinese_medicine = {}  # 创建一个存放中药材信息的字典
                    chinese_medicine['name'] = zy_name
                    print(zy_name)
                    imge_url = zy_imge_list[i]
                    print(zy_imge_list[i])
                    # self.get_chinese_medicine_imge(imge_url, chinese_medicine)
                    info_url = zy_info_list[i]
                    self.get_chinese_medicine_info(info_url, chinese_medicine)
                    medicine_info = {}
                    i += 1
                    for key, value in chinese_medicine.items():
                        print(f'Key: {key}, Value: {value}')
                        if key == 'info':
                            medicine_info['alias'] = ''
                            medicine_info['english'] = ''
                            medicine_info['harvest'] = ''
                            medicine_info['origin'] = ''
                            medicine_info['morphology'] = ''
                            medicine_info['parts'] = ''
                            medicine_info['character'] = ''
                            medicine_info['famt'] = ''
                            medicine_info['efficacy'] = ''
                            medicine_info['clinical'] = ''
                            medicine_info['pharmacological'] = ''
                            medicine_info['bases'] = ''
                            medicine_info['Usage'] = ''
                            for line in value.split('\n'):
                                if "【中药名】" in line or "【正名】" in line or "【药名】" in line  :
                                    new_str = line.replace("【中药名】", "").replace("【正名】", "").replace("【药名】", "")
                                    sp = new_str.split()
                                    medicine_info['name'] = sp[0]
                                    if len(sp) > 1:
                                        medicine_info['pinyin'] = sp[1]
                                    else:
                                        medicine_info['pinyin'] = ''
                                if "【别名】" in line:
                                    medicine_info['alias'] = line.replace("【别名】", "")  .replace("【别名】", "")
                                if "【英文名】" in line or "【外语名】" in line:
                                    medicine_info['english'] = line.replace("【外语名】", "").replace("【英文名】", "")
                                if "【药用部位】" in line:
                                    medicine_info['parts'] = line   # .replace("【药用部位】", "")
                                if "【植物形态】" in line:
                                    medicine_info['morphology'] = line  # .replace("【植物形态】", "")
                                if "【产地分布】" in line:
                                    medicine_info['origin'] = line  # .replace("【产地分布】", "")
                                if "【采收加工】" in line:
                                    medicine_info['harvest'] = line  # .replace("【采收加工】", "")
                                if "【药材性状】" in line:
                                    medicine_info['character'] = line  # .replace("【药材性状】", "")
                                if "【性味归经】" in line:
                                    medicine_info['famt'] = line  # .replace("【性味归经】", "")
                                if "【功效与作用】" in line:
                                    medicine_info['efficacy'] = line  # .replace("【功效与作用】", "")
                                if "【临床应用】" in line:
                                    medicine_info['clinical'] = line  # .replace("【临床应用】", "")
                                if "【药理研究】" in line:
                                    medicine_info['pharmacological'] = line  # .replace("【药理研究】", "")
                                if "【主要成分】" in line or "【化学成分】" in line:
                                    medicine_info['bases'] = line  # .replace("【主要成分】", "").replace("【化学成分】", "")
                                if "【使用禁忌】" in line:
                                    medicine_info['Usage'] = line  # .replace("【使用禁忌】", "")
                            # 建立与MySQL服务器的连接
                            conn = pymysql.connect(host='192.168.2.6', user='root', password='123456',
                                                   database='herb')
                            cursor = conn.cursor()
                            uuid_val = uuid.uuid4()
                            try:
                                selectSql = "select * from dry_herb_info where name = '" + medicine_info['name'] + "'"
                                cursor.execute(selectSql)
                                res = cursor.fetchone()
                                print(res)
                                if res == None:
                                    # SQL语句,将数据插入到表格中
                                    sql = "INSERT INTO dry_herb_info (id, name, pinyin, alias, english, parts, morphology, origin, harvest, character_drug, famt, efficacy, clinical, pharmacological, bases, usage_taboo) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s,%s, %s,%s,%s,%s,%s,%s)"
                                    data = (uuid_val,medicine_info['name'],medicine_info['pinyin'],medicine_info['alias'],medicine_info['english'],
                                            medicine_info['parts'],medicine_info['morphology'],medicine_info['origin'],medicine_info['harvest'],medicine_info['character'],
                                            medicine_info['famt'],medicine_info['efficacy'],medicine_info['clinical'],medicine_info['pharmacological'],medicine_info['bases'],medicine_info['Usage'])
                                    # 执行SQL语句
                                    cursor.execute(sql,data)
                                    # 提交事务
                                    conn.commit()
                            except Exception as e:
                                print("Error occurred while inserting data into MySQL table.")
                                print(e)
                            finally:
                                # 关闭游标和连接
                                cursor.close()
                                conn.close()
                    medList.append(medicine_info)
            else:
                print("响应结果为空")
    def get_chinese_medicine_info(self, info_url, chinese_medicine):
        # 发送请求
        try:
            resp = requests.get(info_url, self.headers).content.decode('gbk')
            # 匹配<p></p>和<p></p>中间的全部内容
            pattern = r'<p></p>([\s\S]*?)<p></p>'
            match = re.search(pattern, resp)
            if match:
                # print(match.group(1))
                # 正则表达式 < [ ^ >]+ > 匹配HTML标签及其内容,并将其替换为空字符串,从而实现了去除HTML标签和标签中内容的效果
                text = re.sub(r"<[^>]+>", "", match.group(1))
                # 去除【前的空格
                text = re.sub(r"\s+【", "\n【", text)
                # 去除开头空白行
                text = re.sub(r"^\s*", "", text)
                # 去除相关推荐文章
                text = re.sub(r"相关推荐文章.*", "", text, flags=re.DOTALL)
                # 去除结尾空白行
                info = re.sub(r"\s*$", "", text)
                # 把爬取到的详细信息存入到字典
                chinese_medicine['info'] = info
            else:
                chinese_medicine['info'] = "暂无详细信息,请等待管理员添加!"
        except Exception as e:
            pass
if __name__ == '__main__':
    zy = ChineseMedicineSpider()
    zy.run_spider()