想写个脚本学习一下Python的正则,今天魔改了一下Forever师傅之前爬全国高校官网的脚本,爬取并保存至表格文件,放一下脚本:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
"""
@Author : 15h3na0
@Time : 2020/4/2 16:56
@Blog: https://15h3na0.xyz/
"""
import re
import requests
from openpyxl import workbook


def get_schools():
url = 'http://www.hao123.com/edu'
link = []
tmp = requests.get(url)
res = re.findall(r'所<a href="(.*?)"', tmp.text)
for i in res:
link.append(i)
return link


def get_urls(url):
res = []
for i in url:
tmp = requests.get(i)
tmp.encoding = 'gb2312'
flag = re.findall(r"<p>    <a (href=.*?<\/a>)</p></td>", tmp.text)
for j in flag:
res.append(j)
return res


def to_table(url):
table = workbook.Workbook()
data = table.active
data.append(['学校名称', '官方网站'])
for i, j in zip(url, range(len(url))):
if 'baike' in i:
pass
else:
data1 = re.findall(r'href="(.*?)"', i)
data2 = re.findall(r'>(.*?)<\/a>', i)
data.append([data2[0], data1[0]])
table.save('College.xlsx')


if __name__ == '__main__':
links = get_schools()
url = get_urls(links)
to_table(url)
print('爬取完毕!')

Forever师傅脚本:全国高校URL

GitHub地址: National-University-Spider