Python 爬取统计页面分析
需求: 实现 selenium 登录爬取页面数据,提取整合需要的内容
分析:
在手动实现登录页面后,通过 Network 查看返回数据,找到有可能获取的数据连接,发起构造 url 请求
原本链接为nav_to.do?uri=/home.do?,而筛选后发现 url 变化sysparm_query%3Dactive%253Dtrue%255Eassignment_group%253Djavascript:getMyGroups()%255Estate%2521%253D6%255EEQ
可依次根据关键字进行构造,从而添加上日期和页面数修改为
/nav_to.do?uri=%2Fincident_list.do%3Fsysparm_query%3Dassignment_group%253Djavascript:getMyGroups()%255Esys_created_on%253E%253Djavascript:gs.dateGenerate(%25272022-01-01%2527%252C%252700:00:00%2527)%26sysparm_first_row%3D101%26sysparm_view%3D_stack%3Dtrue%26sysparm_userpref_module%3D70782f04db8ab20099f47bedae961972%26sysparm_query%3Dactive%253Dtrue%255Eassignment_group%253Djavascript:getMyGroups()%255Estate%2521%253D6%255EEQ
尝试手动登录并返回页面数据
用 selenium 登录网页,在创建一个 session 保持登陆的 cookie,构造 header 进行 request 请求
展开代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
|
# coding:utf-8
# 用webdriver登录并获取cookies,并用requests发送请求,以豆瓣为例
from selenium import webdriver
import requests
import time
import json
import sys
def main():
# 从命令行参数获取登录用户名和密码
user_name = ''
password = ''
# 登录页面URL
login_url = '/nav_to.do?uri=%2Fincident_list.do%3Fsysparm_query%3Dassignment_group%253Djavascript:getMyGroups()%255Esys_created_on%253E%253Djavascript:gs.dateGenerate(%25272022-01-01%2527%252C%252700:00:00%2527)%26sysparm_first_row%3D1%26sysparm_view%3D_stack%3Dtrue%26sysparm_userpref_module%3D70782f04db8ab20099f47bedae961972%26sysparm_query%3Dactive%253Dtrue%255Eassignment_group%253Djavascript:getMyGroups()%255Estate%2521%253D6%255EEQ'
# 获取chrome的配置
opt = webdriver.ChromeOptions()
# 在运行的时候不弹出浏览器窗口
# opt.set_headless()
# 获取driver对象
driver = webdriver.Chrome(chrome_options = opt)
# 打开登录页面
driver.get(login_url)
print('opened login page...')
# 向浏览器发送用户名、密码,并点击登录按钮
#driver.find_element_by_css_selector('').click()
time.sleep(2)
driver.find_element_by_css_selector('#username').send_keys(user_name)
driver.find_element_by_css_selector('#password').send_keys(password)
#time.sleep(6)
#driver.find_element_by_css_selector('#login > button').click()
# 多次登录需要输入验证码,这里给一个手工输入验证码的时间
time.sleep(6)
print('submited...')
# 等待2秒钟
time.sleep(20)
# 创建一个requests session对象
s = requests.Session()
# 从driver中获取cookie列表(是一个列表,列表的每个元素都是一个字典)
cookies = driver.get_cookies()
# 把cookies设置到session中
for cookie in cookies:
s.cookies.set(cookie['name'],cookie['value'])
headers = {
"User-Agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'
}
page = 1
# 需要登录才能看到的页面URL
for i in range(1,912,100):
page_url = '/incident_list.do?sysparm_query=assignment_group%3Djavascript:getMyGroups()%5Esys_created_on%3E%3Djavascript:gs.dateGenerate(%272022-01-01%27%2C%2700:00:00%27)&sysparm_first_row={}&sysparm_view='.format(i)
# 获取该页面的HTML
resp = s.get(page_url,headers=headers)
resp.encoding = 'utf-8'
print('status_code = {0}'.format(resp.status_code))
# 将网页内容存入文件
with open('{}page.txt'.format(i),'w+',encoding='utf-8') as fout:
fout.write(resp.text)
print('现在已经进行到第{}页'.format(page))
page = page + 1
print('end')
if __name__ == '__main__':
main()
|
将获取返回数据进行清洗
用 xpath 对返回的请求进行元素匹配,再用正则提取需要的格式,最终存到 csv 中
展开代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
|
import time
import json
import sys
from lxml import etree
import re
import pandas as pd
#读取txt数据,转化成lxml模式并打印
def main():
linc,lopenby,ldate,lstate,lassignto,lmar,lshortdes,ldes = [[] for i in range(8)]
lis = []
n = 0
f = open('1page.txt','r',encoding='utf-8')
st = f.read()
html = etree.HTML(st)
item = html.xpath('//*[@id="incident_table"]/tbody')
inc = item[0].xpath('.//tr/td[3]/a/text()') #第二列
openby = item[0].xpath('.//tr/td[5]/a/text()') #第四列
date = item[0].xpath('.//tr/td[6]/div[1]/text()') #第五列
state = item[0].xpath('.//tr/td[7]/text()') #第六列
assignto = item[0].xpath('.//tr/td[9]/a/text()') #第八列
mar = item[0].xpath('.//tr/td[11]/text()') #第十列
shortdes = item[0].xpath('.//tr/td[14]/text()') #第十三列
#des = item[0].xpath('.//tr/td[15]/@title') #第十四列
des = item[0].xpath('.//tr/td[15]/text()') #第十四列
print(len(inc),len(des))
with open('desscan.txt','a',encoding='utf-8') as f:
for i in des:
f.write(i+'\n'+'\n')
## print(len(inc),len(openby),len(date),len(state),len(assignto),len(mar),len(shortdes),len(des))
##
## for i in range(len(inc)-1):
## temp = inc[i] + openby[i] + date[i] + state[i] + assignto[i] + mar[i] + shortdes[i] + des[i]
## #lis.append(inc[i],openby[i],date[i],state[i],assignto[i],mar[i],shortdes[i],des[i])
## lis.append(temp)
## print(i)
## print(lis[0])
if __name__ == '__main__':
main()
|
展开代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
|
import time
import json
import sys
from lxml import etree
import re
import pandas as pd
import numpy as np
def main():
titlelis = ['inc','openby','date','state','assignto','mar','shortdes','des']
for num in range(10):
lis = [[] for i in range(100)]
f = open('{}01page.txt'.format(num),'r',encoding='utf-8')
st = f.read()
print('open {}01page.txt'.format(num))
html = etree.HTML(st)
##result = etree.tostring(html).decode() #验证
item = html.xpath('//*[@id="incident_table"]/tbody')
#page = re.findall(r'http://gd.zgsydw.com/ziliao/shizheng/(.*?).html',url) #预留
#with open('lis.txt','a+',encoding='utf-8') as f:
for i in range(1,101):
inc = item[0].xpath('.//tr[{}]/td[3]/a/text()'.format(i)) #第二列
openby = item[0].xpath('.//tr[{}]/td[5]/a/text()'.format(i)) #第四列
date = item[0].xpath('.//tr[{}]/td[6]/div[1]/text()'.format(i)) #第五列
state = item[0].xpath('.//tr[{}]/td[7]/text()'.format(i)) #第六列
assignto = item[0].xpath('.//tr[{}]/td[9]/a/text()'.format(i)) #第八列
mar = item[0].xpath('.//tr[{}]/td[11]/text()'.format(i)) #第十列
shortdes = item[0].xpath('.//tr[{}]/td[14]/text()'.format(i)) #第十三列
des = item[0].xpath('.//tr[{}]/td[15]/@title'.format(i)) #第十四列
if des == []:
des = item[0].xpath('.//tr[{}]/td[15]/text()'.format(i)) #第十四列
lis[i - 1].append("".join(inc))
lis[i - 1].append("".join(openby))
lis[i - 1].append("".join(date))
lis[i - 1].append("".join(state))
lis[i - 1].append("".join(assignto))
lis[i - 1].append("".join(mar))
lis[i - 1].append("".join(shortdes))
lis[i - 1].append("".join(des))
print(lis)
#lis不断累加前次结果
#inc,openby,date,state,assignto,mar,shortdes,des #预留
#将全部写入txt
#f.write("".join(lis[i-1][j][j])+'\n')
#f.write('\n')
indexlis = [num * 100 + i for i in range(1,101)]
if num == 9:
#print(lis)
lis = ["".join(list(filter(None,"".join(lis))))]
#print(lis)
#indexlis = [num * 100 + i for i in range(1,len(lis))]
#print(len(lis),len(indexlis),indexlis)
a = pd.DataFrame(lis,columns=titlelis,index=indexlis)
a.to_csv('panlis.csv', encoding='utf-8-sig',mode='a+')
if __name__ == '__main__':
main()
|
序列化并提取词汇
转化为 pandas 格式,尝试用 nltk 将高频词汇提取出来用 jieba 制作词云,但当时免费库还不够成熟,效果不理想
展开代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
|
import pandas as pd
import time
import numpy as np
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import SnowballStemmer
def main():
df = pd.read_csv('panlis.csv')
df2 =df['des']
#print(df2)
ar = np.array(df2)
lis = ar.tolist()
text = "".join(lis).lower()
#print(text)
words = nltk.word_tokenize(text)
#lancaster
lancaster_stemmer = LancasterStemmer()
print(lancaster_stemmer.stem('maximum'))
print(lancaster_stemmer.stem('multiply'))
#porter
porter_stemmer = PorterStemmer()
print(porter_stemmer.stem('maximum'))
print(porter_stemmer.stem('multiply'))
#snowball
snowball_stemmer = SnowballStemmer('english')
print(snowball_stemmer.stem('maximum'))
print(snowball_stemmer.stem('multiply'))
text2 = nltk.word_tokenize(text)
print(text2)
print(nltk.pos_tag(text2))
if __name__ == '__main__':
main()
|