Selenium爬取QQ空间说说信息

知乎上看到这篇文章,就拿来改进运行下。

可能是因为不太会用Selenium,page_source在这里并不能获取到动态加载后的网页数据,所以只能用自带的一系列的find_element. 但是也是出现无法完全抓取的问题,这里就直接放上源码了,有些乱,凑活看看先…

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# encoding:utf-8

import re
import time
import pymysql
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities


# 屏幕截图
def get_pic(pic_name):
data = driver.get_screenshot_as_png()
time.sleep(5)
with open(pic_name, 'wb') as f:
f.write(data)


def savedata(qq, post_data):
# try:
print(post_data)
sql = 'INSERT INTO QQSpace(qq, postime, content, star, cmt)VALUES(%s,%s,%s,%s,%s)'
cursor.execute(sql, (qq, post_data[0], post_data[1], post_data[2], post_data[3]))
# 提交事务
db.commit()
#
# except:
# print('Save fail...')
# pass


def getdata(qq):
print('get data...')
try:
driver.switch_to.frame('app_canvas_frame')
except:
pass

content = driver.find_elements_by_css_selector('.content') # div.bd pre.content
stime = driver.find_elements_by_css_selector('.c_tx.c_tx3.goDetail') # div.info span.c_tx3 a.c_tx.c_tx3.goDetail
ft = driver.find_elements_by_class_name('ft')
print(len(content), len(stime), len(ft))

for con, sti, ft_data in zip(content, stime, ft):
# print(ft_data)
try:
ft_data = ft_data.text.replace(')', '(').split('(')
if '赞' in ft_data[0]:
if len(ft_data) == 3:
star = ft_data[1]
cmt = None
elif len(ft_data) == 5:
star = ft_data[1]
cmt = ft_data[3]

elif len(ft_data) == 7:
star = ft_data[1]
cmt = ft_data[3]

else:
star = None
cmt = None
else:
star = None
cmt = ft_data[1]

except:
star = None
cmt = None

post_data = [sti.text, con.text, star, cmt]
try:
savedata(qq, post_data)
except:
pass



# 登录QQ space
def get_shuoshuo(qq):
driver.get('https://user.qzone.qq.com/{}/311'.format(qq))
time.sleep(7)
try:
# get_pic('pre_login.png')
driver.find_element_by_id('login_div')
a = True
except:
print('can not find login_div')
a = False

if a == True:
print('logining...')
driver.switch_to.frame('login_frame')
driver.find_element_by_id('switcher_plogin').click()
driver.find_element_by_id('u').clear() # 选择用户名框
driver.find_element_by_id('u').send_keys('QQ号')
driver.find_element_by_id('p').clear()
driver.find_element_by_id('p').send_keys('QQ密码')
driver.find_element_by_id('login_button').click()

# get_pic('aft_input.png')
print('ok!')

driver.implicitly_wait(5)
try:
driver.find_element_by_id('QM_OwnerInfo_Icon')
b = True
except:
print('can not find QM...')
b = False


if b == True:
time.sleep(7)
getdata(qq)
continue_grap = True
while continue_grap == True:
try:
driver.find_element_by_link_text('下一页').click()
# print('sleeping...')
time.sleep(7)
getdata(qq)
except:
break




if __name__ == '__main__':

# 打开数据库,之后在具体调用后close
db = pymysql.connect("localhost", "root", "数据库密码", "SpiderData", charset='utf8')
cursor = db.cursor()


# 使用selenium
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = (
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:50.0) Gecko/20100101 Firefox/50.0")
driver = webdriver.PhantomJS('/home/shen/Downloads/phantomjs-2.1.1-linux-x86_64/bin/phantomjs',
desired_capabilities=dcap)
driver.maximize_window()

get_shuoshuo('要抓取的好友QQ号')

driver.close()
driver.quit()

db.close()

输出:

击蒙御寇