豆瓣影评爬取

以爬取豆瓣影评为例,抽象简单的爬取流程。

环境与工具:

Ubuntu16.04 64bit
Pycharm
Anaconda3
bs4

流程图

源码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113

import re
import pymysql
import requests
from bs4 import BeautifulSoup



# 获取一个评论列表页面的网页数据
def get_data(url):
headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36"}
data = requests.get(url, headers=headers).text
return data

# 获取页面中影评标题及链接
def parse_data_bs(data):
title_data_d = {}
bsObj = BeautifulSoup(data, 'lxml')
review_lists = bsObj.findAll('div', {'typeof':"v:Review"})
for item in review_lists:
title_data = item.find('h3', {'class':'title'})
# 这里多次使用replace可用正则表达式代替,这里省略,详情参考站内正则表达式python版的总结
title = title_data.get_text().replace('\n', '').replace(' ', '')
title_href = title_data.find('a').get('href')
title_data_d[title] = title_href
return title_data_d

# 根据评论标题链接抓取评论内容
def get_comment_data(comment_url):
comment_data = get_data(comment_url)
bsObj = BeautifulSoup(comment_data, 'lxml')
# print(bsObj)
# author data
p_name = bsObj.find('header', {'class': 'main-hd'}).find('a').find('span').get_text()
p_url = bsObj.find('header', {'class': 'main-hd'}).find('a').get('href')
# comment data
c_date = bsObj.find('span', {'class': 'main-meta'}).get_text()
# c_rank提取中发现查看源码发现与检查元素不符合
# 有些用户未评分
try:
c_rank = bsObj.find('span', {'class': "main-title-hide"}).get_text()
except:
c_rank = None
c_comment = bsObj.find('div', {'property': 'v:description'}).get_text()
# 有用,无用
useful = bsObj.find('div', {'class': 'main-panel-useful'}).find_all('button')
y_use = useful[0].get_text().replace('\n', '').split(' ')[5]
n_use = useful[1].get_text().replace('\n', '').split(' ')[5]
comment_data = [p_name, p_url, c_date, c_comment, c_rank, y_use, n_use]

return comment_data




def create_table():
db = pymysql.connect("localhost", "root", "密码不给看", "USpiderData", charset='utf8')
cursor = db.cursor()
cursor.execute("DROP TABLE IF EXISTS DouBan")
# p_name, p_url, c_date, c_data, c_rank, y_use, n_use
cursor.execute("CREATE TABLE DouBan(p_name VARCHAR(20), p_url VARCHAR(100), "
"c_date VARCHAR (20), c_comment VARCHAR (20000), c_rank VARCHAR (5), "
"y_use VARCHAR (10), n_use VARCHAR (10));") # 创建表
db.commit()
db.close()

def save_to_mysql(cursor, db, commemt_data):
# try:
sql = 'INSERT INTO DouBan(p_name, p_url, c_date, c_comment, c_rank, y_use, n_use)VALUES(%s,%s,%s,%s,%s,%s,%s)'
# cursor.execute(sql, (commemt_data[0], commemt_data[1], commemt_data[2], commemt_data[3], commemt_data[4], commemt_data[5], commemt_data[6]))
cursor.execute(sql, [commemt_data[i] for i in range(7)])
# 提交事务
db.commit()
print(commemt_data[0])
# except:
# print('Save fail...')
# pass


def main_spider(list_url):
db = pymysql.connect("localhost", "root", "密码不给看", "USpiderData", charset='utf8')
cursor = db.cursor()
page_data = get_data(list_url)
# https: // movie.douban.com/subject/26616436/reviews
titles_data = parse_data_bs(page_data)
# print(titles_data)
for comment_title, comment_url in titles_data.items():
comment_data = get_comment_data(comment_url)
# print(comment_data)
save_to_mysql(cursor, db, comment_data)

db.close()


def get_all_list_urls():
return ['https://movie.douban.com/subject/1292052/reviews?start=%s'%(v) for v in range(0, 4300, 20)]


if __name__=='__main__':
create_table()
list_urls = get_all_list_urls()
error_link = []
for list_url in list_urls:
try:
main_spider(list_url)
except:
error_link.append(list_url)
if len(error_link) > 10:
print("ERROR!!!")
break
pass
print('OK!')

击蒙御寇