百度新闻爬虫——多进程抓取


基于上文的改进,使用进程池抓取。

环境:

Ubuntu14.04, Pycharm, Anaconda3,
MySQL:mysql Ver 14.14 Distrib 5.5.53, for debian-linux-gnu (x86_64) using readline 6.3

文件目录:

MyTools.py 存放获取,解析并储存网页内容的工具函数[和上文的一样]
processing_pool.py 单线程普通抓取的实现
globalValue.py 新闻条数count的跨文件传递.

MyTools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141

import re
import time
import pymysql
import chardet
import requests
from bs4 import BeautifulSoup

import globalValue

from email.header import Header
from email.mime.text import MIMEText
from email.utils import parseaddr, formataddr
import smtplib



# headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36"}
headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36"}

# 打开数据库,之后在具体调用后close
db = pymysql.connect("localhost", "root", "数据库密码", "数据库名称", charset='utf8')
cursor = db.cursor()



# 实现:给定新闻类别链接的列表class_urls,获取并储存数据
# 待实现函数:get_class_data, getdata
def get_save_data(class_urls):
for class_url in class_urls:
class_data = get_class_data(class_url)
getdata(class_data)



# 对于每个分类提取单个新闻标题信息class_data
def get_class_data(class_url):
classname = class_url.split('.')
class_data = requests.get(class_url, headers=headers)
pa = re.compile(r'charset=(.*?)">')
try:
charset = re.findall(pa, class_data.text)[0]
class_data.encoding = charset
except:
pass
# class_data.encoding = 'gbk'
class_data = class_data.text
soup = BeautifulSoup(class_data, 'lxml')
data = soup.findAll('a', {'target': '_blank'})
class_data = {}
for i in range(len(data)):
title = data[i].get_text()
href = data[i].get('href')
# 过滤一些干扰链接
if len(title) > 10:
if not '下载' in title:
class_data[title] = href

classname = class_url.split('.')[0][7:]

# 这里返回classname,主要是为了入库方便
return [classname, class_data]


# 实现:根据传入的新闻类别的信息class_data, 获取新闻文本并存入数据库
# 待实现函数:get_news_text
def getdata(class_data):
class_title = class_data[0]
class_data = class_data[1]
for news_title, news_url in dict(class_data).items():

print(news_title)
text = get_news_text(news_url)
if text is not None:
print("Got text data...")
else:
text = 'lost'
try:
sql = 'INSERT INTO BaiduNews(class, title, text)VALUES(%s,%s,%s)'
cursor.execute(sql, (class_title, news_title, text))
# 提交事务
db.commit()

except:
print('Save fail...')
pass


# 获取每条新闻的具体文本内容,粗略抓取
'''
百度新闻的链接是跳转到各个新闻网站的,网站大部分的结构都不同,
很难做到完全抓取到文本,所以目前只能粗略抓取。

之后会进行pyquery 的 css抓取,有时间会研究下专门的文本抓取算法
'''
def get_news_text(href):
try:
data = requests.get(href, headers=headers)
# 检测编码
char = chardet.detect(data.content)
data.encoding = char['encoding']
# 解析网页
data = BeautifulSoup(data.text, 'lxml')
# 这里对于同一属性多个值进行匹配
# data = BeautifulSoup(data.text,'lxml').find("div", {'class': ['text', 'article', 'content']})
data = data.find("div", {'class': re.compile(r"^(text|article|content)$")})
text = data.get_text()
count = globalValue.get_value()
print(count)
count += 1
globalValue.set_value(count)

except:
text = None
pass
return text




# 邮件
def _format_addr(s):
name, addr = parseaddr(s)
return formataddr((Header(name,'utf-8').encode(), addr))
def send_ms(T):
from_addr = "发件箱,自定义"
# 开启QQ邮箱STMP服务的授权码
# 参考这里http://jingyan.baidu.com/article/4f7d5712b1ac7c1a201927da.html
password = '授权码'
to_addr = '收件箱[需开启STMP服务]'
smtp_server = 'smtp.qq.com'
msg = MIMEText(T, 'plain', 'utf-8')
msg['From'] = _format_addr('Anyone')
msg['To'] = _format_addr('Echo')
msg['Subject'] = Header('The New Report', 'utf-8').encode()
server = smtplib.SMTP_SSL(smtp_server, 465, timeout=10)
server.set_debuglevel(0)
server.login(from_addr,password)
server.sendmail(from_addr, [to_addr], msg.as_string())
server.quit()

processing_pool.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49

# -*- coding: utf-8 -*-

'''
进程池抓取
'''
import re
import time
import pymysql
import send_email
from multiprocessing import Pool
from MyTools import *
import globalValue



if __name__=='__main__':
count = 0
globalValue.set_value(count)

start = time.time()
s = time.ctime()

headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36"}
class_list = ['http://finance.baidu.com/', 'http://internet.baidu.com/', 'http://yule.baidu.com/',
'http://shipin.news.baidu.com/']

data_lists = []
for class_url in class_list:
data = get_class_data(class_url)
data_lists.append(data)

pool = Pool(processes=4)
pool.map(getdata, data_lists)
count = globalValue.get_value()

pool.close()
pool.join()

db.close()

end = time.time()
e = time.ctime()

total_time = end - start

with open("processing_pool.txt", 'a') as f:
f.write("\nprocessing_pool抓取\n本次抓取开始于%s,结束于%s,耗时%s\n"%(s, e, total_time))

globalValue.py
1
2
3
4
5
6
7
8
9

def set_value(input_value):
global value
value = input_value


def get_value():
return value

日志文件:

[可以看到约190秒,较普通抓取有明显的速度提速]

击蒙御寇