百度新闻爬虫——普通抓取

一个关于百度新闻的爬虫, 分类提取新闻文本。

环境

Ubuntu14.04, Pycharm, Anaconda3,
MySQL:mysql Ver 14.14 Distrib 5.5.53, for debian-linux-gnu (x86_64) using readline 6.3

文件目录:

MyTools.py 存放获取,解析并储存网页内容的工具函数
single_thread.py 单线程普通抓取的实现
globalValue.py 新闻条数count的跨文件传递,参考这里

MyTools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138

import re
import time
import pymysql
import chardet
import requests
from bs4 import BeautifulSoup

import globalValue

from email.header import Header
from email.mime.text import MIMEText
from email.utils import parseaddr, formataddr
import smtplib



# headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36"}
headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36"}

# 打开数据库,之后在具体调用后close
db = pymysql.connect("localhost", "root", "你的密码", "USpiderData", charset='utf8')
cursor = db.cursor()



# 实现:给定新闻类别链接的列表class_urls,获取并储存数据
# 待实现函数:get_class_data, getdata
def get_save_data(class_urls):
for class_url in class_urls:
class_data = get_class_data(class_url)
getdata(class_data)



# 对于每个分类提取单个新闻标题信息class_data
def get_class_data(class_url):
classname = class_url.split('.')
class_data = requests.get(class_url, headers=headers)

char = chardet.detect(class_data.content)
class_data.encoding = char['encoding']
# 解析网页
class_data = class_data.text
soup = BeautifulSoup(class_data, 'lxml')
data = soup.findAll('a', {'target': '_blank'})
class_data = {}
for i in range(len(data)):
title = data[i].get_text()
href = data[i].get('href')
# 过滤一些干扰链接
if len(title) > 10:
if not '下载' in title:
class_data[title] = href

classname = class_url.split('.')[0][7:]

# 这里返回classname,主要是为了入库方便
return [classname, class_data]


# 实现:根据传入的新闻类别的信息class_data, 获取新闻文本并存入数据库
# 待实现函数:get_news_text
def getdata(class_data):
class_title = class_data[0]
class_data = class_data[1]
for news_title, news_url in dict(class_data).items():

print(news_title)
text = get_news_text(news_url)
if text is not None:
print("Got text data...")
else:
text = 'lost'
try:
sql = 'INSERT INTO BaiduNews(class, title, text)VALUES(%s,%s,%s)'
cursor.execute(sql, (class_title, news_title, text))
# 提交事务
db.commit()

except:
print('Save fail...')
pass


# 获取每条新闻的具体文本内容,粗略抓取
'''
百度新闻的链接是跳转到各个新闻网站的,网站大部分的结构都不同,
很难做到完全抓取到文本,所以目前只能粗略抓取。

之后会进行pyquery 的 css抓取,有时间会研究下专门的文本抓取算法
'''
def get_news_text(href):
try:
data = requests.get(href, headers=headers)
# 检测编码
char = chardet.detect(data.content)
data.encoding = char['encoding']
# 解析网页
data = BeautifulSoup(data.text, 'lxml')
# 这里对于同一属性多个值进行匹配
# data = BeautifulSoup(data.text,'lxml').find("div", {'class': ['text', 'article', 'content']})
data = data.find("div", {'class': re.compile(r"^(text|article|content)$")})
text = data.get_text()
count = globalValue.get_value()
print(count)
count += 1
globalValue.set_value(count)

except:
text = None
pass
return text




# 邮件
def _format_addr(s):
name, addr = parseaddr(s)
return formataddr((Header(name,'utf-8').encode(), addr))
def send_ms(T):
from_addr = "1021550072@qq.com"
# 开启QQ邮箱STMP服务的授权码
# 参考这里http://jingyan.baidu.com/article/4f7d5712b1ac7c1a201927da.html
password = '你的密钥'
to_addr = '1021550072@qq.com'
smtp_server = 'smtp.qq.com'
msg = MIMEText(T, 'plain', 'utf-8')
msg['From'] = _format_addr('Anyone')
msg['To'] = _format_addr('Echo')
msg['Subject'] = Header('The New Report', 'utf-8').encode()
server = smtplib.SMTP_SSL(smtp_server, 465, timeout=10)
server.set_debuglevel(0)
server.login(from_addr,password)
server.sendmail(from_addr, [to_addr], msg.as_string())
server.quit()

single_thread.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42

import re
import time
import pymysql
from MyTools import *

import globalValue


if __name__=='__main__':
# 初始化count,记录抓取的新闻条数
count = 0
globalValue.set_value(count)

s = time.ctime()
start = time.time()

class_lists = ['http://finance.baidu.com/', 'http://internet.baidu.com/', 'http://yule.baidu.com/',
'http://shipin.news.baidu.com/']

# 开始抓取
get_save_data(class_lists)
count = globalValue.get_value()

# 记得关闭数据库
db.close()

end = time.time()
e = time.ctime()
total_time = end - start
print(total_time)
print(count)
# 抓取的日志文件
with open("single_thread.txt", 'a') as f:
f.write("\nSingle-thread抓取\n本次抓取开始于%s,结束于%s,耗时%s\n共抓取新闻%s条"%(s, e, total_time, count))

# 发送邮件通知,可选
T = "\nSingle-thread抓取\n本次抓取开始于%s,结束于%s,耗时%s\n共抓取新闻%s条"%(s, e, total_time, count)

send_ms(T)
print('已成功发送邮件,请查收')

globalValue.py
1
2
3
4
5
6
7
8
9

def set_value(input_value):
global value
value = input_value


def get_value():
return value

日志文件及运行结果[同时会发送邮件到指定邮箱]:
[可以看到约400多秒]

击蒙御寇