朴素贝叶斯算法的简单实现

李航老师,《统计学习方法》第四章,朴素贝叶斯算法笔记。

算法原理
极大似然估计

贝叶斯估计

如上所述,在lambda = 0时,贝叶斯估计就等价于极大似然估计。

Python实现

这里用的是数字手写体的数据,之前在KNN也用到过。

手动实现算法
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143

'''

朴素贝叶斯预测手写体

'''

import os
import time
import pandas as pd
import numpy as np


## 处理单个txt文件, 将文件转化为1-d数组
def img2vector(filename):
rows = 32
cols = 32
imgVector = np.zeros((1, rows * cols))
fileIn = open(filename)
for row in range(rows):
lineStr = fileIn.readline()
for col in range(cols):
imgVector[0, row * 32 + col] = int(lineStr[col])
fileIn.close()
return imgVector


# 获取训练集
def get_training_sample():
os.chdir('/home/shen/PycharmProjects/MyPython/统计学习方法/Naive Bayes/digits/trainingDigits')
files = os.listdir()
numSamples = len(files)
train_x = np.zeros((numSamples, 1024))
train_y = []
for i in range(numSamples):
filename = files[i]
yi = int(filename.split('_')[0])
xi = img2vector(filename)

train_x[i, :] = xi
train_y.append(yi)

return train_x, train_y


# 获取测试集
def get_test_sample():
os.chdir('/home/shen/PycharmProjects/MyPython/统计学习方法/Naive Bayes/digits/testDigits')
files = os.listdir()
numSamples = len(files)
test_x = np.zeros((numSamples, 1024))
test_y = []
for i in range(numSamples):
filename = files[i]
yi = int(filename.split('_')[0])
xi = img2vector(filename)

test_x[i, :] = xi
test_y.append(yi)

return test_x, test_y


# 核心训练算法
def get_train_x_yi(train_x, train_y, yi):
if yi == 0:
train_x_yi = train_x[train_y == np.zeros(len(train_y))]

else:
train_x_yi = train_x[train_y == np.ones(len(train_y)) * yi]

x_equal_1 = (train_x_yi.sum(axis=0)+lamda)/ (len(train_x_yi)+len(train_x_yi[0])*lamda)
x_equal_0 = np.ones(len(x_equal_1)) - x_equal_1
x_prec = np.vstack((x_equal_0, x_equal_1))

return x_prec


def train(train_x, train_y):
# 根据需要,构造合适的结构
# [[0, [{0:p0, 1:p1}, {...}, {...},...{...}]], [1, [...]], [...], ...]
train_result = list(range(0, 10))
for i in train_result:
train_result[i] = [0, [{} for i in range(32 * 32)]]

N = len(train_y)
unique_yi, counts_yi = np.unique(train_y, return_counts=True)
prec_yi = (counts_yi+lamda) / (len(train_y)+len(unique_yi)*lamda)
for i in range(len(prec_yi)):
train_result[i][0] = list(prec_yi)[i]
# yi = dict(zip(unique_yi, prec_yi))

for y in range(10):
x_prec = get_train_x_yi(train_x, train_y, y)

for xi in range(len(x_prec[0])):
train_result[y][1][xi][0] = x_prec[0][xi]
train_result[y][1][xi][1] = x_prec[1][xi]

return train_result

# 对测试集进行预测
def predict(test_x, test_y, train_result):
true_pre = 0
for i in range(len(test_x)):
yi = test_y[i]
xi = test_x[i]
result_xi = []
for classx in range(10):
P_yi = train_result[classx][0]
for k in range(len(xi)):
# print(train_result[classx][1][k][int(xi[k])])
P_yi *= train_result[classx][1][k][int(xi[k])]
result_xi.append(P_yi)

y0 = dict(zip(result_xi, range(len(result_xi))))[max(result_xi)]
if y0 == yi:
true_pre += 1

print('预测准确率为: ', true_pre / len(test_x))


def run():
s = time.time()
train_x, train_y = get_training_sample()
test_x, test_y = get_test_sample()


train_result = train(train_x, train_y)
predict(train_x, train_y, train_result)
e = time.time()
print('本次训练预测共耗时 ', e - s)


if __name__ == '__main__':
# 选取合适的lamda
# lamda=0是为极大似然估计
# lamda>0是为贝叶斯估计,特别地,在其为1时,称作拉普拉斯平滑。
lamda = 0 # 这里lamda=0得到的准确率较高
run()



运行结果:

与sklearn实现对比

为了对比,我们也用sklearn的朴素贝叶斯算法实现下。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22

import os
import time
import pandas as pd
import numpy as np
# the Naive Bayes model
from sklearn.naive_bayes import MultinomialNB, GaussianNB

# 这里的函数还是上面那些,只不过用到其中几个而已,这里不再赘述
if __name__=='__main__':
s = time.time()

train_x, train_y = get_training_sample()
test_x, test_y = get_test_sample()
nb = MultinomialNB()
# nb = GaussianNB()
nb.fit(train_x, train_y)
print(nb.score(test_x, test_y))

e = time.time()
print('本次训练预测共耗时 ', e-s)

输出:

nb = MultinomialNB()
0.923890063425
本次训练预测共耗时 1.693662405014038

nb = GaussianNB()
0.733615221987
本次训练预测共耗时 1.8078570365905762

可以看到,sklearn的算法实现明显要快得多,在正确选择合适算法时,也能达到较高的准确率。

补充

这里训练数据的结果用了一个比较复杂结构,开始的时候怎么也构造不出。但是后来反过来逆向考虑

在新的待预测的测试向量进入时,我需要哪些数据去预测呢?
怎样高效地调用这些数据呢?
所以,从其需要,构造其现有的就够,变得简单许多。

击蒙御寇