首页分享逻辑回归分类、决策树分类、朴素贝叶斯分类及手写数字识别

逻辑回归分类、决策树分类、朴素贝叶斯分类及手写数字识别

来源：花匠小妙招时间：2024-09-14 08:23

逻辑回归

使用逻辑回归进行鸢尾花分类：

import numpy as np

from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt

import matplotlib as mpl

from sklearn import preprocessing

import pandas as pd

from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import Pipeline

def iris_type(s):

it = {b'Iris-setosa': 0, b'Iris-versicolor': 1, b'Iris-virginica': 2}

return it[s]

if __name__ == "__main__":

path = u'8.iris.data' # 数据文件路径

# # # 手写读取数据

# # f = file(path)

# # x = []

# # y = []

# # for d in f:

# # d = d.strip()

# # if d:

# # d = d.split(',')

# # y.append(d[-1])

# # x.append(map(float, d[:-1]))

# # print '原始数据X：n', x

# # print '原始数据Y：n', y

# # x = np.array(x)

# # print 'Numpy格式X：n', x

# # y = np.array(y)

# # print 'Numpy格式Y - 1:n', y

# # y[y == 'Iris-setosa'] = 0

# # y[y == 'Iris-versicolor'] = 1

# # y[y == 'Iris-virginica'] = 2

# # print 'Numpy格式Y - 2:n', y

# # y = y.astype(dtype=np.int)

# # print 'Numpy格式Y - 3:n', y

# # 使用sklearn的数据预处理

# df = pd.read_csv(path, header=0)

# x = df.values[:, :-1]

# y = df.values[:, -1]

# print 'x = n', x

# print 'y = n', y

# le = preprocessing.LabelEncoder()

# le.fit(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'])

# print le.classes_

# y = le.transform(y)

# print 'Last Version, y = n', y

# # 路径，浮点型数据，逗号分隔，第4列使用函数iris_type单独处理

data = np.loadtxt(path, dtype=float, delimiter=',', converters={4: iris_type})

print(data)

# 将数据的0到3列组成x，第4列得到y

x, y = np.split(data, (4,), axis=1)

# 为了可视化，仅使用前两列特征

x = x[:, :2]

# print(x)

# print(y)

# x = StandardScaler().fit_transform(x)

# lr = LogisticRegression() # Logistic回归模型

# lr.fit(x, y.ravel()) # 根据数据[x,y]，计算回归参数

# 等价形式

lr = Pipeline([('sc', StandardScaler()),

('clf', LogisticRegression()) ])

lr.fit(x, y.ravel())

# 画图

N, M = 500, 500 # 横纵各采样多少个值

x1_min, x1_max = x[:, 0].min(), x[:, 0].max() # 第0列的范围

x2_min, x2_max = x[:, 1].min(), x[:, 1].max() # 第1列的范围

t1 = np.linspace(x1_min, x1_max, N)

t2 = np.linspace(x2_min, x2_max, M)

x1, x2 = np.meshgrid(t1, t2) # 生成网格采样点

x_test = np.stack((x1.flat, x2.flat), axis=1) # 测试点

# 无意义，只是为了凑另外两个维度

# x3 = np.ones(x1.size) * np.average(x[:, 2])

# x4 = np.ones(x1.size) * np.average(x[:, 3])

# x_test = np.stack((x1.flat, x2.flat, x3, x4), axis=1) # 测试点

cm_light = mpl.colors.ListedColormap(['#77E0A0', '#FF8080', '#A0A0FF'])

cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b'])

y_hat = lr.predict(x_test) # 预测值

y_hat = y_hat.reshape(x1.shape) # 使之与输入的形状相同

plt.pcolormesh(x1, x2, y_hat, cmap=cm_light) # 预测值的显示

plt.scatter(x[:, 0], x[:, 1], edgecolors='k', s=50, cmap=cm_dark) # 样本的显示

plt.xlabel('petal length')

plt.ylabel('petal width')

plt.xlim(x1_min, x1_max)

plt.ylim(x2_min, x2_max)

plt.grid()

plt.savefig('2.png')

plt.show()

# 训练集上的预测结果

y_hat = lr.predict(x)

y = y.reshape(-1)

result = y_hat == y

print(y_hat)

print (result)

acc = np.mean(result)

print ('准确度: %.2f%%' % (100 * acc))

结果：准确度: 79.33%，分类如下所示：

使用决策树对鸢尾花数据进行分类：

import numpy as np

import matplotlib.pyplot as plt

import matplotlib as mpl

from sklearn import tree

from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import Pipeline

def iris_type(s):

it = {b'Iris-setosa': 0, b'Iris-versicolor': 1, b'Iris-virginica': 2}

return it[s]

# 花萼长度、花萼宽度，花瓣长度，花瓣宽度

# iris_feature = 'sepal length', 'sepal width', 'petal length', 'petal width'

iris_feature = u'花萼长度', u'花萼宽度', u'花瓣长度', u'花瓣宽度'

if __name__ == "__main__":

mpl.rcParams['font.sans-serif'] = [u'SimHei']

mpl.rcParams['axes.unicode_minus'] = False

path = '..8.iris.data' # 数据文件路径

data = np.loadtxt(path, dtype=float, delimiter=',', converters={4: iris_type})

print(data)

x, y = np.split(data, (4,), axis=1)

# 为了可视化，仅使用前两列特征

x = x[:, :2]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

model = Pipeline([

('ss', StandardScaler()),

('DTC', DecisionTreeClassifier(criterion='entropy', max_depth=3))])

model = model.fit(x_train, y_train)

y_test_hat = model.predict(x_test) # 测试数据

f = open('.iris_tree.dot', 'w')

tree.export_graphviz(model.get_params('DTC')['DTC'], out_file=f)

# 画图

N, M = 100, 100 # 横纵各采样多少个值

x1_min, x1_max = x[:, 0].min(), x[:, 0].max() # 第0列的范围

x2_min, x2_max = x[:, 1].min(), x[:, 1].max() # 第1列的范围

t1 = np.linspace(x1_min, x1_max, N)

t2 = np.linspace(x2_min, x2_max, M)

x1, x2 = np.meshgrid(t1, t2) # 生成网格采样点

x_show = np.stack((x1.flat, x2.flat), axis=1) # 测试点

cm_light = mpl.colors.ListedColormap(['#A0FFA0', '#FFA0A0', '#A0A0FF'])

cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b'])

y_show_hat = model.predict(x_show) # 预测值

y_show_hat = y_show_hat.reshape(x1.shape) # 使之与输入的形状相同

plt.figure(facecolor='w')

plt.pcolormesh(x1, x2, y_show_hat, cmap=cm_light) # 预测值的显示

plt.scatter(x_test[:, 0], x_test[:, 1], c=y_test.ravel(), edgecolors='k', s=100, cmap=cm_dark, marker='o') # 测试数据

plt.scatter(x[:, 0], x[:, 1], c=y.ravel(), edgecolors='k', s=40, cmap=cm_dark) # 全部数据

plt.xlabel(iris_feature[0], fontsize=15)

plt.ylabel(iris_feature[1], fontsize=15)

plt.xlim(x1_min, x1_max)

plt.ylim(x2_min, x2_max)

plt.grid(True)

plt.title(u'鸢尾花数据的决策树分类', fontsize=17)

plt.show()

# 训练集上的预测结果

y_test = y_test.reshape(-1)

print (y_test_hat)

print (y_test)

result = (y_test_hat == y_test) # True则预测正确，False则预测错误

acc = np.mean(result)

print('准确度: %.2f%%',(100 * acc))

# 过拟合：错误率

depth = np.arange(1, 15)

err_list = []

for d in depth:

clf = DecisionTreeClassifier(criterion='entropy', max_depth=d)

clf = clf.fit(x_train, y_train)

y_test_hat = clf.predict(x_test) # 测试数据

result = (y_test_hat == y_test) # True则预测正确，False则预测错误

err = 1 - np.mean(result)

err_list.append(err)

print (d, ' 准确度: %.2f%%' , (100 * err))

plt.figure(facecolor='w')

plt.plot(depth, err_list, 'ro-', lw=2)

plt.xlabel(u'决策树深度', fontsize=15)

plt.ylabel(u'错误率', fontsize=15)

plt.title(u'决策树深度与过拟合', fontsize=17)

plt.grid(True)

plt.show()

结果：准确度: %.2f%% 80.0，分类结果如下所示：

手写图片数字识别：

import numpy as np

from sklearn import svm

import matplotlib.colors

import matplotlib.pyplot as plt

from PIL import Image

import os

def show_accuracy(a, b, tip):

acc = a.ravel() == b.ravel()

print(tip + '正确率：%.2f%%' % (100*np.mean(acc)))

def save_image(im, i):

im *= 15.9375

im = 255 - im

a = im.astype(np.uint8)

output_path = '.HandWritten'

if not os.path.exists(output_path):

os.mkdir(output_path)

Image.fromarray(a).save(output_path + ('%d.png' % i))

if __name__ == "__main__":

print('Load Training File Start...')

data = np.loadtxt('14.optdigits.tra', dtype=np.float, delimiter=',')

x, y = np.split(data, (-1, ), axis=1)

images = x.reshape(-1, 8, 8)

y = y.ravel().astype(np.int)

print('Load Test Data Start...')

data = np.loadtxt('14.optdigits.tes', dtype=np.float, delimiter=',')

x_test, y_test = np.split(data, (-1, ), axis=1)

images_test = x_test.reshape(-1, 8, 8)

y_test = y_test.ravel().astype(np.int)

print('Load Data OK...')

matplotlib.rcParams['font.sans-serif'] = [u'SimHei']

matplotlib.rcParams['axes.unicode_minus'] = False

plt.figure(figsize=(15, 9), facecolor='w')

for index, image in enumerate(images[:16]):

plt.subplot(4, 8, index + 1)

plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')

plt.title(u'训练图片: %i' % y[index])

for index, image in enumerate(images_test[:16]):

plt.subplot(4, 8, index + 17)

plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')

save_image(image.copy(), index)

plt.title(u'测试图片: %i' % y_test[index])

plt.tight_layout()

plt.show()

clf = svm.SVC(C=1, kernel='rbf', gamma=0.001) # ~ kNN

print('Start Learning...')

clf.fit(x, y)

print ('Learning is OK...')

y_hat = clf.predict(x)

show_accuracy(y, y_hat, '训练集')

y_hat = clf.predict(x_test)

print (y_hat)

print (y_test)

show_accuracy(y_test, y_hat, '测试集')

err_images = images_test[y_test != y_hat]

err_y_hat = y_hat[y_test != y_hat]

err_y = y_test[y_test != y_hat]

print (err_y_hat)

print (err_y)

plt.figure(figsize=(10, 8), facecolor='w')

for index, image in enumerate(err_images):

if index >= 12:

break

plt.subplot(3, 4, index + 1)

plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')

plt.title(u'错分为：%i，真实值：%i' % (err_y_hat[index], err_y[index]))

plt.tight_layout()

plt.show()

结果：

训练集正确率：99.82%
[0 1 2 ... 8 9 8]
[0 1 2 ... 8 9 8]
测试集正确率：98.27%
[9 1 1 1 9 5 9 9 9 9 9 8 9 8 1 9 9 1 3 8 9 9 9 9 1 4 8 3 5 5 1]
[5 2 2 2 7 7 5 7 7 7 7 6 7 3 8 8 8 8 9 9 3 8 8 8 8 0 4 9 9 3 8]

朴素贝叶斯分类：

import numpy as np

import matplotlib.pyplot as plt

import matplotlib as mpl

from sklearn.preprocessing import StandardScaler

from sklearn.naive_bayes import GaussianNB, MultinomialNB

from sklearn.pipeline import Pipeline

from sklearn.neighbors import KNeighborsClassifier

def iris_type(s):

it = {b'Iris-setosa': 0, b'Iris-versicolor': 1, b'Iris-virginica': 2}

return it[s]

if __name__ == "__main__":

data = np.loadtxt('8.iris.data', dtype=float, delimiter=',', converters={4: iris_type})

print(data)

x, y = np.split(data, (4,), axis=1)

x = x[:, :2]

print(x)

print(y)

gnb = Pipeline([

('sc', StandardScaler()),

('clf', GaussianNB())])

gnb.fit(x, y.ravel())

# gnb = MultinomialNB().fit(x, y.ravel())

# gnb = KNeighborsClassifier(n_neighbors=5).fit(x, y.ravel())

# 画图

N, M = 500, 500 # 横纵各采样多少个值

x1_min, x1_max = x[:, 0].min(), x[:, 0].max() # 第0列的范围

x2_min, x2_max = x[:, 1].min(), x[:, 1].max() # 第1列的范围

t1 = np.linspace(x1_min, x1_max, N)

t2 = np.linspace(x2_min, x2_max, M)

x1, x2 = np.meshgrid(t1, t2) # 生成网格采样点

x_test = np.stack((x1.flat, x2.flat), axis=1) # 测试点

# 无意义，只是为了凑另外两个维度

# x3 = np.ones(x1.size) * np.average(x[:, 2])

# x4 = np.ones(x1.size) * np.average(x[:, 3])

# x_test = np.stack((x1.flat, x2.flat, x3, x4), axis=1) # 测试点

mpl.rcParams['font.sans-serif'] = [u'simHei']

mpl.rcParams['axes.unicode_minus'] = False

cm_light = mpl.colors.ListedColormap(['#77E0A0', '#FF8080', '#A0A0FF'])

cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b'])

y_hat = gnb.predict(x_test) # 预测值

y_hat = y_hat.reshape(x1.shape) # 使之与输入的形状相同

plt.figure(facecolor='w')

plt.pcolormesh(x1, x2, y_hat, cmap=cm_light) # 预测值的显示

plt.scatter(x[:, 0], x[:, 1], edgecolors='k', s=50, cmap=cm_dark) # 样本的显示

plt.xlabel(u'花萼长度', fontsize=14)

plt.ylabel(u'花萼宽度', fontsize=14)

plt.xlim(x1_min, x1_max)

plt.ylim(x2_min, x2_max)

plt.title(u'GaussianNB对鸢尾花数据的分类结果', fontsize=18)

plt.grid(True)

plt.show()

# 训练集上的预测结果

y_hat = gnb.predict(x)

y = y.reshape(-1)

result = y_hat == y

print(y_hat)

print(result)

acc = np.mean(result)

print('准确度: %.2f%%' % (100 * acc))

结果：准确度: 78.00%，分类效果如下所示：

逻辑回归分类、决策树分类、朴素贝叶斯分类及手写数字识别

逻辑回归

使用决策树对鸢尾花数据进行分类：

手写图片数字识别：

朴素贝叶斯分类：

推荐分享

家庭养花风水知识家庭养花“五行说”

家庭养花知识大全家庭养花有什么好处

秋天养花，掌握这5点养花知识，正确管理花卉，让花健康生长

【花卉知识】养花须知：新手养花的选购技巧，不要当冤大头！

逻辑回归分类、决策树分类、朴素贝叶斯分类及手写数字识别

逻辑回归

使用决策树对鸢尾花数据进行分类：

手写图片数字识别：

朴素贝叶斯分类：

推荐分享

家庭养花风水知识 家庭养花“五行说”

家庭养花知识大全 家庭养花有什么好处

秋天养花，掌握这5点养花知识，正确管理花卉，让花健康生长

【花卉知识 】养花须知：新手养花的选购技巧，不要当冤大头！

家庭养花风水知识家庭养花“五行说”

家庭养花知识大全家庭养花有什么好处

【花卉知识】养花须知：新手养花的选购技巧，不要当冤大头！