首页分享基于Jupyter 对鸢尾花数据集和月亮数据集，分别采用线性LDA、k

基于Jupyter 对鸢尾花数据集和月亮数据集，分别采用线性LDA、k

来源：花匠小妙招时间：2025-01-11 22:46

对鸢尾花数据集和月亮数据集，分别采用线性LDA、k-means和SVM算法进行二分类可视化分析一、线性LDA算法1.定义：线性判别分析(Linear Discriminant Analysis2.鸢尾花数据集进行二分类3.月亮数据集进行二分类二、k-means算法1.定义2.鸢尾花数据集进行二分类3.月亮数据集进行二分类三、SVM算法1.定义：2.鸢尾花数据集进行二分类3.月亮数据集进行二分类四、SVM算法的优缺点优点缺点：

一、线性LDA算法

1.定义：线性判别分析(Linear Discriminant Analysis

简称LDA)是一种经典的线性学习方法，在二分类问题上因为最早由【Fisher，1936年】提出，所以也称为“Fisher 判别分析！”
Fisher（费歇）判别思想是投影，使多维问题简化为一维问题来处理。选择一个适当的投影轴,使所有的样本点都投影到这个轴上得到一个投影值。对这个投影轴的方向的要求是：使每一类内的投影值所形成的类内离差尽可能小，而不同类间的投影值所形成的类间离差尽可能大。
在这里插入图片描述

2.鸢尾花数据集进行二分类

# 鸢尾花数据集LDA线性二分类 import numpy as np import matplotlib.pyplot as plt from sklearn.datasets.samples_generator import make_classification class LDA(): def Train(self, X, y): """X为训练数据集，y为训练label""" X1 = np.array([X[i] for i in range(len(X)) if y[i] == 0]) X2 = np.array([X[i] for i in range(len(X)) if y[i] == 1]) # 求中心点 mju1 = np.mean(X1, axis=0) # mju1是ndrray类型 mju2 = np.mean(X2, axis=0) # dot(a, b, out=None) 计算矩阵乘法 cov1 = np.dot((X1 - mju1).T, (X1 - mju1)) cov2 = np.dot((X2 - mju2).T, (X2 - mju2)) Sw = cov1 + cov2 # 计算w w = np.dot(np.mat(Sw).I, (mju1 - mju2).reshape((len(mju1), 1))) # 记录训练结果 self.mju1 = mju1 # 第1类的分类中心 self.cov1 = cov1 self.mju2 = mju2 # 第2类的分类中心 self.cov2 = cov2 self.Sw = Sw # 类内散度矩阵 self.w = w # 判别权重矩阵 def Test(self, X, y): """X为测试数据集，y为测试label""" # 分类结果 y_new = np.dot((X), self.w) # 计算fisher线性判别式 nums = len(y) c1 = np.dot((self.mju1 - self.mju2).reshape(1, (len(self.mju1))), np.mat(self.Sw).I) c2 = np.dot(c1, (self.mju1 + self.mju2).reshape((len(self.mju1), 1))) c = 1/2 * c2 # 2个分类的中心 h = y_new - c # 判别 y_hat = [] for i in range(nums): if h[i] >= 0: y_hat.append(0) else: y_hat.append(1) # 计算分类精度 count = 0 for i in range(nums): if y_hat[i] == y[i]: count += 1 precise = count / nums # 显示信息 print("测试样本数量:", nums) print("预测正确样本的数量:", count) print("测试准确度:", precise) return precise if '__main__' == __name__: # 产生分类数据 n_samples = 500 X, y = make_classification(n_samples=n_samples, n_features=2, n_redundant=0, n_classes=2,n_informative=1, n_clusters_per_class=1, class_sep=0.5, random_state=10) # LDA线性判别分析(二分类) lda = LDA() # 60% 用作训练，40%用作测试 Xtrain = X[:299, :] Ytrain = y[:299] Xtest = X[300:, :] Ytest = y[300:] lda.Train(Xtrain, Ytrain) precise = lda.Test(Xtest, Ytest) # 原始数据 plt.scatter(X[:, 0], X[:, 1], marker='o', c=y) plt.xlabel("x1") plt.ylabel("x2") plt.title("Test precise:" + str(precise)) plt.show()

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576

运行结果：
在这里插入图片描述

3.月亮数据集进行二分类

#月亮数据集二分类 import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import make_moons class LDA(): def Train(self, X, y): """X为训练数据集，y为训练label""" X1 = np.array([X[i] for i in range(len(X)) if y[i] == 0]) X2 = np.array([X[i] for i in range(len(X)) if y[i] == 1]) # 求中心点 mju1 = np.mean(X1, axis=0) # mju1是ndrray类型 mju2 = np.mean(X2, axis=0) # dot(a, b, out=None) 计算矩阵乘法 cov1 = np.dot((X1 - mju1).T, (X1 - mju1)) cov2 = np.dot((X2 - mju2).T, (X2 - mju2)) Sw = cov1 + cov2 # 计算w w = np.dot(np.mat(Sw).I, (mju1 - mju2).reshape((len(mju1), 1))) # 记录训练结果 self.mju1 = mju1 # 第1类的分类中心 self.cov1 = cov1 self.mju2 = mju2 # 第1类的分类中心 self.cov2 = cov2 self.Sw = Sw # 类内散度矩阵 self.w = w # 判别权重矩阵 def Test(self, X, y): """X为测试数据集，y为测试label""" # 分类结果 y_new = np.dot((X), self.w) # 计算fisher线性判别式 nums = len(y) c1 = np.dot((self.mju1 - self.mju2).reshape(1, (len(self.mju1))), np.mat(self.Sw).I) c2 = np.dot(c1, (self.mju1 + self.mju2).reshape((len(self.mju1), 1))) c = 1/2 * c2 # 2个分类的中心 h = y_new - c # 判别 y_hat = [] for i in range(nums): if h[i] >= 0: y_hat.append(0) else: y_hat.append(1) # 计算分类精度 count = 0 for i in range(nums): if y_hat[i] == y[i]: count += 1 precise = count / (nums+0.000001) # 显示信息 print("测试样本数量:", nums) print("预测正确样本的数量:", count) print("测试准确度:", precise) return precise if '__main__' == __name__: # 产生分类数据 X, y = make_moons(n_samples=100, noise=0.15, random_state=42) # LDA线性判别分析(二分类) lda = LDA() # 60% 用作训练，40%用作测试 Xtrain = X[:60, :] Ytrain = y[:60] Xtest = X[40:, :] Ytest = y[40:] lda.Train(Xtrain, Ytrain) precise = lda.Test(Xtest, Ytest) # 原始数据 plt.scatter(X[:, 0], X[:, 1], marker='o', c=y) plt.xlabel("x1") plt.ylabel("x2") plt.title("Test precise:" + str(precise)) plt.show()

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273

运行结果：
在这里插入图片描述

二、k-means算法

1.定义

K-Means算法的思想很简单，对于给定的样本集，按照样本之间的距离大小，将样本集划分为K个簇。让簇内的点尽量紧密的连在一起，而让簇间的距离尽量的大。
在数据集中根据一定策略选择K个点作为每个簇的初始中心，然后观察剩余的数据，将数据划分到距离这K个点最近的簇中，也就是说将数据划分成K个簇完成一次划分，但形成的新簇并不一定是最好的划分，因此生成的新簇中，重新计算每个簇的中心点，然后在重新进行划分，直到每次划分的结果保持不变。在实际应用中往往经过很多次迭代仍然达不到每次划分结果保持不变，甚至因为数据的关系，根本就达不到这个终止条件，实际应用中往往采用变通的方法设置一个最大迭代次数，当达到最大迭代次数时，终止计算。

2.鸢尾花数据集进行二分类

#鸢尾花数据集二分类 import matplotlib.pyplot as plt import numpy as np from sklearn.cluster import KMeans from sklearn import datasets from sklearn.datasets import load_iris iris = load_iris() X = iris.data[:] ##表示我们只取特征空间中的后两个维度 estimator = KMeans(n_clusters=5)#构造聚类器 estimator.fit(X)#聚类 label_pred = estimator.labels_ #获取聚类标签 #绘制k-means结果 x0 = X[label_pred == 0] x1 = X[label_pred == 1] x2 = X[label_pred == 2] x3 = X[label_pred == 3] plt.scatter(x0[:, 0], x0[:, 1], c = "red", marker='o', label='label0') plt.scatter(x1[:, 0], x1[:, 1], c = "green", marker='*', label='label1') #plt.scatter(x2[:, 0], x2[:, 1], c = "blue", marker='+', label='label2') #plt.scatter(x3[:, 0], x3[:, 1], c = "yellow", marker='o', label='label3') plt.xlabel('petal length') plt.ylabel('petal width') plt.legend(loc=2) plt.show()

12345678910111213141516171819202122232425

运行结果：
在这里插入图片描述

3.月亮数据集进行二分类

#月亮数据集K-Means二分类 import matplotlib.pyplot as plt import numpy as np from sklearn.cluster import KMeans from sklearn.datasets import make_moons X, y = make_moons(n_samples=100, noise=0.15, random_state=42) estimator = KMeans(n_clusters=5)#构造聚类器 estimator.fit(X)#聚类 label_pred = estimator.labels_ #获取聚类标签 #绘制k-means结果 x0 = X[label_pred == 0] x1 = X[label_pred == 1] x2 = X[label_pred == 2] x3 = X[label_pred == 3] plt.scatter(x0[:, 0], x0[:, 1], c = "red", marker='o', label='label0') plt.scatter(x1[:, 0], x1[:, 1], c = "green", marker='*', label='label1') #plt.scatter(x2[:, 0], x2[:, 1], c = "blue", marker='+', label='label2') #plt.scatter(x3[:, 0], x3[:, 1], c = "yellow", marker='o', label='label3') plt.xlabel('petal length') plt.ylabel('petal width') plt.legend(loc=2) plt.show()

1234567891011121314151617181920212223

运行结果：
在这里插入图片描述

三、SVM算法

1.定义：

SVM的全称是Support Vector Machine，即支持向量机，主要用于解决模式识别领域中的数据分类问题，属于有监督学习算法的一种。SVM要解决的问题可以用一个经典的二分类问题加以描述。如图1所示，红色和蓝色的二维数据点显然是可以被一条直线分开的，在模式识别领域称为线性可分问题。然而将两类数据点分开的直线显然不止一条。图1(b)和©分别给出了A、B两种不同的分类方案，其中黑色实线为分界线，术语称为“决策面”。每个决策面对应了一个线性分类器。虽然在目前的数据上看，这两个分类器的分类结果是一样的，但如果考虑潜在的其他数据，则两者的分类性能是有差别的。
在这里插入图片描述

2.鸢尾花数据集进行二分类

# 鸢尾花数据集SVM算法二分类 import numpy as np import matplotlib.pyplot as plt from sklearn import datasets, svm import pandas as pd from pylab import * mpl.rcParams['font.sans-serif'] = ['SimHei'] iris = datasets.load_iris() iris = datasets.load_iris() X = iris.data y = iris.target X = X[y != 0, :2] # 选择X的前两个特性 y = y[y != 0] n_sample = len(X) np.random.seed(0) order = np.random.permutation(n_sample) # 排列，置换 X = X[order] y = y[order].astype(np.float) X_train = X[:int(.9 * n_sample)] y_train = y[:int(.9 * n_sample)] X_test = X[int(.9 * n_sample):] y_test = y[int(.9 * n_sample):] #合适的模型 for fig_num, kernel in enumerate(('linear', 'rbf','poly')): # 径向基函数 (Radial Basis Function 简称 RBF),常用的是高斯基函数 clf = svm.SVC(kernel=kernel, gamma=10) # gamma是“rbf”、“poly”和“sigmoid”的核系数。 clf.fit(X_train, y_train) plt.figure(str(kernel)) plt.xlabel('x1') plt.ylabel('x2') plt.scatter(X[:, 0], X[:, 1], c=y, zorder=10, cmap=plt.cm.Paired, edgecolor='k', s=20) # zorder: z方向上排列顺序，数值越大，在上方显示 # paired两个色彩相近输出(paired) # 圈出测试数据 plt.scatter(X_test[:, 0], X_test[:, 1], s=80, facecolors='none',zorder=10, edgecolor='k') plt.axis('tight') #更改 x 和 y 轴限制，以便显示所有数据 x_min = X[:, 0].min() x_max = X[:, 0].max() y_min = X[:, 1].min() y_max = X[:, 1].max() XX, YY = np.mgrid[x_min:x_max:200j, y_min:y_max:200j] Z = clf.decision_function(np.c_[XX.ravel(), YY.ravel()]) # 样本X到分离超平面的距离 Z = Z.reshape(XX.shape) plt.contourf(XX,YY,Z>0,cmap=plt.cm.Paired) plt.contour(XX, YY, Z, colors=['r', 'k', 'b'], linestyles=['--', '-', '--'], levels=[-0.5, 0, 0.5]) # 范围 plt.title(kernel) plt.show()

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748

运行结果：
在这里插入图片描述

3.月亮数据集进行二分类

# 月亮数据集SVM二分类 import matplotlib.pyplot as plt from sklearn.pipeline import Pipeline import numpy as np import matplotlib as mpl from sklearn.datasets import make_moons from sklearn.preprocessing import PolynomialFeatures from sklearn.preprocessing import StandardScaler from sklearn.svm import LinearSVC # 为了显示中文 mpl.rcParams['font.sans-serif'] = [u'SimHei'] mpl.rcParams['axes.unicode_minus'] = False X, y = make_moons(n_samples=100, noise=0.15, random_state=42) def plot_dataset(X, y, axes): plt.plot(X[:, 0][y==0], X[:, 1][y==0], "bs") plt.plot(X[:, 0][y==1], X[:, 1][y==1], "g^") plt.axis(axes) plt.grid(True, which='both') plt.xlabel(r"$x_1$", fontsize=20) plt.ylabel(r"$x_2$", fontsize=20, rotation=0) plt.title("月亮数据",fontsize=20) plot_dataset(X, y, [-1.5, 2.5, -1, 1.5]) polynomial_svm_clf = Pipeline([ # 将源数据映射到 3阶多项式 ("poly_features", PolynomialFeatures(degree=3)), # 标准化 ("scaler", StandardScaler()), # SVC线性分类器 ("svm_clf", LinearSVC(C=10, loss="hinge", random_state=42)) ]) polynomial_svm_clf.fit(X, y) def plot_predictions(clf, axes): # 打表 x0s = np.linspace(axes[0], axes[1], 100) x1s = np.linspace(axes[2], axes[3], 100) x0, x1 = np.meshgrid(x0s, x1s) X = np.c_[x0.ravel(), x1.ravel()] y_pred = clf.predict(X).reshape(x0.shape) y_decision = clf.decision_function(X).reshape(x0.shape) # print(y_pred) # print(y_decision) plt.contourf(x0, x1, y_pred, cmap=plt.cm.brg, alpha=0.2) plt.contourf(x0, x1, y_decision, cmap=plt.cm.brg, alpha=0.1) plot_predictions(polynomial_svm_clf, [-1.5, 2.5, -1, 1.5]) plot_dataset(X, y, [-1.5, 2.5, -1, 1.5]) plt.show()

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748

运行结果：
在这里插入图片描述

from sklearn.svm import SVC gamma1, gamma2 = 0.1, 5 C1, C2 = 0.001, 1000 hyperparams = (gamma1, C1), (gamma1, C2) svm_clfs = [] for gamma, C in hyperparams: rbf_kernel_svm_clf = Pipeline([ ("scaler", StandardScaler()), ("svm_clf", SVC(kernel="rbf", gamma=gamma, C=C)) ]) rbf_kernel_svm_clf.fit(X, y) svm_clfs.append(rbf_kernel_svm_clf) plt.figure(figsize=(11, 7)) for i, svm_clf in enumerate(svm_clfs): plt.subplot(221 + i) plot_predictions(svm_clf, [-1.5, 2.5, -1, 1.5]) plot_dataset(X, y, [-1.5, 2.5, -1, 1.5]) gamma, C = hyperparams[i] plt.title(r"$gamma = {}, C = {}$".format(gamma, C), fontsize=16) plt.tight_layout() plt.show()

1234567891011121314151617181920212223

运行结果：
在这里插入图片描述