import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

from sklearn import model_selection

from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression

from sklearn.linear_model import LinearRegression

from sklearn.ensemble import RandomForestClassifier

from sklearn.neighbors import KNeighborsClassifier

from sklearn.svm import SVC

from pandas.plotting import scatter_matrix

from sklearn.datasets import load_iris

#第一步,数据集的获取

iris = load_iris()

#第二步,数据预处理,删除缺失值

iris_d = pd.DataFrame(iris['data'], columns=['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width']) #数据转化为DataFrame类型

iris_d['Species'] = iris.target #目标类型的获取

iris_d.dropna(inplace=True) #数据预处理,删除缺失值

iris_d

Sepal_LengthSepal_WidthPetal_LengthPetal_WidthSpecies05.13.51.40.2014.93.01.40.2024.73.21.30.2034.63.11.50.2045.03.61.40.20..................1456.73.05.22.321466.32.55.01.921476.53.05.22.021486.23.45.42.321495.93.05.11.82

150 rows × 5 columns

#第三步,数据可视化

iris_d.groupby('Species').size() #数据集的数量

Species

0 50

1 50

2 50

dtype: int64

iris_d.describe()#数据集的描述性统计

Sepal_LengthSepal_WidthPetal_LengthPetal_WidthSpeciescount150.000000150.000000150.000000150.000000150.000000mean5.8433333.0573333.7580001.1993331.000000std0.8280660.4358661.7652980.7622380.819232min4.3000002.0000001.0000000.1000000.00000025%5.1000002.8000001.6000000.3000000.00000050%5.8000003.0000004.3500001.3000001.00000075%6.4000003.3000005.1000001.8000002.000000max7.9000004.4000006.9000002.5000002.000000

iris_d.plot(kind = 'box') #变量之间的箱线图,展现数据的离散程度

plt.show()

iris_d.hist() #数据集的直方图,用于展示数据的分布特征

plt.show()

scatter_matrix(iris_d) #展现了变量之间的关系,非线性相关和线性相关

plt.show()

#第四部分,特征工程

array = iris_d.values

X = array[:,0:4]

Y = array[:,4]

#选取模型的特征,本模型选择80%数据量作为训练数据,20%作为测试数据

x_train, x_test, y_train, y_test = model_selection.train_test_split(X, Y, test_size=0.2, random_state=7)

#第五部分,机器学习模型和评估

#K近邻(KNN)

model = KNeighborsClassifier()

model.fit(x_train,y_train)

predictions = model.predict(x_test)

#模型评估,交叉验证

print(accuracy_score(y_test, predictions))

0.9

#机器学习模型

#支持向量机(SVM)

model = SVC()

model.fit(x_train,y_train)

predictions = model.predict(x_test)

#模型评估,交叉验证

print(accuracy_score(y_test, predictions))

0.8666666666666667

#机器学习模型

#随机森林(RF)

model = RandomForestClassifier()

model.fit(x_train,y_train)

predictions = model.predict(x_test)

#模型评估,交叉验证

print(accuracy_score(y_test, predictions))

0.8666666666666667

#机器学习模型

#逻辑回归(LR)

model = LogisticRegression()

model.fit(x_train,y_train)

predictions = model.predict(x_test)

#模型评估,交叉验证

print(accuracy_score(y_test, predictions))

0.8666666666666667

d:\program files\python3.7\lib\site-packages\sklearn\linear_model\_logistic.py:765: ConvergenceWarning: lbfgs failed to converge (status=1):

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:

https://scikit-learn.org/stable/modules/preprocessing.html

Please also refer to the documentation for alternative solver options:

https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression

extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)

推荐链接

评论可见,请评论后查看内容,谢谢!!!评论后请刷新页面。