zl程序教程

您现在的位置是:首页 >  工具

当前栏目

南大《探索数据的奥秘》课件示例代码笔记16

笔记数据代码 探索 示例 16 奥秘 课件
2023-09-14 09:01:29 时间

Chp8-2
2019 年 12 月 23 日

In [3]: from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
df=pd.read_csv('C:\Python\Scripts\my_data\iris.csv',header=None,
names=['sepal_length','sepal_width','petal_length','
petal_width','target'])
my_data=df[['sepal_length','
sepal_width']].iloc[:50]
def rmse(x,y,coefs): # 注意,自定义函数的语法
yfit=np.polyval(coefs,x)
rmse=np.sqrt(np.mean((y-yfit)**2))
return rmse
xtrain,xtest,ytrain,ytest=train_test_split(my_data['sepal_length'],my_data['
sepal_width'],test_size = 0.5)
train_err=[]
validation_err=[]
degrees=range(1,8)
for i,d in enumerate(degrees):
p=np.polyfit(xtrain,ytrain,d)
train_err.append(rmse(xtrain,ytrain,p))
validation_err.append(rmse(xtest,ytest,p))
1fig,ax=plt.subplots()
ax.plot(degrees,validation_err,lw=2,label='testing error')
ax.plot(degrees,train_err,lw=2,label='training error')
ax.legend(loc=0)
ax.set_xlabel('degree of polynomial')
ax.set_ylabel('RMSE')
Out[3]: Text(0,0.5,'RMSE')

In [54]: from sklearn.model_selection import KFold
my_data=df[['sepal_length','sepal_width']]
nfolds=3
fig,axes=plt.subplots(1,nfolds,figsize=(14,4))
kf=KFold(n_splits =nfolds)
i=0
for training, validation in kf.split(my_data):
x,y=my_data.iloc[training]['sepal_length'],df.iloc[training]['sepal_width']
axes[i].plot(x,y,'ro')
x,y=my_data.iloc[validation]['sepal_length'],df.iloc[validation]['
sepal_width']
axes[i].plot(x,y,'bo')
i=i+1
plt.tight_layout()

In [61]: my_class=[]
for n in range(150):
if n<50:
my_class.append(1)
elif n<100:
my_class.append(2)
else:
my_class.append(3)
print(my_class)
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]

In [65]: from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.neighbors import KNeighborsClassifier
knn1 = KNeighborsClassifier(n_neighbors=1)
knn2 = KNeighborsClassifier(n_neighbors=1)
knn1.fit(my_data[['sepal_length','sepal_width']],my_class) # 全部数据用来训练
print('训练集测试集相同时,模型的性能得分是: ',knn1.score(my_data[['sepal_length','
sepal_width']],my_class))# 在训练集上评价性能
print('\n')scores=
cross_val_score(knn2,my_data[['sepal_length','sepal_width']],my_class,cv=5,
scoring='accuracy') # 交叉验证
print('5 折交叉验证时,模型的性能平均得分是: ',
scores.mean())
训练集测试集相同时,模型的性能得分是: 0.9266666666666666
5 折交叉验证时,模型的性能平均得分是: 0.7266666666666667