深度学习模型融合stacking
学习 模型 深度 融合
2023-09-11 14:17:15 时间
当你的深度学习模型变得很多时,选一个确定的模型也是一个头痛的问题。或者你可以把他们都用起来,就进行模型融合。我主要使用stacking和blend方法。先把代码贴出来,大家可以看一下。
1 import numpy as np 2 import pandas as pd 3 import matplotlib.pyplot as plt 4 from sklearn.metrics import roc_curve 5 6 SEED = 222 7 np.random.seed(SEED) 8 from sklearn.model_selection import train_test_split 9 10 from sklearn.metrics import roc_auc_score 11 from sklearn.svm import SVC,LinearSVC 12 from sklearn.naive_bayes import GaussianNB 13 from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier 14 from sklearn.linear_model import LogisticRegression 15 from sklearn.neighbors import KNeighborsClassifier 16 from sklearn.neural_network import MLPClassifier 17 18 df = pd.read_csv('input.csv') 19 20 def get_train_test(): # 数据处理 21 22 y = 1 * (df.cand_pty_affiliation == "REP") 23 x = df.drop(['cand_pty_affiliation'],axis=1) 24 x = pd.get_dummies(x,sparse=True) 25 x.drop(x.columns[x.std()==0],axis=1,inplace=True) 26 return train_test_split(x,y,test_size=0.95,random_state=SEED) 27 28 def get_models(): # 模型定义 29 nb = GaussianNB() 30 svc = SVC(C=100,probability=True) 31 knn = KNeighborsClassifier(n_neighbors=3) 32 lr = LogisticRegression(C=100,random_state=SEED) 33 nn = MLPClassifier((80, 10), early_stopping=False, random_state=SEED) 34 gb = GradientBoostingClassifier(n_estimators =100, random_state = SEED) 35 rf = RandomForestClassifier(n_estimators=1,max_depth=3,random_state=SEED) 36 37 models = {'svm':svc, 38 'knn':knn, 39 'naive bayes':nb, 40 'mlp-nn':nn, 41 'random forest':rf, 42 'gbm':gb, 43 'logistic':lr, 44 } 45 return models 46 47 def train_base_learnres(base_learners,inp,out,verbose=True): # 训练基本模型 48 if verbose:print("fitting models.") 49 for i,(name,m) in enumerate(base_learners.items()): 50 if verbose:print("%s..." % name,end=" ",flush=False) 51 m.fit(inp,out) 52 if verbose:print("done") 53 54 def predict_base_learners(pred_base_learners,inp,verbose=True): # 把基本学习器的输出作为融合学习的特征,这里计算特征 55 p = np.zeros((inp.shape[0],len(pred_base_learners))) 56 if verbose:print("Generating base learner predictions.") 57 for i,(name,m) in enumerate(pred_base_learners.items()): 58 if verbose:print("%s..." % name,end=" ",flush=False) 59 p_ = m.predict_proba(inp) 60 p[:,i] = p_[:,1] 61 if verbose:print("done") 62 return p 63 64 def ensemble_predict(base_learners,meta_learner,inp,verbose=True): # 融合学习进行预测 65 p_pred = predict_base_learners(base_learners,inp,verbose=verbose) # 测试数据必须先经过基本学习器计算特征 66 return p_pred,meta_learner.predict_proba(p_pred)[:,1] 67 68 def ensenmble_by_blend(): # blend融合 69 xtrain_base, xpred_base, ytrain_base, ypred_base = train_test_split( 70 xtrain, ytrain, test_size=0.5, random_state=SEED 71 ) # 把数据切分成两部分 72 73 train_base_learnres(base_learners, xtrain_base, ytrain_base) # 训练基本模型 74 75 p_base = predict_base_learners(base_learners, xpred_base) # 把基本学习器的输出作为融合学习的特征,这里计算特征 76 meta_learner.fit(p_base, ypred_base) # 融合学习器的训练 77 p_pred, p = ensemble_predict(base_learners, meta_learner, xtest) # 融合学习进行预测 78 print("\nEnsemble ROC-AUC score: %.3f" % roc_auc_score(ytest, p)) 79 80 81 from sklearn.base import clone 82 def stacking(base_learners,meta_learner,X,y,generator): # stacking进行融合 83 print("Fitting final base learners...",end="") 84 train_base_learnres(base_learners,X,y,verbose=False) 85 print("done") 86 87 print("Generating cross-validated predictions...") 88 cv_preds,cv_y = [],[] 89 for i,(train_inx,test_idx) in enumerate(generator.split(X)): 90 fold_xtrain,fold_ytrain = X[train_inx,:],y[train_inx] 91 fold_xtest,fold_ytest = X[test_idx,:],y[test_idx] 92 93 fold_base_learners = {name:clone(model) 94 for name,model in base_learners.items()} 95 train_base_learnres(fold_base_learners,fold_xtrain,fold_ytrain,verbose=False) 96 fold_P_base = predict_base_learners(fold_base_learners,fold_xtest,verbose=False) 97 98 cv_preds.append(fold_P_base) 99 cv_y.append(fold_ytest) 100 101 print("Fold %i done" %(i+1)) 102 print("CV-predictions done") 103 cv_preds = np.vstack(cv_preds) 104 cv_y = np.hstack(cv_y) 105 106 print("Fitting meta learner...",end="") 107 meta_learner.fit(cv_preds,cv_y) 108 print("done") 109 110 return base_learners,meta_learner 111 112 def ensemble_by_stack(): 113 from sklearn.model_selection import KFold 114 cv_base_learners,cv_meta_learner = stacking( 115 get_models(),clone(meta_learner),xtrain.values,ytrain.values,KFold(2)) 116 P_pred,p = ensemble_predict(cv_base_learners,cv_meta_learner,xtest,verbose=False) 117 print("\nEnsemble ROC-AUC score: %.3f" %roc_auc_score(ytest,p)) 118 119 def plot_roc_curve(ytest,p_base_learners,p_ensemble,labels,ens_label): 120 plt.figure(figsize=(10,8)) 121 plt.plot([0,1],[0,1],'k--') 122 cm = [plt.cm.rainbow(i) 123 for i in np.linspace(0,1.0, p_base_learners.shape[1] +1)] 124 for i in range(p_base_learners.shape[1]): 125 p = p_base_learners[:,i] 126 fpr,tpr,_ = roc_curve(ytest,p) 127 plt.plot(fpr,tpr,label = labels[i],c=cm[i+1]) 128 fpr, tpr, _ = roc_curve(ytest, p_ensemble) 129 plt.plot(fpr, tpr, label=ens_label, c=cm[0]) 130 plt.xlabel('False positive rate') 131 plt.ylabel('True positive rate') 132 plt.title('ROC curve') 133 plt.legend(frameon=False) 134 plt.show() 135 136 from mlens.ensemble import SuperLearner 137 def use_pack(): 138 sl =SuperLearner( 139 folds=10,random_state=SEED,verbose=2, 140 # backend="multiprocessing" 141 ) 142 # Add the base learners and the meta learner 143 sl.add(list(base_learners.values()),proba=True) 144 sl.add_meta(meta_learner,proba=True) 145 # Train the ensemble 146 sl.fit(xtrain,ytrain) 147 # Predict the test set 148 p_sl=sl.predict_proba(xtest) 149 150 print("\nSuper Learner ROC-AUC score: %.3f" % roc_auc_score(ytest,p_sl[:,1])) 151 152 if __name__ == "__main__": 153 xtrain, xtest, ytrain, ytest = get_train_test() 154 base_learners = get_models() 155 156 meta_learner = GradientBoostingClassifier( 157 n_estimators=1000, 158 loss="exponential", 159 max_depth=4, 160 subsample=0.5, 161 learning_rate=0.005, 162 random_state=SEED 163 ) 164 165 # ensenmble_by_blend() # blend进行融合 166 # ensemble_by_stack() # stack进行融合 167 use_pack() # 调用包进行融合
相关文章
- CPU、内存、显卡等硬件因素也影响着你的深度学习模型性能
- 2-《PyTorch深度学习实践》-线性模型
- (《机器学习》完整版系列)第14章 概率图模型——14.2 马尔可夫随机场(无向图,“团”与“极大团”,MRF的“三性”)
- 【Hibernate学习】 ——ORM(四)再次认识实体继承
- 百面机器学习之模型评估
- 深度学习模型解决超声波手势识别
- 机器学习笔记之谱聚类(三)模型的矩阵形式转化
- 机器学习笔记之概率图模型(三)贝叶斯网络之有向分离(D划分)
- 机器学习笔记之隐马尔可夫模型(六)解码问题
- 《深度学习导论及案例分析》一1.3深度学习的模型和算法
- 深度学习导论及案例分析》一2.10概率图模型的学习
- 《深度学习导论及案例分析》一2.11概率图模型的推理
- 《深度学习导论及案例分析》一第3章 受限玻耳兹曼机3.1 受限玻耳兹曼机的标准模型
- 《深度学习导论及案例分析》一3.3受限玻耳兹曼机的变种模型
- 【深度学习之美】神经网络不胜语, M-P模型似可寻(入门系列之三)
- 深度学习模型-快速构建词典和id的映射
- 【玩转数据系列十二】PAI平台深度学习Caffe框架实现图像分类的模型训练
- 《计算机视觉:模型、学习和推理》——3.5 一元正态分布
- 【转载】 Tensorflow学习笔记-模型保存与加载
- PyTorch深度学习实战 | 基于多层感知机模型和随机森林模型的某地房价预测
- deeplearning.ai学习seq2seq模型
- 机器学习——支持向量机SVM之非线性模型(原问题和对偶问题)
- 【深度学习】——利用pytorch搭建一个完整的深度学习项目(构建模型、加载数据集、参数配置、训练、模型保存、预测)
- 【深度学习系列】基础知识、模型学习
- 【深度学习】——常见深度学习模型总结、anchor-free和anchor-based