机器学习Python数据特征选定
16lz
2021-01-22
from pandas import read_csv from numpy import set_printoptions from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression from sklearn.decomposition import PCA from sklearn.ensemble import ExtraTreesClassifier #本例数据集用的pima_Indians进行学习 下载地址 http://www.broadview.com.cn/33110 filename = 'pima_data.csv' names = ['preg','plas','pres','skin','test','mass','pedi','age','class'] data = read_csv(filename,names = names) array = data.values X = array[:,0:8] Y = array[:,8] #单变量特征选定 test = SelectKBest(score_func=chi2,k=4) fit = test.fit(X,Y) set_printoptions(precision=3) #print(fit.scores_) features = fit.transform(X) # print(features) #RFR(递归特征消除) '''使用一个基模型来进行多轮训练,每轮训练后消除若干权值系数的特征,再基于新的特征进行下一轮训练''' model = LogisticRegression() rfe = RFE(model,3) fit = rfe.fit(X,Y) # print("特征个数:") # print(fit.n_features_) # print("被选定的特征:") # print(fit.support_) # print("特征排名:") # print(fit.ranking_) #主成分分析 '''将原始的样本映射到维度更低的样本空间中,PCA让映射的样本具有更大的发散性,用于无监督降维''' pca = PCA(n_components=3) fit =pca.fit(X) # print("解释方差:%s" %fit.explained_variance_ratio_) # print(fit.components_) #特征重要性 '''用extratressclassifier进行特征的重要性计算''' model = ExtraTreesClassifier() fit = model.fit(X,Y) #输出每一个特征数据的得分 print(fit.feature_importances_)