1 数据的预处理分析 from __future__ import division import pandas as pd import numpy as np churn_df = pd.read_csv('churn.csv') col_names = churn_df.columns.tolist() print "Column names:" print col_names #前六个后六个 to_show = col_names[:6] + col_names[-6:] print "\nSample data:" churn_df[to_show].head(6) 复制代码

2 数据标准化处理 churn_result = churn_df['Churn?'] y = np.where(churn_result == 'True.',1,0) # We don't need these columns to_drop = ['State','Area Code','Phone','Churn?'] churn_feat_space = churn_df.drop(to_drop,axis=1) # 'yes'/'no' has to be converted to boolean values # NumPy converts these from boolean to 1. and 0. later yes_no_cols = ["Int'l Plan","VMail Plan"] churn_feat_space[yes_no_cols] = churn_feat_space[yes_no_cols] == 'yes' # Pull out features for future use features = churn_feat_space.columns X = churn_feat_space.as_matrix().astype(np.float) # This is important from sklearn.preprocessing import StandardScaler scaler = StandardScaler() X = scaler.fit_transform(X) print "Feature space holds %d observations and %d features" % X.shape print "Unique target labels:", np.unique(y) print X[0] print len(y[y == 0]) Feature space holds 3333 observations and 17 features Unique target labels: [0 1] [ 0.67648946 -0.32758048 1.6170861 1.23488274 1.56676695 0.47664315 1.56703625 -0.07060962 -0.05594035 -0.07042665 0.86674322 -0.46549436 0.86602851 -0.08500823 -0.60119509 -0.0856905 -0.42793202] 2850 复制代码 3 sklearn多模型封装（已废弃，学思想） from sklearn.cross_validation import KFold def run_cv(X,y,clf_class,**kwargs): # Construct a kfolds object kf = KFold(len(y),n_folds=5,shuffle=True) y_pred = y.copy() # Iterate through folds for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train = y[train_index] # Initialize a classifier with key word arguments clf = clf_class(**kwargs) clf.fit(X_train,y_train) y_pred[test_index] = clf.predict(X_test) return y_pred from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier as RF from sklearn.neighbors import KNeighborsClassifier as KNN def accuracy(y_true,y_pred): # NumPy interprets True and False as 1. and 0. return np.mean(y_true == y_pred) print "Support vector machines:" print "%.3f" % accuracy(y, run_cv(X,y,SVC)) print "Random forest:" print "%.3f" % accuracy(y, run_cv(X,y,RF)) print "K-nearest-neighbors:" print "%.3f" % accuracy(y, run_cv(X,y,KNN)) Support vector machines: 0.916 Random forest: 0.944 K-nearest-neighbors: 0.893 复制代码 4 阈值概率调整 def run_prob_cv(X, y, clf_class, **kwargs): kf = KFold(len(y), n_folds=5, shuffle=True) y_prob = np.zeros((len(y),2)) for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train = y[train_index] clf = clf_class(**kwargs) clf.fit(X_train,y_train) # Predict probabilities, not classes y_prob[test_index] = clf.predict_proba(X_test) return y_prob import warnings warnings.filterwarnings('ignore') # Use 10 estimators so predictions are all multiples of 0.1 pred_prob = run_prob_cv(X, y, RF, n_estimators=10) #print pred_prob[0] pred_churn = pred_prob[:,1] is_churn = y == 1 # Number of times a predicted probability is assigned to an observation counts = pd.value_counts(pred_churn) #print counts # calculate true probabilities true_prob = {} for prob in counts.index: true_prob[prob] = np.mean(is_churn[pred_churn == prob]) true_prob = pd.Series(true_prob) # pandas-fu counts = pd.concat([counts,true_prob], axis=1).reset_index() counts.columns = ['pred_prob', 'count', 'true_prob'] counts # 0.7以上流式率达到94%，说明阈值为0.7是合适的，低于0.7不管，高于0.7的都认为是流失的 复制代码

• ### 实践才能有财富，十年结弦，一生悬命

1.凡CodeSecTeam转载的文章,均出自其它媒体或其他官网介绍,目的在于传递更多的信息,并不代表本站赞同其观点和其真实性负责；
2.转载的文章仅代表原创作者观点,与本站无关。其原创性以及文中陈述文字和内容未经本站证实,本站对该文以及其中全部或者部分内容、文字的真实性、完整性、及时性，不作出任何保证或承若；
3.如本站转载稿涉及版权等问题,请作者及时联系本站,我们会及时处理。