泰坦尼克号是一个入门的机器学习项目,通过这个项目了解机器学习的基本方法和数据分析的方法 #首先进行数据的分析和清洗
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
# 导入数据
train_data = pd.read_csv(r'C:\Users\Administrator\Desktop\taitanic\train.csv')
test_data = pd.read_csv(r'C:\Users\Administrator\Desktop\taitanic\test.csv')
# 总体数据信息
train_data.info()
test_data.info()
sum_data = train_data.append(test_data)
psg_out_id = test_data['PassengerId']
# 空余上船位置定义
#train_data['Cabin'] = train_data.Cabin.fillna('U0')
# 开始数据分析
# 总体生存比例
train_data['Survived'].value_counts().plot.pie(autopct='%0.9f%%')
# 性别和生存关系
print(train_data.groupby(['Sex','Survived'])['Survived'].count())
# 船舱,性别与生存的关系
print(train_data.groupby(['Sex','Pclass','Survived'])['Survived'].count())
# 作图 性别和生存
train_data[['Sex','Survived']].groupby(['Sex']).mean().plot.bar()
# 船舱与生存的关系
train_data[['Pclass','Survived']].groupby(['Pclass']).mean().plot.bar()
# 船舱,性别与生存的关系作图
train_data[['Sex','Pclass','Survived']].groupby(['Pclass','Sex']).mean().plot.bar()
# 其次看一下仓位因素
fig = plt.figure()
Us =train_data.Pclass[train_data.Survived == 0].value_counts()
S = train_data.Pclass[train_data.Survived == 1].value_counts()
df = pd.DataFrame({'Survived':S, 'Unsurvived':Us})
df.plot(kind='bar', stacked=True)
plt.title("The rescue of each passenger level ")
plt.xlabel("Level")
plt.ylabel("Number of people")
plt.xticks(rotation = 0)
# 仓位与年龄的关系
Us =train_data.Age[train_data.Survived == 0].value_counts()
S = train_data.Age[train_data.Survived == 1].value_counts()
df = pd.DataFrame({'Survived':S, 'Unsurvived':Us})
df.plot(kind='bar', stacked=True)
plt.title("Age to Survrive ")
plt.xlabel("Age")
plt.ylabel("Number of people")
plt.xticks(rotation = 0)
# 港口与生存的关系
sns.countplot('Embarked',hue='Survived',data=train_data)
plt.title('Embarked and Survived')
#回归森林预测年龄
from sklearn.ensemble import RandomForestRegressor
# 通过其他属性进行分析
age_df = sum_data[['Age','Title','Parch', 'SibSp','Pclass']]
age_df_notnull = age_df.loc[(sum_data['Age'].notnull())]
age_df_isnull = age_df.loc[(sum_data['Age'].isnull())]
X = age_df_notnull.values[:,1:]
Y = age_df_notnull.values[:,0]
# 算法选择随机森林,当然也可以是其他的
RFR = RandomForestRegressor(n_estimators=1000, n_jobs=-1)
RFR.fit(X,Y)
predictAges = RFR.predict(age_df_isnull.values[:,1:])
sum_data.loc[sum_data['Age'].isnull(), ['Age']]= predictAges
# 年龄分组
主要分成四组
bins = [0,18, 30, 60, 150]
sum_data['Age_group'] =sum_data['Age']
sum_data.loc[sum_data['Age_group'] >60,'Age_group'] = -4
sum_data.loc[sum_data['Age_group'] >30,'Age_group'] = -3
sum_data.loc[sum_data['Age_group'] >18,'Age_group'] = -2
sum_data.loc[sum_data['Age_group'] >0,'Age_group'] = 1
sum_data.loc[sum_data['Age_group'] ==-2,'Age_group'] = 2
sum_data.loc[sum_data['Age_group'] ==-3,'Age_group'] = 3
sum_data.loc[sum_data['Age_group'] ==-4,'Age_group'] = 4
pd.concat([sum_data,sum_data['Age_group']])
# 亲友人数和生存的关系
n,ax=plt.subplots(1,2,figsize=(18,8))
train_data[['Parch','Survived']].groupby(['Parch']).mean().plot.bar(ax=ax[0])
ax[0].set_title('Parch and Survived')
train_data[['SibSp','Survived']].groupby(['SibSp']).mean().plot.bar(ax=ax[1])
ax[1].set_title('SibSp and Survived')
# 称呼与生存的关系
sum_data = train_data.append(test_data)
sum_data['Title'] = sum_data['Name'].map(lambda x: re.compile(", (.*?)\.").findall(x)[0])
title_Dict = {}
# 称呼分类
title_Dict.update(dict.fromkeys(['Capt', 'Col', 'Major', 'Dr', 'Rev'], 'Officer'))
title_Dict.update(dict.fromkeys(['Don', 'the Countess', 'Dona', 'Lady'], 'Royalty'))
title_Dict.update(dict.fromkeys(['Mme', 'Ms', 'Mrs','Mlle', 'Miss'], 'Mrs'))
title_Dict.update(dict.fromkeys(['Master','Mr', 'Sir'], 'Mr'))
title_Dict.update(dict.fromkeys(['Jonkheer'], 'Master'))
sum_data['Title'] = sum_data['Title'].map(title_Dict)
sum_data['Title'] = pd.factorize(sum_data['Title'])[0]
# 加入tittle
title_dummies_df = pd.get_dummies(sum_data['Title'], prefix=sum_data[['Title']].columns[0])
sum_data = pd.concat([sum_data, title_dummies_df], axis=1)'''
# 对仓位进行简单处理
sum_data['Cabin'].loc[(sum_data['Cabin'].notnull())] = 1
sum_data['Cabin'].loc[(sum_data['Cabin'].isnull())] = 0
train_data =sum_data[:891]
train_data[['Title','Survived']].groupby(['Title']).mean().plot.bar()
train_data[['Cabin','Survived']].groupby(['Cabin']).mean().plot.bar()
# 作图
plt.show()
搞事的分割线
通过之前的作图,对这个数据有了一定的了解了,可以开始干活了
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
train_data=pd.read_csv(r'C:\Users\Administrator\Desktop\taitanic\train.csv') #读取文件titanic_train数据集
test_data = pd.read_csv(r'C:\Users\Administrator\Desktop\taitanic\test.csv')
sum_data = train_data.append(test_data)
psg_out_id = test_data['PassengerId']
##名字开头
sum_data['Title'] = sum_data['Name'].map(lambda x: re.compile(", (.*?)\.").findall(x)[0])
title_Dict = {}
title_Dict.update(dict.fromkeys(['Capt', 'Col', 'Major', 'Dr', 'Rev'], 'Officer'))
title_Dict.update(dict.fromkeys(['Don', 'the Countess', 'Dona', 'Lady'], 'Royalty'))
title_Dict.update(dict.fromkeys(['Mme', 'Ms', 'Mrs','Mlle', 'Miss'], 'Mrs'))
title_Dict.update(dict.fromkeys(['Master','Mr', 'Sir'], 'Mr'))
title_Dict.update(dict.fromkeys(['Jonkheer'], 'Master'))
sum_data['Title'] = sum_data['Title'].map(title_Dict)
sum_data['Title'] = pd.factorize(sum_data['Title'])[0]
#亲友
sum_data['FAF']= sum_data['SibSp'] + sum_data['Parch']
pd.concat([sum_data,sum_data['FAF']])
#性别替换
sum_data.loc[sum_data['Sex']=='male','Sex'] = 0
sum_data.loc[sum_data['Sex']=='female','Sex'] = 1
#上船位置
sum_data['Embarked'] = sum_data['Embarked'].fillna('S')
sum_data.loc[sum_data['Embarked'] == 'S','Embarked'] =0
sum_data.loc[sum_data['Embarked'] == 'C','Embarked'] = 1
sum_data.loc[sum_data['Embarked'] == 'Q','Embarked'] = 1
#船舱
sum_data['Cabin'].loc[(sum_data['Cabin'].notnull())] = 1
sum_data['Cabin'].loc[(sum_data['Cabin'].isnull())] = 0
#票
sum_data['Fare'] = sum_data[['Fare']].fillna(sum_data.groupby('Pclass').transform(np.mean))
#等级
sum_data.loc[sum_data['Pclass'] == 1,'Pclass'] =1
sum_data.loc[sum_data['Pclass'] == 2,'Pclass'] = 0
sum_data.loc[sum_data['Pclass'] == 3,'Pclass'] = -1
#回归森林预测年龄
from sklearn.ensemble import RandomForestRegressor
#choose training data to predict age
age_df = sum_data[['Age','Title','Parch', 'SibSp','Pclass']]
age_df_notnull = age_df.loc[(sum_data['Age'].notnull())]
age_df_isnull = age_df.loc[(sum_data['Age'].isnull())]
X = age_df_notnull.values[:,1:]
Y = age_df_notnull.values[:,0]
# use RandomForestRegression to train data
RFR = RandomForestRegressor(n_estimators=1000, n_jobs=-1)
RFR.fit(X,Y)
predictAges = RFR.predict(age_df_isnull.values[:,1:])
sum_data.loc[sum_data['Age'].isnull(), ['Age']]= predictAges
#年龄分组
bins = [0, 12, 18, 65, 100]
sum_data['Age_group'] =sum_data['Age']
sum_data.loc[sum_data['Age_group'] > 65,'Age_group'] = -4
sum_data.loc[sum_data['Age_group'] >30,'Age_group'] = -3
sum_data.loc[sum_data['Age_group'] >15,'Age_group'] = -2
sum_data.loc[sum_data['Age_group'] >0,'Age_group'] = 1
sum_data.loc[sum_data['Age_group'] ==-2,'Age_group'] = 0
sum_data.loc[sum_data['Age_group'] ==-3,'Age_group'] = 0
sum_data.loc[sum_data['Age_group'] ==-4,'Age_group'] = -1
pd.concat([sum_data,sum_data['Age_group']])
#年龄归一化
from sklearn import preprocessing
assert np.size(sum_data['Age']) == 1309
# StandardScaler will subtract the mean from each value then scale to the unit variance
scaler = preprocessing.StandardScaler()
sum_data['Age_scaled'] = scaler.fit_transform(sum_data['Age'].values.reshape(-1,1))
assert np.size(sum_data['Fare']) == 1309
scaler = preprocessing.StandardScaler()
sum_data['Fare_scaled'] = scaler.fit_transform(sum_data['Fare'].values.reshape(-1,1))
#随机森林决策树
from sklearn import model_selection
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
#两组数据分开
traindata =sum_data[:891]
testdata = sum_data[891:]
#这里后面的samples不要少写一个“s”
#n_estimators是决策树的个数,min_samples_split是根据属性划分节点时,每个划分最少的样本数。min_samples_leaf:叶子节点最少的样本数。
#交叉验证
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
#predictors = [ 'FAF','SibSp', 'Age','Embarked','Fare','Cabin', 'Pclass','Sex', 'Title'] 0.811
#predictors = [ 'FAF', 'Age','Embarked','Fare','Cabin', 'Pclass','Sex', 'Title'] 0.818
#predictors = [ 'FAF','SibSp', 'Age','Embarked','Fare','Cabin','Sex', 'Title'] 0.816
#predictors = ['Sex', 'Title', 'FAF','SibSp', 'Age','Embarked','Fare','Cabin', 'Pclass'] 0.817
#predictors = ['Sex', 'Title', 'FAF','SibSp','Embarked','Fare','Cabin', 'Pclass'] 0.804
#predictors = ['Sex', 'Title', 'FAF', 'Age','Embarked','Fare','Cabin', 'Pclass'] 0.817
#predictors = ['Sex', 'Title', 'FAF', 'Age','Embarked','Fare','Cabin'] 0.811
#predictors = ['Sex', 'Age', 'Title', 'FAF','Embarked','Fare','Cabin', 'Pclass'] 0.816
#predictors = ['Sex', 'Title', 'FAF','Cabin', 'Age','Embarked','Fare', 'Pclass'] 0.814
#predictors = ['Sex', 'Pclass', 'Title', 'FAF', 'Age','Embarked','Fare','Cabin'] 0.814
#predictors = [ 'Pclass','Sex', 'Age','Cabin','Title'] 0.795
#predictors =['Sex','Title','Age','FAF', 'Pclass','Embarked','Cabin','Fare'] 0.819
#predictors =['Sex','Title','Age','FAF', 'Pclass','Embarked','Cabin','Fare','Parch'] 0.821
predictors =['Sex','Title','Age','FAF', 'Pclass','Embarked','Cabin','Fare','Parch'] #0.825 16 1 |0.824 16 2| 0.826 6 2|0.828 |5 2 |4 2 |5 3|0.830 10 1|0.83170 9 1|0.83168 8 2|0.8328 9 2
alg = RandomForestClassifier(random_state=1,n_estimators = 100,min_samples_split=6,min_samples_leaf=2)
scores = cross_val_score(alg,traindata[predictors],traindata["Survived"],cv=10)
#生成成绩和平均分
print(scores)
print(scores.mean())
#predictors = ['Sex','Title','Age','Age_group','FAF','Embarked','Cabin','Fare']
#predictors = ['Sex','Title','Age_scaled','Age_group','FAF','Embarked','Cabin','Fare_scaled']
#predictors = ['Pclass','Age_scaled','Age_group','SibSp', 'Parch','Fare','Embarked','Title','Cabin','FAF','Sex']
predictors = ['Pclass','Age_scaled','Age_group','SibSp', 'Parch','Fare','Embarked','Title','Cabin','FAF','Sex']#选定特征
UseFlag = traindata['Survived'].values
UseFeature = traindata[predictors].values
rf = RandomForestClassifier(random_state=1,n_estimators=100,min_samples_split=9,min_samples_leaf=2)
rf.fit(UseFeature,UseFlag)#进行模型的训练
#开始预测
temp = rf.predict(UseFeature)
TestFeature = testdata[predictors].values
temp = rf.predict(TestFeature)
testdata['Survived'] = temp
outdata = testdata[['PassengerId','Survived']]#提取出需要的列
outdata.to_csv(r"C:\Users\Administrator\Desktop\taitanic\RF5.csv",index=False,header=True)#保存数据集
结果在0.78904和0.79904浮动
RF1:年龄和票取众数 0.74
RF2:年龄采用随机森林补充,票取众数,年龄归一化 0.745
RF3: 年龄采用随机森林补充,票取众数,年龄 票 归一化 0.755
RF4: 年龄采用随机森林补充,票取众数,年龄 票 归一化 船舱二分类 加入亲友合计,姓名关键提取,0.77
RF5: 年龄采用随机森林补充,票取众数,年龄 票 归一化 船舱二分类 加入亲友合计,姓名关键提取,加入年龄分段 0.799
随着加入的参数增加,结果也越来越好,但是Pclass作为一个相对关键的因素,在加入后的效果并不如预期,删除后反而准确率更高,由于在之前组中的Pclass的权重过高,因此应适当降低其权重,或者提高其他特征的占比。