캐글 입문
2021년 여름방학 캐글 입문, 타이타닉 생존자 예측
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
# 작업파일과 다운받은 파일이 다른위치에 있을경우 'train.csv'를 수정해주셔야합니다!
train = pd.read_csv('train.csv', index_col='PassengerId')
test = pd.read_csv('test.csv', index_col='PassengerId')
train.head() #train.tail()을 하게 되면, 마지막 데이터를 확인합니다.
train.shape, test.shape
train.info()
test.info()
train.isnull()
train
True + True + True
True + True + False
train.isnull().sum()
sex_mean = train.groupby('Sex')['Age'].mean()
sex_mean
train.loc[(train["Sex"] == 'female') & (train["Age"].isnull()), "Age"] = int(sex_mean[0])
train.loc[(train["Sex"] == 'male') & (train["Age"].isnull()), "Age"] = int(sex_mean[1])
train['Age'] = train['Age'].apply(lambda x:round(x))
train.head()
train.info()
train.Cabin.value_counts()
train.Sex.value_counts()
train.Pclass.value_counts()
train['Cabin'] = train['Cabin'].fillna('N') #결측치를 'N'으로 채우라는 명령
train['Cabin'] = train['Cabin'].apply(lambda x:x[0]) #해당 값을 0번째 값으로만 채우라는 명령
train.head(20) #제대로 반영이 되었는지 20개만 출력을 해봅니다.
def 함수(x):
return x[0]
함수('hello world')
def 제곱(x):
return x**2
list(map(제곱, [1, 2, 3, 4, 5, 6]))
list(map(lambda x:x**2, [1, 2, 3, 4, 5, 6]))
train['Embarked'].value_counts()
train['Embarked'].isnull().sum()
train['Embarked'] = train['Embarked'].fillna('S')
train['Survived'].value_counts()
train["hojun"] = 100
train
train["Survived_label"] = train["Survived"].replace(0, "Dead").replace(1, "Survived")
train[['Survived','Survived_label']].head()
train['Survived_label'].value_counts()
temp = train['Survived_label'].value_counts()
survival_rate = temp[1]/(temp[0] + temp[1])*100
print(f"생존율은 {survival_rate:.1f}% 입니다.")
train['Survived'].plot(kind='hist', bins=3)
train['Survived_label'].value_counts().plot(kind='bar', rot='45')
train['Survived_label'].value_counts().plot(kind='pie', autopct='%1.2f%%')
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
f,ax=plt.subplots(1,2,figsize=(12,6))
train['Survived_label'].value_counts().plot.pie(explode=[0,0.1],autopct='%1.2f%%',ax=ax[0])
ax[0].set_title('Survived')
ax[0].set_ylabel('')
sns.countplot('Survived_label',data=train,ax=ax[1])
ax[1].set_title('Survived')
plt.show()
sns.countplot(data=train, x="Sex", hue="Survived_label")
sns.countplot(data=train, x="Pclass", hue="Survived_label")
train['Age'].hist(bins=20,figsize=(10,5),grid=False,edgecolor='black',color='yellowgreen');
def 함수(x=10, y=20):
return x + y
함수(100, y=200)
sns.countplot(data=train, x="SibSp", hue="Survived_label")
sns.countplot(data=train, x="Parch", hue="Survived_label")
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.corr()
train['Title'] = train['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
train['Title'] = train['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona', 'Mlle', 'Ms', 'Mme'], 'Other')
train['Title'] = train['Title'].replace('Mlle', 'Miss')
train['Title'] = train['Title'].replace('Ms', 'Miss')
train['Title'] = train['Title'].replace('Mme', 'Mrs')
train['Title'].value_counts()
train['Title_label'] = train['Title'].astype('category').cat.codes
train[['Title','Title_label']]
test['Title'] = test['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
test['Title'] = test['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona', 'Mlle', 'Ms', 'Mme'], 'Other')
test['Title'] = test['Title'].replace('Mlle', 'Miss')
test['Title'] = test['Title'].replace('Ms', 'Miss')
test['Title'] = test['Title'].replace('Mme', 'Mrs')
test['Title'].value_counts()
test['Title_label'] = train['Title'].astype('category').cat.codes
test[['Title','Title_label']]
전체데이터 = [train, test]
for 데이터 in 전체데이터:
데이터['Title'] = 데이터['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
mapping_data = {"Mr": 0, "Miss": 1, "Mrs": 2, "Master": 3, "Dr": 3, "Rev": 3, "Col": 3, "Major": 3, "Mlle": 3,"Countess": 3, "Ms": 3, "Lady": 3, "Jonkheer": 3, "Don": 3, "Dona" : 3, "Mme": 3,"Capt": 3,"Sir": 3 }
for 데이터 in 전체데이터:
데이터['Title'] = 데이터['Title'].map(mapping_data)
train.drop('Name', axis=1, inplace=True)
test.drop('Name', axis=1, inplace=True)
train.drop('Title', axis=1, inplace=True)
test.drop('Title', axis=1, inplace=True)
train
test
train["Age"].fillna(train.groupby("Sex")["Age"].transform("median"), inplace=True)
test["Age"].fillna(test.groupby("Sex")["Age"].transform("median"), inplace=True)
for 데이터 in 전체데이터:
데이터.loc[ 데이터['Age'] <= 16, 'Age'] = 0
데이터.loc[(데이터['Age'] > 16) & (데이터['Age'] <= 26), 'Age'] = 1
데이터.loc[(데이터['Age'] > 26) & (데이터['Age'] <= 36), 'Age'] = 2
데이터.loc[(데이터['Age'] > 36) & (데이터['Age'] <= 62), 'Age'] = 3
데이터.loc[ 데이터['Age'] > 62, 'Age'] = 4
train.head()
for 데이터 in 전체데이터:
데이터['Fare_bin'] = pd.qcut(train['Fare'], 5)
데이터['Fare_label'] = 데이터['Fare_bin'].astype('category').cat.codes
for 데이터 in 전체데이터:
데이터.drop('Fare', axis=1, inplace=True)
데이터.drop('Fare_bin', axis=1, inplace=True)
train["FamilySize"] = train["SibSp"] + train["Parch"] + 1
test["FamilySize"] = test["SibSp"] + test["Parch"] + 1
mapping_data = {1: 0, 2: 0.4, 3: 0.8, 4: 1.2, 5: 1.6, 6: 2, 7: 2.4, 8: 2.8, 9: 3.2, 10: 3.6, 11: 4}
for 데이터 in 전체데이터:
데이터['FamilySize'] = 데이터['FamilySize'].map(mapping_data)
for 데이터 in 전체데이터:
데이터['Embarked'] = 데이터['Embarked'].fillna('S')
mapping_data = {"S": 0, "C": 1, "Q": 2}
for 데이터 in 전체데이터:
데이터['Embarked'] = 데이터['Embarked'].map(mapping_data)
train['Sex'] = train['Sex'].astype('category').cat.codes
test['Sex'] = test['Sex'].astype('category').cat.codes
train
for 데이터 in 전체데이터:
데이터.drop('Ticket', axis=1, inplace=True)
데이터.drop('Cabin', axis=1, inplace=True)
데이터.drop('PassengerId', axis=1, inplace=True)
train.head()
train.corr()
plt.figure(figsize=(15,15))
sns.heatmap(data=train.corr(), annot=True, fmt='.2f', linewidths=.5, cmap='Blues')
train.head()
test.head()
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import numpy as np
train_data = train.drop('Survived', axis=1)
target = train['Survived']
train_data.shape, target.shape
train_data
target
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)
clf = KNeighborsClassifier(n_neighbors = 13)
scoring = 'accuracy'
score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring)
round(np.mean(score)*100, 2)
score
clf = DecisionTreeClassifier()
scoring = 'accuracy'
score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring)
round(np.mean(score)*100, 2)
clf = RandomForestClassifier(n_estimators=13)
scoring = 'accuracy'
score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring)
round(np.mean(score)*100, 2)
clf = GaussianNB()
scoring = 'accuracy'
score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring)
round(np.mean(score)*100, 2)
clf = SVC()
scoring = 'accuracy'
score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring)
round(np.mean(score)*100,2)
clf = SVC()
clf.fit(train_data, target)
test_data = test.copy()
prediction = clf.predict(test_data)
prediction