데이터 선택(로드) 및 결측치 처리

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# 작업파일과 다운받은 파일이 다른위치에 있을경우 'train.csv'를 수정해주셔야합니다!
train = pd.read_csv('train.csv', index_col='PassengerId')
test = pd.read_csv('test.csv', index_col='PassengerId')

train.head() #train.tail()을 하게 되면, 마지막 데이터를 확인합니다.
Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
PassengerId
1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
train.shape, test.shape
((891, 11), (418, 10))
train.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB
test.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Name      418 non-null    object 
 2   Sex       418 non-null    object 
 3   Age       332 non-null    float64
 4   SibSp     418 non-null    int64  
 5   Parch     418 non-null    int64  
 6   Ticket    418 non-null    object 
 7   Fare      417 non-null    float64
 8   Cabin     91 non-null     object 
 9   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(5)
memory usage: 35.9+ KB
train.isnull()
train
True + True + True
True + True + False
train.isnull().sum()
Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64
sex_mean = train.groupby('Sex')['Age'].mean()
sex_mean
Sex
female    27.915709
male      30.726645
Name: Age, dtype: float64
train.loc[(train["Sex"] == 'female') & (train["Age"].isnull()), "Age"] = int(sex_mean[0])
train.loc[(train["Sex"] == 'male') & (train["Age"].isnull()), "Age"] = int(sex_mean[1])

train['Age'] = train['Age'].apply(lambda x:round(x))

train.head()
Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
PassengerId
1 0 3 Braund, Mr. Owen Harris male 22 1 0 A/5 21171 7.2500 NaN S
2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38 1 0 PC 17599 71.2833 C85 C
3 1 3 Heikkinen, Miss. Laina female 26 0 0 STON/O2. 3101282 7.9250 NaN S
4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 0 113803 53.1000 C123 S
5 0 3 Allen, Mr. William Henry male 35 0 0 373450 8.0500 NaN S
train.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       891 non-null    int64  
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(1), int64(5), object(5)
memory usage: 83.5+ KB
train.Cabin.value_counts()
G6             4
B96 B98        4
C23 C25 C27    4
E101           3
D              3
              ..
E68            1
B71            1
D45            1
C7             1
A31            1
Name: Cabin, Length: 147, dtype: int64
train.Sex.value_counts()
male      577
female    314
Name: Sex, dtype: int64
train.Pclass.value_counts()
3    491
1    216
2    184
Name: Pclass, dtype: int64
train['Cabin'] = train['Cabin'].fillna('N') #결측치를 'N'으로 채우라는 명령
train['Cabin'] = train['Cabin'].apply(lambda x:x[0]) #해당 값을 0번째 값으로만 채우라는 명령

train.head(20) #제대로 반영이 되었는지 20개만 출력을 해봅니다.
Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
PassengerId
1 0 3 Braund, Mr. Owen Harris male 22 1 0 A/5 21171 7.2500 N S
2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38 1 0 PC 17599 71.2833 C C
3 1 3 Heikkinen, Miss. Laina female 26 0 0 STON/O2. 3101282 7.9250 N S
4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 0 113803 53.1000 C S
5 0 3 Allen, Mr. William Henry male 35 0 0 373450 8.0500 N S
6 0 3 Moran, Mr. James male 30 0 0 330877 8.4583 N Q
7 0 1 McCarthy, Mr. Timothy J male 54 0 0 17463 51.8625 E S
8 0 3 Palsson, Master. Gosta Leonard male 2 3 1 349909 21.0750 N S
9 1 3 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27 0 2 347742 11.1333 N S
10 1 2 Nasser, Mrs. Nicholas (Adele Achem) female 14 1 0 237736 30.0708 N C
11 1 3 Sandstrom, Miss. Marguerite Rut female 4 1 1 PP 9549 16.7000 G S
12 1 1 Bonnell, Miss. Elizabeth female 58 0 0 113783 26.5500 C S
13 0 3 Saundercock, Mr. William Henry male 20 0 0 A/5. 2151 8.0500 N S
14 0 3 Andersson, Mr. Anders Johan male 39 1 5 347082 31.2750 N S
15 0 3 Vestrom, Miss. Hulda Amanda Adolfina female 14 0 0 350406 7.8542 N S
16 1 2 Hewlett, Mrs. (Mary D Kingcome) female 55 0 0 248706 16.0000 N S
17 0 3 Rice, Master. Eugene male 2 4 1 382652 29.1250 N Q
18 1 2 Williams, Mr. Charles Eugene male 30 0 0 244373 13.0000 N S
19 0 3 Vander Planke, Mrs. Julius (Emelia Maria Vande... female 31 1 0 345763 18.0000 N S
20 1 3 Masselmani, Mrs. Fatima female 27 0 0 2649 7.2250 N C
def 함수(x):
    return x[0]

함수('hello world')
'h'
def 제곱(x):
    return x**2
    
list(map(제곱, [1, 2, 3, 4, 5, 6]))
[1, 4, 9, 16, 25, 36]
list(map(lambda x:x**2, [1, 2, 3, 4, 5, 6]))
[1, 4, 9, 16, 25, 36]
train['Embarked'].value_counts()
S    644
C    168
Q     77
Name: Embarked, dtype: int64
train['Embarked'].isnull().sum()
2
train['Embarked'] = train['Embarked'].fillna('S')
train['Survived'].value_counts()
0    549
1    342
Name: Survived, dtype: int64
train["hojun"] = 100
train
Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked hojun
PassengerId
1 0 3 Braund, Mr. Owen Harris male 22 1 0 A/5 21171 7.2500 N S 100
2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38 1 0 PC 17599 71.2833 C C 100
3 1 3 Heikkinen, Miss. Laina female 26 0 0 STON/O2. 3101282 7.9250 N S 100
4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 0 113803 53.1000 C S 100
5 0 3 Allen, Mr. William Henry male 35 0 0 373450 8.0500 N S 100
... ... ... ... ... ... ... ... ... ... ... ... ...
887 0 2 Montvila, Rev. Juozas male 27 0 0 211536 13.0000 N S 100
888 1 1 Graham, Miss. Margaret Edith female 19 0 0 112053 30.0000 B S 100
889 0 3 Johnston, Miss. Catherine Helen "Carrie" female 27 1 2 W./C. 6607 23.4500 N S 100
890 1 1 Behr, Mr. Karl Howell male 26 0 0 111369 30.0000 C C 100
891 0 3 Dooley, Mr. Patrick male 32 0 0 370376 7.7500 N Q 100

891 rows × 12 columns

train["Survived_label"] = train["Survived"].replace(0, "Dead").replace(1, "Survived")

train[['Survived','Survived_label']].head()
Survived Survived_label
PassengerId
1 0 Dead
2 1 Survived
3 1 Survived
4 1 Survived
5 0 Dead
train['Survived_label'].value_counts()
Dead        549
Survived    342
Name: Survived_label, dtype: int64
temp = train['Survived_label'].value_counts()
survival_rate = temp[1]/(temp[0] + temp[1])*100

print(f"생존율은 {survival_rate:.1f}% 입니다.")
생존율은 38.4% 입니다.

시각화

train['Survived'].plot(kind='hist', bins=3)
<matplotlib.axes._subplots.AxesSubplot at 0x7fbcf60c0790>
train['Survived_label'].value_counts().plot(kind='bar', rot='45')
<matplotlib.axes._subplots.AxesSubplot at 0x7fbcf570ac50>
train['Survived_label'].value_counts().plot(kind='pie', autopct='%1.2f%%')
<matplotlib.axes._subplots.AxesSubplot at 0x7fbcf5243750>
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
f,ax=plt.subplots(1,2,figsize=(12,6))

train['Survived_label'].value_counts().plot.pie(explode=[0,0.1],autopct='%1.2f%%',ax=ax[0])
ax[0].set_title('Survived')
ax[0].set_ylabel('')

sns.countplot('Survived_label',data=train,ax=ax[1])
ax[1].set_title('Survived')
plt.show()
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning

데이터 분석

sns.countplot(data=train, x="Sex", hue="Survived_label")
<matplotlib.axes._subplots.AxesSubplot at 0x7fbcf521bd50>
sns.countplot(data=train, x="Pclass", hue="Survived_label")
<matplotlib.axes._subplots.AxesSubplot at 0x7fbcf461c4d0>
train['Age'].hist(bins=20,figsize=(10,5),grid=False,edgecolor='black',color='yellowgreen');
def 함수(x=10, y=20):
    return x + y

함수(100, y=200)
300
sns.countplot(data=train, x="SibSp", hue="Survived_label")
<matplotlib.axes._subplots.AxesSubplot at 0x7fbcf461cad0>
sns.countplot(data=train, x="Parch", hue="Survived_label")
<matplotlib.axes._subplots.AxesSubplot at 0x7fbcf45a4f50>

머신러닝

머신러닝 데이터 전처리

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.corr()
PassengerId Survived Pclass Age SibSp Parch Fare
PassengerId 1.000000 -0.005007 -0.035144 0.036847 -0.057527 -0.001652 0.012658
Survived -0.005007 1.000000 -0.338481 -0.077221 -0.035322 0.081629 0.257307
Pclass -0.035144 -0.338481 1.000000 -0.369226 0.083081 0.018443 -0.549500
Age 0.036847 -0.077221 -0.369226 1.000000 -0.308247 -0.189119 0.096067
SibSp -0.057527 -0.035322 0.083081 -0.308247 1.000000 0.414838 0.159651
Parch -0.001652 0.081629 0.018443 -0.189119 0.414838 1.000000 0.216225
Fare 0.012658 0.257307 -0.549500 0.096067 0.159651 0.216225 1.000000
train['Title'] = train['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

train['Title'] = train['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona', 'Mlle', 'Ms', 'Mme'], 'Other')
train['Title'] = train['Title'].replace('Mlle', 'Miss')
train['Title'] = train['Title'].replace('Ms', 'Miss')
train['Title'] = train['Title'].replace('Mme', 'Mrs')
train['Title'].value_counts()
Mr        517
Miss      182
Mrs       125
Master     40
Other      27
Name: Title, dtype: int64
train['Title_label'] = train['Title'].astype('category').cat.codes

train[['Title','Title_label']]
Title Title_label
0 Mr 2
1 Mrs 3
2 Miss 1
3 Mrs 3
4 Mr 2
... ... ...
886 Other 4
887 Miss 1
888 Miss 1
889 Mr 2
890 Mr 2

891 rows × 2 columns

test['Title'] = test['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

test['Title'] = test['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona', 'Mlle', 'Ms', 'Mme'], 'Other')
test['Title'] = test['Title'].replace('Mlle', 'Miss')
test['Title'] = test['Title'].replace('Ms', 'Miss')
test['Title'] = test['Title'].replace('Mme', 'Mrs')

test['Title'].value_counts()
Mr        240
Miss       78
Mrs        72
Master     21
Other       7
Name: Title, dtype: int64
test['Title_label'] = train['Title'].astype('category').cat.codes

test[['Title','Title_label']]
Title Title_label
0 Mr 2
1 Mrs 3
2 Mr 1
3 Mr 3
4 Mrs 2
... ... ...
413 Mr 2
414 Other 2
415 Mr 3
416 Mr 3
417 Master 1

418 rows × 2 columns

전체데이터 = [train, test]
for 데이터 in 전체데이터:
    데이터['Title'] = 데이터['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
mapping_data = {"Mr": 0, "Miss": 1, "Mrs": 2, "Master": 3, "Dr": 3, "Rev": 3, "Col": 3, "Major": 3, "Mlle": 3,"Countess": 3, "Ms": 3, "Lady": 3, "Jonkheer": 3, "Don": 3, "Dona" : 3, "Mme": 3,"Capt": 3,"Sir": 3 }
for 데이터 in 전체데이터:
    데이터['Title'] = 데이터['Title'].map(mapping_data)
train.drop('Name', axis=1, inplace=True)
test.drop('Name', axis=1, inplace=True)
train.drop('Title', axis=1, inplace=True)
test.drop('Title', axis=1, inplace=True)
train
test
PassengerId Pclass Sex Age SibSp Parch Ticket Fare Cabin Embarked Title_label
0 892 3 male 34.5 0 0 330911 7.8292 NaN Q 2
1 893 3 female 47.0 1 0 363272 7.0000 NaN S 3
2 894 2 male 62.0 0 0 240276 9.6875 NaN Q 1
3 895 3 male 27.0 0 0 315154 8.6625 NaN S 3
4 896 3 female 22.0 1 1 3101298 12.2875 NaN S 2
... ... ... ... ... ... ... ... ... ... ... ...
413 1305 3 male NaN 0 0 A.5. 3236 8.0500 NaN S 2
414 1306 1 female 39.0 0 0 PC 17758 108.9000 C105 C 2
415 1307 3 male 38.5 0 0 SOTON/O.Q. 3101262 7.2500 NaN S 3
416 1308 3 male NaN 0 0 359309 8.0500 NaN S 3
417 1309 3 male NaN 1 1 2668 22.3583 NaN C 1

418 rows × 11 columns

train["Age"].fillna(train.groupby("Sex")["Age"].transform("median"), inplace=True)
test["Age"].fillna(test.groupby("Sex")["Age"].transform("median"), inplace=True)
for 데이터 in 전체데이터:
    데이터.loc[ 데이터['Age'] <= 16, 'Age'] = 0
    데이터.loc[(데이터['Age'] > 16) & (데이터['Age'] <= 26), 'Age'] = 1
    데이터.loc[(데이터['Age'] > 26) & (데이터['Age'] <= 36), 'Age'] = 2
    데이터.loc[(데이터['Age'] > 36) & (데이터['Age'] <= 62), 'Age'] = 3
    데이터.loc[ 데이터['Age'] > 62, 'Age'] = 4
train.head()
PassengerId Survived Pclass Sex Age SibSp Parch Ticket Fare Cabin Embarked Title_label
0 1 0 3 male 1.0 1 0 A/5 21171 7.2500 NaN S 2
1 2 1 1 female 3.0 1 0 PC 17599 71.2833 C85 C 3
2 3 1 3 female 1.0 0 0 STON/O2. 3101282 7.9250 NaN S 1
3 4 1 1 female 2.0 1 0 113803 53.1000 C123 S 3
4 5 0 3 male 2.0 0 0 373450 8.0500 NaN S 2
for 데이터 in 전체데이터:
    데이터['Fare_bin'] = pd.qcut(train['Fare'], 5)
    데이터['Fare_label'] = 데이터['Fare_bin'].astype('category').cat.codes
for 데이터 in 전체데이터:
    데이터.drop('Fare', axis=1, inplace=True)
    데이터.drop('Fare_bin', axis=1, inplace=True)
train["FamilySize"] = train["SibSp"] + train["Parch"] + 1
test["FamilySize"] = test["SibSp"] + test["Parch"] + 1
mapping_data = {1: 0, 2: 0.4, 3: 0.8, 4: 1.2, 5: 1.6, 6: 2, 7: 2.4, 8: 2.8, 9: 3.2, 10: 3.6, 11: 4}
for 데이터 in 전체데이터:
    데이터['FamilySize'] = 데이터['FamilySize'].map(mapping_data)
for 데이터 in 전체데이터:
    데이터['Embarked'] = 데이터['Embarked'].fillna('S')
mapping_data = {"S": 0, "C": 1, "Q": 2}
for 데이터 in 전체데이터:
    데이터['Embarked'] = 데이터['Embarked'].map(mapping_data)
train['Sex'] = train['Sex'].astype('category').cat.codes
test['Sex'] = test['Sex'].astype('category').cat.codes
train
PassengerId Survived Pclass Sex Age SibSp Parch Ticket Cabin Embarked Title_label Fare_label FamilySize
0 1 0 3 1 1.0 1 0 A/5 21171 NaN 0 2 0 0.4
1 2 1 1 0 3.0 1 0 PC 17599 C85 1 3 4 0.4
2 3 1 3 0 1.0 0 0 STON/O2. 3101282 NaN 0 1 1 0.0
3 4 1 1 0 2.0 1 0 113803 C123 0 3 4 0.4
4 5 0 3 1 2.0 0 0 373450 NaN 0 2 1 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 1 2.0 0 0 211536 NaN 0 4 2 0.0
887 888 1 1 0 1.0 0 0 112053 B42 0 1 3 0.0
888 889 0 3 0 2.0 1 2 W./C. 6607 NaN 0 1 3 1.2
889 890 1 1 1 1.0 0 0 111369 C148 1 2 3 0.0
890 891 0 3 1 2.0 0 0 370376 NaN 2 2 0 0.0

891 rows × 13 columns

for 데이터 in 전체데이터:
    데이터.drop('Ticket', axis=1, inplace=True)
    데이터.drop('Cabin', axis=1, inplace=True)
    데이터.drop('PassengerId', axis=1, inplace=True)
train.head()
Survived Pclass Sex Age SibSp Parch Embarked Title_label Fare_label FamilySize
0 0 3 1 1.0 1 0 0 2 0 0.4
1 1 1 0 3.0 1 0 1 3 4 0.4
2 1 3 0 1.0 0 0 0 1 1 0.0
3 1 1 0 2.0 1 0 0 3 4 0.4
4 0 3 1 2.0 0 0 0 2 1 0.0
train.corr()
Survived Pclass Sex Age SibSp Parch Embarked Title_label Fare_label FamilySize
Survived 1.000000 -0.338481 -0.543351 -0.072077 -0.035322 0.081629 0.106811 -0.052471 0.317783 0.016639
Pclass -0.338481 1.000000 0.131900 -0.286667 0.083081 0.018443 0.045702 -0.195910 -0.705206 0.065997
Sex -0.543351 0.131900 1.000000 0.084630 -0.114631 -0.245489 -0.116569 0.040484 -0.244943 -0.200988
Age -0.072077 -0.286667 0.084630 1.000000 -0.228235 -0.165474 0.034334 0.447788 0.087896 -0.238659
SibSp -0.035322 0.083081 -0.114631 -0.228235 1.000000 0.414838 -0.059961 -0.213887 0.354974 0.890712
Parch 0.081629 0.018443 -0.245489 -0.165474 0.414838 1.000000 -0.078665 -0.122792 0.351317 0.783111
Embarked 0.106811 0.045702 -0.116569 0.034334 -0.059961 -0.078665 1.000000 -0.081928 -0.089125 -0.080281
Title_label -0.052471 -0.195910 0.040484 0.447788 -0.213887 -0.122792 -0.081928 1.000000 0.060707 -0.207530
Fare_label 0.317783 -0.705206 -0.244943 0.087896 0.354974 0.351317 -0.089125 0.060707 1.000000 0.418125
FamilySize 0.016639 0.065997 -0.200988 -0.238659 0.890712 0.783111 -0.080281 -0.207530 0.418125 1.000000
plt.figure(figsize=(15,15))
sns.heatmap(data=train.corr(), annot=True, fmt='.2f', linewidths=.5, cmap='Blues')
<matplotlib.axes._subplots.AxesSubplot at 0x7fbcf44006d0>
train.head()
test.head()
Pclass Sex Age SibSp Parch Embarked Title_label Fare_label FamilySize
0 3 1 2.0 0 0 2 2 0 0.0
1 3 0 3.0 1 0 0 3 4 0.4
2 2 1 3.0 0 0 2 1 1 0.0
3 3 1 2.0 0 0 0 3 4 0.0
4 3 0 1.0 1 1 0 2 1 0.8
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

import numpy as np
train_data = train.drop('Survived', axis=1)
target = train['Survived']

train_data.shape, target.shape
((891, 9), (891,))
train_data
target
0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)
clf = KNeighborsClassifier(n_neighbors = 13)
scoring = 'accuracy'
score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring)
round(np.mean(score)*100, 2)
81.03
score
array([0.82222222, 0.74157303, 0.84269663, 0.7752809 , 0.85393258,
       0.82022472, 0.79775281, 0.75280899, 0.85393258, 0.84269663])
clf = DecisionTreeClassifier()
scoring = 'accuracy'
score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring)
round(np.mean(score)*100, 2)
79.8
clf = RandomForestClassifier(n_estimators=13)
scoring = 'accuracy'
score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring)
round(np.mean(score)*100, 2)
80.81
clf = GaussianNB()
scoring = 'accuracy'
score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring)
round(np.mean(score)*100, 2)
78.9
clf = SVC()
scoring = 'accuracy'
score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring)
round(np.mean(score)*100,2)
82.83

예측한 값을 캐글에 제출

clf = SVC()
clf.fit(train_data, target)
test_data = test.copy()
prediction = clf.predict(test_data)
prediction
array([0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0])