728x90
반응형

titanic_train.csv
0.07MB
titanic_test.csv
0.03MB

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df_train = pd.read_csv('titanic_train.csv')
df_test = pd.read_csv('titanic_test.csv')
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 916 entries, 0 to 915
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     916 non-null    int64  
 1   survived   916 non-null    int64  
 2   name       916 non-null    object 
 3   sex        916 non-null    object 
 4   age        741 non-null    float64
 5   sibsp      916 non-null    int64  
 6   parch      916 non-null    int64  
 7   ticket     916 non-null    object 
 8   fare       916 non-null    float64
 9   cabin      214 non-null    object 
 10  embarked   914 non-null    object 
 11  body       85 non-null     float64
 12  home.dest  527 non-null    object 
dtypes: float64(3), int64(4), object(6)
memory usage: 93.2+ KB

 

df_train = df_train.drop(['ticket','body','home.dest'], axis=1)
df_test = df_test.drop(['ticket','body','home.dest'], axis=1)

df_train['age'] = df_train['age'].fillna(age_mean)
df_test['age'] = df_test['age'].fillna(age_mean)

em_mode = df_train['embarked'].value_counts().index[0]
df_train['embarked'] = df_train['embarked'].fillna(em_mode)
df_test['embarked'] = df_test['embarked'].fillna(em_mode)

 

whole_df = df_train.append(df_test)
train_idx_num = len(df_train)
whole_df['cabin'].value_counts()

C23 C25 C27        6
G6                 5
B57 B59 B63 B66    5
D                  4
F2                 4
                  ..
A20                1
C128               1
D6                 1
C49                1
A10                1
Name: cabin, Length: 186, dtype: int64

 

whole_df['cabin'].isnull().value_counts()

True     1014
False     295
Name: cabin, dtype: int64

 

whole_df['cabin'] = whole_df['cabin'].fillna('X')
whole_df['cabin'].value_counts()

X                  1014
C23 C25 C27           6
G6                    5
B57 B59 B63 B66       5
F2                    4
                   ... 
A9                    1
E52                   1
C95                   1
C99                   1
A10                   1
Name: cabin, Length: 187, dtype: int64

 

whole_df['cabin'].unique()

array(['X', 'E36', 'C68', 'E24', 'C22 C26', 'D38', 'B50', 'A24', 'C111',
       'F', 'C6', 'C87', 'E8', 'B45', 'C93', 'D28', 'D36', 'C125', 'B35',
       'T', 'B73', 'B57 B59 B63 B66', 'A26', 'A18', 'B96 B98', 'G6',
       'C78', 'C101', 'D9', 'D33', 'C128', 'E50', 'B26', 'B69', 'E121',
       'C123', 'B94', 'A34', 'D', 'C39', 'D43', 'E31', 'B5', 'D17', 'F33',
       'E44', 'D7', 'A21', 'D34', 'A29', 'D35', 'A11', 'B51 B53 B55',
       'D46', 'E60', 'C30', 'D26', 'E68', 'A9', 'B71', 'D37', 'F2',
       'C55 C57', 'C89', 'C124', 'C23 C25 C27', 'C126', 'E49', 'F E46',
       'E46', 'D19', 'B58 B60', 'C82', 'B52 B54 B56', 'C92', 'E45',
       'F G73', 'C65', 'E25', 'B3', 'D40', 'C91', 'B102', 'B61', 'F G63',
       'A20', 'B36', 'C7', 'B77', 'D20', 'C148', 'C105', 'E38', 'B86',
       'C132', 'C86', 'A14', 'C54', 'A5', 'B49', 'B28', 'B24', 'C2', 'F4',
       'A6', 'C83', 'B42', 'A36', 'C52', 'D56', 'C116', 'B19', 'E77',
       'F E57', 'E101', 'B18', 'C95', 'D15', 'E33', 'B30', 'D21', 'E10',
       'C130', 'D6', 'C51', 'D30', 'E67', 'C110', 'C103', 'C90', 'C118',
       'C97', 'D47', 'E34', 'B4', 'D50', 'C62 C64', 'E17', 'B41', 'C49',
       'C85', 'B20', 'C28', 'E63', 'C99', 'D49', 'A10', 'A16', 'B37',
       'C80', 'B78', 'E12', 'C104', 'A31', 'D11', 'D48', 'D10 D12', 'B38',
       'D45', 'C50', 'C31', 'B82 B84', 'A32', 'C53', 'B10', 'C70', 'A23',
       'C106', 'C46', 'E58', 'B11', 'F E69', 'B80', 'E39 E41', 'D22',
       'E40', 'A19', 'C32', 'B79', 'C45', 'B22', 'B39', 'C47', 'B101',
       'A7', 'E52', 'F38'], dtype=object)

 

# whole_df['cabin'] = whole_df['cabin'].values
whole_df['cabin'] = [ ca[0] for ca in  whole_df['cabin'].values ]
# whole_df['cabin'] = whole_df['cabin'].apply(lambda x : x[0])

whole_df['cabin'].value_counts()
X    1014
C      94
B      65
D      46
E      41
A      22
F      21
G       5
T       1
Name: cabin, dtype: int64

 

whole_df['cabin'] = whole_df['cabin'].replace('G', 'X')
whole_df['cabin'] = whole_df['cabin'].replace('T', 'X')

whole_df['cabin'].value_counts()

X    1020
C      94
B      65
D      46
E      41
A      22
F      21
Name: cabin, dtype: int64

 

sns.countplot(x='cabin', hue='survived', data = whole_df)

whole_df['name']

0                 Mellinger, Miss. Madeleine Violet
1                                 Wells, Miss. Joan
2                    Duran y More, Miss. Florentina
3                                Scanlan, Mr. James
4                      Bradley, Miss. Bridget Delia
                           ...                     
388               Karlsson, Mr. Julius Konrad Eugen
389    Ware, Mrs. John James (Florence Louise Long)
390                            O'Keefe, Mr. Patrick
391                                Tobin, Mr. Roger
392                            Daniels, Miss. Sarah
Name: name, Length: 1309, dtype: object

 

# import re
# re.compile(',')
n_grade = whole_df['name'].apply(lambda  x : x.split(", ")[1].split(".")[0])
n_grade = n_grade.unique().tolist()
n_grade
# nana = [ na[na.find(',')+2 : na.find('.')] for na in whole_df['name'].values]
# nana

['Miss',
 'Mr',
 'Master',
 'Mrs',
 'Dr',
 'Mlle',
 'Col',
 'Rev',
 'Ms',
 'Mme',
 'Sir',
 'the Countess',
 'Dona',
 'Jonkheer',
 'Lady',
 'Major',
 'Don',
 'Capt']

 

# 호칭에 따른 사회적 지위 정의
grade_dict = {
    'A' : ['Rev', 'Col', 'Major', 'Dr', 'Capt', 'Sir'], # 명예직
'B' : ['Ms', 'Mme','Mrs','Dona'], # 여성
'C' : ['Jonkheer','the Countess'], # 귀족
'D' : ['Mr','Don'], # 남성
'E' : ['Master'], # 젊은 남성
'F' : ['Miss','Mlle','Lady'] # 젊은 여성
}

 

print(grade_dict.values())
print(grade_dict['A'])

dict_values([['Rev', 'Col', 'Major', 'Dr', 'Capt', 'Sir'], ['Ms', 'Mme', 'Mrs', 'Dona'], ['Jonkheer', 'the Countess'], ['Mr', 'Don'], ['Master'], ['Miss', 'Mlle', 'Lady']])
['Rev', 'Col', 'Major', 'Dr', 'Capt', 'Sir']

 

def give_grade(x) : # name 지위를 쭉 나열한 컬럼을 넣었을때  
    g = x.split(", ")[1].split(".")[0]
    for k, v in grade_dict.items() :
        for title in v :
            if g == title :
                return k
    return 'G'
whole_df['name'] = whole_df['name'].apply(lambda x : give_grade(x))

 

whole_df['name'].value_counts()

D    758
F    263
B    201
E     61
A     24
C      2
Name: name, dtype: int64

 

sns.countplot(x=whole_df['name'], hue=whole_df['survived'])

# 인코딩
whole_df_encoded = pd.get_dummies(whole_df)
whole_df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 392
Data columns (total 24 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   pclass      1309 non-null   int64  
 1   survived    1309 non-null   int64  
 2   age         1309 non-null   float64
 3   sibsp       1309 non-null   int64  
 4   parch       1309 non-null   int64  
 5   fare        1309 non-null   float64
 6   name_A      1309 non-null   uint8  
 7   name_B      1309 non-null   uint8  
 8   name_C      1309 non-null   uint8  
 9   name_D      1309 non-null   uint8  
 10  name_E      1309 non-null   uint8  
 11  name_F      1309 non-null   uint8  
 12  sex_female  1309 non-null   uint8  
 13  sex_male    1309 non-null   uint8  
 14  cabin_A     1309 non-null   uint8  
 15  cabin_B     1309 non-null   uint8  
 16  cabin_C     1309 non-null   uint8  
 17  cabin_D     1309 non-null   uint8  
 18  cabin_E     1309 non-null   uint8  
 19  cabin_F     1309 non-null   uint8  
 20  cabin_X     1309 non-null   uint8  
 21  embarked_C  1309 non-null   uint8  
 22  embarked_Q  1309 non-null   uint8  
 23  embarked_S  1309 non-null   uint8  
dtypes: float64(2), int64(4), uint8(18)
memory usage: 134.6 KB

 

# 학습데이터의 독립변수 x_train
x_train = whole_df_encoded[:train_num+1]
x_train = x_train.loc[:,x_train.columns != 'survived'].values
y_train = whole_df_encoded[:train_num+1]['survived']

x_test = whole_df_encoded[train_num+1:]
x_test = x_test.loc[:,x_test.columns != 'survived'].values
y_test = whole_df_encoded[train_num+1:]['survived']

 

x_train.shape

# (917, 23)

 

y_train = df_train['survived'].values
x_train = df_train.loc[:,df_train.columns != 'survived'].values
y_test = df_test['survived'].values
x_test = df_train.loc[:,df_train.columns != 'survived'].values

 

ttt = df_train.copy()
ttt['name'] = ttt['name'].apply(lambda x : x.split(', ')[1].split('.')[0])
for i in ttt['name'] :
    if i in grade_dict['A'] :
        ttt.name.replace(i, 'A', inplace = True)
        print()
    elif i in grade_dict['B'] :
        ttt.name.replace(i, 'B', inplace = True)
    elif i in grade_dict['C'] :
        ttt.name.replace(i, 'C', inplace = True)
    elif i in grade_dict['D'] :
        ttt.name.replace(i, 'D', inplace = True)
    elif i in grade_dict['E'] :
        ttt.name.replace(i, 'E', inplace = True)
    elif i in grade_dict['F'] :
        ttt.name.replace(i, 'F', inplace = True)
    else :
        ttt.name.replace(i, 'G', inplace = True)

 

y_test

1      1
2      0
3      0
4      0
5      1
      ..
388    0
389    1
390    1
391    0
392    1
Name: survived, Length: 392, dtype: int64

 

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=0)
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)

 

# 평가
from sklearn.metrics import confusion_matrix
confmat = confusion_matrix(y_true = y_test, y_pred = y_pred)
print(confmat)

[[208  37]
 [ 42 105]]

 

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('정확도(accuracy) : %.2f'% accuracy_score(y_test, y_pred))
print('정밀도(precision) : %.3f'% precision_score(y_test, y_pred))
print('재현율(recall) : %.3f'% recall_score(y_test, y_pred))
print('F1-score : %.3f'% f1_score(y_test, y_pred))
# f= 2*(정밀도*재현율)/(정밀도+재현율)

정확도(accuracy) : 0.80
정밀도(precision) : 0.739
재현율(recall) : 0.714
F1-score : 0.727

 

반응형

+ Recent posts