728x90
반응형
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df_train = pd.read_csv('titanic_train.csv')
df_test = pd.read_csv('titanic_test.csv')
df_train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 916 entries, 0 to 915
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 pclass 916 non-null int64
1 survived 916 non-null int64
2 name 916 non-null object
3 sex 916 non-null object
4 age 741 non-null float64
5 sibsp 916 non-null int64
6 parch 916 non-null int64
7 ticket 916 non-null object
8 fare 916 non-null float64
9 cabin 214 non-null object
10 embarked 914 non-null object
11 body 85 non-null float64
12 home.dest 527 non-null object
dtypes: float64(3), int64(4), object(6)
memory usage: 93.2+ KB
df_train = df_train.drop(['ticket','body','home.dest'], axis=1)
df_test = df_test.drop(['ticket','body','home.dest'], axis=1)
df_train['age'] = df_train['age'].fillna(age_mean)
df_test['age'] = df_test['age'].fillna(age_mean)
em_mode = df_train['embarked'].value_counts().index[0]
df_train['embarked'] = df_train['embarked'].fillna(em_mode)
df_test['embarked'] = df_test['embarked'].fillna(em_mode)
whole_df = df_train.append(df_test)
train_idx_num = len(df_train)
whole_df['cabin'].value_counts()
C23 C25 C27 6
G6 5
B57 B59 B63 B66 5
D 4
F2 4
..
A20 1
C128 1
D6 1
C49 1
A10 1
Name: cabin, Length: 186, dtype: int64
whole_df['cabin'].isnull().value_counts()
True 1014
False 295
Name: cabin, dtype: int64
whole_df['cabin'] = whole_df['cabin'].fillna('X')
whole_df['cabin'].value_counts()
X 1014
C23 C25 C27 6
G6 5
B57 B59 B63 B66 5
F2 4
...
A9 1
E52 1
C95 1
C99 1
A10 1
Name: cabin, Length: 187, dtype: int64
whole_df['cabin'].unique()
array(['X', 'E36', 'C68', 'E24', 'C22 C26', 'D38', 'B50', 'A24', 'C111',
'F', 'C6', 'C87', 'E8', 'B45', 'C93', 'D28', 'D36', 'C125', 'B35',
'T', 'B73', 'B57 B59 B63 B66', 'A26', 'A18', 'B96 B98', 'G6',
'C78', 'C101', 'D9', 'D33', 'C128', 'E50', 'B26', 'B69', 'E121',
'C123', 'B94', 'A34', 'D', 'C39', 'D43', 'E31', 'B5', 'D17', 'F33',
'E44', 'D7', 'A21', 'D34', 'A29', 'D35', 'A11', 'B51 B53 B55',
'D46', 'E60', 'C30', 'D26', 'E68', 'A9', 'B71', 'D37', 'F2',
'C55 C57', 'C89', 'C124', 'C23 C25 C27', 'C126', 'E49', 'F E46',
'E46', 'D19', 'B58 B60', 'C82', 'B52 B54 B56', 'C92', 'E45',
'F G73', 'C65', 'E25', 'B3', 'D40', 'C91', 'B102', 'B61', 'F G63',
'A20', 'B36', 'C7', 'B77', 'D20', 'C148', 'C105', 'E38', 'B86',
'C132', 'C86', 'A14', 'C54', 'A5', 'B49', 'B28', 'B24', 'C2', 'F4',
'A6', 'C83', 'B42', 'A36', 'C52', 'D56', 'C116', 'B19', 'E77',
'F E57', 'E101', 'B18', 'C95', 'D15', 'E33', 'B30', 'D21', 'E10',
'C130', 'D6', 'C51', 'D30', 'E67', 'C110', 'C103', 'C90', 'C118',
'C97', 'D47', 'E34', 'B4', 'D50', 'C62 C64', 'E17', 'B41', 'C49',
'C85', 'B20', 'C28', 'E63', 'C99', 'D49', 'A10', 'A16', 'B37',
'C80', 'B78', 'E12', 'C104', 'A31', 'D11', 'D48', 'D10 D12', 'B38',
'D45', 'C50', 'C31', 'B82 B84', 'A32', 'C53', 'B10', 'C70', 'A23',
'C106', 'C46', 'E58', 'B11', 'F E69', 'B80', 'E39 E41', 'D22',
'E40', 'A19', 'C32', 'B79', 'C45', 'B22', 'B39', 'C47', 'B101',
'A7', 'E52', 'F38'], dtype=object)
# whole_df['cabin'] = whole_df['cabin'].values
whole_df['cabin'] = [ ca[0] for ca in whole_df['cabin'].values ]
# whole_df['cabin'] = whole_df['cabin'].apply(lambda x : x[0])
whole_df['cabin'].value_counts()
X 1014
C 94
B 65
D 46
E 41
A 22
F 21
G 5
T 1
Name: cabin, dtype: int64
whole_df['cabin'] = whole_df['cabin'].replace('G', 'X')
whole_df['cabin'] = whole_df['cabin'].replace('T', 'X')
whole_df['cabin'].value_counts()
X 1020
C 94
B 65
D 46
E 41
A 22
F 21
Name: cabin, dtype: int64
sns.countplot(x='cabin', hue='survived', data = whole_df)
whole_df['name']
0 Mellinger, Miss. Madeleine Violet
1 Wells, Miss. Joan
2 Duran y More, Miss. Florentina
3 Scanlan, Mr. James
4 Bradley, Miss. Bridget Delia
...
388 Karlsson, Mr. Julius Konrad Eugen
389 Ware, Mrs. John James (Florence Louise Long)
390 O'Keefe, Mr. Patrick
391 Tobin, Mr. Roger
392 Daniels, Miss. Sarah
Name: name, Length: 1309, dtype: object
# import re
# re.compile(',')
n_grade = whole_df['name'].apply(lambda x : x.split(", ")[1].split(".")[0])
n_grade = n_grade.unique().tolist()
n_grade
# nana = [ na[na.find(',')+2 : na.find('.')] for na in whole_df['name'].values]
# nana
['Miss',
'Mr',
'Master',
'Mrs',
'Dr',
'Mlle',
'Col',
'Rev',
'Ms',
'Mme',
'Sir',
'the Countess',
'Dona',
'Jonkheer',
'Lady',
'Major',
'Don',
'Capt']
# 호칭에 따른 사회적 지위 정의
grade_dict = {
'A' : ['Rev', 'Col', 'Major', 'Dr', 'Capt', 'Sir'], # 명예직
'B' : ['Ms', 'Mme','Mrs','Dona'], # 여성
'C' : ['Jonkheer','the Countess'], # 귀족
'D' : ['Mr','Don'], # 남성
'E' : ['Master'], # 젊은 남성
'F' : ['Miss','Mlle','Lady'] # 젊은 여성
}
print(grade_dict.values())
print(grade_dict['A'])
dict_values([['Rev', 'Col', 'Major', 'Dr', 'Capt', 'Sir'], ['Ms', 'Mme', 'Mrs', 'Dona'], ['Jonkheer', 'the Countess'], ['Mr', 'Don'], ['Master'], ['Miss', 'Mlle', 'Lady']])
['Rev', 'Col', 'Major', 'Dr', 'Capt', 'Sir']
def give_grade(x) : # name 지위를 쭉 나열한 컬럼을 넣었을때
g = x.split(", ")[1].split(".")[0]
for k, v in grade_dict.items() :
for title in v :
if g == title :
return k
return 'G'
whole_df['name'] = whole_df['name'].apply(lambda x : give_grade(x))
whole_df['name'].value_counts()
D 758
F 263
B 201
E 61
A 24
C 2
Name: name, dtype: int64
sns.countplot(x=whole_df['name'], hue=whole_df['survived'])
# 인코딩
whole_df_encoded = pd.get_dummies(whole_df)
whole_df_encoded.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 392
Data columns (total 24 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 pclass 1309 non-null int64
1 survived 1309 non-null int64
2 age 1309 non-null float64
3 sibsp 1309 non-null int64
4 parch 1309 non-null int64
5 fare 1309 non-null float64
6 name_A 1309 non-null uint8
7 name_B 1309 non-null uint8
8 name_C 1309 non-null uint8
9 name_D 1309 non-null uint8
10 name_E 1309 non-null uint8
11 name_F 1309 non-null uint8
12 sex_female 1309 non-null uint8
13 sex_male 1309 non-null uint8
14 cabin_A 1309 non-null uint8
15 cabin_B 1309 non-null uint8
16 cabin_C 1309 non-null uint8
17 cabin_D 1309 non-null uint8
18 cabin_E 1309 non-null uint8
19 cabin_F 1309 non-null uint8
20 cabin_X 1309 non-null uint8
21 embarked_C 1309 non-null uint8
22 embarked_Q 1309 non-null uint8
23 embarked_S 1309 non-null uint8
dtypes: float64(2), int64(4), uint8(18)
memory usage: 134.6 KB
# 학습데이터의 독립변수 x_train
x_train = whole_df_encoded[:train_num+1]
x_train = x_train.loc[:,x_train.columns != 'survived'].values
y_train = whole_df_encoded[:train_num+1]['survived']
x_test = whole_df_encoded[train_num+1:]
x_test = x_test.loc[:,x_test.columns != 'survived'].values
y_test = whole_df_encoded[train_num+1:]['survived']
x_train.shape
# (917, 23)
y_train = df_train['survived'].values
x_train = df_train.loc[:,df_train.columns != 'survived'].values
y_test = df_test['survived'].values
x_test = df_train.loc[:,df_train.columns != 'survived'].values
ttt = df_train.copy()
ttt['name'] = ttt['name'].apply(lambda x : x.split(', ')[1].split('.')[0])
for i in ttt['name'] :
if i in grade_dict['A'] :
ttt.name.replace(i, 'A', inplace = True)
print()
elif i in grade_dict['B'] :
ttt.name.replace(i, 'B', inplace = True)
elif i in grade_dict['C'] :
ttt.name.replace(i, 'C', inplace = True)
elif i in grade_dict['D'] :
ttt.name.replace(i, 'D', inplace = True)
elif i in grade_dict['E'] :
ttt.name.replace(i, 'E', inplace = True)
elif i in grade_dict['F'] :
ttt.name.replace(i, 'F', inplace = True)
else :
ttt.name.replace(i, 'G', inplace = True)
y_test
1 1
2 0
3 0
4 0
5 1
..
388 0
389 1
390 1
391 0
392 1
Name: survived, Length: 392, dtype: int64
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=0)
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)
# 평가
from sklearn.metrics import confusion_matrix
confmat = confusion_matrix(y_true = y_test, y_pred = y_pred)
print(confmat)
[[208 37]
[ 42 105]]
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('정확도(accuracy) : %.2f'% accuracy_score(y_test, y_pred))
print('정밀도(precision) : %.3f'% precision_score(y_test, y_pred))
print('재현율(recall) : %.3f'% recall_score(y_test, y_pred))
print('F1-score : %.3f'% f1_score(y_test, y_pred))
# f= 2*(정밀도*재현율)/(정밀도+재현율)
정확도(accuracy) : 0.80
정밀도(precision) : 0.739
재현율(recall) : 0.714
F1-score : 0.727
반응형
'Data_Science > Data_Analysis_Py' 카테고리의 다른 글
34. 강남역 고기집 후기분석 || 맵크로울링 (0) | 2021.11.25 |
---|---|
33. white wine || GBM (0) | 2021.11.24 |
31. titanic || logistic (0) | 2021.11.24 |
30. 보스턴 주택가격정보 || 선형회귀 (0) | 2021.11.24 |
29. 비트코인 시계열 분석 || prophet (0) | 2021.11.24 |