728x90
반응형

범주형

  • 설명변수 => 목표변수
  • 목표변수가 범주형인 경우 한값에 분류하여 예측
  • 질병진단, 스펨메일필터링
  • knn k nearest neighbors
import pandas as pd
import seaborn as sns
df = sns.load_dataset('titanic')
pd.set_option('display.max_columns', 15)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.6+ KB

 

rdf = df.drop(['deck', 'embark_town'], axis = 1)
rdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   survived    891 non-null    int64   
 1   pclass      891 non-null    int64   
 2   sex         891 non-null    object  
 3   age         714 non-null    float64 
 4   sibsp       891 non-null    int64   
 5   parch       891 non-null    int64   
 6   fare        891 non-null    float64 
 7   embarked    889 non-null    object  
 8   class       891 non-null    category
 9   who         891 non-null    object  
 10  adult_male  891 non-null    bool    
 11  alive       891 non-null    object  
 12  alone       891 non-null    bool    
dtypes: bool(2), category(1), float64(2), int64(4), object(4)
memory usage: 72.4+ KB

 

rdf = rdf.dropna(subset=['age'], axis = 0)
rdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   survived    714 non-null    int64   
 1   pclass      714 non-null    int64   
 2   sex         714 non-null    object  
 3   age         714 non-null    float64 
 4   sibsp       714 non-null    int64   
 5   parch       714 non-null    int64   
 6   fare        714 non-null    float64 
 7   embarked    712 non-null    object  
 8   class       714 non-null    category
 9   who         714 non-null    object  
 10  adult_male  714 non-null    bool    
 11  alive       714 non-null    object  
 12  alone       714 non-null    bool    
dtypes: bool(2), category(1), float64(2), int64(4), object(4)
memory usage: 63.6+ KB

 

most_freq = rdf['embarked'].value_counts(dropna=True).idxmax()
# rdf.groupby('embarked')['embarked'].count().idxmax()
rdf['embarked'].fillna(most_freq, inplace = True)
rdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   survived    714 non-null    int64   
 1   pclass      714 non-null    int64   
 2   sex         714 non-null    object  
 3   age         714 non-null    float64 
 4   sibsp       714 non-null    int64   
 5   parch       714 non-null    int64   
 6   fare        714 non-null    float64 
 7   embarked    714 non-null    object  
 8   class       714 non-null    category
 9   who         714 non-null    object  
 10  adult_male  714 non-null    bool    
 11  alive       714 non-null    object  
 12  alone       714 non-null    bool    
dtypes: bool(2), category(1), float64(2), int64(4), object(4)
memory usage: 63.6+ KB

 

ndf = rdf[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'embarked']]
ndf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  714 non-null    int64  
 1   pclass    714 non-null    int64  
 2   sex       714 non-null    object 
 3   age       714 non-null    float64
 4   sibsp     714 non-null    int64  
 5   parch     714 non-null    int64  
 6   embarked  714 non-null    object 
dtypes: float64(1), int64(4), object(2)
memory usage: 44.6+ KB

 

ndf.describe()

	survived	pclass	age	sibsp	parch
count	714.000000	714.000000	714.000000	714.000000	714.000000
mean	0.406162	2.236695	29.699118	0.512605	0.431373
std	0.491460	0.838250	14.526497	0.929783	0.853289
min	0.000000	1.000000	0.420000	0.000000	0.000000
25%	0.000000	1.000000	20.125000	0.000000	0.000000
50%	0.000000	2.000000	28.000000	0.000000	0.000000
75%	1.000000	3.000000	38.000000	1.000000	1.000000
max	1.000000	3.000000	80.000000	5.000000	6.000000

# 원핫인코딩 # 범주형데이터를 모형이 인식할 수 있게 숫자형으로 변환

 

oh_set = pd.get_dummies(ndf['sex'])
oh_set.head()

	female	male
0	0	1
1	1	0
2	1	0
3	1	0
4	0	1

 

ndf = pd.concat([ndf, oh_set], axis = 1)
ndf.head()

	survived	pclass	sex	age	sibsp	parch	embarked	female	male
0	0	3	male	22.0	1	0	S	0	1
1	1	1	female	38.0	1	0	C	1	0
2	1	3	female	26.0	0	0	S	1	0
3	1	1	female	35.0	1	0	S	1	0
4	0	3	male	35.0	0	0	S	0	1

 

oh_embarked = pd.get_dummies(ndf['embarked'], prefix = 'town')
ndf = pd.concat([ndf, oh_embarked], axis = 1)
ndf

	survived	pclass	sex	age	sibsp	parch	embarked	female	male	town_C	town_Q	town_S
0	0	3	male	22.0	1	0	S	0	1	0	0	1
1	1	1	female	38.0	1	0	C	1	0	1	0	0
2	1	3	female	26.0	0	0	S	1	0	0	0	1
3	1	1	female	35.0	1	0	S	1	0	0	0	1
4	0	3	male	35.0	0	0	S	0	1	0	0	1
...	...	...	...	...	...	...	...	...	...	...	...	...
885	0	3	female	39.0	0	5	Q	1	0	0	1	0
886	0	2	male	27.0	0	0	S	0	1	0	0	1
887	1	1	female	19.0	0	0	S	1	0	0	0	1
889	1	1	male	26.0	0	0	C	0	1	1	0	0
890	0	3	male	32.0	0	0	Q	0	1	0	1	0
714 rows × 12 columns

 

x =ndf[['pclass','age','sibsp','parch','female','male','town_C','town_Q','town_S']]
y = ndf['survived']
x.head()

	pclass	age	sibsp	parch	female	male	town_C	town_Q	town_S
0	3	22.0	1	0	0	1	0	0	1
1	1	38.0	1	0	1	0	1	0	0
2	3	26.0	0	0	1	0	0	0	1
3	1	35.0	1	0	1	0	0	0	1
4	3	35.0	0	0	0	1	0	0	1

# 설명변수 데이터 정규화
# 분석시 데이터 값의 크기에 따라서 분석의 결과에 영향
# 나이 범위가 크기 때문에 정규화를 통해 모든 속성변수들의 값을 기준단위로 변경

from sklearn import preprocessing
import numpy as np
preprocessing.StandardScaler().fit(x).transform(x)


array([[ 0.91123237, -0.53037664,  0.52457013, ..., -0.47180795,
        -0.20203051,  0.53307848],
       [-1.47636364,  0.57183099,  0.52457013, ...,  2.11950647,
        -0.20203051, -1.87589641],
       [ 0.91123237, -0.25482473, -0.55170307, ..., -0.47180795,
        -0.20203051,  0.53307848],
       ...,
       [-1.47636364, -0.73704057, -0.55170307, ..., -0.47180795,
        -0.20203051,  0.53307848],
       [-1.47636364, -0.25482473, -0.55170307, ...,  2.11950647,
        -0.20203051, -1.87589641],
       [ 0.91123237,  0.15850313, -0.55170307, ..., -0.47180795,
         4.94974747, -1.87589641]])

 

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=10)
x_train.shape

# (499, 9)

 

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train, y_train)
y_hat = knn.predict(x_test)
print(y_hat[0:10])
print(y_test[0:10])

[0 0 1 0 0 1 0 1 0 0]
728    0
555    0
426    1
278    0
617    0
751    1
576    1
679    1
567    0
117    0
Name: survived, dtype: int64

 

# 성능평가
from sklearn import metrics
knn_matrix = metrics.confusion_matrix(y_test, y_hat)
print(knn_matrix)


# [[111  14]
#  [ 29  61]]

precision 정확도

  • 예측 true 실제 true인 tp의 비율
  • 정확도가 높은 것은 예측 T 실제 F fn이 작은 경우

    Recall 재현율

  • 실제값이 true 인 분석대상중 True로 예측한 비율
  • 재현율의 높은 것은 fn 오류가 낮다

    F1 score

  • 정확도와 재현율이 조화 평균을 계산한 값
  • 모형의 예측력을 평가 지표
knn_report = metrics.classification_report(y_test, y_hat)
print(knn_report)

              precision    recall  f1-score   support

           0       0.79      0.89      0.84       125
           1       0.81      0.68      0.74        90

    accuracy                           0.80       215
   macro avg       0.80      0.78      0.79       215
weighted avg       0.80      0.80      0.80       215

accuracy : 정확도 macro avg : 단순평균
weighted avg : 가중평균, 표본의 갯수로 가중평균

 

# svm support vector machine
from sklearn import svm
# kernel = 'rbf' 적용
# 커널 : 벡터공간으로 매핑함수
# rbf = radial basis function
# linear
# polynimial
# sigmoid
svm_model = svm.SVC(kernel='rbf')
svm_model.fit(x_train, y_train)
y_hat = svm_model.predict(x_test)
print(y_hat[0:10])

# [0 0 0 0 0 1 0 0 0 0]

 

from sklearn import metrics
svm_matrix = metrics.confusion_matrix(y_test, y_hat)
print(svm_matrix)

# [[118   7]
#  [ 79  11]]

 

svm_report = metrics.classification_report(y_test, y_hat)
print(svm_report)

              precision    recall  f1-score   support

           0       0.60      0.94      0.73       125
           1       0.61      0.12      0.20        90

    accuracy                           0.60       215
   macro avg       0.61      0.53      0.47       215
weighted avg       0.60      0.60      0.51       215

 

 

반응형

+ Recent posts