
# Decision Tree
# node 분기점 : 분석되는 설명변수


from sklearn import tree
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import pandas as pd
import numpy as np

uci_path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data'
df = pd.read_csv(uci_path, header=None)

0	1	2	3	4	5	6	7	8	9	10
0	1000025	5	1	1	1	2	1	3	1	1	2
1	1002945	5	4	4	5	7	10	3	2	1	2
2	1015425	3	1	1	1	2	2	3	1	1	2
3	1016277	6	8	8	1	3	4	3	7	1	2
4	1017023	4	1	1	3	2	1	3	1	1	2

id : id번호
clump : 덩어리 두께
cell_size : 암세포 크기
cell_shape : 세포모양
adhesion : 한계
epithlial : 상피세포크기
bare_nuclei : 베어핵
chromatin : 염색질
normal_nucleoli : 정상세포
mitoses : 유사분열
class : 양성 음성


df.columns = ['id','clump', 'cell_size', 'cell_shape', 'adhesion', 'epithlial', \
              'bare_nuclei','chromatin', 'normal_nucleoli', 'mitoses', 'class']

	id	clump	cell_size	cell_shape	adhesion	epithlial	bare_nuclei	chromatin	normal_nucleoli	mitoses	class
0	1000025	5	1	1	1	2	1	3	1	1	2
1	1002945	5	4	4	5	7	10	3	2	1	2
2	1015425	3	1	1	1	2	2	3	1	1	2
3	1016277	6	8	8	1	3	4	3	7	1	2
4	1017023	4	1	1	3	2	1	3	1	1	2


2    458
4    241
Name: class, dtype: int64



array(['1', '10', '2', '4', '3', '9', '7', '?', '5', '8', '6'],



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               699 non-null    int64 
 1   clump            699 non-null    int64 
 2   cell_size        699 non-null    int64 
 3   cell_shape       699 non-null    int64 
 4   adhesion         699 non-null    int64 
 5   epithlial        699 non-null    int64 
 6   bare_nuclei      699 non-null    object
 7   chromatin        699 non-null    int64 
 8   normal_nucleoli  699 non-null    int64 
 9   mitoses          699 non-null    int64 
 10  class            699 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 60.2+ KB


df.loc[df['bare_nuclei'] == '?', 'bare_nuclei'] = np.nan


df['bare_nuclei'].replace('?', np.nan, inplace =True)
df.dropna(subset=['bare_nuclei'], axis=0, inplace =True)
df['bare_nuclei'] = df['bare_nuclei'].astype(int)
# 64비트, 8자리수, 32비트 4자리 // 1~10이니깐 상관없음

<class 'pandas.core.frame.DataFrame'>
Int64Index: 683 entries, 0 to 698
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   id               683 non-null    int64
 1   clump            683 non-null    int64
 2   cell_size        683 non-null    int64
 3   cell_shape       683 non-null    int64
 4   adhesion         683 non-null    int64
 5   epithlial        683 non-null    int64
 6   bare_nuclei      683 non-null    int32
 7   chromatin        683 non-null    int64
 8   normal_nucleoli  683 non-null    int64
 9   mitoses          683 non-null    int64
 10  class            683 non-null    int64
dtypes: int32(1), int64(10)
memory usage: 61.4 KB


x = df.iloc[:,1:-1]
y = df.iloc[:,-1]

0      2
1      2
2      2
3      2
4      2
694    2
695    2
696    4
697    4
698    4
Name: class, Length: 683, dtype: int64


# 정규화
x = preprocessing.StandardScaler().fit(x).transform(x)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size =0.3, random_state=10)

[[ 1.97177486  0.6037398   0.59763519 ...  1.4522248   2.00965299
 [ 1.26222679  2.23617957  2.2718962  ...  2.67776377  2.33747554
 [ 0.55267873 -0.70221201 -0.74177362 ... -0.18182716 -0.61292736
 [ 0.19790469 -0.0492361  -0.74177362 ... -0.99885314 -0.61292736
 [-0.51164337 -0.70221201 -0.74177362 ... -0.18182716 -0.61292736
 [ 0.90745276 -0.37572406  0.26278299 ... -0.18182716  0.04271773


tm = tree.DecisionTreeClassifier(criterion = 'entropy', max_depth=5)
# max_depth 트리 단계
# 불순도 : 분류가 안되고 섞여있는 상태 // 
# entropy 는 불순도 측정 함수이름
tm.fit(x_train, y_train)
y_hat = tm.predict(x_test)

[4 4 4 4 4 4 2 2 4 4]


tmetrix = metrics.confusion_matrix(y_test, y_hat)

# [[127   4]
#  [  2  72]]


tree_report = metrics.classification_report(y_test, y_hat)

              precision    recall  f1-score   support

           2       0.98      0.97      0.98       131
           4       0.95      0.97      0.96        74

    accuracy                           0.97       205
   macro avg       0.97      0.97      0.97       205
weighted avg       0.97      0.97      0.97       205


의사결정트리 : 학습데이터에 따라서 생성되는 데이터가 달라지므로 일반화하기 어렵다.
     데이터에 따라 성능, 변동폭이 크다.
        => 단점을 보완하기 위한 알고리즘 randomforest




