728x90
반응형

titanic_test.csv
0.03MB
titanic_train.csv
0.07MB

# 분류
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df_train = pd.read_csv('titanic_train.csv')
df_test = pd.read_csv('titanic_test.csv')
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 916 entries, 0 to 915
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     916 non-null    int64  
 1   survived   916 non-null    int64  
 2   name       916 non-null    object 
 3   sex        916 non-null    object 
 4   age        741 non-null    float64
 5   sibsp      916 non-null    int64  
 6   parch      916 non-null    int64  
 7   ticket     916 non-null    object 
 8   fare       916 non-null    float64
 9   cabin      214 non-null    object 
 10  embarked   914 non-null    object 
 11  body       85 non-null     float64
 12  home.dest  527 non-null    object 
dtypes: float64(3), int64(4), object(6)
memory usage: 93.2+ KB

 

df_train['survived'].value_counts()

0    563
1    353
Name: survived, dtype: int64

 

df_train['survived'].value_counts().plot.bar()

df_train[['pclass','survived']].value_counts().sort_index().plot.bar()

 

ax = sns.countplot(x='pclass', hue = 'survived', data = df_train)

df_train[['sex','survived']].value_counts().sort_index().plot.bar()

 

ax = sns.countplot(x='sex', hue = 'survived', data = df_train)

 

# 분류
# 결측치 처리
- 삭제 : 처리가 쉽지만, 중요정보 삭제
# 변경
- 평균값 또는 중앙값, 최빈값으로 처리

 

df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 916 entries, 0 to 915
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     916 non-null    int64  
 1   survived   916 non-null    int64  
 2   name       916 non-null    object 
 3   sex        916 non-null    object 
 4   age        741 non-null    float64
 5   sibsp      916 non-null    int64  
 6   parch      916 non-null    int64  
 7   ticket     916 non-null    object 
 8   fare       916 non-null    float64
 9   cabin      214 non-null    object 
 10  embarked   914 non-null    object 
 11  body       85 non-null     float64
 12  home.dest  527 non-null    object 
dtypes: float64(3), int64(4), object(6)
memory usage: 93.2+ KB

 

age_mean = df_train['age'].mean()
age_mean

30.23144399460189

 

df_train['age'] = df_train['age'].fillna(age_mean)
df_test['age'] = df_test['age'].fillna(age_mean)
# age_mean = df_train['age'].mean(skipna = False)
df_train['age']

0      13.000000
1       4.000000
2      30.000000
3      30.231444
4      22.000000
         ...    
911     0.170000
912    30.231444
913    30.231444
914    20.000000
915    32.000000
Name: age, Length: 916, dtype: float64

 

df_train['embarked'].isnull().value_counts()

False    914
True       2
Name: embarked, dtype: int64

 

replace_embarked = df_train['embarked'].value_counts().index[0]

df_train['embarked'] = df_train['embarked'].fillna(replace_embarked)
df_test['embarked'] = df_test['embarked'].fillna(replace_embarked)

df_train['embarked']

0      S
1      S
2      C
3      Q
4      Q
      ..
911    S
912    S
913    Q
914    S
915    Q
Name: embarked, Length: 916, dtype: object

 

df_train = df_train.drop(['name','ticket','body','cabin','home.dest'], axis=1)
df_test = df_test.drop(['name','ticket','body','cabin','home.dest'], axis=1)

df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 393 entries, 0 to 392
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    393 non-null    int64  
 1   survived  393 non-null    int64  
 2   sex       393 non-null    object 
 3   age       393 non-null    float64
 4   sibsp     393 non-null    int64  
 5   parch     393 non-null    int64  
 6   fare      393 non-null    float64
 7   embarked  393 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 24.7+ KB

 

whole_df = df_train.append(df_test)
whole_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 392
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    1309 non-null   int64  
 1   survived  1309 non-null   int64  
 2   sex       1309 non-null   object 
 3   age       1309 non-null   float64
 4   sibsp     1309 non-null   int64  
 5   parch     1309 non-null   int64  
 6   fare      1309 non-null   float64
 7   embarked  1309 non-null   object 
dtypes: float64(2), int64(4), object(2)
memory usage: 92.0+ KB

 

train_num = len(df_train)

whole_df_encoded = pd.get_dummies(whole_df)
whole_df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 392
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   pclass      1309 non-null   int64  
 1   survived    1309 non-null   int64  
 2   age         1309 non-null   float64
 3   sibsp       1309 non-null   int64  
 4   parch       1309 non-null   int64  
 5   fare        1309 non-null   float64
 6   sex_female  1309 non-null   uint8  
 7   sex_male    1309 non-null   uint8  
 8   embarked_C  1309 non-null   uint8  
 9   embarked_Q  1309 non-null   uint8  
 10  embarked_S  1309 non-null   uint8  
dtypes: float64(2), int64(4), uint8(5)
memory usage: 78.0 KB

 

df_train = whole_df_encoded[:train_num]
df_test = whole_df_encoded[train_num:]
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 916 entries, 0 to 915
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   pclass      916 non-null    int64  
 1   survived    916 non-null    int64  
 2   age         916 non-null    float64
 3   sibsp       916 non-null    int64  
 4   parch       916 non-null    int64  
 5   fare        916 non-null    float64
 6   sex_female  916 non-null    uint8  
 7   sex_male    916 non-null    uint8  
 8   embarked_C  916 non-null    uint8  
 9   embarked_Q  916 non-null    uint8  
 10  embarked_S  916 non-null    uint8  
dtypes: float64(2), int64(4), uint8(5)
memory usage: 54.6 KB

 

df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 393 entries, 0 to 392
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   pclass      393 non-null    int64  
 1   survived    393 non-null    int64  
 2   age         393 non-null    float64
 3   sibsp       393 non-null    int64  
 4   parch       393 non-null    int64  
 5   fare        393 non-null    float64
 6   sex_female  393 non-null    uint8  
 7   sex_male    393 non-null    uint8  
 8   embarked_C  393 non-null    uint8  
 9   embarked_Q  393 non-null    uint8  
 10  embarked_S  393 non-null    uint8  
dtypes: float64(2), int64(4), uint8(5)
memory usage: 23.4 KB

 

y_train = df_train['survived'].values
x_train = df_train.loc[:,df_train.columns != 'survived'].values
y_test = df_test['survived'].values
x_test = df_train.loc[:,df_train.columns != 'survived'].values

 

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=0)
lr.fit(x_train, y_train)

 

y_pred = lr.predict(x_test)

 

# 평가
from sklearn.metrics import confusion_matrix
confmat = confusion_matrix(y_true = y_test, y_pred = y_pred)
print(confmat)

 

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('정확도(accuracy) : %.2f'% accuracy_score(y_test, y_pred))
print('정밀도(precision) : %.3f'% precision_score(y_test, y_pred))
print('재현율(recall) : %.3f'% recall_score(y_test, y_pred))
print('F1-score : %.3f'% f1_score(y_test, y_pred))
# f= 2*(정밀도*재현율)/(정밀도+재현율)

 

 

반응형
728x90
반응형

# 보스턴 주택가격정보
# http://lib.stat.cmu.edu/datasets/boston_correted.txt

BostonHousing2.csv
0.05MB
import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns housing = pd.read_csv('BostonHousing2.csv') housing.head() TOWN LON LAT CMEDV CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT 0 Nahant -70.955 42.2550 24.0 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98 1 Swampscott -70.950 42.2875 21.6 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14 2 Swampscott -70.936 42.2830 34.7 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03 3 Marblehead -70.928 42.2930 33.4 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94 4 Marblehead -70.922 42.2980 36.2 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33

housing = housing.rename(columns = {'CMEDV':'y'}) housing.info() <class 'pandas.core.frame.DataFrame'> RangeIndex: 506 entries, 0 to 505 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 TOWN 506 non-null object 1 LON 506 non-null float64 2 LAT 506 non-null float64 3 y 506 non-null float64 4 CRIM 506 non-null float64 5 ZN 506 non-null float64 6 INDUS 506 non-null float64 7 CHAS 506 non-null int64 8 NOX 506 non-null float64 9 RM 506 non-null float64 10 AGE 506 non-null float64 11 DIS 506 non-null float64 12 RAD 506 non-null int64 13 TAX 506 non-null int64 14 PTRATIO 506 non-null float64 15 B 506 non-null float64 16 LSTAT 506 non-null float64 dtypes: float64(13), int64(3), object(1) memory usage: 67.3+ KB

import seaborn as sns cols = ['y','RM','LSTAT','NOX'] sns.pairplot( housing[cols]) plt.show() # y, RM 양의 상관관계, LSTAT, NOX 음의 상관관계

# 독립속성, 종속속성 선택 y = housing['y']
from sklearn import linear_model from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error from math import sqrt #1 독립속성 선택 x = housing[['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE', 'DIS','RAD','TAX','PTRATIO','B','LSTAT']] # 2 종속속성 선택 y = housing['y'] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=33)

lr = linear_model.LinearRegression() model = lr.fit(x_train, y_train) print(model.score(x_train, y_train)) print(model.score(x_test, y_test)) 0.7490284664199387 0.7009342135321538

print(lr.coef_) [-1.11193551e-01 5.09415195e-02 3.25436161e-02 3.02115825e+00 -1.54108556e+01 4.04590890e+00 -1.97595267e-03 -1.56114408e+00 3.27038718e-01 -1.38825230e-02 -8.22151628e-01 8.74659468e-03 -5.85060261e-01]

y_pred = lr.predict(x_train) rmse = sqrt(mean_squared_error(y_train, y_pred)) print(rmse) 4.672162734008588

y_pred = lr.predict(x_test) rmse = sqrt(mean_squared_error(y_test, y_pred)) print(rmse) 4.614951784913319

y[:10] 0 24.0 1 21.6 2 34.7 3 33.4 4 36.2 5 28.7 6 22.9 7 22.1 8 16.5 9 18.9 Name: y, dtype: float64
반응형
728x90
반응형
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from fbprophet import Prophet
file_path = 'market-price.csv'
bitcoin_df = pd.read_csv(file_path, names = ['ds', 'y'], header=0)
# 상한가 설정
bitcoin_df['cap'] = 20000
# 하한가 설정 
# bitcoin_df['floor'] = 2000
# growth = logistic  상한설정시 추가, 비선형방식으로 분석
prophet = Prophet(seasonality_mode = 'multiplicative',
                  growth = 'logistic', # 상하한가 설정할때 , 비선형방식
                 yearly_seasonality = True, # 연별
                 weekly_seasonality = True, # 주별
                 daily_seasonality = True, # 일별
                 changepoint_prior_scale = 0.5) # 과적합 방지 0.5만큼 만 분석
prophet.fit(bitcoin_df) # 학습하기
bitcoin_df.head()


ds	y	cap
0	2017-08-27 00:00:00	4354.308333	20000
1	2017-08-28 00:00:00	4391.673517	20000
2	2017-08-29 00:00:00	4607.985450	20000
3	2017-08-30 00:00:00	4594.987850	20000
4	2017-08-31 00:00:00	4748.255000	20000

 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from fbprophet import Prophet
file_path = 'market-price.csv'
bitcoin_df = pd.read_csv(file_path, names = ['ds', 'y'], header=0)
# 상한가 설정
bitcoin_df['cap'] = 20000
# 하한가 설정 
# bitcoin_df['floor'] = 2000
# growth = logistic  상한설정시 추가, 비선형방식으로 분석
prophet = Prophet(seasonality_mode = 'multiplicative',
                  growth = 'logistic', # 상하한가 설정할때 , 비선형방식
                 yearly_seasonality = True, # 연별
                 weekly_seasonality = True, # 주별
                 daily_seasonality = True, # 일별
                 changepoint_prior_scale = 0.5) # 과적합 방지 0.5만큼 만 분석
(
    growth='linear',
    changepoints=None,
    n_changepoints=25,
    changepoint_range=0.8,
    yearly_seasonality='auto',
    weekly_seasonality='auto',
    daily_seasonality='auto',
    holidays=None,
    seasonality_mode='additive',
    seasonality_prior_scale=10.0,
    holidays_prior_scale=10.0,
    changepoint_prior_scale=0.05,
    mcmc_samples=0,
    interval_width=0.8,
    uncertainty_samples=1000,
    stan_backend=None,
)

 

# 5일 앞을 예측하기
future_data = prophet.make_future_dataframe(periods=5, freq='d')
# 상한가 설정
future_data['cap'] = 20000
# future_data['floor'] = 2000
# 예측
forecast_data = prophet.predict(future_data)
forecast_data

	ds	trend	cap	yhat_lower	yhat_upper	trend_lower	trend_upper	daily	daily_lower	daily_upper	...	weekly	weekly_lower	weekly_upper	yearly	yearly_lower	yearly_upper	additive_terms	additive_terms_lower	additive_terms_upper	yhat
0	2017-08-27	5621.085431	20000	4008.821488	5757.019304	5621.085431	5621.085431	0.311474	0.311474	0.311474	...	0.002289	0.002289	0.002289	-0.440095	-0.440095	-0.440095	0.0	0.0	0.0	4910.962354
1	2017-08-28	5626.023045	20000	3955.585468	5723.896533	5626.023045	5626.023045	0.311474	0.311474	0.311474	...	-0.000562	-0.000562	-0.000562	-0.449330	-0.449330	-0.449330	0.0	0.0	0.0	4847.280361
2	2017-08-29	5630.963297	20000	3911.998721	5680.326522	5630.963297	5630.963297	0.311474	0.311474	0.311474	...	-0.000493	-0.000493	-0.000493	-0.459435	-0.459435	-0.459435	0.0	0.0	0.0	4795.023355
3	2017-08-30	5635.906186	20000	3875.520852	5590.533721	5635.906186	5635.906186	0.311474	0.311474	0.311474	...	-0.007356	-0.007356	-0.007356	-0.470318	-0.470318	-0.470318	0.0	0.0	0.0	4699.215406
4	2017-08-31	5640.851711	20000	3810.130956	5423.302109	5640.851711	5640.851711	0.311474	0.311474	0.311474	...	-0.005606	-0.005606	-0.005606	-0.481860	-0.481860	-0.481860	0.0	0.0	0.0	4648.105600
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
365	2018-08-27	7108.210766	20000	5313.384266	7048.183238	7108.210766	7108.210766	0.311474	0.311474	0.311474	...	-0.000562	-0.000562	-0.000562	-0.437930	-0.437930	-0.437930	0.0	0.0	0.0	6205.334670
366	2018-08-28	7111.966217	20000	5257.614559	7014.891617	7111.966217	7111.966217	0.311474	0.311474	0.311474	...	-0.000493	-0.000493	-0.000493	-0.446936	-0.446936	-0.446936	0.0	0.0	0.0	6145.052289
367	2018-08-29	7115.722558	20000	5183.980838	6862.843499	7115.722558	7115.722558	0.311474	0.311474	0.311474	...	-0.007356	-0.007356	-0.007356	-0.456831	-0.456831	-0.456831	0.0	0.0	0.0	6029.053882
368	2018-08-30	7119.479786	20000	5142.817816	6755.514784	7119.464525	7119.496318	0.311474	0.311474	0.311474	...	-0.005606	-0.005606	-0.005606	-0.467530	-0.467530	-0.467530	0.0	0.0	0.0	5968.527240
369	2018-08-31	7123.237902	20000	5101.560913	6845.461477	7123.183774	7123.294421	0.311474	0.311474	0.311474	...	0.000310	0.000310	0.000310	-0.478920	-0.478920	-0.478920	0.0	0.0	0.0	5932.681373
370 rows × 23 columns

 

# 그래프로 작성
fig = prophet.plot(forecast_data)

 

# 실제 데이터와 비교하기
# 예측데이터
pred_y = forecast_data.yhat.values[-5:]
pred_y 

# array([6205.33466998, 6145.05228906, 6029.05388165, 5968.52723998,
#        5932.68137291])

 

# 실제데이터
test_file_path = 'market-price-test.csv'
bitcoin_test_df = pd.read_csv(test_file_path, names = ['ds', 'y'], header=0)
test_y = bitcoin_test_df.y.values

 

# 예측최소데이터
pred_y_lower = forecast_data.yhat_lower.values[-5:]

# 예측최대데이터
pred_y_upper = forecast_data.yhat_upper.values[-5:]

 

plt.plot(pred_y, color = 'gold') # 모델 예측한 가격그래프
plt.plot(pred_y_lower, color = 'red') # 모델이 예상한 최소 가격 그래프
plt.plot(pred_y_upper, color = 'blue') # 모델이 예상한 최대 가격 그래프
plt.plot(test_y, color = 'green') # 실제 가격 그래프

# 이상치 제거
bitcoin_df = pd.read_csv(file_path, names = ['ds', 'y'], header=0)
bitcoin_df.loc[bitcoin_df['y'] > 18000, 'y'] = None
bitcoin_df.info()
# 3건 제거

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365 entries, 0 to 364
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ds      365 non-null    object 
 1   y       362 non-null    float64
dtypes: float64(1), object(1)
memory usage: 5.8+ KB

 

prophet = Prophet(seasonality_mode = 'multiplicative',
                 yearly_seasonality = True, # 연별
                 weekly_seasonality = True, # 주별
                 daily_seasonality = True, # 일별
                 changepoint_prior_scale = 0.5) # 과적합 방지 0.5만큼 만 분석
prophet.fit(bitcoin_df) # 학습하기

 

# 5일 앞을 예측하기
future_data = prophet.make_future_dataframe(periods=5, freq='d')
# # 상한가 설정
# future_data['cap'] = 20000
# 예측
forecast_data = prophet.predict(future_data)
forecast_data

	ds	trend	yhat_lower	yhat_upper	trend_lower	trend_upper	daily	daily_lower	daily_upper	multiplicative_terms	...	weekly	weekly_lower	weekly_upper	yearly	yearly_lower	yearly_upper	additive_terms	additive_terms_lower	additive_terms_upper	yhat
0	2017-08-27	528.085585	3766.698129	5075.496722	528.085585	528.085585	9.711762	9.711762	9.711762	7.371717	...	-0.109233	-0.109233	-0.109233	-2.230812	-2.230812	-2.230812	0.0	0.0	0.0	4420.983246
1	2017-08-28	529.776373	3961.128742	5121.531273	529.776373	529.776373	9.711762	9.711762	9.711762	7.479669	...	-0.054572	-0.054572	-0.054572	-2.177521	-2.177521	-2.177521	0.0	0.0	0.0	4492.328178
2	2017-08-29	531.467162	3978.374754	5172.827221	531.467162	531.467162	9.711762	9.711762	9.711762	7.642893	...	0.067545	0.067545	0.067545	-2.136414	-2.136414	-2.136414	0.0	0.0	0.0	4593.413651
3	2017-08-30	533.157950	4013.934349	5196.960495	533.157950	533.157950	9.711762	9.711762	9.711762	7.611159	...	0.009555	0.009555	0.009555	-2.110158	-2.110158	-2.110158	0.0	0.0	0.0	4591.108025
4	2017-08-31	534.848738	4008.172911	5178.939144	534.848738	534.848738	9.711762	9.711762	9.711762	7.647029	...	0.036189	0.036189	0.036189	-2.100922	-2.100922	-2.100922	0.0	0.0	0.0	4624.852670
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
365	2018-08-27	818.078684	6243.626452	7506.907764	818.078684	818.078684	9.711762	9.711762	9.711762	7.411506	...	-0.054572	-0.054572	-0.054572	-2.245684	-2.245684	-2.245684	0.0	0.0	0.0	6881.273946
366	2018-08-28	822.999668	6453.325229	7704.060396	822.999668	822.999668	9.711762	9.711762	9.711762	7.589491	...	0.067545	0.067545	0.067545	-2.189816	-2.189816	-2.189816	0.0	0.0	0.0	7069.148295
367	2018-08-29	827.920652	6419.908313	7727.754695	825.682414	827.920652	9.711762	9.711762	9.711762	7.575922	...	0.009555	0.009555	0.009555	-2.145395	-2.145395	-2.145395	0.0	0.0	0.0	7100.183131
368	2018-08-30	832.841636	6536.153516	7786.172133	823.752948	834.318783	9.711762	9.711762	9.711762	7.632749	...	0.036189	0.036189	0.036189	-2.115202	-2.115202	-2.115202	0.0	0.0	0.0	7189.713129
369	2018-08-31	837.762619	6550.207854	7952.370851	816.207036	850.803956	9.711762	9.711762	9.711762	7.688080	...	0.077855	0.077855	0.077855	-2.101537	-2.101537	-2.101537	0.0	0.0	0.0	7278.549033
370 rows × 22 columns

 

# 그래프로 작성
fig = prophet.plot(forecast_data)

# 실제 데이터와 비교하기
# 예측데이터
pred_y = forecast_data.yhat.values[-5:]
pred_y 

# array([6881.2739463 , 7069.14829491, 7100.18313111, 7189.71312892,
#        7278.54903332])

 

# 실제데이터
test_file_path = 'market-price-test.csv'
bitcoin_test_df = pd.read_csv(test_file_path, names = ['ds', 'y'], header=0)
test_y = bitcoin_test_df.y.values

# 예측최소데이터
pred_y_lower = forecast_data.yhat_lower.values[-5:]

# 예측최대데이터
pred_y_upper = forecast_data.yhat_upper.values[-5:]

 

plt.plot(pred_y, color = 'gold') # 모델 예측한 가격그래프
plt.plot(pred_y_lower, color = 'red') # 모델이 예상한 최소 가격 그래프
plt.plot(pred_y_upper, color = 'blue') # 모델이 예상한 최대 가격 그래프
plt.plot(test_y, color = 'green') # 실제 가격 그래프

 

# 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from fbprophet import Prophet
file_path = 'market-price.csv'
bitcoin_df = pd.read_csv(file_path, names = ['ds', 'y'], header=0)
# 상한가 설정
bitcoin_df['cap'] = 20000
# 하한가 설정 
bitcoin_df['floor'] = 2000
# growth = logistic  상한설정시 추가, 비선형방식으로 분석
prophet = Prophet(seasonality_mode = 'multiplicative',
                  growth = 'logistic', # 상하한가 설정할때 , 비선형방식
                 yearly_seasonality = True, # 연별
                 weekly_seasonality = True, # 주별
                 daily_seasonality = True, # 일별
                 changepoint_prior_scale = 0.5) # 과적합 방지 0.5만큼 만 분석
prophet.fit(bitcoin_df) # 학습하기

 

# 5일 앞을 예측하기
future_data = prophet.make_future_dataframe(periods=5, freq='d')
# 상한가 설정
future_data['cap'] = 20000
future_data['floor'] = 2000
# 예측
forecast_data = prophet.predict(future_data)
forecast_data


ds	trend	cap	floor	yhat_lower	yhat_upper	trend_lower	trend_upper	daily	daily_lower	...	weekly	weekly_lower	weekly_upper	yearly	yearly_lower	yearly_upper	additive_terms	additive_terms_lower	additive_terms_upper	yhat
0	2017-08-27	5703.063125	20000	2000	3715.115421	5497.027367	5703.063125	5703.063125	0.426516	0.426516	...	0.003555	0.003555	0.003555	-0.626876	-0.626876	-0.626876	0.0	0.0	0.0	4580.676685
1	2017-08-28	5708.250950	20000	2000	3665.713289	5396.548303	5708.250950	5708.250950	0.426516	0.426516	...	-0.000994	-0.000994	-0.000994	-0.642812	-0.642812	-0.642812	0.0	0.0	0.0	4467.904518
2	2017-08-29	5713.444155	20000	2000	3501.918583	5273.509727	5713.444155	5713.444155	0.426516	0.426516	...	-0.000734	-0.000734	-0.000734	-0.659989	-0.659989	-0.659989	0.0	0.0	0.0	4375.317434
3	2017-08-30	5718.642741	20000	2000	3377.490794	5126.427715	5718.642741	5718.642741	0.426516	0.426516	...	-0.010622	-0.010622	-0.010622	-0.678244	-0.678244	-0.678244	0.0	0.0	0.0	4218.353837
4	2017-08-31	5723.846707	20000	2000	3284.264456	5033.597019	5723.846707	5723.846707	0.426516	0.426516	...	-0.008040	-0.008040	-0.008040	-0.697376	-0.697376	-0.697376	0.0	0.0	0.0	4127.470590
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
365	2018-08-27	7102.808324	20000	2000	4866.755803	6547.711110	7102.808324	7102.808324	0.426516	0.426516	...	-0.000994	-0.000994	-0.000994	-0.623101	-0.623101	-0.623101	0.0	0.0	0.0	5699.443947
366	2018-08-28	7106.076367	20000	2000	4743.519630	6436.151223	7106.076367	7106.076367	0.426516	0.426516	...	-0.000734	-0.000734	-0.000734	-0.638706	-0.638706	-0.638706	0.0	0.0	0.0	5593.025888
367	2018-08-29	7109.345673	20000	2000	4533.463676	6332.987745	7109.336045	7109.345673	0.426516	0.426516	...	-0.010622	-0.010622	-0.010622	-0.655587	-0.655587	-0.655587	0.0	0.0	0.0	5405.285348
368	2018-08-30	7112.616243	20000	2000	4408.976493	6155.807154	7112.547092	7112.630877	0.426516	0.426516	...	-0.008040	-0.008040	-0.008040	-0.673590	-0.673590	-0.673590	0.0	0.0	0.0	5298.091380
369	2018-08-31	7115.888076	20000	2000	4405.810985	6115.617608	7115.744311	7115.953331	0.426516	0.426516	...	0.000391	0.000391	0.000391	-0.692523	-0.692523	-0.692523	0.0	0.0	0.0	5225.796784
370 rows × 24 columns

 

# 그래프로 작성
fig = prophet.plot(forecast_data)

 

# 실제 데이터와 비교하기
# 예측데이터
pred_y = forecast_data.yhat.values[-5:]
pred_y 

# array([5699.44394662, 5593.02588819, 5405.28534766, 5298.09137955,
#        5225.79678411])

 

# 실제데이터
test_file_path = 'market-price-test.csv'
bitcoin_test_df = pd.read_csv(test_file_path, names = ['ds', 'y'], header=0)
test_y = bitcoin_test_df.y.values

# 예측최소데이터
pred_y_lower = forecast_data.yhat_lower.values[-5:]

# 예측최대데이터
pred_y_upper = forecast_data.yhat_upper.values[-5:]

 

plt.plot(pred_y, color = 'gold') # 모델 예측한 가격그래프
plt.plot(pred_y_lower, color = 'red') # 모델이 예상한 최소 가격 그래프
plt.plot(pred_y_upper, color = 'blue') # 모델이 예상한 최대 가격 그래프
plt.plot(test_y, color = 'green') # 실제 가격 그래프

 

 

반응형
728x90
반응형

market-price.csv
0.01MB

# 시계열 데이터 : 연속적인 시간에 따라 다르게 측정데이터
# arima 모델 => statsmodel 모듈 이용
# ar 과거정보기준
# ma 이전정보의 오차를 현재 상태로 추론

 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
file_path = 'market-price.csv'
bitcoin_df = pd.read_csv(file_path)
bitcoin_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365 entries, 0 to 364
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Timestamp     365 non-null    object 
 1   market-price  365 non-null    float64
dtypes: float64(1), object(1)
memory usage: 5.8+ KB

 

bitcoin_df.head()

	Timestamp	market-price
0	2017-08-27 00:00:00	4354.308333
1	2017-08-28 00:00:00	4391.673517
2	2017-08-29 00:00:00	4607.985450
3	2017-08-30 00:00:00	4594.987850
4	2017-08-31 00:00:00	4748.255000

 

bitcoin_df = pd.read_csv('market-price.csv', names = ['day','price'], header=0)
bitcoin_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365 entries, 0 to 364
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   day     365 non-null    object 
 1   price   365 non-null    float64
dtypes: float64(1), object(1)
memory usage: 5.8+ KB

 

bitcoin_df.head()

	day	price
0	2017-08-27 00:00:00	4354.308333
1	2017-08-28 00:00:00	4391.673517
2	2017-08-29 00:00:00	4607.985450
3	2017-08-30 00:00:00	4594.987850
4	2017-08-31 00:00:00	4748.255000

 

bitcoin_df.shape

# (365, 2)

 

bitcoin_df['day'] = pd.to_datetime(bitcoin_df['day'])
bitcoin_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365 entries, 0 to 364
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   day     365 non-null    datetime64[ns]
 1   price   365 non-null    float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 5.8 KB

 

bitcoin_df.describe()

	price
count	365.000000
mean	8395.863578
std	3239.804756
min	3319.630000
25%	6396.772500
50%	7685.633333
75%	9630.136277
max	19498.683333

 

bitcoin_df.plot()
plt.show()

 

# arima 모델 학습
# order = (2,1,2)
# 2 => ar 2번째 과거까지 알고리즘에 넣음
# 1 => difference 차분 정보, 현재상태 - 바로 이전의 상태 뺀값
#         시계열 데이터의 불규칙성을 파악 => 비트코인 ^^
# 2 => ma 2번째 과거정보오차를 이용해서 현재를 추론

 

from statsmodels.tsa.arima_model import ARIMA
model = ARIMA(bitcoin_df.price.values, order=(2,1,2))
model_fit = model.fit(trend='c', full_output=True, disp=True)
fig = model_fit.plot_predict() # 학습 데이터에 대한 예측 결과 출력
risiduals = pd.DataFrame(model_fit.resid)# 잔차의 변동을 시각화
risiduals.plot()

# 실제데이터와 비교
# 이후 5일 정보 예측
forecast_data = model_fit.forecast(steps=5)
forecast_data
# 1 번 배열 : 예측값, 5일차 예측값
# 2 번 배열 : 표준오차, 5일차 예측값
# 3 번 배열 : 5개 배열 [예측데이터 하한값, 예측데이터 상한값]

(array([6676.91689529, 6685.04884511, 6690.29837254, 6697.35159419,
        6703.26452567]),
 array([ 512.41529746,  753.50414112,  914.97749885, 1061.45286959,
        1184.4382798 ]),
 array([[5672.60136715, 7681.23242343],
        [5208.20786632, 8161.8898239 ],
        [4896.97542813, 8483.62131695],
        [4616.94219851, 8777.76098987],
        [4381.80815535, 9024.720896  ]]))

 

# 실데이터 읽어오기
test_file_path = 'market-price-test.csv'
bitcoin_test_df = pd.read_csv(test_file_path, names = ['ds','y'], header=0)

 

# 예측값을 pred_y 변수에 리스트로 저장 // 원래 튜플
pred_y = forecast_data[0].tolist()
pred_y

[6676.9168952924865,
 6685.048845109902,
 6690.298372539306,
 6697.35159419041,
 6703.2645256732285]

 

# 실제값을 test_y 변수에 리스트로 저장하기
test_y = bitcoin_test_df['y'].values
test_y
pred_y_lower = [] # 최소 예측값
pred_y_upper = [] # 최대 예측값

 

for low_up in forecast_data[2] :
    pred_y_lower.append(low_up[0])
    pred_y_upper.append(low_up[1])
    
pred_y_lower
[5672.601367152579,
 5208.207866318599,
 4896.975428126821,
 4616.942198505993,
 4381.808155348637]

 

pred_y_upper
[7681.232423432394,
 8161.889823901204,
 8483.62131695179,
 8777.760989874827,
 9024.72089599782]

 

# 시각화
plt.plot(pred_y, color='gold') # 예측값
plt.plot(test_y, color='green') # 실제값 => 변동성있는 편
plt.plot(pred_y_lower, color='red') # 예측 최소값
plt.plot(pred_y_upper, color='blue') # 예측 최대값

 

# 시계열 데이터 분석을 위한 모델

# ar (자기회귀분석모델)

그냥시간이 들어가서 연속적인 => 주가분석 등
현재 자신과 과거의 자신을 비교, 관계
ar(n) => n 이전의 시점

 

# ma 이동평균모델

과거와 현재의 오차의 관계

 

# 합쳐서 자기회귀 이동모델

arma(자기회귀 이동평균 모델)
현재시점의 나와 과거시점의 나를 비교
현재시점의 차이를 비교

 

# arima 자기회귀 누적 이동평균 모델

ma 누적차수 ar 누적차수 동시에
현재와 추세간의 관계 정의 
arma는 규칙적인 시계열데이터는 가능하지만 불규칙적인 시계열에 불리, 한계
보완하기위한 모델이다

 

# arima(p, d, q)
p : ar 모형 차수
d : 차분 : 차이
q : ma 모형 차수
    # p+q가 짝수인게 좋다

 

 

# 페이스북 시계열 알고리즘 prophet

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from fbprophet import Prophet
file_path = 'market-price.csv'
bitcoin_df = pd.read_csv(file_path, names = ['ds', 'y'], header=0)
bitcoin_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365 entries, 0 to 364
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ds      365 non-null    object 
 1   y       365 non-null    float64
dtypes: float64(1), object(1)
memory usage: 5.8+ KB

 

 

prophet = Prophet(seasonality_mode = 'multiplicative',
                 yearly_seasonality = True, # 연별
                 weekly_seasonality = True, # 주별
                 daily_seasonality = True, # 일별
                 changepoint_prior_scale = 0.5) # 과적합 방지 0.5만큼 만 분석
prophet.fit(bitcoin_df) # 학습하기

pip install pystan --upgrade

 

# 5일 앞을 예측하기
future_data = prophet.make_future_dataframe(periods=5, freq='d')
# 예측
forecast_data = prophet.predict(future_data)
forecast_data

	ds	trend	yhat_lower	yhat_upper	trend_lower	trend_upper	daily	daily_lower	daily_upper	multiplicative_terms	...	weekly	weekly_lower	weekly_upper	yearly	yearly_lower	yearly_upper	additive_terms	additive_terms_lower	additive_terms_upper	yhat
0	2017-08-27	473.569120	3776.764014	5104.491150	473.569120	473.569120	9.563964	9.563964	9.563964	8.356854	...	-0.038472	-0.038472	-0.038472	-1.168637	-1.168637	-1.168637	0.0	0.0	0.0	4431.117317
1	2017-08-28	476.933144	3833.197375	5183.393019	476.933144	476.933144	9.563964	9.563964	9.563964	8.436224	...	-0.006602	-0.006602	-0.006602	-1.121138	-1.121138	-1.121138	0.0	0.0	0.0	4500.447825
2	2017-08-29	480.297167	3877.729283	5211.107968	480.297167	480.297167	9.563964	9.563964	9.563964	8.494301	...	0.019974	0.019974	0.019974	-1.089637	-1.089637	-1.089637	0.0	0.0	0.0	4560.085805
3	2017-08-30	483.661190	3954.539662	5206.571586	483.661190	483.661190	9.563964	9.563964	9.563964	8.440425	...	-0.046634	-0.046634	-0.046634	-1.076905	-1.076905	-1.076905	0.0	0.0	0.0	4565.966993
4	2017-08-31	487.025213	3931.103385	5269.222802	487.025213	487.025213	9.563964	9.563964	9.563964	8.461194	...	-0.017649	-0.017649	-0.017649	-1.085122	-1.085122	-1.085122	0.0	0.0	0.0	4607.839822
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
365	2018-08-27	738.543896	6218.124910	7629.182104	738.543896	738.543896	9.563964	9.563964	9.563964	8.374726	...	-0.006602	-0.006602	-0.006602	-1.182636	-1.182636	-1.182636	0.0	0.0	0.0	6923.647020
366	2018-08-28	742.612648	6338.876532	7721.930490	742.612648	742.612648	9.563964	9.563964	9.563964	8.452304	...	0.019974	0.019974	0.019974	-1.131634	-1.131634	-1.131634	0.0	0.0	0.0	7019.400574
367	2018-08-29	746.681400	6371.510730	7768.115586	746.681400	752.202325	9.563964	9.563964	9.563964	8.421478	...	-0.046634	-0.046634	-0.046634	-1.095851	-1.095851	-1.095851	0.0	0.0	0.0	7034.842537
368	2018-08-30	750.750152	6374.620387	7883.157582	748.285679	770.190606	9.563964	9.563964	9.563964	8.468117	...	-0.017649	-0.017649	-0.017649	-1.078198	-1.078198	-1.078198	0.0	0.0	0.0	7108.190099
369	2018-08-31	754.818904	6440.287682	7941.401382	742.825041	785.906885	9.563964	9.563964	9.563964	8.518827	...	0.035872	0.035872	0.035872	-1.081008	-1.081008	-1.081008	0.0	0.0	0.0	7184.990775
370 rows × 22 columns

 

forecast_data.shape

# (370, 22)

 

# 예측된 데이터의 날짜 , 예측값, 최소 예측값, 최대 예측값
forecast_data[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail(5)

	ds	yhat	yhat_lower	yhat_upper
365	2018-08-27	6923.647020	6218.124910	7629.182104
366	2018-08-28	7019.400574	6338.876532	7721.930490
367	2018-08-29	7034.842537	6371.510730	7768.115586
368	2018-08-30	7108.190099	6374.620387	7883.157582
369	2018-08-31	7184.990775	6440.287682	7941.401382

 

# 결과 시각화
fig1 = prophet.plot(forecast_data)
# 검은점 : 실데이터
# 파란선 : 예측값

fig2 = prophet.plot_components(forecast_data)
# 4개의 데이터
# 원 데이터
# 주별
# 연별
# 시간별

# 예측이니 성능 분석도 해야함 
# 실제값 - 예측값 
y = bitcoin_df.y.values[5:] # 실데이터, 첫5일제외, 
y_pred = forecast_data.yhat.values[5:-5] #첫5일, 막5일 제외한 예측데이터

 

# r2score rmse
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt
r2 = r2_score(y, y_pred)
r2
# 0.9737786665877044

 

rmse = sqrt(mean_squared_error(y, y_pred))
rmse

# 522.2899311292591

 

# 실데이터와 비교
test_file_path = 'market-price-test.csv'
bitcoin_test_df = pd.read_csv(test_file_path, names = ['ds','y'], header=0)
bitcoin_test_df

	ds	y
0	2018-08-27 00:00:00	6719.266154
1	2018-08-28 00:00:00	7000.040000
2	2018-08-29 00:00:00	7054.276429
3	2018-08-30 00:00:00	6932.662500
4	2018-08-31 00:00:00	6981.946154

 

y = bitcoin_test_df.y.values
y
array([6719.26615385, 7000.04      , 7054.27642857, 6932.6625    ,
       6981.94615385])

 

y_pred = forecast_data.yhat.values[-5:]
y_pred 

array([6923.64702007, 7019.40057427, 7034.84253693, 7108.19009905,
       7184.99077545])

 

plt.plot(y_pred, color = 'gold')
plt.plot(y, color = 'green')

 

반응형
728x90
반응형

picher_stats_2017.csv
0.01MB
batter_stats_2017.csv
0.02MB

# 프로야구 연봉 예측
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
picher_file_path = 'picher_stats_2017.csv'
batter_file_path = 'batter_stats_2017.csv'
picher = pd.read_csv(picher_file_path)
batter = pd.read_csv(batter_file_path)
batter.columns

Index(['선수명', '팀명', '경기', '타석', '타수', '안타', '홈런', '득점', '타점', '볼넷', '삼진', '도루',
       'BABIP', '타율', '출루율', '장타율', 'OPS', 'wOBA', 'WAR', '연봉(2018)',
       '연봉(2017)'],
      dtype='object')

 

pi_fea_df = picher[['승','패','세','홀드','블론','경기','선발','이닝','삼진/9',
                    '볼넷/9','홈런/9','BABIP','LOB%','ERA','RA9-WAR','FIP','kFIP','WAR','연봉(2018)','연봉(2017)']]
pi_fea_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 152 entries, 0 to 151
Data columns (total 20 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   승         152 non-null    int64  
 1   패         152 non-null    int64  
 2   세         152 non-null    int64  
 3   홀드        152 non-null    int64  
 4   블론        152 non-null    int64  
 5   경기        152 non-null    int64  
 6   선발        152 non-null    int64  
 7   이닝        152 non-null    float64
 8   삼진/9      152 non-null    float64
 9   볼넷/9      152 non-null    float64
 10  홈런/9      152 non-null    float64
 11  BABIP     152 non-null    float64
 12  LOB%      152 non-null    float64
 13  ERA       152 non-null    float64
 14  RA9-WAR   152 non-null    float64
 15  FIP       152 non-null    float64
 16  kFIP      152 non-null    float64
 17  WAR       152 non-null    float64
 18  연봉(2018)  152 non-null    int64  
 19  연봉(2017)  152 non-null    int64  
dtypes: float64(11), int64(9)
memory usage: 23.9 KB

 

picher = picher.rename(columns = {'연봉(2018)':'y'})

team_encoding = pd.get_dummies(picher['팀명'])
team_encoding.head()

KIA	KT	LG	NC	SK	두산	롯데	삼성	한화
0	0	0	0	0	1	0	0	0	0
1	0	0	1	0	0	0	0	0	0
2	1	0	0	0	0	0	0	0	0
3	0	0	1	0	0	0	0	0	0
4	0	0	0	0	0	0	1	0	0

 

picher = pd.concat([picher, team_encoding], axis=1)
picher.head()

	선수명	팀명	승	패	세	홀드	블론	경기	선발	이닝	...	연봉(2017)	KIA	KT	LG	NC	SK	두산	롯데	삼성	한화
0	켈리	SK	16	7	0	0	0	30	30	190.0	...	85000	0	0	0	0	1	0	0	0	0
1	소사	LG	11	11	1	0	0	30	29	185.1	...	50000	0	0	1	0	0	0	0	0	0
2	양현종	KIA	20	6	0	0	0	31	31	193.1	...	150000	1	0	0	0	0	0	0	0	0
3	차우찬	LG	10	7	0	0	0	28	28	175.2	...	100000	0	0	1	0	0	0	0	0	0
4	레일리	롯데	13	7	0	0	0	30	30	187.1	...	85000	0	0	0	0	0	0	1	0	0
5 rows × 31 columns

 

picher = picher.drop('팀명', axis=1)

x = picher[picher.columns.difference({'선수명','y'})]
y = picher['y']

 

from sklearn import preprocessing as pp
x = pp.StandardScaler().fit(x).transform(x)

 

pd.options.mode.chained_assignment = None # 과학적표기방법 안씀
# 정규화 함수
def standard_scaling(df, scale_columns) :
    for col in scale_columns :
        s_mean = df[col].mean()
        s_std = df[col].std()
        df[col] =  df[col].apply(lambda x : (x - s_mean)/s_std)
    return df

 

pi_fea_df = picher[['승','패','세','홀드','블론','경기','선발','이닝','삼진/9',
                    '볼넷/9','홈런/9','BABIP','LOB%','ERA','RA9-WAR','FIP','kFIP','WAR','연봉(2017)']]
pi_fea_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 152 entries, 0 to 151
Data columns (total 19 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   승         152 non-null    float64
 1   패         152 non-null    float64
 2   세         152 non-null    float64
 3   홀드        152 non-null    float64
 4   블론        152 non-null    float64
 5   경기        152 non-null    float64
 6   선발        152 non-null    float64
 7   이닝        152 non-null    float64
 8   삼진/9      152 non-null    float64
 9   볼넷/9      152 non-null    float64
 10  홈런/9      152 non-null    float64
 11  BABIP     152 non-null    float64
 12  LOB%      152 non-null    float64
 13  ERA       152 non-null    float64
 14  RA9-WAR   152 non-null    float64
 15  FIP       152 non-null    float64
 16  kFIP      152 non-null    float64
 17  WAR       152 non-null    float64
 18  연봉(2017)  152 non-null    int64  
dtypes: float64(18), int64(1)
memory usage: 22.7 KB

 

picher_df = standard_scaling(picher,pi_fea_df )

 

# 정규화된 x
x = picher[picher_df.columns.difference({'선수명','y'})]
x.head()

	BABIP	ERA	FIP	KIA	KT	LG	LOB%	NC	RA9-WAR	SK	...	삼진/9	선발	세	승	연봉(2017)	이닝	패	한화	홀드	홈런/9
0	0.016783	-0.587056	-0.971030	0	0	0	0.446615	0	3.174630	1	...	0.672099	2.452068	-0.306452	3.313623	2.734705	2.645175	1.227145	0	-0.585705	-0.442382
1	-0.241686	-0.519855	-1.061888	0	0	1	-0.122764	0	3.114968	0	...	0.134531	2.349505	-0.098502	2.019505	1.337303	2.547755	2.504721	0	-0.585705	-0.668521
2	-0.095595	-0.625456	-0.837415	1	0	0	0.308584	0	2.973948	0	...	0.109775	2.554632	-0.306452	4.348918	5.329881	2.706808	0.907751	0	-0.585705	-0.412886
3	-0.477680	-0.627856	-0.698455	0	0	1	0.558765	0	2.740722	0	...	0.350266	2.246942	-0.306452	1.760682	3.333592	2.350927	1.227145	0	-0.585705	-0.186746
4	-0.196735	-0.539055	-0.612941	0	0	0	0.481122	0	2.751570	0	...	0.155751	2.452068	-0.306452	2.537153	2.734705	2.587518	1.227145	0	-0.585705	-0.294900
5 rows × 28 columns

 

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=19)

picher.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 152 entries, 0 to 151
Data columns (total 30 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   선수명       152 non-null    object 
 1   승         152 non-null    float64
 2   패         152 non-null    float64
 3   세         152 non-null    float64
 4   홀드        152 non-null    float64
 5   블론        152 non-null    float64
 6   경기        152 non-null    float64
 7   선발        152 non-null    float64
 8   이닝        152 non-null    float64
 9   삼진/9      152 non-null    float64
 10  볼넷/9      152 non-null    float64
 11  홈런/9      152 non-null    float64
 12  BABIP     152 non-null    float64
 13  LOB%      152 non-null    float64
 14  ERA       152 non-null    float64
 15  RA9-WAR   152 non-null    float64
 16  FIP       152 non-null    float64
 17  kFIP      152 non-null    float64
 18  WAR       152 non-null    float64
 19  y         152 non-null    int64  
 20  연봉(2017)  152 non-null    float64
 21  KIA       152 non-null    uint8  
 22  KT        152 non-null    uint8  
 23  LG        152 non-null    uint8  
 24  NC        152 non-null    uint8  
 25  SK        152 non-null    uint8  
 26  두산        152 non-null    uint8  
 27  롯데        152 non-null    uint8  
 28  삼성        152 non-null    uint8  
 29  한화        152 non-null    uint8  
dtypes: float64(19), int64(1), object(1), uint8(9)
memory usage: 26.4+ KB

 

# ols
import statsmodels.api as sm

x_train = sm.add_constant(x_train)
model = sm.OLS(y_train, x_train).fit()
model.summary()

OLS Regression Results
Dep. Variable:	y	R-squared:	0.928
Model:	OLS	Adj. R-squared:	0.907
Method:	Least Squares	F-statistic:	44.19
Date:	Tue, 20 Jul 2021	Prob (F-statistic):	7.70e-42
Time:	10:11:19	Log-Likelihood:	-1247.8
No. Observations:	121	AIC:	2552.
Df Residuals:	93	BIC:	2630.
Df Model:	27		
Covariance Type:	nonrobust		
coef	std err	t	P>|t|	[0.025	0.975]
const	1.872e+04	775.412	24.136	0.000	1.72e+04	2.03e+04
x1	-1476.1375	1289.136	-1.145	0.255	-4036.106	1083.831
x2	-415.3144	2314.750	-0.179	0.858	-5011.949	4181.320
x3	-9.383e+04	9.4e+04	-0.998	0.321	-2.8e+05	9.28e+04
x4	-485.0276	671.883	-0.722	0.472	-1819.254	849.199
x5	498.2459	695.803	0.716	0.476	-883.480	1879.972
x6	-262.5237	769.196	-0.341	0.734	-1789.995	1264.948
x7	-1371.0060	1559.650	-0.879	0.382	-4468.162	1726.150
x8	-164.7210	760.933	-0.216	0.829	-1675.784	1346.342
x9	3946.0617	2921.829	1.351	0.180	-1856.111	9748.235
x10	269.1233	721.020	0.373	0.710	-1162.679	1700.926
x11	1.024e+04	2523.966	4.057	0.000	5226.545	1.53e+04
x12	7.742e+04	7.93e+04	0.977	0.331	-8e+04	2.35e+05
x13	-2426.3684	2943.799	-0.824	0.412	-8272.169	3419.432
x14	-285.5830	781.560	-0.365	0.716	-1837.606	1266.440
x15	111.1761	758.548	0.147	0.884	-1395.150	1617.502
x16	7587.0753	6254.661	1.213	0.228	-4833.443	2e+04
x17	1266.8570	1238.036	1.023	0.309	-1191.636	3725.350
x18	-972.1837	817.114	-1.190	0.237	-2594.810	650.443
x19	5379.1903	7262.214	0.741	0.461	-9042.128	1.98e+04
x20	-4781.4961	5471.265	-0.874	0.384	-1.56e+04	6083.352
x21	-249.8717	1291.108	-0.194	0.847	-2813.757	2314.014
x22	235.2476	2207.965	0.107	0.915	-4149.333	4619.828
x23	1.907e+04	1266.567	15.055	0.000	1.66e+04	2.16e+04
x24	851.2121	6602.114	0.129	0.898	-1.23e+04	1.4e+04
x25	1297.3310	1929.556	0.672	0.503	-2534.385	5129.047
x26	1199.4709	720.099	1.666	0.099	-230.503	2629.444
x27	-931.9918	1632.526	-0.571	0.569	-4173.865	2309.882
x28	1.808e+04	1.67e+04	1.082	0.282	-1.51e+04	5.13e+04
Omnibus:	28.069	Durbin-Watson:	2.025
Prob(Omnibus):	0.000	Jarque-Bera (JB):	194.274
Skew:	-0.405	Prob(JB):	6.52e-43
Kurtosis:	9.155	Cond. No.	1.23e+16


Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 5.36e-30. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

 

# r_squared 결정계수
독립변수의 변동량으로 설명되는 종속변수의 변동량
상관계수의 제곱과 가다


# adj 수정결정계수
독립변수가 많아지는 경우 결정계수값이 커질수있어, 표본의 크기와 독립변수의 수를 고려하여
다중회귀분석을 수행하는 경우
p>|t| 각피처의 검정통계량(f statistics )이 유의미한지를 나타내는 pvalue 값
p value < 0.05 이면 피처가 회귀분석에 유의미한 피처다
이분석에서는 war 연복2017 한화 3개가 0.05미만
=> 회귀분석에서 유의미한 피처들

 

x_train = sm.add_constant(x_train)
model = sm.OLS(y_train, x_train).fit()
model.summary()

OLS Regression Results
Dep. Variable:	y	R-squared:	0.928
Model:	OLS	Adj. R-squared:	0.907
Method:	Least Squares	F-statistic:	44.19
Date:	Tue, 20 Jul 2021	Prob (F-statistic):	7.70e-42
Time:	10:53:15	Log-Likelihood:	-1247.8
No. Observations:	121	AIC:	2552.
Df Residuals:	93	BIC:	2630.
Df Model:	27		
Covariance Type:	nonrobust		
coef	std err	t	P>|t|	[0.025	0.975]
const	1.678e+04	697.967	24.036	0.000	1.54e+04	1.82e+04
BABIP	-1481.0173	1293.397	-1.145	0.255	-4049.448	1087.414
ERA	-416.6874	2322.402	-0.179	0.858	-5028.517	4195.143
FIP	-9.414e+04	9.43e+04	-0.998	0.321	-2.81e+05	9.31e+04
KIA	303.1852	2222.099	0.136	0.892	-4109.462	4715.833
KT	3436.0520	2133.084	1.611	0.111	-799.831	7671.935
LG	1116.9978	2403.317	0.465	0.643	-3655.513	5889.509
LOB%	-1375.5383	1564.806	-0.879	0.382	-4482.933	1731.857
NC	1340.5004	2660.966	0.504	0.616	-3943.651	6624.652
RA9-WAR	3959.1065	2931.488	1.351	0.180	-1862.247	9780.460
SK	2762.4237	2243.540	1.231	0.221	-1692.803	7217.650
WAR	1.027e+04	2532.309	4.057	0.000	5243.823	1.53e+04
kFIP	7.767e+04	7.95e+04	0.977	0.331	-8.03e+04	2.36e+05
경기	-2434.3895	2953.530	-0.824	0.412	-8299.515	3430.736
두산	971.9293	2589.849	0.375	0.708	-4170.998	6114.857
롯데	2313.9585	2566.009	0.902	0.370	-2781.627	7409.544
볼넷/9	7612.1566	6275.338	1.213	0.228	-4849.421	2.01e+04
블론	1271.0450	1242.128	1.023	0.309	-1195.576	3737.666
삼성	-946.5092	2482.257	-0.381	0.704	-5875.780	3982.762
삼진/9	5396.9728	7286.221	0.741	0.461	-9072.019	1.99e+04
선발	-4797.3028	5489.352	-0.874	0.384	-1.57e+04	6103.463
세	-250.6977	1295.377	-0.194	0.847	-2823.059	2321.663
승	236.0253	2215.264	0.107	0.915	-4163.049	4635.100
연봉(2017)	1.913e+04	1270.754	15.055	0.000	1.66e+04	2.17e+04
이닝	854.0260	6623.940	0.129	0.898	-1.23e+04	1.4e+04
패	1301.6197	1935.935	0.672	0.503	-2542.763	5146.003
한화	5477.8879	2184.273	2.508	0.014	1140.355	9815.421
홀드	-935.0728	1637.923	-0.571	0.569	-4187.663	2317.518
홈런/9	1.814e+04	1.68e+04	1.082	0.282	-1.52e+04	5.14e+04
Omnibus:	28.069	Durbin-Watson:	2.025
Prob(Omnibus):	0.000	Jarque-Bera (JB):	194.274
Skew:	-0.405	Prob(JB):	6.52e-43
Kurtosis:	9.155	Cond. No.	3.63e+16


Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 6.04e-31. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

 

plt.rcParams['figure.figsize'] = [20, 16]
plt.rc('font', family = 'Malgun Gothic')
coefs = model.params.tolist()
coefs_series = pd.Series(coefs)
x_labels = model.params.index.tolist()
ax = coefs_series.plot(kind = 'bar')
ax.set_title('feature_coef_graph')
ax.set_xlabel('x_feature')
ax.set_ylabel('coef')
ax.set_xticklabels(x_labels)

[Text(0, 0, 'const'),
 Text(1, 0, 'BABIP'),
 Text(2, 0, 'ERA'),
 Text(3, 0, 'FIP'),
 Text(4, 0, 'KIA'),
 Text(5, 0, 'KT'),
 Text(6, 0, 'LG'),
 Text(7, 0, 'LOB%'),
 Text(8, 0, 'NC'),
 Text(9, 0, 'RA9-WAR'),
 Text(10, 0, 'SK'),
 Text(11, 0, 'WAR'),
 Text(12, 0, 'kFIP'),
 Text(13, 0, '경기'),
 Text(14, 0, '두산'),
 Text(15, 0, '롯데'),
 Text(16, 0, '볼넷/9'),
 Text(17, 0, '블론'),
 Text(18, 0, '삼성'),
 Text(19, 0, '삼진/9'),
 Text(20, 0, '선발'),
 Text(21, 0, '세'),
 Text(22, 0, '승'),
 Text(23, 0, '연봉(2017)'),
 Text(24, 0, '이닝'),
 Text(25, 0, '패'),
 Text(26, 0, '한화'),
 Text(27, 0, '홀드'),
 Text(28, 0, '홈런/9')]

다중공선성이 높으면 상관성이 너무 높은 것
안정적인 분석을 위해서 안써야함

 

from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame()
vif['VIF Factor'] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]
vif['features'] = x.columns
vif.round(1)

	VIF Factor	features
0	3.2	BABIP
1	10.6	ERA
2	14238.3	FIP
3	1.1	KIA
4	1.1	KT
5	1.1	LG
6	4.3	LOB%
7	1.1	NC
8	13.6	RA9-WAR
9	1.1	SK
10	10.4	WAR
11	10264.1	kFIP
12	14.6	경기
13	1.2	두산
14	1.1	롯데
15	57.8	볼넷/9
16	3.0	블론
17	1.2	삼성
18	89.5	삼진/9
19	39.6	선발
20	3.1	세
21	8.0	승
22	2.5	연봉(2017)
23	63.8	이닝
24	5.9	패
25	1.1	한화
26	3.8	홀드
27	425.6	홈런/9

 

변수간 상관관계가 높아서 분석에 부정적인 영향을 미침
vif 평가 : 분산팽창요인 
    보통 10~15 정도를 넘으면 다중공선성에 문제가 있다고 판단
    홈런, 이닝, 선발, 삼진, 볼넷, 경기, kfip,fip 
    특히 이둘은 너무 유사해서 상승효과가 생김, 그래서 하나는 빼버려야함
1. vif 계수 높은 피처 제거, 유사피처중 한개만 제거
2. 다시모델을 실행해서 공선성 검증
3. 분석결과에서 p-value값이 유의미한 피처들을 선정

 

# 적절한 피처를 선정해서 다시 학습하기
# 피처간 상관계수를 그래프로 작성
scale_columns = ['승','패','세','홀드','블론','경기','선발','이닝','삼진/9',
                    '볼넷/9','홈런/9','BABIP','LOB%','ERA','RA9-WAR','FIP','kFIP','WAR','연봉(2017)']
picher_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 152 entries, 0 to 151
Data columns (total 30 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   선수명       152 non-null    object 
 1   승         152 non-null    float64
 2   패         152 non-null    float64
 3   세         152 non-null    float64
 4   홀드        152 non-null    float64
 5   블론        152 non-null    float64
 6   경기        152 non-null    float64
 7   선발        152 non-null    float64
 8   이닝        152 non-null    float64
 9   삼진/9      152 non-null    float64
 10  볼넷/9      152 non-null    float64
 11  홈런/9      152 non-null    float64
 12  BABIP     152 non-null    float64
 13  LOB%      152 non-null    float64
 14  ERA       152 non-null    float64
 15  RA9-WAR   152 non-null    float64
 16  FIP       152 non-null    float64
 17  kFIP      152 non-null    float64
 18  WAR       152 non-null    float64
 19  y         152 non-null    int64  
 20  연봉(2017)  152 non-null    float64
 21  KIA       152 non-null    uint8  
 22  KT        152 non-null    uint8  
 23  LG        152 non-null    uint8  
 24  NC        152 non-null    uint8  
 25  SK        152 non-null    uint8  
 26  두산        152 non-null    uint8  
 27  롯데        152 non-null    uint8  
 28  삼성        152 non-null    uint8  
 29  한화        152 non-null    uint8  
dtypes: float64(19), int64(1), object(1), uint8(9)
memory usage: 26.4+ KB

 

corr = picher_df[scale_columns].corr(method='pearson')
corr

	승	패	세	홀드	블론	경기	선발	이닝	삼진/9	볼넷/9	홈런/9	BABIP	LOB%	ERA	RA9-WAR	FIP	kFIP	WAR	연봉(2017)
승	1.000000	0.710749	0.053747	0.092872	0.105281	0.397074	0.773560	0.906093	0.078377	-0.404710	-0.116147	-0.171111	0.131178	-0.271086	0.851350	-0.303133	-0.314159	0.821420	0.629710
패	0.710749	1.000000	0.066256	0.098617	0.121283	0.343147	0.771395	0.829018	0.031755	-0.386313	-0.064467	-0.133354	-0.020994	-0.188036	0.595989	-0.233416	-0.238688	0.625641	0.429227
세	0.053747	0.066256	1.000000	0.112716	0.605229	0.434290	-0.177069	0.020278	0.170436	-0.131394	-0.073111	-0.089212	0.167557	-0.150348	0.167669	-0.199746	-0.225259	0.084151	0.262664
홀드	0.092872	0.098617	0.112716	1.000000	0.490076	0.715527	-0.285204	0.024631	0.186790	-0.146806	-0.076475	-0.104307	0.048123	-0.155712	0.003526	-0.211515	-0.237353	-0.038613	-0.001213
블론	0.105281	0.121283	0.605229	0.490076	1.000000	0.630526	-0.264160	0.014176	0.188423	-0.137019	-0.064804	-0.112480	0.100633	-0.160761	0.008766	-0.209014	-0.237815	-0.058213	0.146584
경기	0.397074	0.343147	0.434290	0.715527	0.630526	1.000000	-0.037443	0.376378	0.192487	-0.364293	-0.113545	-0.241608	0.105762	-0.320177	0.281595	-0.345351	-0.373777	0.197836	0.225357
선발	0.773560	0.771395	-0.177069	-0.285204	-0.264160	-0.037443	1.000000	0.894018	-0.055364	-0.312935	-0.058120	-0.098909	0.041819	-0.157775	0.742258	-0.151040	-0.142685	0.758846	0.488559
이닝	0.906093	0.829018	0.020278	0.024631	0.014176	0.376378	0.894018	1.000000	0.037343	-0.451101	-0.107063	-0.191514	0.103369	-0.285392	0.853354	-0.296768	-0.302288	0.832609	0.586874
삼진/9	0.078377	0.031755	0.170436	0.186790	0.188423	0.192487	-0.055364	0.037343	1.000000	0.109345	0.216017	0.457523	-0.071284	0.256840	0.102963	-0.154857	-0.317594	0.151791	0.104948
볼넷/9	-0.404710	-0.386313	-0.131394	-0.146806	-0.137019	-0.364293	-0.312935	-0.451101	0.109345	1.000000	0.302251	0.276009	-0.150837	0.521039	-0.398586	0.629833	0.605008	-0.394131	-0.332379
홈런/9	-0.116147	-0.064467	-0.073111	-0.076475	-0.064804	-0.113545	-0.058120	-0.107063	0.216017	0.302251	1.000000	0.362614	-0.274543	0.629912	-0.187210	0.831042	0.743623	-0.205014	-0.100896
BABIP	-0.171111	-0.133354	-0.089212	-0.104307	-0.112480	-0.241608	-0.098909	-0.191514	0.457523	0.276009	0.362614	1.000000	-0.505478	0.733109	-0.187058	0.251126	0.166910	-0.082995	-0.088754
LOB%	0.131178	-0.020994	0.167557	0.048123	0.100633	0.105762	0.041819	0.103369	-0.071284	-0.150837	-0.274543	-0.505478	1.000000	-0.720091	0.286893	-0.288050	-0.269536	0.144191	0.110424
ERA	-0.271086	-0.188036	-0.150348	-0.155712	-0.160761	-0.320177	-0.157775	-0.285392	0.256840	0.521039	0.629912	0.733109	-0.720091	1.000000	-0.335584	0.648004	0.582057	-0.261508	-0.203305
RA9-WAR	0.851350	0.595989	0.167669	0.003526	0.008766	0.281595	0.742258	0.853354	0.102963	-0.398586	-0.187210	-0.187058	0.286893	-0.335584	1.000000	-0.366308	-0.377679	0.917299	0.643375
FIP	-0.303133	-0.233416	-0.199746	-0.211515	-0.209014	-0.345351	-0.151040	-0.296768	-0.154857	0.629833	0.831042	0.251126	-0.288050	0.648004	-0.366308	1.000000	0.984924	-0.391414	-0.268005
kFIP	-0.314159	-0.238688	-0.225259	-0.237353	-0.237815	-0.373777	-0.142685	-0.302288	-0.317594	0.605008	0.743623	0.166910	-0.269536	0.582057	-0.377679	0.984924	1.000000	-0.408283	-0.282666
WAR	0.821420	0.625641	0.084151	-0.038613	-0.058213	0.197836	0.758846	0.832609	0.151791	-0.394131	-0.205014	-0.082995	0.144191	-0.261508	0.917299	-0.391414	-0.408283	1.000000	0.675794
연봉(2017)	0.629710	0.429227	0.262664	-0.001213	0.146584	0.225357	0.488559	0.586874	0.104948	-0.332379	-0.100896	-0.088754	0.110424	-0.203305	0.643375	-0.268005	-0.282666	0.675794	1.000000

 

# 히트맵 시각화
import seaborn as sns
show_cols = ['win', 'lose','save','hold','blon','match','start','inning','strike3',
            'ball4','homerun','BABIP','LOB','ERA','RA9-WAR','FIP','kFIP','WAR','2017']
plt.rc('font', family = 'Nanum Gothic')
sns.set(font_scale=0.8)
hm = sns.heatmap(corr.values,
                cbar = True,
                annot = True,
                square = True,
                fmt = '.2f',
                annot_kws={'size':15},
                yticklabels = show_cols,
                xticklabels = show_cols)
plt.tight_layout()
plt.show()

x = picher_df[['FIP','WAR','볼넷/9','삼진/9','연봉(2017)']]
y = picher_df['y']

 

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=19)

 

# 모델학습하기
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
model = lr.fit(x_train, y_train)

 

# r2
print(model.score(x_train, y_train))
print(model.score(x_test, y_test))

# 0.9150591192570362
# 0.9038759653889865

 

# rmse 평가
# mse 평균제곱오차
from math import sqrt
from sklearn.metrics import mean_squared_error
y_pred = lr.predict(x_train)
print(sqrt(mean_squared_error(y_train, y_pred)))
y_pred = lr.predict(x_test)
print(sqrt(mean_squared_error(y_test, y_pred)))

# 7893.462873347693
# 13141.86606359108

 

# 피처별 vif 공분산
from statsmodels.stats.outliers_influence import variance_inflation_factor
x = picher_df[['FIP','WAR','볼넷/9','삼진/9','연봉(2017)']]
vif = pd.DataFrame()
vif['VIF Factor'] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]
vif['features'] = x.columns
vif.round(1)


	VIF Factor	features
0	1.9	FIP
1	2.1	WAR
2	1.9	볼넷/9
3	1.1	삼진/9
4	1.9	연봉(2017)

 

# 시각화\ 비교
# 모든 데이터 검증
# lr 학습이 완료된 객체
x = picher_df[['FIP','WAR','볼넷/9','삼진/9','연봉(2017)']]
predict_2018_salary = lr.predict(x)
predict_2018_salary[:5]
picher_df['예측연봉(2018)'] = pd.Series(predict_2018_salary)

 

picher = pd.read_csv(picher_file_path)
picher = picher[['선수명','연봉(2017)']]

 

# 2018년 연봉 내림차순
result_df = picher_df.sort_values(by=['y'], ascending = False)
# 연봉2017 삭제, 정규화된 데이터, 실제데이터가 아님
result_df.drop(['연봉(2017)'], axis=1, inplace=True, errors='ignore')
# 연봉 2017의 실제데이터로 컬럼 변경
result_df = result_df.merge(picher, on=['선수명'], how='left')
result_df = result_df[['선수명', 'y','예측연봉(2018)','연봉(2017)']]
result_df.columns = ['선수명','실제연봉(2018)','예측연봉(2018)','작년연봉(2017)']

 

result_df
	선수명	실제연봉(2018)	예측연봉(2018)	작년연봉(2017)
0	양현종	230000	163930.148696	150000
1	켈리	140000	120122.822204	85000
2	소사	120000	88127.019455	50000
3	정우람	120000	108489.464585	120000
4	레일리	111000	102253.697589	85000
...	...	...	...	...
147	장지훈	2800	249.850641	2700
148	차재용	2800	900.811527	2800
149	성영훈	2700	5003.619609	2700
150	정동윤	2700	2686.350884	2700
151	장민익	2700	3543.781665	2700
152 rows × 4 columns

 

result_df = result_df.iloc[:10,:]
plt.rc('font', family = 'Malgun Gothic')
result_df.plot(x='선수명', y=['작년연봉(2017)','예측연봉(2018)','실제연봉(2018)'], kind='bar')

 

# 2017연봉과 2018년 연봉이 다른 선수들만 
result_df = result_df[result_df['작년연봉(2017)'] != result_df['예측연봉(2018)']]
result_df.head()
	선수명	실제연봉(2018)	예측연봉(2018)	작년연봉(2017)
0	양현종	230000	163930.148696	150000
1	켈리	140000	120122.822204	85000
2	소사	120000	88127.019455	50000
3	정우람	120000	108489.464585	120000
4	레일리	111000	102253.697589	85000

 

result_df = result_df.reset_index()
result_df.head()

	index	선수명	실제연봉(2018)	예측연봉(2018)	작년연봉(2017)
0	0	양현종	230000	163930.148696	150000
1	1	켈리	140000	120122.822204	85000
2	2	소사	120000	88127.019455	50000
3	3	정우람	120000	108489.464585	120000
4	4	레일리	111000	102253.697589	85000

 

result_df = result_df.iloc[:10, :]
result_df.plot(x='선수명', y=['작년연봉(2017)','예측연봉(2018)','실제연봉(2018)'],kind = 'bar')

 

반응형
728x90
반응형

# dbscan density based clustering => 데이터 위치로부터 공간밀집도중심 클러스터 구분
noise 처리

2016_middle_shcool_graduates_report.xlsx
0.06MB

import pandas as pd
import numpy as np
import folium
file_path = '2016_middle_shcool_graduates_report.xlsx'
df = pd.read_excel(file_path, engine='openpyxl', header = 0,)
pd.set_option('display.width', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_colwidth', 20)

 

df.columns.values

array(['지역', '학교명', '코드', '유형', '주야', '남학생수', '여학생수', '일반고', '특성화고',
       '과학고', '외고_국제고', '예고_체고', '마이스터고', '자사고', '자공고', '기타진학', '취업',
       '미상', '위도', '경도'], dtype=object)

 

df.head()

	지역	학교명	코드	유형	주야	...	기타진학	취업	미상	위도	경도
0	성북구	서울대학교사범대학부설중학교	3	국립	주간	...	0.004	0	0.000	37.594942	127.038909
1	종로구	서울대학교사범대학부설여자중학교	3	국립	주간	...	0.031	0	0.000	37.577473	127.003857
2	강남구	개원중학교	3	공립	주간	...	0.009	0	0.003	37.491637	127.071744
3	강남구	개포중학교	3	공립	주간	...	0.019	0	0.000	37.480439	127.062201
4	서초구	경원중학교	3	공립	주간	...	0.010	0	0.000	37.510750	127.008900

 

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 415 entries, 0 to 414
Data columns (total 20 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   지역      415 non-null    object 
 1   학교명     415 non-null    object 
 2   코드      415 non-null    int64  
 3   유형      415 non-null    object 
 4   주야      415 non-null    object 
 5   남학생수    415 non-null    int64  
 6   여학생수    415 non-null    int64  
 7   일반고     415 non-null    float64
 8   특성화고    415 non-null    float64
 9   과학고     415 non-null    float64
 10  외고_국제고  415 non-null    float64
 11  예고_체고   415 non-null    float64
 12  마이스터고   415 non-null    float64
 13  자사고     415 non-null    float64
 14  자공고     415 non-null    float64
 15  기타진학    415 non-null    float64
 16  취업      415 non-null    int64  
 17  미상      415 non-null    float64
 18  위도      415 non-null    float64
 19  경도      415 non-null    float64
dtypes: float64(12), int64(4), object(4)
memory usage: 65.0+ KB

 

# 중학교 정보 지도표시
import folium
import json
mschool_map = folium.Map(location=[37.55, 126.98], zoom_start=12)
for name, lat, lng in zip(df.학교명, df.위도, df.경도):
    folium.CircleMarker([lat, lng],
                       radius = 5,
                       color = 'brown',
                       fill = True,
                       fill_color = 'coral',
                       fill_opacity = 0.7,
                       popup = name,
                       tooltip=name).add_to(mschool_map)
mschool_map.save('./seoul_mschool_loca.html')

seoul_mschool_loca.html
0.48MB

# 전처리 : 지역, 유형, 주야 컬럼 원핫인코디변환
df['코드'].unique()

# array([3, 5, 9], dtype=int64)

 

from sklearn import preprocessing as pp
label_encoder = pp.LabelEncoder()
# 문자열 => 수치형, 숫자의 크기저오는 의미없음, 단순 종류표시
label_location = label_encoder.fit_transform(df['지역']) 
label_code = label_encoder.fit_transform(df['코드']) 
label_type = label_encoder.fit_transform(df['유형']) 
label_day = label_encoder.fit_transform(df['주야'])

 

# onehot_encoder = pp.OneHotEncoder()
df['location'] = label_location

 

df['location'] = label_location
df['type'] = label_type
df['code'] = label_code
df['day'] = label_day
df.head()
	지역	학교명	코드	유형	주야	...	경도	location	type	code	day
0	성북구	서울대학교사범대학부설중학교	3	국립	주간	...	127.038909	16	1	0	0
1	종로구	서울대학교사범대학부설여자중학교	3	국립	주간	...	127.003857	22	1	0	0
2	강남구	개원중학교	3	공립	주간	...	127.071744	0	0	0	0
3	강남구	개포중학교	3	공립	주간	...	127.062201	0	0	0	0
4	서초구	경원중학교	3	공립	주간	...	127.008900	14	0	0	0
5 rows × 24 columns

 

label_location
array([16, 22,  0,  0, 14,  0,  0,  0,  0,  0,  0,  0,  0, 14, 14, 14,  0,
       14, 14, 14, 14, 14,  0,  0,  0, 14, 14,  0, 14,  0,  0,  0, 14, 14,
        0, 14,  0,  0,  0,  0, 17, 17,  1, 17,  1,  1,  1,  1,  1, 17, 17,
       17, 17,  1, 17, 17,  1, 17,  1,  1, 17, 17,  1,  1, 17, 17, 17, 17,
       17, 17, 17, 17, 17, 17,  1,  1, 17, 17,  1,  1, 18,  3,  3,  3, 18,
        3,  3,  3,  3,  3,  3, 18, 18,  3,  3,  3, 18,  3,  3,  3, 18, 18,
       18, 18, 18,  3, 18, 18, 18, 18, 18, 18,  3, 18, 18,  3,  3,  7,  6,
        6,  6,  6,  6,  7, 19, 19, 19, 19,  7, 19,  7,  7,  7,  7,  6,  7,
       19, 19, 19, 19,  6,  6, 19,  6,  6,  6,  6, 19,  7, 10, 10, 10, 10,
       10, 24, 24, 24, 24, 10, 24, 10, 24, 24, 24, 24, 24, 10, 10, 10, 10,
       10, 24, 24, 10, 24, 24, 10, 10, 11, 11,  4,  4, 11,  4,  4,  4, 11,
        4, 11, 11, 11, 11,  4,  4,  4, 11, 11, 11,  4, 11,  4,  4,  4,  4,
       11,  4, 11, 11,  8,  8,  9,  8,  8,  8,  9,  9,  9,  9,  8,  8,  8,
        8,  8,  8,  9,  8,  9,  9,  8,  8,  8,  8,  8,  8,  9,  8,  8,  8,
        9,  9,  9,  8,  8,  8,  8, 12, 12, 21, 21, 21, 12, 13, 13, 21, 21,
       13, 12, 21, 21, 12, 12, 12, 12, 21, 12, 13, 12, 13, 21, 21, 21, 13,
       21, 21, 21, 13, 13, 13, 12, 13, 21, 21, 13, 13, 12,  5, 15,  5,  5,
        5,  5, 15,  5,  5, 15,  5, 15, 15, 15,  5, 15,  5,  5, 15, 15,  2,
       16, 16, 16, 16,  2, 16, 16,  2, 16, 16,  2,  2,  2,  2,  2, 16, 16,
        2, 16, 16,  2, 16, 16,  2, 22, 23, 23, 22, 22, 23, 22, 20, 22, 20,
       22, 20, 20, 11, 20, 20, 20, 20, 23, 23, 22, 23, 22, 20, 23, 23, 17,
        2,  8,  4, 15, 15, 16,  5,  3,  9, 12,  3, 21, 18,  2, 13, 17,  1,
        1, 21, 12,  6, 13, 16,  3, 16,  0, 17, 22, 12, 22,  3, 14,  0,  4,
        5,  8, 11,  2,  9,  4,  8,  2,  6,  6, 13,  0,  1,  1, 17,  2, 21,
       22, 16,  0,  7,  5, 23,  8])

 

#
from sklearn import cluster
#분석에 사용할 속성을 선택( 과고, 외고, 자사고)
columns_list = [9,10,13]
x = df.iloc[:,columns_list]
x = pp.StandardScaler().fit(x).transform(x)
print(x[:5])

[[ 2.02375287 -0.57972902  1.84751715]
 [-0.65047921  1.84782097 -0.48039958]
 [ 0.68663683 -0.14623795  0.11423133]
 [ 1.28091062 -0.05953974 -0.20206171]
 [ 0.38949993 -0.31963438  2.54336183]]

 

#dbscan 모형
# eps 반지름값, min_samples 클러슽터의 포인트가 최소 5개는 되어야 클러스터로 인정
dbm = cluster.DBSCAN(eps=0.2, min_samples = 5)
# 데이터 학습
dbm.fit(x)
cluster_label = dbm.labels_
print(cluster_label)

[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  0 -1 -1 -1 -1 -1 -1  2 -1  0 -1
 -1 -1 -1 -1  0 -1 -1 -1 -1 -1  0  3 -1 -1 -1 -1 -1 -1 -1  0 -1 -1  1  0
 -1 -1 -1  0 -1 -1 -1 -1  0 -1  0  0 -1 -1  0 -1 -1 -1  0  0 -1 -1  0 -1
 -1 -1  0 -1 -1 -1  0  2  0  0  0  0  0 -1 -1 -1  0 -1  0 -1 -1  0 -1  0
 -1  0  0 -1 -1 -1 -1  1  0 -1  0  0 -1 -1 -1  0 -1 -1 -1 -1 -1  0  1 -1
 -1  0  2  0 -1 -1  1 -1 -1 -1  0  0  0 -1 -1  0 -1 -1 -1  0  0 -1 -1 -1
 -1  0 -1 -1 -1  0 -1 -1 -1  0 -1  0  0 -1 -1 -1 -1 -1  0 -1  0  0 -1 -1
 -1 -1 -1  0 -1 -1 -1  1  0  3  1 -1  0  0 -1  0 -1 -1  0  0  2 -1 -1  3
  0  0 -1 -1 -1 -1  0 -1  0  0 -1  0  0  0 -1 -1  0 -1 -1 -1 -1 -1  2  0
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  0 -1 -1 -1  0 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1  0 -1 -1 -1  0  0 -1 -1  0 -1  3  0  2 -1 -1
 -1 -1  0 -1 -1 -1  0 -1  0  0 -1 -1 -1 -1 -1  1 -1  0  1 -1  0  0  1 -1
  2 -1  0 -1 -1 -1 -1  0 -1 -1  1  0 -1  0 -1 -1  0  3  0 -1 -1 -1  2 -1
 -1 -1 -1  0  0  0  1 -1 -1 -1 -1 -1 -1 -1 -1  0 -1  0 -1  0 -1 -1  0  0
 -1 -1 -1  0 -1  0 -1 -1  0 -1 -1 -1  0  1 -1 -1 -1  0  1  1  1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  0 -1 -1 -1  0 -1  0
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  0]

 

df['Cluster'] = cluster_label

df.head()

	지역	학교명	코드	유형	주야	...	location	type	code	day	Cluster
0	성북구	서울대학교사범대학부설중학교	3	국립	주간	...	16	1	0	0	-1
1	종로구	서울대학교사범대학부설여자중학교	3	국립	주간	...	22	1	0	0	-1
2	강남구	개원중학교	3	공립	주간	...	0	0	0	0	-1
3	강남구	개포중학교	3	공립	주간	...	0	0	0	0	-1
4	서초구	경원중학교	3	공립	주간	...	14	0	0	0	-1

 

# 클러스터별 그룹
# -1 노이즈 그룹, 어디에도 속하지 못함
grouped = df.groupby('Cluster')
grouped.sum()

	코드	남학생수	여학생수	일반고	특성화고	...	경도	location	type	code	day
Cluster											
-1	765	38505	30866	170.996	35.234	...	32395.479457	2877	142	0	0
0	312	10314	13927	69.275	21.253	...	12956.408362	1124	64	2	0
1	211	1790	1891	9.968	3.613	...	5715.272378	489	53	34	0
2	24	1174	1069	5.268	1.157	...	1016.535379	60	2	0	0
3	15	728	459	3.071	0.862	...	634.912972	49	0	0	0
5 rows × 20 columns

 

for k, g in grouped:
    print("* key :", k)
    print("* g :", len(g))
    print(g.iloc[:,[0,1,3,9,10,13]].head())
    print('\n')
    
* key : -1
* g : 255
    지역               학교명  유형    과학고  외고_국제고    자사고
0  성북구    서울대학교사범대학부설중학교  국립  0.018   0.007  0.227
1  종로구  서울대학교사범대학부설여자중학교  국립  0.000   0.035  0.043
2  강남구             개원중학교  공립  0.009   0.012  0.090
3  강남구             개포중학교  공립  0.013   0.013  0.065
4  서초구             경원중학교  공립  0.007   0.010  0.282


* key : 0
* g : 102
     지역      학교명  유형  과학고  외고_국제고    자사고
13  서초구  동덕여자중학교  사립  0.0   0.022  0.038
22  강남구    수서중학교  공립  0.0   0.019  0.044
28  서초구    언남중학교  공립  0.0   0.015  0.050
34  강남구    은성중학교  사립  0.0   0.016  0.065
43  송파구    거원중학교  공립  0.0   0.021  0.054


* key : 1
* g : 45
       지역      학교명  유형  과학고  외고_국제고    자사고
46    강동구    동신중학교  사립  0.0     0.0  0.044
103   양천구    신원중학교  공립  0.0     0.0  0.006
118   구로구    개봉중학교  공립  0.0     0.0  0.012
126  영등포구    대림중학교  공립  0.0     0.0  0.050
175   중랑구  혜원여자중학교  사립  0.0     0.0  0.004


* key : 2
* g : 8
      지역    학교명  유형    과학고  외고_국제고    자사고
20   서초구  서초중학교  공립  0.003   0.013  0.085
79   강동구  한영중학교  사립  0.004   0.011  0.077
122  구로구  구일중학교  공립  0.004   0.012  0.079
188  동작구  대방중학교  공립  0.003   0.015  0.076
214  도봉구  도봉중학교  공립  0.004   0.011  0.072


* key : 3
* g : 5
       지역    학교명  유형  과학고  외고_국제고    자사고
35    서초구  이수중학교  공립  0.0   0.004  0.100
177  동대문구  휘경중학교  공립  0.0   0.004  0.094
191   동작구  문창중학교  공립  0.0   0.004  0.084
259   마포구  성사중학교  공립  0.0   0.004  0.078
305   강북구  강북중학교  공립  0.0   0.004  0.088

 

# 지도색 표시
colors = {-1 : 'gray', 0 : 'coral', 1 : 'blue', 2 : 'green', 3  : 'red',
          4 : 'purple', 5 : 'orange', 6 : 'brown', 7 : 'brick', 
         8 : 'yellow', 9 : 'magenta', 10 : 'cyan', 11 : 'tan' }
cluster_map = folium.Map(location=[37.55, 126.98], zoom_start=12)
for name, lat, lng, clus in zip(df.학교명, df.위도, df.경도, df.Cluster) :
    folium.CircleMarker([lat, lng],
                       radius = 5,
                       color=colors[clus],
                       fill=True,
                       fill_color=colors[clus],
                       fill_opacity = 0.7,
                        popup = name,
                        tooltip=name).add_to(cluster_map)
cluster_map.save('seoul_school_cluster.html')

seoul_school_cluster.html
0.48MB

 

# 설명변수
# 과학고 외고 국제고 + 유형
col_list2 = [9,10,13,22]
x2 = df.iloc[:, col_list2]
print(x2[:5])
x2 = pp.StandardScaler().fit(x2).transform(x2)
dbm2 = cluster.DBSCAN(eps=0.2, min_samples=5)
dbm2.fit(x2)
df['Cluster2'] = dbm2.labels_
grouped2_cols = [0,1,3] + col_list2
grouped2_cols

     과학고  외고_국제고    자사고  code
0  0.018   0.007  0.227     0
1  0.000   0.035  0.043     0
2  0.009   0.012  0.090     0
3  0.013   0.013  0.065     0
4  0.007   0.010  0.282     0

[0, 1, 3, 9, 10, 13, 22]

 

df['Cluster2'].value_counts()

-1    260
 0    101
 4     26
 1     15
 2      8
 3      5
Name: Cluster2, dtype: int64

 

grouped2 = df.groupby('Cluster2')
for k, g in grouped2:
    print("* key :", k)
    print("* g :", len(g))
    print(g.iloc[:,[0,1,3,9,10,13]].head())
    print('\n')
    
* key : -1
* g : 260
    지역               학교명  유형    과학고  외고_국제고    자사고
0  성북구    서울대학교사범대학부설중학교  국립  0.018   0.007  0.227
1  종로구  서울대학교사범대학부설여자중학교  국립  0.000   0.035  0.043
2  강남구             개원중학교  공립  0.009   0.012  0.090
3  강남구             개포중학교  공립  0.013   0.013  0.065
4  서초구             경원중학교  공립  0.007   0.010  0.282


* key : 0
* g : 101
     지역      학교명  유형  과학고  외고_국제고    자사고
13  서초구  동덕여자중학교  사립  0.0   0.022  0.038
22  강남구    수서중학교  공립  0.0   0.019  0.044
28  서초구    언남중학교  공립  0.0   0.015  0.050
34  강남구    은성중학교  사립  0.0   0.016  0.065
43  송파구    거원중학교  공립  0.0   0.021  0.054


* key : 1
* g : 15
       지역      학교명  유형  과학고  외고_국제고    자사고
46    강동구    동신중학교  사립  0.0     0.0  0.044
103   양천구    신원중학교  공립  0.0     0.0  0.006
118   구로구    개봉중학교  공립  0.0     0.0  0.012
126  영등포구    대림중학교  공립  0.0     0.0  0.050
175   중랑구  혜원여자중학교  사립  0.0     0.0  0.004


* key : 2
* g : 8
      지역    학교명  유형    과학고  외고_국제고    자사고
20   서초구  서초중학교  공립  0.003   0.013  0.085
79   강동구  한영중학교  사립  0.004   0.011  0.077
122  구로구  구일중학교  공립  0.004   0.012  0.079
188  동작구  대방중학교  공립  0.003   0.015  0.076
214  도봉구  도봉중학교  공립  0.004   0.011  0.072


* key : 3
* g : 5
       지역    학교명  유형  과학고  외고_국제고    자사고
35    서초구  이수중학교  공립  0.0   0.004  0.100
177  동대문구  휘경중학교  공립  0.0   0.004  0.094
191   동작구  문창중학교  공립  0.0   0.004  0.084
259   마포구  성사중학교  공립  0.0   0.004  0.078
305   강북구  강북중학교  공립  0.0   0.004  0.088


* key : 4
* g : 26
      지역     학교명  유형  과학고  외고_국제고  자사고
384  종로구   서울농학교  국립  0.0     0.0  0.0
385  마포구  한국우진학교  국립  0.0     0.0  0.0
386  종로구   서울맹학교  국립  0.0     0.0  0.0
387  강서구    교남학교  사립  0.0     0.0  0.0
388  서초구   다니엘학교  사립  0.0     0.0  0.0

 

cluster2_map = folium.Map(location = [37.55, 126.98], zoom_start=12)

 

for name, lat, lng, clus in zip(df.학교명, df.위도, df.경도, df.Cluster2) :
    folium.CircleMarker([lat, lng],
                       radius=5,
                       color=colors[clus],
                       fill=True,
                       fill_color=colors[clus],
                       fill_opacity=0.7,
                       popup=name,
                       tooltip=name).add_to(cluster2_map)
cluster2_map.save('./seoul_mschool_cluster2.html')

seoul_mschool_cluster2.html
0.48MB

 

 

반응형
728x90
반응형

# 비지도 학습 : 데이터셋 내부에 정답 없음
관측값을 몇개의 집단으로 나눔. 정답이 없는 상태에서 데이터의 유사성으로 판단 => 군집
군집 : kmeans : 데이터간의 유사성을 측정하는 기준으로 클러스터의 중심까지의 거리 이용.

import pandas as pd
import matplotlib.pyplot as plt
# 고객의 연간 구매금액을 상품카테고리별로 구분한 데이ㅓ
uci_path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/\
00292/Wholesale%20customers%20data.csv'
df = pd.read_csv(uci_path, header = 0)
df.head()

	Channel	Region	Fresh	Milk	Grocery	Frozen	Detergents_Paper	Delicassen
0	2	3	12669	9656	7561	214	2674	1338
1	2	3	7057	9810	9568	1762	3293	1776
2	2	3	6353	8808	7684	2405	3516	7844
3	1	3	13265	1196	4221	6404	507	1788
4	2	3	22615	5410	7198	3915	1777	5185

 

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 440 entries, 0 to 439
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   Channel           440 non-null    int64
 1   Region            440 non-null    int64
 2   Fresh             440 non-null    int64
 3   Milk              440 non-null    int64
 4   Grocery           440 non-null    int64
 5   Frozen            440 non-null    int64
 6   Detergents_Paper  440 non-null    int64
 7   Delicassen        440 non-null    int64
dtypes: int64(8)
memory usage: 27.6 KB

 

x = df.iloc[:,:]
# 정규화
from sklearn import preprocessing
x = preprocessing.StandardScaler().fit(x).transform(x)
x[:5]

array([[ 1.44865163,  0.59066829,  0.05293319,  0.52356777, -0.04111489,
        -0.58936716, -0.04356873, -0.06633906],
       [ 1.44865163,  0.59066829, -0.39130197,  0.54445767,  0.17031835,
        -0.27013618,  0.08640684,  0.08915105],
       [ 1.44865163,  0.59066829, -0.44702926,  0.40853771, -0.0281571 ,
        -0.13753572,  0.13323164,  2.24329255],
       [-0.69029709,  0.59066829,  0.10011141, -0.62401993, -0.3929769 ,
         0.6871443 , -0.49858822,  0.09341105],
       [ 1.44865163,  0.59066829,  0.84023948, -0.05239645, -0.07935618,
         0.17385884, -0.23191782,  1.29934689]])

 

from sklearn import cluster
kms = cluster.KMeans(init = 'k-means++', n_clusters=5, n_init=10)
# init = kmeans++ 중심점 설정없이 최초설정
# n clusters 5종류의 클러스터로 설정
# n_init 10개로 시작
kms.fit(x)
cluster_label= kms.labels_
print(cluster_label)

[3 3 3 1 3 3 3 3 1 3 3 3 3 3 3 1 3 1 3 1 3 1 1 2 3 3 1 1 3 1 1 1 1 1 1 3 1
 3 3 1 1 1 3 3 3 3 3 4 3 3 1 1 3 3 1 1 4 3 1 1 3 4 3 3 1 4 1 3 1 1 1 2 1 3
 3 1 1 3 1 1 1 3 3 1 3 4 4 2 1 1 1 1 4 1 3 1 3 1 1 1 3 3 3 1 1 1 3 3 3 3 1
 3 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1
 1 1 1 1 1 1 1 3 3 1 3 3 3 1 1 3 3 3 3 1 1 1 3 3 1 3 1 3 1 1 1 1 1 2 1 2 1
 1 1 1 3 3 1 1 1 3 1 1 0 3 0 0 3 3 0 0 0 3 0 0 0 3 0 4 0 0 3 0 3 0 3 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 4 0 0 0 0 0 0 0
 0 0 0 0 0 3 0 3 0 3 0 0 0 0 1 1 1 1 1 1 3 1 3 1 1 1 1 1 1 1 1 1 1 1 3 0 3
 0 3 3 0 3 3 3 3 3 3 3 0 0 3 0 0 3 0 0 3 0 0 0 3 0 0 0 0 0 2 0 0 0 0 0 3 0
 4 0 3 0 0 0 0 3 3 1 3 1 1 3 3 1 3 1 3 1 3 1 1 1 3 1 1 1 1 1 1 1 3 1 1 1 1
 3 1 1 3 1 1 3 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1
 3 3 1 1 1 1 1 1 3 3 1 3 1 1 3 1 3 3 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1]

 

df['Cluster'] = cluster_label
df.head()

	Channel	Region	Fresh	Milk	Grocery	Frozen	Detergents_Paper	Delicassen	Cluster
0	2	3	12669	9656	7561	214	2674	1338	3
1	2	3	7057	9810	9568	1762	3293	1776	3
2	2	3	6353	8808	7684	2405	3516	7844	3
3	1	3	13265	1196	4221	6404	507	1788	1
4	2	3	22615	5410	7198	3915	1777	5185	3

 

df.plot(kind = 'scatter', x ='Grocery', y = 'Frozen', c = 'Cluster', cmap = 'Set1', colorbar=False, figsize=(10,10))
df.plot(kind = 'scatter', x ='Milk', y = 'Delicassen', c = 'Cluster', cmap = 'Set1', colorbar=True, figsize=(10,10))
plt.show()

 

반응형
728x90
반응형

# Decision Tree
# node 분기점 : 분석되는 설명변수

 

from sklearn import tree
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import pandas as pd
import numpy as np

uci_path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data'
df = pd.read_csv(uci_path, header=None)
df.head()


0	1	2	3	4	5	6	7	8	9	10
0	1000025	5	1	1	1	2	1	3	1	1	2
1	1002945	5	4	4	5	7	10	3	2	1	2
2	1015425	3	1	1	1	2	2	3	1	1	2
3	1016277	6	8	8	1	3	4	3	7	1	2
4	1017023	4	1	1	3	2	1	3	1	1	2

id : id번호
clump : 덩어리 두께
cell_size : 암세포 크기
cell_shape : 세포모양
adhesion : 한계
epithlial : 상피세포크기
bare_nuclei : 베어핵
chromatin : 염색질
normal_nucleoli : 정상세포
mitoses : 유사분열
class : 양성 음성

 

df.columns = ['id','clump', 'cell_size', 'cell_shape', 'adhesion', 'epithlial', \
              'bare_nuclei','chromatin', 'normal_nucleoli', 'mitoses', 'class']
df.head()

	id	clump	cell_size	cell_shape	adhesion	epithlial	bare_nuclei	chromatin	normal_nucleoli	mitoses	class
0	1000025	5	1	1	1	2	1	3	1	1	2
1	1002945	5	4	4	5	7	10	3	2	1	2
2	1015425	3	1	1	1	2	2	3	1	1	2
3	1016277	6	8	8	1	3	4	3	7	1	2
4	1017023	4	1	1	3	2	1	3	1	1	2

 

df['class'].value_counts()
2    458
4    241
Name: class, dtype: int64

 

df['bare_nuclei'].unique()

array(['1', '10', '2', '4', '3', '9', '7', '?', '5', '8', '6'],
      dtype=object)

 

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               699 non-null    int64 
 1   clump            699 non-null    int64 
 2   cell_size        699 non-null    int64 
 3   cell_shape       699 non-null    int64 
 4   adhesion         699 non-null    int64 
 5   epithlial        699 non-null    int64 
 6   bare_nuclei      699 non-null    object
 7   chromatin        699 non-null    int64 
 8   normal_nucleoli  699 non-null    int64 
 9   mitoses          699 non-null    int64 
 10  class            699 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 60.2+ KB

 

df.loc[df['bare_nuclei'] == '?', 'bare_nuclei'] = np.nan

 

df['bare_nuclei'].replace('?', np.nan, inplace =True)
df.dropna(subset=['bare_nuclei'], axis=0, inplace =True)
df['bare_nuclei'] = df['bare_nuclei'].astype(int)
df.info()
# 64비트, 8자리수, 32비트 4자리 // 1~10이니깐 상관없음

<class 'pandas.core.frame.DataFrame'>
Int64Index: 683 entries, 0 to 698
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   id               683 non-null    int64
 1   clump            683 non-null    int64
 2   cell_size        683 non-null    int64
 3   cell_shape       683 non-null    int64
 4   adhesion         683 non-null    int64
 5   epithlial        683 non-null    int64
 6   bare_nuclei      683 non-null    int32
 7   chromatin        683 non-null    int64
 8   normal_nucleoli  683 non-null    int64
 9   mitoses          683 non-null    int64
 10  class            683 non-null    int64
dtypes: int32(1), int64(10)
memory usage: 61.4 KB

 

x = df.iloc[:,1:-1]
y = df.iloc[:,-1]
y

0      2
1      2
2      2
3      2
4      2
      ..
694    2
695    2
696    4
697    4
698    4
Name: class, Length: 683, dtype: int64

 

# 정규화
x = preprocessing.StandardScaler().fit(x).transform(x)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size =0.3, random_state=10)
print(x_train.shape)

[[ 1.97177486  0.6037398   0.59763519 ...  1.4522248   2.00965299
   0.22916583]
 [ 1.26222679  2.23617957  2.2718962  ...  2.67776377  2.33747554
  -0.34839971]
 [ 0.55267873 -0.70221201 -0.74177362 ... -0.18182716 -0.61292736
  -0.34839971]
 ...
 [ 0.19790469 -0.0492361  -0.74177362 ... -0.99885314 -0.61292736
  -0.34839971]
 [-0.51164337 -0.70221201 -0.74177362 ... -0.18182716 -0.61292736
  -0.34839971]
 [ 0.90745276 -0.37572406  0.26278299 ... -0.18182716  0.04271773
  -0.34839971]]

 

tm = tree.DecisionTreeClassifier(criterion = 'entropy', max_depth=5)
# max_depth 트리 단계
# 불순도 : 분류가 안되고 섞여있는 상태 // 
# entropy 는 불순도 측정 함수이름
tm.fit(x_train, y_train)
y_hat = tm.predict(x_test)
print(y_hat[:10])

[4 4 4 4 4 4 2 2 4 4]

 

tmetrix = metrics.confusion_matrix(y_test, y_hat)
print(tmetrix)

# [[127   4]
#  [  2  72]]

 

tree_report = metrics.classification_report(y_test, y_hat)
print(tree_report)

              precision    recall  f1-score   support

           2       0.98      0.97      0.98       131
           4       0.95      0.97      0.96        74

    accuracy                           0.97       205
   macro avg       0.97      0.97      0.97       205
weighted avg       0.97      0.97      0.97       205

 

의사결정트리 : 학습데이터에 따라서 생성되는 데이터가 달라지므로 일반화하기 어렵다.
     데이터에 따라 성능, 변동폭이 크다.
        => 단점을 보완하기 위한 알고리즘 randomforest

 

 

반응형
728x90
반응형

범주형

  • 설명변수 => 목표변수
  • 목표변수가 범주형인 경우 한값에 분류하여 예측
  • 질병진단, 스펨메일필터링
  • knn k nearest neighbors
import pandas as pd
import seaborn as sns
df = sns.load_dataset('titanic')
pd.set_option('display.max_columns', 15)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.6+ KB

 

rdf = df.drop(['deck', 'embark_town'], axis = 1)
rdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   survived    891 non-null    int64   
 1   pclass      891 non-null    int64   
 2   sex         891 non-null    object  
 3   age         714 non-null    float64 
 4   sibsp       891 non-null    int64   
 5   parch       891 non-null    int64   
 6   fare        891 non-null    float64 
 7   embarked    889 non-null    object  
 8   class       891 non-null    category
 9   who         891 non-null    object  
 10  adult_male  891 non-null    bool    
 11  alive       891 non-null    object  
 12  alone       891 non-null    bool    
dtypes: bool(2), category(1), float64(2), int64(4), object(4)
memory usage: 72.4+ KB

 

rdf = rdf.dropna(subset=['age'], axis = 0)
rdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   survived    714 non-null    int64   
 1   pclass      714 non-null    int64   
 2   sex         714 non-null    object  
 3   age         714 non-null    float64 
 4   sibsp       714 non-null    int64   
 5   parch       714 non-null    int64   
 6   fare        714 non-null    float64 
 7   embarked    712 non-null    object  
 8   class       714 non-null    category
 9   who         714 non-null    object  
 10  adult_male  714 non-null    bool    
 11  alive       714 non-null    object  
 12  alone       714 non-null    bool    
dtypes: bool(2), category(1), float64(2), int64(4), object(4)
memory usage: 63.6+ KB

 

most_freq = rdf['embarked'].value_counts(dropna=True).idxmax()
# rdf.groupby('embarked')['embarked'].count().idxmax()
rdf['embarked'].fillna(most_freq, inplace = True)
rdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   survived    714 non-null    int64   
 1   pclass      714 non-null    int64   
 2   sex         714 non-null    object  
 3   age         714 non-null    float64 
 4   sibsp       714 non-null    int64   
 5   parch       714 non-null    int64   
 6   fare        714 non-null    float64 
 7   embarked    714 non-null    object  
 8   class       714 non-null    category
 9   who         714 non-null    object  
 10  adult_male  714 non-null    bool    
 11  alive       714 non-null    object  
 12  alone       714 non-null    bool    
dtypes: bool(2), category(1), float64(2), int64(4), object(4)
memory usage: 63.6+ KB

 

ndf = rdf[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'embarked']]
ndf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  714 non-null    int64  
 1   pclass    714 non-null    int64  
 2   sex       714 non-null    object 
 3   age       714 non-null    float64
 4   sibsp     714 non-null    int64  
 5   parch     714 non-null    int64  
 6   embarked  714 non-null    object 
dtypes: float64(1), int64(4), object(2)
memory usage: 44.6+ KB

 

ndf.describe()

	survived	pclass	age	sibsp	parch
count	714.000000	714.000000	714.000000	714.000000	714.000000
mean	0.406162	2.236695	29.699118	0.512605	0.431373
std	0.491460	0.838250	14.526497	0.929783	0.853289
min	0.000000	1.000000	0.420000	0.000000	0.000000
25%	0.000000	1.000000	20.125000	0.000000	0.000000
50%	0.000000	2.000000	28.000000	0.000000	0.000000
75%	1.000000	3.000000	38.000000	1.000000	1.000000
max	1.000000	3.000000	80.000000	5.000000	6.000000

# 원핫인코딩 # 범주형데이터를 모형이 인식할 수 있게 숫자형으로 변환

 

oh_set = pd.get_dummies(ndf['sex'])
oh_set.head()

	female	male
0	0	1
1	1	0
2	1	0
3	1	0
4	0	1

 

ndf = pd.concat([ndf, oh_set], axis = 1)
ndf.head()

	survived	pclass	sex	age	sibsp	parch	embarked	female	male
0	0	3	male	22.0	1	0	S	0	1
1	1	1	female	38.0	1	0	C	1	0
2	1	3	female	26.0	0	0	S	1	0
3	1	1	female	35.0	1	0	S	1	0
4	0	3	male	35.0	0	0	S	0	1

 

oh_embarked = pd.get_dummies(ndf['embarked'], prefix = 'town')
ndf = pd.concat([ndf, oh_embarked], axis = 1)
ndf

	survived	pclass	sex	age	sibsp	parch	embarked	female	male	town_C	town_Q	town_S
0	0	3	male	22.0	1	0	S	0	1	0	0	1
1	1	1	female	38.0	1	0	C	1	0	1	0	0
2	1	3	female	26.0	0	0	S	1	0	0	0	1
3	1	1	female	35.0	1	0	S	1	0	0	0	1
4	0	3	male	35.0	0	0	S	0	1	0	0	1
...	...	...	...	...	...	...	...	...	...	...	...	...
885	0	3	female	39.0	0	5	Q	1	0	0	1	0
886	0	2	male	27.0	0	0	S	0	1	0	0	1
887	1	1	female	19.0	0	0	S	1	0	0	0	1
889	1	1	male	26.0	0	0	C	0	1	1	0	0
890	0	3	male	32.0	0	0	Q	0	1	0	1	0
714 rows × 12 columns

 

x =ndf[['pclass','age','sibsp','parch','female','male','town_C','town_Q','town_S']]
y = ndf['survived']
x.head()

	pclass	age	sibsp	parch	female	male	town_C	town_Q	town_S
0	3	22.0	1	0	0	1	0	0	1
1	1	38.0	1	0	1	0	1	0	0
2	3	26.0	0	0	1	0	0	0	1
3	1	35.0	1	0	1	0	0	0	1
4	3	35.0	0	0	0	1	0	0	1

# 설명변수 데이터 정규화
# 분석시 데이터 값의 크기에 따라서 분석의 결과에 영향
# 나이 범위가 크기 때문에 정규화를 통해 모든 속성변수들의 값을 기준단위로 변경

from sklearn import preprocessing
import numpy as np
preprocessing.StandardScaler().fit(x).transform(x)


array([[ 0.91123237, -0.53037664,  0.52457013, ..., -0.47180795,
        -0.20203051,  0.53307848],
       [-1.47636364,  0.57183099,  0.52457013, ...,  2.11950647,
        -0.20203051, -1.87589641],
       [ 0.91123237, -0.25482473, -0.55170307, ..., -0.47180795,
        -0.20203051,  0.53307848],
       ...,
       [-1.47636364, -0.73704057, -0.55170307, ..., -0.47180795,
        -0.20203051,  0.53307848],
       [-1.47636364, -0.25482473, -0.55170307, ...,  2.11950647,
        -0.20203051, -1.87589641],
       [ 0.91123237,  0.15850313, -0.55170307, ..., -0.47180795,
         4.94974747, -1.87589641]])

 

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=10)
x_train.shape

# (499, 9)

 

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train, y_train)
y_hat = knn.predict(x_test)
print(y_hat[0:10])
print(y_test[0:10])

[0 0 1 0 0 1 0 1 0 0]
728    0
555    0
426    1
278    0
617    0
751    1
576    1
679    1
567    0
117    0
Name: survived, dtype: int64

 

# 성능평가
from sklearn import metrics
knn_matrix = metrics.confusion_matrix(y_test, y_hat)
print(knn_matrix)


# [[111  14]
#  [ 29  61]]

precision 정확도

  • 예측 true 실제 true인 tp의 비율
  • 정확도가 높은 것은 예측 T 실제 F fn이 작은 경우

    Recall 재현율

  • 실제값이 true 인 분석대상중 True로 예측한 비율
  • 재현율의 높은 것은 fn 오류가 낮다

    F1 score

  • 정확도와 재현율이 조화 평균을 계산한 값
  • 모형의 예측력을 평가 지표
knn_report = metrics.classification_report(y_test, y_hat)
print(knn_report)

              precision    recall  f1-score   support

           0       0.79      0.89      0.84       125
           1       0.81      0.68      0.74        90

    accuracy                           0.80       215
   macro avg       0.80      0.78      0.79       215
weighted avg       0.80      0.80      0.80       215

accuracy : 정확도 macro avg : 단순평균
weighted avg : 가중평균, 표본의 갯수로 가중평균

 

# svm support vector machine
from sklearn import svm
# kernel = 'rbf' 적용
# 커널 : 벡터공간으로 매핑함수
# rbf = radial basis function
# linear
# polynimial
# sigmoid
svm_model = svm.SVC(kernel='rbf')
svm_model.fit(x_train, y_train)
y_hat = svm_model.predict(x_test)
print(y_hat[0:10])

# [0 0 0 0 0 1 0 0 0 0]

 

from sklearn import metrics
svm_matrix = metrics.confusion_matrix(y_test, y_hat)
print(svm_matrix)

# [[118   7]
#  [ 79  11]]

 

svm_report = metrics.classification_report(y_test, y_hat)
print(svm_report)

              precision    recall  f1-score   support

           0       0.60      0.94      0.73       125
           1       0.61      0.12      0.20        90

    accuracy                           0.60       215
   macro avg       0.61      0.53      0.47       215
weighted avg       0.60      0.60      0.51       215

 

 

반응형
728x90
반응형

기계학습 각각변수들의 관계를 찾는 과정

  • 예측:회귀분석
  • 분류:knn
  • 군집:Kmeans
  • 머신러닝 프로세스 -> 데이터 분리 -> 알고리즘 준비-> 모형학습 -> 예측 -> 평가 -> 활용

# 회귀분석 : 가격, 매출, 주가 등 연속성 데이터 예측 알고리즘 import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns df = pd.read_csv('auto-mpg.csv', header = None) df.columns = ['mpg', 'cylinders', 'displacement', 'horsepower','weight','acceleration', 'model year', 'origin', 'name'] df mpg cylinders displacement horsepower weight acceleration model year origin name 0 18.0 8 307.0 130.0 3504.0 12.0 70 1 chevrolet chevelle malibu 1 15.0 8 350.0 165.0 3693.0 11.5 70 1 buick skylark 320 2 18.0 8 318.0 150.0 3436.0 11.0 70 1 plymouth satellite 3 16.0 8 304.0 150.0 3433.0 12.0 70 1 amc rebel sst 4 17.0 8 302.0 140.0 3449.0 10.5 70 1 ford torino ... ... ... ... ... ... ... ... ... ... 393 27.0 4 140.0 86.00 2790.0 15.6 82 1 ford mustang gl 394 44.0 4 97.0 52.00 2130.0 24.6 82 2 vw pickup 395 32.0 4 135.0 84.00 2295.0 11.6 82 1 dodge rampage 396 28.0 4 120.0 79.00 2625.0 18.6 82 1 ford ranger 397 31.0 4 119.0 82.00 2720.0 19.4 82 1 chevy s-10 398 rows × 9 columns

print(df.info()) <class 'pandas.core.frame.DataFrame'> RangeIndex: 398 entries, 0 to 397 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 mpg 398 non-null float64 1 cylinders 398 non-null int64 2 displacement 398 non-null float64 3 horsepower 398 non-null object 4 weight 398 non-null float64 5 acceleration 398 non-null float64 6 model year 398 non-null int64 7 origin 398 non-null int64 8 name 398 non-null object dtypes: float64(4), int64(3), object(2) memory usage: 28.1+ KB None

print(df.describe()) mpg cylinders displacement weight acceleration \ count 398.000000 398.000000 398.000000 398.000000 398.000000 mean 23.514573 5.454774 193.425879 2970.424623 15.568090 std 7.815984 1.701004 104.269838 846.841774 2.757689 min 9.000000 3.000000 68.000000 1613.000000 8.000000 25% 17.500000 4.000000 104.250000 2223.750000 13.825000 50% 23.000000 4.000000 148.500000 2803.500000 15.500000 75% 29.000000 8.000000 262.000000 3608.000000 17.175000 max 46.600000 8.000000 455.000000 5140.000000 24.800000 model year origin count 398.000000 398.000000 mean 76.010050 1.572864 std 3.697627 0.802055 min 70.000000 1.000000 25% 73.000000 1.000000 50% 76.000000 1.000000 75% 79.000000 2.000000 max 82.000000 3.000000

print(df.horsepower.unique()) ['130.0' '165.0' '150.0' '140.0' '198.0' '220.0' '215.0' '225.0' '190.0' '170.0' '160.0' '95.00' '97.00' '85.00' '88.00' '46.00' '87.00' '90.00' '113.0' '200.0' '210.0' '193.0' '?' '100.0' '105.0' '175.0' '153.0' '180.0' '110.0' '72.00' '86.00' '70.00' '76.00' '65.00' '69.00' '60.00' '80.00' '54.00' '208.0' '155.0' '112.0' '92.00' '145.0' '137.0' '158.0' '167.0' '94.00' '107.0' '230.0' '49.00' '75.00' '91.00' '122.0' '67.00' '83.00' '78.00' '52.00' '61.00' '93.00' '148.0' '129.0' '96.00' '71.00' '98.00' '115.0' '53.00' '81.00' '79.00' '120.0' '152.0' '102.0' '108.0' '68.00' '58.00' '149.0' '89.00' '63.00' '48.00' '66.00' '139.0' '103.0' '125.0' '133.0' '138.0' '135.0' '142.0' '77.00' '62.00' '132.0' '84.00' '64.00' '74.00' '116.0' '82.00']

# df.loc[df['horsepower']=='?','horsepower'] df['horsepower'].replace('?', np.nan, inplace = True) print(df.horsepower.unique()) ['130.0' '165.0' '150.0' '140.0' '198.0' '220.0' '215.0' '225.0' '190.0' '170.0' '160.0' '95.00' '97.00' '85.00' '88.00' '46.00' '87.00' '90.00' '113.0' '200.0' '210.0' '193.0' nan '100.0' '105.0' '175.0' '153.0' '180.0' '110.0' '72.00' '86.00' '70.00' '76.00' '65.00' '69.00' '60.00' '80.00' '54.00' '208.0' '155.0' '112.0' '92.00' '145.0' '137.0' '158.0' '167.0' '94.00' '107.0' '230.0' '49.00' '75.00' '91.00' '122.0' '67.00' '83.00' '78.00' '52.00' '61.00' '93.00' '148.0' '129.0' '96.00' '71.00' '98.00' '115.0' '53.00' '81.00' '79.00' '120.0' '152.0' '102.0' '108.0' '68.00' '58.00' '149.0' '89.00' '63.00' '48.00' '66.00' '139.0' '103.0' '125.0' '133.0' '138.0' '135.0' '142.0' '77.00' '62.00' '132.0' '84.00' '64.00' '74.00' '116.0' '82.00']

# 누락 삭제 df['horsepower'].isnull().sum() df.dropna( subset = ['horsepower'], axis = 0, inplace=True) df.info() <class 'pandas.core.frame.DataFrame'> Int64Index: 392 entries, 0 to 397 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 mpg 392 non-null float64 1 cylinders 392 non-null int64 2 displacement 392 non-null float64 3 horsepower 392 non-null object 4 weight 392 non-null float64 5 acceleration 392 non-null float64 6 model year 392 non-null int64 7 origin 392 non-null int64 8 name 392 non-null object dtypes: float64(4), int64(3), object(2) memory usage: 30.6+ KB

pd.set_option('display.max_columns', 10) print(df.describe()) mpg cylinders displacement weight acceleration \ count 392.000000 392.000000 392.000000 392.000000 392.000000 mean 23.445918 5.471939 194.411990 2977.584184 15.541327 std 7.805007 1.705783 104.644004 849.402560 2.758864 min 9.000000 3.000000 68.000000 1613.000000 8.000000 25% 17.000000 4.000000 105.000000 2225.250000 13.775000 50% 22.750000 4.000000 151.000000 2803.500000 15.500000 75% 29.000000 8.000000 275.750000 3614.750000 17.025000 max 46.600000 8.000000 455.000000 5140.000000 24.800000 model year origin count 392.000000 392.000000 mean 75.979592 1.576531 std 3.683737 0.805518 min 70.000000 1.000000 25% 73.000000 1.000000 50% 76.000000 1.000000 75% 79.000000 2.000000 max 82.000000 3.000000

# 문자열 실수형으로 변환 df['horsepower'] = df['horsepower'].astype(float) # 분석에 활용할 속성 선택, 연비 , 실린더 ndf = df[['mpg','cylinders','horsepower','weight']] ndf.info() <class 'pandas.core.frame.DataFrame'> Int64Index: 392 entries, 0 to 397 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 mpg 392 non-null float64 1 cylinders 392 non-null int64 2 horsepower 392 non-null float64 3 weight 392 non-null float64 dtypes: float64(3), int64(1) memory usage: 15.3 KB

# plt 산점도 ndf.plot(kind='scatter', x = 'weight', y='mpg', c= 'coral', s=10, figsize= (10,5)) plt.show()

# sns 산점도 fig = plt.figure(figsize = (10,5)) ax1 = fig.add_subplot(1,2,1) ax2 = fig.add_subplot(1,2,2) sns.regplot(x='weight', y='mpg', data =ndf, ax=ax1) sns.regplot(x='weight', y='mpg', data =ndf, ax=ax2, fit_reg=False) # 회귀선 미표시 plt.show()

# joinplot sns.jointplot(x='weight', y='mpg', data =ndf) sns.jointplot(x='weight', y='mpg', kind = 'reg', data =ndf) plt.show()

# seaborn pairplot sns.pairplot(ndf, kind = 'reg') plt.show()

# 독립변수 여러개 x = ndf[['weight']] print(type(x)) # 종속변수 1개 y = ndf['mpg'] print(type(y)) # <class 'pandas.core.frame.DataFrame'> # <class 'pandas.core.series.Series'>

from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state = 10) print(len(x_train)) # 274 print(len(y_train)) # 274

from sklearn.linear_model import LinearRegression lr = LinearRegression() # 훈련 독립변수, 정답 종속변수 lr.fit(x_train, y_train) LinearRegression()

r_square = lr.score(x_test, y_test) print(r_square) # 0.6822458558299325

# 기울기 print('기울기 a', lr.coef_) print('절편 b', lr.intercept_) 기울기 a [-0.00775343] 절편 b 46.7103662572801

y_hat = lr.predict(x) plt.figure(figsize=(10,5)) ax1 = sns.kdeplot(y, label='y') ax2 = sns.kdeplot(y_hat, label='y_hat', ax=ax1) plt.legend() plt.show()

# 단순회귀분석 : 두변수간 관계를 직선으로 분석
# 다항회귀분석 : 회귀선을 곡선으로 더 높은 정확도

from sklearn.preprocessing import PolynomialFeatures poly = PolynomialFeatures(degree=2) x_train_poly = poly.fit_transform(x_train) print('원데이터 :', x_train.shape) print('2차항 변환 데이터 :', x_train_poly.shape) # 원데이터 : (274, 1) # 2차항 변환 데이터 : (274, 3)

pr = LinearRegression() pr.fit(x_train_poly, y_train) x_test_poly = poly.fit_transform(x_test) r_square = pr.score(x_test_poly, y_test) y_hat_test = pr.predict(x_test_poly) print('기울기 a', pr.coef_) print('절편 b', pr.intercept_) # 기울기 a [ 0.00000000e+00 -1.85768289e-02 1.70491223e-06] # 절편 b 62.58071221576951

# 산점도 그리기 fig = plt.figure(figsize=(10,5)) ax = fig.add_subplot(1,1,1) ax.plot(x_train, y_train, 'o', label = 'Train Data') ax.plot(x_test, y_hat_test, 'r+', label = 'Predicted Value') ax.legend(loc = 'best') plt.xlabel('weight') plt.ylabel('mpg') plt.show()

x_poly = poly.fit_transform(x) y_hat = pr.predict(x_poly) plt.figure(figsize = (10,5)) ax1 = sns.kdeplot(y, label='y') ax2 = sns.kdeplot(y_hat, label='y_hat', ax = ax1) plt.legend() plt.show()

# 단순회귀분석 : 독립변수, 종속변수가 한개일때
# 다중 회귀분석 : 독립변수가 여러개일 경우
# y = b + a1*x1 + a2*x2 +...+an*xn

from sklearn.linear_model import LinearRegression lr = LinearRegression() x = ndf[['cylinders', 'horsepower', 'weight']] # 다중회귀분석 y = ndf['mpg'] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state = 10)

lr.fit(x_train, y_train) r_square = lr.score(x_test, y_test) print('결정계수 :', r_square) # 결정계수 : 0.6939048496695597

print('기울기 a', lr.coef_) print('절편 b', lr.intercept_) # 기울기 a [-0.60691288 -0.03714088 -0.00522268] # 절편 b 46.41435126963407

y_hat = lr.predict(x_test)

plt.figure(figsize = (10,5)) ax1 = sns.kdeplot(y_test, label='y_test') ax2 = sns.kdeplot(y_hat, label='y_hat', ax = ax1) plt.legend() plt.show()

y_hat = lr.predict(x) plt.figure(figsize = (10,5)) ax1 = sns.kdeplot(y, label='y') ax2 = sns.kdeplot(y_hat, label='y_hat', ax = ax1) plt.legend() plt.show()
x = [[10], [5], [9], [7]] y = [100, 50, 90, 77] lr = LinearRegression() lr.fit(x,y) result =lr.predict([[7]]) plt.figure(figsize = (10,5)) ax1 = sns.kdeplot(y, label='y')


반응형

+ Recent posts