27. 프로야구 연봉 예측 분석 || OLS, Heatmap

2021. 11. 24. 15:05

728x90

picher_stats_2017.csv

batter_stats_2017.csv

# 프로야구 연봉 예측
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
picher_file_path = 'picher_stats_2017.csv'
batter_file_path = 'batter_stats_2017.csv'
picher = pd.read_csv(picher_file_path)
batter = pd.read_csv(batter_file_path)
batter.columns

Index(['선수명', '팀명', '경기', '타석', '타수', '안타', '홈런', '득점', '타점', '볼넷', '삼진', '도루',
       'BABIP', '타율', '출루율', '장타율', 'OPS', 'wOBA', 'WAR', '연봉(2018)',
       '연봉(2017)'],
      dtype='object')

pi_fea_df = picher[['승','패','세','홀드','블론','경기','선발','이닝','삼진/9',
                    '볼넷/9','홈런/9','BABIP','LOB%','ERA','RA9-WAR','FIP','kFIP','WAR','연봉(2018)','연봉(2017)']]
pi_fea_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 152 entries, 0 to 151
Data columns (total 20 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   승         152 non-null    int64  
 1   패         152 non-null    int64  
 2   세         152 non-null    int64  
 3   홀드        152 non-null    int64  
 4   블론        152 non-null    int64  
 5   경기        152 non-null    int64  
 6   선발        152 non-null    int64  
 7   이닝        152 non-null    float64
 8   삼진/9      152 non-null    float64
 9   볼넷/9      152 non-null    float64
 10  홈런/9      152 non-null    float64
 11  BABIP     152 non-null    float64
 12  LOB%      152 non-null    float64
 13  ERA       152 non-null    float64
 14  RA9-WAR   152 non-null    float64
 15  FIP       152 non-null    float64
 16  kFIP      152 non-null    float64
 17  WAR       152 non-null    float64
 18  연봉(2018)  152 non-null    int64  
 19  연봉(2017)  152 non-null    int64  
dtypes: float64(11), int64(9)
memory usage: 23.9 KB

picher = picher.rename(columns = {'연봉(2018)':'y'})

team_encoding = pd.get_dummies(picher['팀명'])
team_encoding.head()

KIA	KT	LG	NC	SK	두산	롯데	삼성	한화
0	0	0	0	0	1	0	0	0	0
1	0	0	1	0	0	0	0	0	0
2	1	0	0	0	0	0	0	0	0
3	0	0	1	0	0	0	0	0	0
4	0	0	0	0	0	0	1	0	0

picher = pd.concat([picher, team_encoding], axis=1)
picher.head()

	선수명	팀명	승	패	세	홀드	블론	경기	선발	이닝	...	연봉(2017)	KIA	KT	LG	NC	SK	두산	롯데	삼성	한화
0	켈리	SK	16	7	0	0	0	30	30	190.0	...	85000	0	0	0	0	1	0	0	0	0
1	소사	LG	11	11	1	0	0	30	29	185.1	...	50000	0	0	1	0	0	0	0	0	0
2	양현종	KIA	20	6	0	0	0	31	31	193.1	...	150000	1	0	0	0	0	0	0	0	0
3	차우찬	LG	10	7	0	0	0	28	28	175.2	...	100000	0	0	1	0	0	0	0	0	0
4	레일리	롯데	13	7	0	0	0	30	30	187.1	...	85000	0	0	0	0	0	0	1	0	0
5 rows × 31 columns

picher = picher.drop('팀명', axis=1)

x = picher[picher.columns.difference({'선수명','y'})]
y = picher['y']

from sklearn import preprocessing as pp
x = pp.StandardScaler().fit(x).transform(x)

pd.options.mode.chained_assignment = None # 과학적표기방법 안씀
# 정규화 함수
def standard_scaling(df, scale_columns) :
    for col in scale_columns :
        s_mean = df[col].mean()
        s_std = df[col].std()
        df[col] =  df[col].apply(lambda x : (x - s_mean)/s_std)
    return df

pi_fea_df = picher[['승','패','세','홀드','블론','경기','선발','이닝','삼진/9',
                    '볼넷/9','홈런/9','BABIP','LOB%','ERA','RA9-WAR','FIP','kFIP','WAR','연봉(2017)']]
pi_fea_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 152 entries, 0 to 151
Data columns (total 19 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   승         152 non-null    float64
 1   패         152 non-null    float64
 2   세         152 non-null    float64
 3   홀드        152 non-null    float64
 4   블론        152 non-null    float64
 5   경기        152 non-null    float64
 6   선발        152 non-null    float64
 7   이닝        152 non-null    float64
 8   삼진/9      152 non-null    float64
 9   볼넷/9      152 non-null    float64
 10  홈런/9      152 non-null    float64
 11  BABIP     152 non-null    float64
 12  LOB%      152 non-null    float64
 13  ERA       152 non-null    float64
 14  RA9-WAR   152 non-null    float64
 15  FIP       152 non-null    float64
 16  kFIP      152 non-null    float64
 17  WAR       152 non-null    float64
 18  연봉(2017)  152 non-null    int64  
dtypes: float64(18), int64(1)
memory usage: 22.7 KB

picher_df = standard_scaling(picher,pi_fea_df )

# 정규화된 x
x = picher[picher_df.columns.difference({'선수명','y'})]
x.head()

	BABIP	ERA	FIP	KIA	KT	LG	LOB%	NC	RA9-WAR	SK	...	삼진/9	선발	세	승	연봉(2017)	이닝	패	한화	홀드	홈런/9
0	0.016783	-0.587056	-0.971030	0	0	0	0.446615	0	3.174630	1	...	0.672099	2.452068	-0.306452	3.313623	2.734705	2.645175	1.227145	0	-0.585705	-0.442382
1	-0.241686	-0.519855	-1.061888	0	0	1	-0.122764	0	3.114968	0	...	0.134531	2.349505	-0.098502	2.019505	1.337303	2.547755	2.504721	0	-0.585705	-0.668521
2	-0.095595	-0.625456	-0.837415	1	0	0	0.308584	0	2.973948	0	...	0.109775	2.554632	-0.306452	4.348918	5.329881	2.706808	0.907751	0	-0.585705	-0.412886
3	-0.477680	-0.627856	-0.698455	0	0	1	0.558765	0	2.740722	0	...	0.350266	2.246942	-0.306452	1.760682	3.333592	2.350927	1.227145	0	-0.585705	-0.186746
4	-0.196735	-0.539055	-0.612941	0	0	0	0.481122	0	2.751570	0	...	0.155751	2.452068	-0.306452	2.537153	2.734705	2.587518	1.227145	0	-0.585705	-0.294900
5 rows × 28 columns

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=19)

picher.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 152 entries, 0 to 151
Data columns (total 30 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   선수명       152 non-null    object 
 1   승         152 non-null    float64
 2   패         152 non-null    float64
 3   세         152 non-null    float64
 4   홀드        152 non-null    float64
 5   블론        152 non-null    float64
 6   경기        152 non-null    float64
 7   선발        152 non-null    float64
 8   이닝        152 non-null    float64
 9   삼진/9      152 non-null    float64
 10  볼넷/9      152 non-null    float64
 11  홈런/9      152 non-null    float64
 12  BABIP     152 non-null    float64
 13  LOB%      152 non-null    float64
 14  ERA       152 non-null    float64
 15  RA9-WAR   152 non-null    float64
 16  FIP       152 non-null    float64
 17  kFIP      152 non-null    float64
 18  WAR       152 non-null    float64
 19  y         152 non-null    int64  
 20  연봉(2017)  152 non-null    float64
 21  KIA       152 non-null    uint8  
 22  KT        152 non-null    uint8  
 23  LG        152 non-null    uint8  
 24  NC        152 non-null    uint8  
 25  SK        152 non-null    uint8  
 26  두산        152 non-null    uint8  
 27  롯데        152 non-null    uint8  
 28  삼성        152 non-null    uint8  
 29  한화        152 non-null    uint8  
dtypes: float64(19), int64(1), object(1), uint8(9)
memory usage: 26.4+ KB

# ols
import statsmodels.api as sm

x_train = sm.add_constant(x_train)
model = sm.OLS(y_train, x_train).fit()
model.summary()

OLS Regression Results
Dep. Variable:	y	R-squared:	0.928
Model:	OLS	Adj. R-squared:	0.907
Method:	Least Squares	F-statistic:	44.19
Date:	Tue, 20 Jul 2021	Prob (F-statistic):	7.70e-42
Time:	10:11:19	Log-Likelihood:	-1247.8
No. Observations:	121	AIC:	2552.
Df Residuals:	93	BIC:	2630.
Df Model:	27		
Covariance Type:	nonrobust		
coef	std err	t	P>|t|	[0.025	0.975]
const	1.872e+04	775.412	24.136	0.000	1.72e+04	2.03e+04
x1	-1476.1375	1289.136	-1.145	0.255	-4036.106	1083.831
x2	-415.3144	2314.750	-0.179	0.858	-5011.949	4181.320
x3	-9.383e+04	9.4e+04	-0.998	0.321	-2.8e+05	9.28e+04
x4	-485.0276	671.883	-0.722	0.472	-1819.254	849.199
x5	498.2459	695.803	0.716	0.476	-883.480	1879.972
x6	-262.5237	769.196	-0.341	0.734	-1789.995	1264.948
x7	-1371.0060	1559.650	-0.879	0.382	-4468.162	1726.150
x8	-164.7210	760.933	-0.216	0.829	-1675.784	1346.342
x9	3946.0617	2921.829	1.351	0.180	-1856.111	9748.235
x10	269.1233	721.020	0.373	0.710	-1162.679	1700.926
x11	1.024e+04	2523.966	4.057	0.000	5226.545	1.53e+04
x12	7.742e+04	7.93e+04	0.977	0.331	-8e+04	2.35e+05
x13	-2426.3684	2943.799	-0.824	0.412	-8272.169	3419.432
x14	-285.5830	781.560	-0.365	0.716	-1837.606	1266.440
x15	111.1761	758.548	0.147	0.884	-1395.150	1617.502
x16	7587.0753	6254.661	1.213	0.228	-4833.443	2e+04
x17	1266.8570	1238.036	1.023	0.309	-1191.636	3725.350
x18	-972.1837	817.114	-1.190	0.237	-2594.810	650.443
x19	5379.1903	7262.214	0.741	0.461	-9042.128	1.98e+04
x20	-4781.4961	5471.265	-0.874	0.384	-1.56e+04	6083.352
x21	-249.8717	1291.108	-0.194	0.847	-2813.757	2314.014
x22	235.2476	2207.965	0.107	0.915	-4149.333	4619.828
x23	1.907e+04	1266.567	15.055	0.000	1.66e+04	2.16e+04
x24	851.2121	6602.114	0.129	0.898	-1.23e+04	1.4e+04
x25	1297.3310	1929.556	0.672	0.503	-2534.385	5129.047
x26	1199.4709	720.099	1.666	0.099	-230.503	2629.444
x27	-931.9918	1632.526	-0.571	0.569	-4173.865	2309.882
x28	1.808e+04	1.67e+04	1.082	0.282	-1.51e+04	5.13e+04
Omnibus:	28.069	Durbin-Watson:	2.025
Prob(Omnibus):	0.000	Jarque-Bera (JB):	194.274
Skew:	-0.405	Prob(JB):	6.52e-43
Kurtosis:	9.155	Cond. No.	1.23e+16


Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 5.36e-30. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

# r_squared 결정계수
독립변수의 변동량으로 설명되는 종속변수의 변동량
상관계수의 제곱과 가다

# adj 수정결정계수
독립변수가 많아지는 경우 결정계수값이 커질수있어, 표본의 크기와 독립변수의 수를 고려하여
다중회귀분석을 수행하는 경우
p>|t| 각피처의 검정통계량(f statistics )이 유의미한지를 나타내는 pvalue 값
p value < 0.05 이면 피처가 회귀분석에 유의미한 피처다
이분석에서는 war 연복2017 한화 3개가 0.05미만
=> 회귀분석에서 유의미한 피처들

x_train = sm.add_constant(x_train)
model = sm.OLS(y_train, x_train).fit()
model.summary()

OLS Regression Results
Dep. Variable:	y	R-squared:	0.928
Model:	OLS	Adj. R-squared:	0.907
Method:	Least Squares	F-statistic:	44.19
Date:	Tue, 20 Jul 2021	Prob (F-statistic):	7.70e-42
Time:	10:53:15	Log-Likelihood:	-1247.8
No. Observations:	121	AIC:	2552.
Df Residuals:	93	BIC:	2630.
Df Model:	27		
Covariance Type:	nonrobust		
coef	std err	t	P>|t|	[0.025	0.975]
const	1.678e+04	697.967	24.036	0.000	1.54e+04	1.82e+04
BABIP	-1481.0173	1293.397	-1.145	0.255	-4049.448	1087.414
ERA	-416.6874	2322.402	-0.179	0.858	-5028.517	4195.143
FIP	-9.414e+04	9.43e+04	-0.998	0.321	-2.81e+05	9.31e+04
KIA	303.1852	2222.099	0.136	0.892	-4109.462	4715.833
KT	3436.0520	2133.084	1.611	0.111	-799.831	7671.935
LG	1116.9978	2403.317	0.465	0.643	-3655.513	5889.509
LOB%	-1375.5383	1564.806	-0.879	0.382	-4482.933	1731.857
NC	1340.5004	2660.966	0.504	0.616	-3943.651	6624.652
RA9-WAR	3959.1065	2931.488	1.351	0.180	-1862.247	9780.460
SK	2762.4237	2243.540	1.231	0.221	-1692.803	7217.650
WAR	1.027e+04	2532.309	4.057	0.000	5243.823	1.53e+04
kFIP	7.767e+04	7.95e+04	0.977	0.331	-8.03e+04	2.36e+05
경기	-2434.3895	2953.530	-0.824	0.412	-8299.515	3430.736
두산	971.9293	2589.849	0.375	0.708	-4170.998	6114.857
롯데	2313.9585	2566.009	0.902	0.370	-2781.627	7409.544
볼넷/9	7612.1566	6275.338	1.213	0.228	-4849.421	2.01e+04
블론	1271.0450	1242.128	1.023	0.309	-1195.576	3737.666
삼성	-946.5092	2482.257	-0.381	0.704	-5875.780	3982.762
삼진/9	5396.9728	7286.221	0.741	0.461	-9072.019	1.99e+04
선발	-4797.3028	5489.352	-0.874	0.384	-1.57e+04	6103.463
세	-250.6977	1295.377	-0.194	0.847	-2823.059	2321.663
승	236.0253	2215.264	0.107	0.915	-4163.049	4635.100
연봉(2017)	1.913e+04	1270.754	15.055	0.000	1.66e+04	2.17e+04
이닝	854.0260	6623.940	0.129	0.898	-1.23e+04	1.4e+04
패	1301.6197	1935.935	0.672	0.503	-2542.763	5146.003
한화	5477.8879	2184.273	2.508	0.014	1140.355	9815.421
홀드	-935.0728	1637.923	-0.571	0.569	-4187.663	2317.518
홈런/9	1.814e+04	1.68e+04	1.082	0.282	-1.52e+04	5.14e+04
Omnibus:	28.069	Durbin-Watson:	2.025
Prob(Omnibus):	0.000	Jarque-Bera (JB):	194.274
Skew:	-0.405	Prob(JB):	6.52e-43
Kurtosis:	9.155	Cond. No.	3.63e+16


Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 6.04e-31. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

plt.rcParams['figure.figsize'] = [20, 16]
plt.rc('font', family = 'Malgun Gothic')
coefs = model.params.tolist()
coefs_series = pd.Series(coefs)
x_labels = model.params.index.tolist()
ax = coefs_series.plot(kind = 'bar')
ax.set_title('feature_coef_graph')
ax.set_xlabel('x_feature')
ax.set_ylabel('coef')
ax.set_xticklabels(x_labels)

[Text(0, 0, 'const'),
 Text(1, 0, 'BABIP'),
 Text(2, 0, 'ERA'),
 Text(3, 0, 'FIP'),
 Text(4, 0, 'KIA'),
 Text(5, 0, 'KT'),
 Text(6, 0, 'LG'),
 Text(7, 0, 'LOB%'),
 Text(8, 0, 'NC'),
 Text(9, 0, 'RA9-WAR'),
 Text(10, 0, 'SK'),
 Text(11, 0, 'WAR'),
 Text(12, 0, 'kFIP'),
 Text(13, 0, '경기'),
 Text(14, 0, '두산'),
 Text(15, 0, '롯데'),
 Text(16, 0, '볼넷/9'),
 Text(17, 0, '블론'),
 Text(18, 0, '삼성'),
 Text(19, 0, '삼진/9'),
 Text(20, 0, '선발'),
 Text(21, 0, '세'),
 Text(22, 0, '승'),
 Text(23, 0, '연봉(2017)'),
 Text(24, 0, '이닝'),
 Text(25, 0, '패'),
 Text(26, 0, '한화'),
 Text(27, 0, '홀드'),
 Text(28, 0, '홈런/9')]

다중공선성이 높으면 상관성이 너무 높은 것
안정적인 분석을 위해서 안써야함

from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame()
vif['VIF Factor'] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]
vif['features'] = x.columns
vif.round(1)

	VIF Factor	features
0	3.2	BABIP
1	10.6	ERA
2	14238.3	FIP
3	1.1	KIA
4	1.1	KT
5	1.1	LG
6	4.3	LOB%
7	1.1	NC
8	13.6	RA9-WAR
9	1.1	SK
10	10.4	WAR
11	10264.1	kFIP
12	14.6	경기
13	1.2	두산
14	1.1	롯데
15	57.8	볼넷/9
16	3.0	블론
17	1.2	삼성
18	89.5	삼진/9
19	39.6	선발
20	3.1	세
21	8.0	승
22	2.5	연봉(2017)
23	63.8	이닝
24	5.9	패
25	1.1	한화
26	3.8	홀드
27	425.6	홈런/9

변수간 상관관계가 높아서 분석에 부정적인 영향을 미침
vif 평가 : 분산팽창요인
    보통 10~15 정도를 넘으면 다중공선성에 문제가 있다고 판단
    홈런, 이닝, 선발, 삼진, 볼넷, 경기, kfip,fip
    특히 이둘은 너무 유사해서 상승효과가 생김, 그래서 하나는 빼버려야함
1. vif 계수 높은 피처 제거, 유사피처중 한개만 제거
2. 다시모델을 실행해서 공선성 검증
3. 분석결과에서 p-value값이 유의미한 피처들을 선정

# 적절한 피처를 선정해서 다시 학습하기
# 피처간 상관계수를 그래프로 작성
scale_columns = ['승','패','세','홀드','블론','경기','선발','이닝','삼진/9',
                    '볼넷/9','홈런/9','BABIP','LOB%','ERA','RA9-WAR','FIP','kFIP','WAR','연봉(2017)']
picher_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 152 entries, 0 to 151
Data columns (total 30 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   선수명       152 non-null    object 
 1   승         152 non-null    float64
 2   패         152 non-null    float64
 3   세         152 non-null    float64
 4   홀드        152 non-null    float64
 5   블론        152 non-null    float64
 6   경기        152 non-null    float64
 7   선발        152 non-null    float64
 8   이닝        152 non-null    float64
 9   삼진/9      152 non-null    float64
 10  볼넷/9      152 non-null    float64
 11  홈런/9      152 non-null    float64
 12  BABIP     152 non-null    float64
 13  LOB%      152 non-null    float64
 14  ERA       152 non-null    float64
 15  RA9-WAR   152 non-null    float64
 16  FIP       152 non-null    float64
 17  kFIP      152 non-null    float64
 18  WAR       152 non-null    float64
 19  y         152 non-null    int64  
 20  연봉(2017)  152 non-null    float64
 21  KIA       152 non-null    uint8  
 22  KT        152 non-null    uint8  
 23  LG        152 non-null    uint8  
 24  NC        152 non-null    uint8  
 25  SK        152 non-null    uint8  
 26  두산        152 non-null    uint8  
 27  롯데        152 non-null    uint8  
 28  삼성        152 non-null    uint8  
 29  한화        152 non-null    uint8  
dtypes: float64(19), int64(1), object(1), uint8(9)
memory usage: 26.4+ KB

corr = picher_df[scale_columns].corr(method='pearson')
corr

	승	패	세	홀드	블론	경기	선발	이닝	삼진/9	볼넷/9	홈런/9	BABIP	LOB%	ERA	RA9-WAR	FIP	kFIP	WAR	연봉(2017)
승	1.000000	0.710749	0.053747	0.092872	0.105281	0.397074	0.773560	0.906093	0.078377	-0.404710	-0.116147	-0.171111	0.131178	-0.271086	0.851350	-0.303133	-0.314159	0.821420	0.629710
패	0.710749	1.000000	0.066256	0.098617	0.121283	0.343147	0.771395	0.829018	0.031755	-0.386313	-0.064467	-0.133354	-0.020994	-0.188036	0.595989	-0.233416	-0.238688	0.625641	0.429227
세	0.053747	0.066256	1.000000	0.112716	0.605229	0.434290	-0.177069	0.020278	0.170436	-0.131394	-0.073111	-0.089212	0.167557	-0.150348	0.167669	-0.199746	-0.225259	0.084151	0.262664
홀드	0.092872	0.098617	0.112716	1.000000	0.490076	0.715527	-0.285204	0.024631	0.186790	-0.146806	-0.076475	-0.104307	0.048123	-0.155712	0.003526	-0.211515	-0.237353	-0.038613	-0.001213
블론	0.105281	0.121283	0.605229	0.490076	1.000000	0.630526	-0.264160	0.014176	0.188423	-0.137019	-0.064804	-0.112480	0.100633	-0.160761	0.008766	-0.209014	-0.237815	-0.058213	0.146584
경기	0.397074	0.343147	0.434290	0.715527	0.630526	1.000000	-0.037443	0.376378	0.192487	-0.364293	-0.113545	-0.241608	0.105762	-0.320177	0.281595	-0.345351	-0.373777	0.197836	0.225357
선발	0.773560	0.771395	-0.177069	-0.285204	-0.264160	-0.037443	1.000000	0.894018	-0.055364	-0.312935	-0.058120	-0.098909	0.041819	-0.157775	0.742258	-0.151040	-0.142685	0.758846	0.488559
이닝	0.906093	0.829018	0.020278	0.024631	0.014176	0.376378	0.894018	1.000000	0.037343	-0.451101	-0.107063	-0.191514	0.103369	-0.285392	0.853354	-0.296768	-0.302288	0.832609	0.586874
삼진/9	0.078377	0.031755	0.170436	0.186790	0.188423	0.192487	-0.055364	0.037343	1.000000	0.109345	0.216017	0.457523	-0.071284	0.256840	0.102963	-0.154857	-0.317594	0.151791	0.104948
볼넷/9	-0.404710	-0.386313	-0.131394	-0.146806	-0.137019	-0.364293	-0.312935	-0.451101	0.109345	1.000000	0.302251	0.276009	-0.150837	0.521039	-0.398586	0.629833	0.605008	-0.394131	-0.332379
홈런/9	-0.116147	-0.064467	-0.073111	-0.076475	-0.064804	-0.113545	-0.058120	-0.107063	0.216017	0.302251	1.000000	0.362614	-0.274543	0.629912	-0.187210	0.831042	0.743623	-0.205014	-0.100896
BABIP	-0.171111	-0.133354	-0.089212	-0.104307	-0.112480	-0.241608	-0.098909	-0.191514	0.457523	0.276009	0.362614	1.000000	-0.505478	0.733109	-0.187058	0.251126	0.166910	-0.082995	-0.088754
LOB%	0.131178	-0.020994	0.167557	0.048123	0.100633	0.105762	0.041819	0.103369	-0.071284	-0.150837	-0.274543	-0.505478	1.000000	-0.720091	0.286893	-0.288050	-0.269536	0.144191	0.110424
ERA	-0.271086	-0.188036	-0.150348	-0.155712	-0.160761	-0.320177	-0.157775	-0.285392	0.256840	0.521039	0.629912	0.733109	-0.720091	1.000000	-0.335584	0.648004	0.582057	-0.261508	-0.203305
RA9-WAR	0.851350	0.595989	0.167669	0.003526	0.008766	0.281595	0.742258	0.853354	0.102963	-0.398586	-0.187210	-0.187058	0.286893	-0.335584	1.000000	-0.366308	-0.377679	0.917299	0.643375
FIP	-0.303133	-0.233416	-0.199746	-0.211515	-0.209014	-0.345351	-0.151040	-0.296768	-0.154857	0.629833	0.831042	0.251126	-0.288050	0.648004	-0.366308	1.000000	0.984924	-0.391414	-0.268005
kFIP	-0.314159	-0.238688	-0.225259	-0.237353	-0.237815	-0.373777	-0.142685	-0.302288	-0.317594	0.605008	0.743623	0.166910	-0.269536	0.582057	-0.377679	0.984924	1.000000	-0.408283	-0.282666
WAR	0.821420	0.625641	0.084151	-0.038613	-0.058213	0.197836	0.758846	0.832609	0.151791	-0.394131	-0.205014	-0.082995	0.144191	-0.261508	0.917299	-0.391414	-0.408283	1.000000	0.675794
연봉(2017)	0.629710	0.429227	0.262664	-0.001213	0.146584	0.225357	0.488559	0.586874	0.104948	-0.332379	-0.100896	-0.088754	0.110424	-0.203305	0.643375	-0.268005	-0.282666	0.675794	1.000000

# 히트맵 시각화
import seaborn as sns
show_cols = ['win', 'lose','save','hold','blon','match','start','inning','strike3',
            'ball4','homerun','BABIP','LOB','ERA','RA9-WAR','FIP','kFIP','WAR','2017']
plt.rc('font', family = 'Nanum Gothic')
sns.set(font_scale=0.8)
hm = sns.heatmap(corr.values,
                cbar = True,
                annot = True,
                square = True,
                fmt = '.2f',
                annot_kws={'size':15},
                yticklabels = show_cols,
                xticklabels = show_cols)
plt.tight_layout()
plt.show()

x = picher_df[['FIP','WAR','볼넷/9','삼진/9','연봉(2017)']]
y = picher_df['y']

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=19)

# 모델학습하기
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
model = lr.fit(x_train, y_train)

# r2
print(model.score(x_train, y_train))
print(model.score(x_test, y_test))

# 0.9150591192570362
# 0.9038759653889865

# rmse 평가
# mse 평균제곱오차
from math import sqrt
from sklearn.metrics import mean_squared_error
y_pred = lr.predict(x_train)
print(sqrt(mean_squared_error(y_train, y_pred)))
y_pred = lr.predict(x_test)
print(sqrt(mean_squared_error(y_test, y_pred)))

# 7893.462873347693
# 13141.86606359108

# 피처별 vif 공분산
from statsmodels.stats.outliers_influence import variance_inflation_factor
x = picher_df[['FIP','WAR','볼넷/9','삼진/9','연봉(2017)']]
vif = pd.DataFrame()
vif['VIF Factor'] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]
vif['features'] = x.columns
vif.round(1)


	VIF Factor	features
0	1.9	FIP
1	2.1	WAR
2	1.9	볼넷/9
3	1.1	삼진/9
4	1.9	연봉(2017)

# 시각화\ 비교
# 모든 데이터 검증
# lr 학습이 완료된 객체
x = picher_df[['FIP','WAR','볼넷/9','삼진/9','연봉(2017)']]
predict_2018_salary = lr.predict(x)
predict_2018_salary[:5]
picher_df['예측연봉(2018)'] = pd.Series(predict_2018_salary)

picher = pd.read_csv(picher_file_path)
picher = picher[['선수명','연봉(2017)']]

# 2018년 연봉 내림차순
result_df = picher_df.sort_values(by=['y'], ascending = False)
# 연봉2017 삭제, 정규화된 데이터, 실제데이터가 아님
result_df.drop(['연봉(2017)'], axis=1, inplace=True, errors='ignore')
# 연봉 2017의 실제데이터로 컬럼 변경
result_df = result_df.merge(picher, on=['선수명'], how='left')
result_df = result_df[['선수명', 'y','예측연봉(2018)','연봉(2017)']]
result_df.columns = ['선수명','실제연봉(2018)','예측연봉(2018)','작년연봉(2017)']

result_df
	선수명	실제연봉(2018)	예측연봉(2018)	작년연봉(2017)
0	양현종	230000	163930.148696	150000
1	켈리	140000	120122.822204	85000
2	소사	120000	88127.019455	50000
3	정우람	120000	108489.464585	120000
4	레일리	111000	102253.697589	85000
...	...	...	...	...
147	장지훈	2800	249.850641	2700
148	차재용	2800	900.811527	2800
149	성영훈	2700	5003.619609	2700
150	정동윤	2700	2686.350884	2700
151	장민익	2700	3543.781665	2700
152 rows × 4 columns

result_df = result_df.iloc[:10,:]
plt.rc('font', family = 'Malgun Gothic')
result_df.plot(x='선수명', y=['작년연봉(2017)','예측연봉(2018)','실제연봉(2018)'], kind='bar')

# 2017연봉과 2018년 연봉이 다른 선수들만 
result_df = result_df[result_df['작년연봉(2017)'] != result_df['예측연봉(2018)']]
result_df.head()
	선수명	실제연봉(2018)	예측연봉(2018)	작년연봉(2017)
0	양현종	230000	163930.148696	150000
1	켈리	140000	120122.822204	85000
2	소사	120000	88127.019455	50000
3	정우람	120000	108489.464585	120000
4	레일리	111000	102253.697589	85000

result_df = result_df.reset_index()
result_df.head()

	index	선수명	실제연봉(2018)	예측연봉(2018)	작년연봉(2017)
0	0	양현종	230000	163930.148696	150000
1	1	켈리	140000	120122.822204	85000
2	2	소사	120000	88127.019455	50000
3	3	정우람	120000	108489.464585	120000
4	4	레일리	111000	102253.697589	85000

result_df = result_df.iloc[:10, :]
result_df.plot(x='선수명', y=['작년연봉(2017)','예측연봉(2018)','실제연봉(2018)'],kind = 'bar')

저작자표시 비영리 (새창열림)

'Data_Science > Data_Analysis_Py' 카테고리의 다른 글

29. 비트코인 시계열 분석 \|\| prophet (0)	2021.11.24
28. 비트코인 가격 시계열 분석 \|\| Arima, fbProphet (0)	2021.11.24
26. 서울 중학교 졸업자 분석 \|\| dbscan, folium (0)	2021.11.24
25. 판매 데이터 분석 \|\| kmeans (0)	2021.11.24
24. 위스콘신 유방안데이터 분석 \|\| DT (0)	2021.11.24

My_Flow

27. 프로야구 연봉 예측 분석 || OLS, Heatmap

'Data_Science > Data_Analysis_Py' 카테고리의 다른 글

+ Recent posts

티스토리툴바