728x90
반응형
# white wine 분석
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"
savefile = "winequality-white.csv"
from urllib.request import urlretrieve
urlretrieve(url, savefile)
# ('winequality-white.csv', <http.client.HTTPMessage at 0x214ffabed90>)
df = pd.read_csv('winequality-white.csv', sep=';', encoding = 'utf-8')
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 fixed acidity 4898 non-null float64
1 volatile acidity 4898 non-null float64
2 citric acid 4898 non-null float64
3 residual sugar 4898 non-null float64
4 chlorides 4898 non-null float64
5 free sulfur dioxide 4898 non-null float64
6 total sulfur dioxide 4898 non-null float64
7 density 4898 non-null float64
8 pH 4898 non-null float64
9 sulphates 4898 non-null float64
10 alcohol 4898 non-null float64
11 quality 4898 non-null int64
dtypes: float64(11), int64(1)
memory usage: 459.3 KB
df.describe()
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
count 4898.000000 4898.000000 4898.000000 4898.000000 4898.000000 4898.000000 4898.000000 4898.000000 4898.000000 4898.000000 4898.000000 4898.000000
mean 6.854788 0.278241 0.334192 6.391415 0.045772 35.308085 138.360657 0.994027 3.188267 0.489847 10.514267 5.877909
std 0.843868 0.100795 0.121020 5.072058 0.021848 17.007137 42.498065 0.002991 0.151001 0.114126 1.230621 0.885639
min 3.800000 0.080000 0.000000 0.600000 0.009000 2.000000 9.000000 0.987110 2.720000 0.220000 8.000000 3.000000
25% 6.300000 0.210000 0.270000 1.700000 0.036000 23.000000 108.000000 0.991723 3.090000 0.410000 9.500000 5.000000
50% 6.800000 0.260000 0.320000 5.200000 0.043000 34.000000 134.000000 0.993740 3.180000 0.470000 10.400000 6.000000
75% 7.300000 0.320000 0.390000 9.900000 0.050000 46.000000 167.000000 0.996100 3.280000 0.550000 11.400000 6.000000
max 14.200000 1.100000 1.660000 65.800000 0.346000 289.000000 440.000000 1.038980 3.820000 1.080000 14.200000 9.000000
sns.countplot(df['quality'])
plt.hist(df['quality'])
(array([ 20., 163., 0., 1457., 0., 2198., 880., 0., 175.,
5.]),
array([3. , 3.6, 4.2, 4.8, 5.4, 6. , 6.6, 7.2, 7.8, 8.4, 9. ]),
<BarContainer object of 10 artists>)
# df['quality'].value_counts() # 큰순서대로
df.groupby('quality')['quality'].count() # 그대로
quality
3 20
4 163
5 1457
6 2198
7 880
8 175
9 5
Name: quality, dtype: int64
plt.plot(df.groupby('quality')['quality'].count())
# gradientBoostingClassifier
x = df.drop('quality', axis = 1)
y = df['quality']
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=10)
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
y_pred[:10]
# array([6, 5, 4, 5, 6, 6, 6, 6, 5, 6], dtype=int64)
# 평가
from sklearn.metrics import confusion_matrix
confmat = confusion_matrix(y_true = y_test, y_pred = y_pred)
print(confmat)
[[ 0 1 0 0 1 0 0]
[ 0 2 18 9 0 0 0]
[ 0 6 159 109 5 1 0]
[ 1 6 73 337 33 0 0]
[ 0 0 6 104 68 2 0]
[ 0 0 0 19 8 10 1]
[ 0 0 0 1 0 0 0]]
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('정확도(accuracy) : %.2f'% accuracy_score(y_test, y_pred))
# print('정밀도(precision) : %.3f'% precision_score(y_test, y_pred))
# print('재현율(recall) : %.3f'% recall_score(y_test, y_pred))
# print('F1-score : %.3f'% f1_score(y_test, y_pred))
# # f= 2*(정밀도*재현율)/(정밀도+재현율)
정확도(accuracy) : 0.59
# y = 3 ~9
# 3개 등급 으로
df.groupby('quality')['quality'].count()
quality
3 20
4 163
5 1457
6 2198
7 880
8 175
9 5
Name: quality, dtype: int64
y = df['quality']
newlist = []
for v in list(y) :
if v <= 4 :
newlist += [0]
elif v <= 7 :
newlist += [1]
else :
newlist += [2]
y = newlist
y[:10]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=10)
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
y_pred[:10]
array([1, 1, 0, 1, 1, 1, 1, 1, 1, 1])
from sklearn.metrics import confusion_matrix
confmat = confusion_matrix(y_true = y_test, y_pred = y_pred)
print(confmat)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('정확도(accuracy) : %.2f'% accuracy_score(y_test, y_pred))
[[ 3 28 0]
[ 8 899 3]
[ 0 30 9]]
정확도(accuracy) : 0.93
반응형
'Data_Science > Data_Analysis_Py' 카테고리의 다른 글
35. 강남역 고기집 감성분석 || 감성분석, TF-IDF (0) | 2021.11.25 |
---|---|
34. 강남역 고기집 후기분석 || 맵크로울링 (0) | 2021.11.25 |
32. titanic || GBM (0) | 2021.11.24 |
31. titanic || logistic (0) | 2021.11.24 |
30. 보스턴 주택가격정보 || 선형회귀 (0) | 2021.11.24 |