728x90
반응형
# white wine 분석
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"
savefile = "winequality-white.csv"
from urllib.request import urlretrieve
urlretrieve(url, savefile)

# ('winequality-white.csv', <http.client.HTTPMessage at 0x214ffabed90>)

winequality-white.csv
0.25MB

 

df = pd.read_csv('winequality-white.csv', sep=';', encoding = 'utf-8')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 459.3 KB

 

df.describe()

	fixed acidity	volatile acidity	citric acid	residual sugar	chlorides	free sulfur dioxide	total sulfur dioxide	density	pH	sulphates	alcohol	quality
count	4898.000000	4898.000000	4898.000000	4898.000000	4898.000000	4898.000000	4898.000000	4898.000000	4898.000000	4898.000000	4898.000000	4898.000000
mean	6.854788	0.278241	0.334192	6.391415	0.045772	35.308085	138.360657	0.994027	3.188267	0.489847	10.514267	5.877909
std	0.843868	0.100795	0.121020	5.072058	0.021848	17.007137	42.498065	0.002991	0.151001	0.114126	1.230621	0.885639
min	3.800000	0.080000	0.000000	0.600000	0.009000	2.000000	9.000000	0.987110	2.720000	0.220000	8.000000	3.000000
25%	6.300000	0.210000	0.270000	1.700000	0.036000	23.000000	108.000000	0.991723	3.090000	0.410000	9.500000	5.000000
50%	6.800000	0.260000	0.320000	5.200000	0.043000	34.000000	134.000000	0.993740	3.180000	0.470000	10.400000	6.000000
75%	7.300000	0.320000	0.390000	9.900000	0.050000	46.000000	167.000000	0.996100	3.280000	0.550000	11.400000	6.000000
max	14.200000	1.100000	1.660000	65.800000	0.346000	289.000000	440.000000	1.038980	3.820000	1.080000	14.200000	9.000000

 

sns.countplot(df['quality'])

plt.hist(df['quality'])

(array([  20.,  163.,    0., 1457.,    0., 2198.,  880.,    0.,  175.,
           5.]),
 array([3. , 3.6, 4.2, 4.8, 5.4, 6. , 6.6, 7.2, 7.8, 8.4, 9. ]),
 <BarContainer object of 10 artists>)

# df['quality'].value_counts() # 큰순서대로
df.groupby('quality')['quality'].count() # 그대로

quality
3      20
4     163
5    1457
6    2198
7     880
8     175
9       5
Name: quality, dtype: int64

 

 

plt.plot(df.groupby('quality')['quality'].count())

 

 

# gradientBoostingClassifier
x = df.drop('quality', axis = 1)
y = df['quality']

 

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=10)

 

from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
y_pred[:10]

# array([6, 5, 4, 5, 6, 6, 6, 6, 5, 6], dtype=int64)

 

# 평가
from sklearn.metrics import confusion_matrix
confmat = confusion_matrix(y_true = y_test, y_pred = y_pred)
print(confmat)

[[  0   1   0   0   1   0   0]
 [  0   2  18   9   0   0   0]
 [  0   6 159 109   5   1   0]
 [  1   6  73 337  33   0   0]
 [  0   0   6 104  68   2   0]
 [  0   0   0  19   8  10   1]
 [  0   0   0   1   0   0   0]]

 

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('정확도(accuracy) : %.2f'% accuracy_score(y_test, y_pred))
# print('정밀도(precision) : %.3f'% precision_score(y_test, y_pred))
# print('재현율(recall) : %.3f'% recall_score(y_test, y_pred))
# print('F1-score : %.3f'% f1_score(y_test, y_pred))
# # f= 2*(정밀도*재현율)/(정밀도+재현율)

정확도(accuracy) : 0.59

 

# y  = 3 ~9
# 3개 등급 으로
df.groupby('quality')['quality'].count()

quality
3      20
4     163
5    1457
6    2198
7     880
8     175
9       5
Name: quality, dtype: int64

 

y = df['quality']
newlist = []
for v in list(y) :
    if v <= 4 :
        newlist += [0]
    elif v <= 7 :
        newlist += [1]
    else :
        newlist += [2]
y = newlist
y[:10]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

 

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=10)
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
y_pred[:10]

array([1, 1, 0, 1, 1, 1, 1, 1, 1, 1])

 

from sklearn.metrics import confusion_matrix
confmat = confusion_matrix(y_true = y_test, y_pred = y_pred)
print(confmat)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('정확도(accuracy) : %.2f'% accuracy_score(y_test, y_pred))

[[  3  28   0]
 [  8 899   3]
 [  0  30   9]]
정확도(accuracy) : 0.93

 

 

 

 

반응형

+ Recent posts