728x90
반응형
01. CCTV_in_Seoul
0.00MB
import pandas as pd cctv_seoul = pd.read_csv('01. CCTV_in_Seoul.csv', encoding = 'utf-8') cctv_seoul.rename(columns ={'기관명':'구별'}, inplace = True) cctv_seoul 구별 소계 2013년도 이전 2014년 2015년 2016년 0 강남구 2780 1292 430 584 932 1 강동구 773 379 99 155 377 2 강북구 748 369 120 138 204 3 강서구 884 388 258 184 81 4 관악구 1496 846 260 390 613 5 광진구 707 573 78 53 174 6 구로구 1561 1142 173 246 323 7 금천구 1015 674 51 269 354 8 노원구 1265 542 57 451 516 9 도봉구 485 238 159 42 386 10 동대문구 1294 1070 23 198 579 11 동작구 1091 544 341 103 314 12 마포구 574 314 118 169 379 13 서대문구 962 844 50 68 292 14 서초구 1930 1406 157 336 398 15 성동구 1062 730 91 241 265 16 성북구 1464 1009 78 360 204 17 송파구 618 529 21 68 463 18 양천구 2034 1843 142 30 467 19 영등포구 904 495 214 195 373 20 용산구 1624 1368 218 112 398 21 은평구 1873 1138 224 278 468 22 종로구 1002 464 314 211 630 23 중구 671 413 190 72 348 24 중랑구 660 509 121 177 109

pop_seoul = pd.read_excel('01. population_in_Seoul.xls', header = 2, usecols = 'B, D, G, J, N') pop_seoul.rename(columns = {pop_seoul.columns[0]:'구별', pop_seoul.columns[1]:'인구수', pop_seoul.columns[2]:'한국인', pop_seoul.columns[3]:'외국인', pop_seoul.columns[4]:'고령자'}, inplace = True) pop_seoul.head() 구별 인구수 한국인 외국인 고령자 0 합계 10197604.0 9926968.0 270636.0 1321458.0 1 종로구 162820.0 153589.0 9231.0 25425.0 2 중구 133240.0 124312.0 8928.0 20764.0 3 용산구 244203.0 229456.0 14747.0 36231.0 4 성동구 311244.0 303380.0 7864.0 39997.0

pop_seoul.info() <class 'pandas.core.frame.DataFrame'> RangeIndex: 27 entries, 0 to 26 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 구별 26 non-null object 1 인구수 26 non-null float64 2 한국인 26 non-null float64 3 외국인 26 non-null float64 4 고령자 26 non-null float64 dtypes: float64(4), object(1) memory usage: 1.2+ KB

cctv_seoul['최근증가율'] = (cctv_seoul['2014년'] + cctv_seoul['2015년'] + cctv_seoul['2016년']) / \ cctv_seoul['2013년도 이전'] * 100 cctv_seoul.sort_values(by='최근증가율', ascending = False).head() 구별 소계 2013년도 이전 2014년 2015년 2016년 최근증가율 22 종로구 1002 464 314 211 630 248.922414 9 도봉구 485 238 159 42 386 246.638655 12 마포구 574 314 118 169 379 212.101911 8 노원구 1265 542 57 451 516 188.929889 1 강동구 773 379 99 155 377 166.490765

pop_seoul['외국인비율'] = pop_seoul['외국인'] /pop_seoul['인구수'] pop_seoul['고령자비율'] = pop_seoul['고령자'] /pop_seoul['인구수'] pop_seoul.head() 구별 인구수 한국인 외국인 고령자 외국인비율 고령자비율 0 합계 10197604.0 9926968.0 270636.0 1321458.0 0.026539 0.129585 1 종로구 162820.0 153589.0 9231.0 25425.0 0.056695 0.156154 2 중구 133240.0 124312.0 8928.0 20764.0 0.067007 0.155839 3 용산구 244203.0 229456.0 14747.0 36231.0 0.060388 0.148364 4 성동구 311244.0 303380.0 7864.0 39997.0 0.025266 0.128507

# 외국인비율 상위 5개 pop_seoul.sort_values(by = '외국인비율', ascending = False).head() 구별 인구수 한국인 외국인 고령자 외국인비율 고령자비율 19 영등포구 402985.0 368072.0 34913.0 52413.0 0.086636 0.130062 18 금천구 255082.0 236353.0 18729.0 32970.0 0.073423 0.129253 17 구로구 447874.0 416487.0 31387.0 56833.0 0.070080 0.126895 2 중구 133240.0 124312.0 8928.0 20764.0 0.067007 0.155839 3 용산구 244203.0 229456.0 14747.0 36231.0 0.060388 0.148364

data_result = pd.merge(cctv_seoul, pop_seoul, on = '구별') data_result.head() 구별 소계 2013년도 이전 2014년 2015년 2016년 최근증가율 인구수 한국인 외국인 고령자 외국인비율 고령자비율 0 강남구 2780 1292 430 584 932 150.619195 570500.0 565550.0 4950.0 63167.0 0.008677 0.110722 1 강동구 773 379 99 155 377 166.490765 453233.0 449019.0 4214.0 54622.0 0.009298 0.120516 2 강북구 748 369 120 138 204 125.203252 330192.0 326686.0 3506.0 54813.0 0.010618 0.166003 3 강서구 884 388 258 184 81 134.793814 603772.0 597248.0 6524.0 72548.0 0.010805 0.120158 4 관악구 1496 846 260 390 613 149.290780 525515.0 507203.0 18312.0 68082.0 0.034846 0.129553

del data_result['2013년도 이전'] del data_result['2014년'] del data_result['2015년'] del data_result['2016년'] data_result.head() 구별 소계 최근증가율 인구수 한국인 외국인 고령자 외국인비율 고령자비율 0 강남구 2780 150.619195 570500.0 565550.0 4950.0 63167.0 0.008677 0.110722 1 강동구 773 166.490765 453233.0 449019.0 4214.0 54622.0 0.009298 0.120516 2 강북구 748 125.203252 330192.0 326686.0 3506.0 54813.0 0.010618 0.166003 3 강서구 884 134.793814 603772.0 597248.0 6524.0 72548.0 0.010805 0.120158 4 관악구 1496 149.290780 525515.0 507203.0 18312.0 68082.0 0.034846 0.129553

data_result.set_index('구별', inplace = True) data_result.head() 소계 최근증가율 인구수 한국인 외국인 고령자 외국인비율 고령자비율 구별 강남구 2780 150.619195 570500.0 565550.0 4950.0 63167.0 0.008677 0.110722 강동구 773 166.490765 453233.0 449019.0 4214.0 54622.0 0.009298 0.120516 강북구 748 125.203252 330192.0 326686.0 3506.0 54813.0 0.010618 0.166003 강서구 884 134.793814 603772.0 597248.0 6524.0 72548.0 0.010805 0.120158 관악구 1496 149.290780 525515.0 507203.0 18312.0 68082.0 0.034846 0.129553

data_result.corr(method='pearson') 소계 최근증가율 인구수 한국인 외국인 고령자 외국인비율 고령자비율 소계 1.000000 -0.343016 0.306342 0.304287 -0.023786 0.255196 -0.136074 -0.280786 최근증가율 -0.343016 1.000000 -0.093068 -0.082511 -0.150463 -0.070969 -0.044042 0.185089 인구수 0.306342 -0.093068 1.000000 0.998061 -0.153371 0.932667 -0.591939 -0.669462 한국인 0.304287 -0.082511 0.998061 1.000000 -0.214576 0.931636 -0.637911 -0.660812 외국인 -0.023786 -0.150463 -0.153371 -0.214576 1.000000 -0.155381 0.838904 -0.014055 고령자 0.255196 -0.070969 0.932667 0.931636 -0.155381 1.000000 -0.606088 -0.380468 외국인비율 -0.136074 -0.044042 -0.591939 -0.637911 0.838904 -0.606088 1.000000 0.267348 고령자비율 -0.280786 0.185089 -0.669462 -0.660812 -0.014055 -0.380468 0.267348 1.000000

import numpy as np np.corrcoef(data_result['고령자비율'], data_result['소계']) array([[ 1. , -0.28078554], [-0.28078554, 1. ]]) np.corrcoef(data_result['외국인비율'], data_result['소계']) array([[ 1. , -0.13607433], [-0.13607433, 1. ]]) np.corrcoef(data_result['인구수'], data_result['소계']) array([[1. , 0.30634228], [0.30634228, 1. ]])

#matplot import matplotlib.pyplot as plt from matplotlib import font_manager, rc plt.figure() plt.rc('font', family = 'Malgun Gothic') data_result['소계'].sort_values(ascending = True).plot(kind='barh', grid=True, figsize = (10,10)) plt.show()
# 인구대비 cctv 비율컬럼 data_result['cctv비율'] = data_result['소계'] / data_result['인구수'] * 100 data_result.head() 소계 최근증가율 인구수 한국인 외국인 고령자 외국인비율 고령자비율 cctv비율 구별 강남구 2780 150.619195 570500.0 565550.0 4950.0 63167.0 0.008677 0.110722 0.487292 강동구 773 166.490765 453233.0 449019.0 4214.0 54622.0 0.009298 0.120516 0.170552 강북구 748 125.203252 330192.0 326686.0 3506.0 54813.0 0.010618 0.166003 0.226535 강서구 884 134.793814 603772.0 597248.0 6524.0 72548.0 0.010805 0.120158 0.146413 관악구 1496 149.290780 525515.0 507203.0 18312.0 68082.0 0.034846 0.129553 0.284673

data_result['cctv비율'].sort_values().plot(kind='barh', grid = True, figsize = (10,10)) plt.show()
# 산점도 plt.figure(figsize = (6,6)) plt.scatter(data_result['인구수'], data_result['소계'], s= 50) plt.xlabel('인구수') plt.ylabel('cctv갯수') plt.grid() plt.show()
# 인구수와 소계 산점도, 회귀선 작성 # polyfit 최소제곱법을 이용한 상수값, 1 : 차수 fpl = np.polyfit(data_result['인구수'], data_result['소계'], 1) # ㅣ 직선 f1 = np.poly1d(fpl) fx = np.linspace(100000, 700000, 100) plt.figure(figsize = (10, 10)) plt.scatter(data_result['인구수'], data_result['소계'], s=50) plt.plot(fx, f1(fx), ls='dashed', lw=3, color='g') plt.xlabel('인구수') plt.ylabel('cctv') plt.grid() plt.show()

# 인구수와 소계 산점도, 회귀선 작성 # polyfit 최소제곱법을 이용한 상수값, 1 : 차수 fpl = np.polyfit(data_result['인구수'], data_result['소계'], 4) # ㅣ 직선 f1 = np.poly1d(fpl) # 인구수에 맞는 y값 fx = np.linspace(100000, 700000, 100) plt.figure(figsize = (10, 10)) plt.scatter(data_result['인구수'], data_result['소계'], s=50) plt.plot(fx, f1(fx), ls='dashed', lw=3, color='g') plt.xlabel('인구수') plt.ylabel('cctv') plt.grid() plt.show()
import numpy as np x = np.array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10.]) y = np.array([4.23620563, 6.18696492, 2.83930821, 5.00923197, 11.51299327, 12.91581993, 14.51838241, 14.34881875, 18.13566499, 20.1408104, 21.9872241]) fit1 = np.polyfit(x, y, 1) # 2개 상수 (+ 절편) fit2 = np.polyfit(x, y, 2) # 3개 상수 (+ 절편) fit3 = np.polyfit(x, y, 3) # 4개 상수 (+ 절편) print(fit1) print(fit2) print(fit3) # [1.92858279 2.34176099] # [0.05915413 1.33704154 3.22907288] # [-0.02808825 0.48047788 -0.26960637 4.24024989]

num = len(x) for i in range(num) : fit1 = 1.92858279*x + 2.34176099 fit2 = 0.05915413*x**2 + 1.33704154*x + 3.22907288 fit3 = - 0.02808825*x**3 + 0.48047788*x**2 - 0.26960637*x + 4.24024989 print(fit3) # [ 4.24024989 4.42303315 5.39824267 6.99734895 9.05182249 11.39313379 # 13.85275335 16.26215167 18.45279925 20.25616659 21.50372419]

# xy 산점도 와 회귀선 plt.scatter(x, y) plt.plot(x, y) plt.plot(x, fit1) plt.show()
# xy 산점도 와 회귀선 plt.scatter(x, y) plt.plot(x, y) plt.plot(x, fit3) plt.show()
# 산점도 + 회귀선, 산점도에 색상을 회귀선과의 거리로 표시 # 회귀선을 위한 상수 fpl = np.polyfit(data_result['인구수'], data_result['소계'], 2) # fpl상수값을 이용하여 y값을 계산하기 위한 함수 f1 = np.poly1d(fpl) # X축값, 10만 ~ 70만 까지 100등분 fx = np.linspace(100000, 700000, 100) # data_result 인구수에 맞는 회귀선의 y값 # 절대값 data_result['오차'] = np.abs(data_result['소계'] - f1(data_result['인구수'])) df_sort = data_result.sort_values(by = '오차', ascending = False) # 그래프 작성 plt.figure(figsize = (14,10)) plt.scatter(data_result['인구수'], data_result['소계'], c =data_result['오차'], s=50) plt.plot(fx, f1(fx), ls = 'dashed', lw = 3, color = 'g') # 점에 구 이름 표시 for n in range(10) : # 라벨링 // 절대값 오차가 많은 구10개 정보 표시 plt.text(df_sort['인구수'][n]*1.02, df_sort['소계'][n]*0.98, # 약간 밑으로 df_sort.index[n], fontsize = 15) plt.xlabel('인구수') plt.ylabel('cctv갯수') plt.colorbar() plt.grid() plt.show()



반응형

'Data_Science > Data_Analysis_Py' 카테고리의 다른 글

22. auto-mpg || 회귀분석  (0) 2021.11.24
21. 서울시 범죄율 분석 || MinMaxscalimg  (0) 2021.11.24
19. 세계음주데이터2  (0) 2021.11.23
18. 세계음주 데이터 분석  (0) 2021.11.03
17. 서울 기온 분석  (0) 2021.11.02
728x90
반응형

drinks.csv
0.00MB

from scipy import stats
import pandas as pd
drinks = pd.read_csv('drinks.csv')
drinks['continent'] = drinks['continent'].fillna('OT')
drinks.info

<bound method DataFrame.info of          country  beer_servings  spirit_servings  wine_servings  \
0    Afghanistan              0                0              0   
1        Albania             89              132             54   
2        Algeria             25                0             14   
3        Andorra            245              138            312   
4         Angola            217               57             45   
..           ...            ...              ...            ...   
188    Venezuela            333              100              3   
189      Vietnam            111                2              1   
190        Yemen              6                0              0   
191       Zambia             32               19              4   
192     Zimbabwe             64               18              4   

     total_litres_of_pure_alcohol continent  
0                             0.0        AS  
1                             4.9        EU  
2                             0.7        AF  
3                            12.4        EU  
4                             5.9        AF  
..                            ...       ...  
188                           7.7        SA  
189                           2.0        AS  
190                           0.1        AS  
191                           2.5        AF  
192                           4.7        AF  

[193 rows x 6 columns]>

 

africa = drinks.loc[drinks['continent']=='AF']
europe = drinks.loc[drinks['continent']=='EU']
# 두집단간 평균의 차이
tTestResult = stats.ttest_ind(africa['beer_servings'], europe['beer_servings'])
tTestResultDiffVar = stats.ttest_ind(africa['beer_servings'], europe['beer_servings'], equal_var = False)

 

# 두집단의 분산이 같다 가설
print(tTestResult)

# Ttest_indResult(statistic=-7.267986335644365, pvalue=9.719556422442453e-11)

 

 

# 두집단의 분산이 다르다 가설
print(tTestResultDiffVar)

# Ttest_indResult(statistic=-7.143520192189803, pvalue=2.9837787864303205e-10)

- t-statistic : 평균차이, 음수 : 뒤쪽 데이터의 평균 큰 경우, 검정 통계
- p-value : 유의확률, 결과가 0, 두집단의 평균이 같지 않다. => 귀무가설이 기각, 맞다틀리다
- 귀무가설 : 현재가설이 맞지 않다를 증명 // 예상되는 가설
- 대립가설 : 귀무가설의 반대되는 가설,
- 아프리카와 유럽의 맥주소비량의 차이는 확률적으로 다르다
- => 통계적으로 유의미하다

 

# 대한민국은 얼마나 술을 독하게 마실까?
drinks['total_servings'] =  drinks['beer_servings'] + drinks['spirit_servings']+drinks['wine_servings']
drinks.head()

	country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent	total_servings
0	Afghanistan	0	0	0	0.0	AS	0
1	Albania	89	132	54	4.9	EU	275
2	Algeria	25	0	14	0.7	AF	39
3	Andorra	245	138	312	12.4	EU	695
4	Angola	217	57	45	5.9	AF	319

 

drinks['alcohol_rate'] = drinks['total_litres_of_pure_alcohol'] / drinks['total_servings']
# alcohol rate , 분모가 0이면 결측값이 생김
drinks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 8 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country                       193 non-null    object 
 1   beer_servings                 193 non-null    int64  
 2   spirit_servings               193 non-null    int64  
 3   wine_servings                 193 non-null    int64  
 4   total_litres_of_pure_alcohol  193 non-null    float64
 5   continent                     193 non-null    object 
 6   total_servings                193 non-null    int64  
 7   alcohol_rate                  180 non-null    float64
dtypes: float64(2), int64(4), object(2)
memory usage: 12.2+ KB

 

drinks['alcohol_rate'].fillna(0, inplace = True)
drinks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 8 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country                       193 non-null    object 
 1   beer_servings                 193 non-null    int64  
 2   spirit_servings               193 non-null    int64  
 3   wine_servings                 193 non-null    int64  
 4   total_litres_of_pure_alcohol  193 non-null    float64
 5   continent                     193 non-null    object 
 6   total_servings                193 non-null    int64  
 7   alcohol_rate                  193 non-null    float64
dtypes: float64(2), int64(4), object(2)
memory usage: 12.2+ KB

 

country_alcohol_rank = drinks[['country', 'alcohol_rate']]
country_alcohol_rank = country_alcohol_rank.sort_values(by = ['alcohol_rate'], ascending = False)
country_alcohol_rank.head()

	country	alcohol_rate
63	Gambia	0.266667
153	Sierra Leone	0.223333
124	Nigeria	0.185714
179	Uganda	0.153704
142	Rwanda	0.151111

 

import numpy as np
import matplotlib.pyplot as plt
country_list = country_alcohol_rank.country.tolist()
x_pos = np.arange(len(country_list))
rank = country_alcohol_rank.alcohol_rate.tolist()
country_list.index("South Korea")

bar_list = plt.bar(x_pos, rank)
bar_list[country_list.index('South Korea')].set_color('r')
plt.ylabel('alcohol rate')
plt.title('liquor drink rank by country')
plt.axis([0, 200, 0, 0.3])

korea_rank = country_list.index('South Korea')
korea_alc_rate = country_alcohol_rank[country_alcohol_rank['country'] == 'South Korea']['alcohol_rate'].values[0]
plt.annotate('South korea :' + str(korea_rank + 1), xy = (korea_rank, korea_alc_rate), 
            xytext = (korea_rank + 10, korea_alc_rate + 0.05),
            arrowprops = dict(facecolor = 'red', shrink = 0.05))
plt.show()

#  전체 소비량을 막대그래프로 작성
country_serving_rank = drinks[['country','total_servings']]
country_serving_rank = country_serving_rank.sort_values(by=['total_servings'], ascending=0)
country_serving_rank.head()

	country	total_servings
3	Andorra	695
68	Grenada	665
45	Czech Republic	665
61	France	648
141	Russian Federation	646

 

# 그래프 작성하기 
country_list = country_serving_rank.country.tolist()
x_pos = np.arange(len(country_list))
rank = country_serving_rank.total_servings.tolist()

bar_list = plt.bar(x_pos, rank)
bar_list[country_list.index('South Korea')].set_color('r')
plt.ylabel('alcohol rate')
plt.title('liquor drink rank by country')
plt.axis([0, 200, 0, 700])

korea_rank = country_list.index('South Korea')
korea_serving_rate = country_serving_rank[country_serving_rank['country'] == 'South Korea']['total_servings'].values[0]
plt.annotate('South korea :' + str(korea_rank + 1), xy = (korea_rank, korea_serving_rate), 
            xytext = (korea_rank + 10, korea_serving_rate + 0.05),
            arrowprops = dict(facecolor = 'red', shrink = 0.05))
plt.show()

 

반응형
728x90
반응형

drinks.csv
0.00MB

import pandas as pd
drinks = pd.read_csv('drinks.csv')
drinks.info

<bound method DataFrame.info of          country  beer_servings  spirit_servings  wine_servings  \
0    Afghanistan              0                0              0   
1        Albania             89              132             54   
2        Algeria             25                0             14   
3        Andorra            245              138            312   
4         Angola            217               57             45   
..           ...            ...              ...            ...   
188    Venezuela            333              100              3   
189      Vietnam            111                2              1   
190        Yemen              6                0              0   
191       Zambia             32               19              4   
192     Zimbabwe             64               18              4   

     total_litres_of_pure_alcohol continent  
0                             0.0        AS  
1                             4.9        EU  
2                             0.7        AF  
3                            12.4        EU  
4                             5.9        AF  
..                            ...       ...  
188                           7.7        SA  
189                           2.0        AS  
190                           0.1        AS  
191                           2.5        AF  
192                           4.7        AF  

[193 rows x 6 columns]>
drinks.head

<bound method NDFrame.head of          country  beer_servings  spirit_servings  wine_servings  \
0    Afghanistan              0                0              0   
1        Albania             89              132             54   
2        Algeria             25                0             14   
3        Andorra            245              138            312   
4         Angola            217               57             45   
..           ...            ...              ...            ...   
188    Venezuela            333              100              3   
189      Vietnam            111                2              1   
190        Yemen              6                0              0   
191       Zambia             32               19              4   
192     Zimbabwe             64               18              4   

     total_litres_of_pure_alcohol continent  
0                             0.0        AS  
1                             4.9        EU  
2                             0.7        AF  
3                            12.4        EU  
4                             5.9        AF  
..                            ...       ...  
188                           7.7        SA  
189                           2.0        AS  
190                           0.1        AS  
191                           2.5        AF  
192                           4.7        AF  

[193 rows x 6 columns]>
# 피처 상관관계
# 피어스 상관계수 
# 'beer_serving', 'wine_servings'
corr = drinks[['beer_servings', 'wine_servings']].corr(method = 'pearson')
corr


	beer_servings	wine_servings
beer_servings	1.000000	0.527172
wine_servings	0.527172	1.000000
corr = drinks[['beer_servings','spirit_servings', 'wine_servings','total_litres_of_pure_alcohol']].corr(method = 'pearson')
corr

	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol
beer_servings	1.000000	0.458819	0.527172	0.835839
spirit_servings	0.458819	1.000000	0.194797	0.654968
wine_servings	0.527172	0.194797	1.000000	0.667598
total_litres_of_pure_alcohol	0.835839	0.654968	0.667598	1.000000
# 상관계수 시각화
import matplotlib.pyplot as plt
import seaborn as sns
cols_view = ['beer','spirit', 'wine', 'alcohol']
sns.set(font_scale = 1.5)
hm = sns.heatmap(corr.values, cbar = True, annot = True, square=True,
                fmt = '.2f', annot_kws = {'size':15},
                yticklabels = cols_view, xticklabels = cols_view)
plt.show()

hm = sns.pairplot(drinks)
plt.show()

 

drinks.isnull().sum()
# drinks.info()

country                          0
beer_servings                    0
spirit_servings                  0
wine_servings                    0
total_litres_of_pure_alcohol     0
continent                       23
dtype: int64
drinks['continent'] = drinks['continent'].fillna('OT')

drinks['continent'].value_counts()

AF    53
EU    45
AS    44
OT    23
OC    16
SA    12
Name: continent, dtype: int64
# 대륙별 국가수 출력
print(drinks.groupby('continent').count()['country'])

continent
AF    53
AS    44
EU    45
OC    16
OT    23
SA    12
Name: country, dtype: int64
plt.pie(drinks['continent'].value_counts(),
        labels = drinks['continent'].value_counts().index.tolist(),
       autopct='%.0f%%',
       explode = (0,0,0,0.2,0,0),
       shadow=True)
plt.title('null data to "ot"')
plt.show()

drinks.groupby('continent')['spirit_servings'].max()

continent
AF    152
AS    326
EU    373
OC    254
OT    438
SA    302
Name: spirit_servings, dtype: int64
drinks.groupby('continent')['spirit_servings'].mean()

continent
AF     16.339623
AS     60.840909
EU    132.555556
OC     58.437500
OT    165.739130
SA    114.750000
Name: spirit_servings, dtype: float64
drinks.groupby('continent')['spirit_servings'].agg(['mean','min','max','sum'])

	mean	min	max	sum
continent				
AF	16.339623	0	152	866
AS	60.840909	0	326	2677
EU	132.555556	0	373	5965
OC	58.437500	0	254	935
OT	165.739130	68	438	3812
SA	114.750000	25	302	1377
dm = drinks['total_litres_of_pure_alcohol'].mean()
con_mean = drinks.groupby('continent')['total_litres_of_pure_alcohol'].mean()
con_mean[con_mean >= dm]

continent
EU    8.617778
OT    5.995652
SA    6.308333
Name: total_litres_of_pure_alcohol, dtype: float64
dmax = drinks.groupby('continent')['beer_servings'].mean()
dmax[dmax == dmax.max()]

continent
EU    193.777778
Name: beer_servings, dtype: float64
drinks.groupby('continent')['beer_servings'].mean().idxmax()

# 'EU'
drinks.groupby('continent')['beer_servings'].mean().idxmin()

# 'AS'
result

	mean	min	max	sum
continent				
AF	16.339623	0	152	866
AS	60.840909	0	326	2677
EU	132.555556	0	373	5965
OC	58.437500	0	254	935
OT	165.739130	68	438	3812
SA	114.750000	25	302	1377
result.index
# Index(['AF', 'AS', 'EU', 'OC', 'OT', 'SA'], dtype='object', name='continent')
import numpy as np
# result = drinks.groupby('continent')['beer_servings'].agg(['mean', 'min', 'max', 'sum']
means = result['mean'].tolist() 
mins = result['min'].tolist() 
maxs = result['max'].tolist()  
sums = result['sum'].tolist()                                                            
index = np.arange(len(result.index))
bar_width = 0.1
rects1 = plt.bar(index, means, bar_width, color = 'r', label = 'Mean')
rects2 = plt.bar(index, mins, bar_width, color = 'g', label = 'Min')
rects3 = plt.bar(index, maxs, bar_width, color = 'b', label = 'Max')
rects4 = plt.bar(index, sums, bar_width, color = 'y', label = 'Sum')
plt.xticks(index, result.index.tolist())
plt.legend(loc="best")
plt.show()

# 대륙별 total_litres_of_pure_alcohol 섭취량 평균을 시각화
import numpy as np
continent_mean = drinks.groupby('continent')['total_litres_of_pure_alcohol'].mean()
total_mean = drinks.total_litres_of_pure_alcohol.mean()

continents = continent_mean.index.tolist()
continents.append('Mean')

x_pos = np.arange(len(continents))
alcohol = continent_mean.tolist()
alcohol.append(total_mean)

bar_list = plt.bar(x_pos, alcohol, align = 'center', alpha = 0.5)
bar_list[len(continents)-1].set_color('r')
plt.plot([0., 6], [total_mean, total_mean], "k--")
plt.xticks(x_pos, continents)
plt.ylabel('total_litres_of_pure_alcohol')
plt.title('total_litres_of_pure_alcohol by continent')
plt.show()

# 대륙별 beer_serving 합계를 막대그래프로 시각화
# eu 막대의 색상을 빨강색으로 변경하기
# 전체 맥주 소비량 합계의 평균을 구해서 막대 그래프에 추가
# 평균선을 출력하기, 막대 색상은 노랑색
# 평균 선은 검정색("k--")
beer_sum = drinks.groupby('continent')['beer_servings'].sum()
beer_sum

continent
AF    3258
AS    1630
EU    8720
OC    1435
OT    3345
SA    2101
Name: beer_servings, dtype: int64
beer_mean = beer_sum.mean()
beer_mean
# 3414.8333333333335
continents = beer_sum.index.tolist()
continents.append("Mean")
continents

# ['AF', 'AS', 'EU', 'OC', 'OT', 'SA', 'Mean']
x_pos = np.arange(len(continents))
alcohol = beer_sum.tolist()
alcohol.append(beer_mean)
alcohol

[3258, 1630, 8720, 1435, 3345, 2101, 3414.8333333333335]

 

bar_list = plt.bar(x_pos, alcohol, align='center', alpha = 0.5)
bar_list[2].set_color("r")

 

반응형

'Data_Science > Data_Analysis_Py' 카테고리의 다른 글

20. 서울시 인구분석 || 다중회귀  (0) 2021.11.23
19. 세계음주데이터2  (0) 2021.11.23
17. 서울 기온 분석  (0) 2021.11.02
16. EDA, 멕시코식당 주문 CHIPOTLE  (0) 2021.10.28
15. 스크래핑  (0) 2021.10.28
728x90
반응형

seoul_5.csv
1.13MB

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('seoul_5.csv', encoding = 'cp949')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40221 entries, 0 to 40220
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   날짜       40221 non-null  object 
 1   지점       40221 non-null  int64  
 2   평균기온(℃)  39465 non-null  float64
 3   최저기온(℃)  39464 non-null  float64
 4   최고기온(℃)  39463 non-null  float64
dtypes: float64(3), int64(1), object(1)
memory usage: 1.5+ MB
df.describe()

	지점	평균기온(℃)	최저기온(℃)	최고기온(℃)
count	40221.0	39465.000000	39464.000000	39463.000000
mean	108.0	11.704019	7.406393	16.716083
std	0.0	10.668056	10.891154	10.998383
min	108.0	-19.200000	-23.100000	-16.300000
25%	108.0	2.600000	-1.500000	7.200000
50%	108.0	12.900000	8.000000	18.600000
75%	108.0	21.200000	17.000000	26.200000
max	108.0	33.700000	30.300000	39.600000
df

	날짜	지점	평균기온(℃)	최저기온(℃)	최고기온(℃)
0	1907-10-01	108	13.5	7.9	20.7
1	1907-10-02	108	16.2	7.9	22.0
2	1907-10-03	108	16.2	13.1	21.3
3	1907-10-04	108	16.5	11.2	22.0
4	1907-10-05	108	17.6	10.9	25.4
...	...	...	...	...	...
40216	2019-01-13	108	1.2	-3.0	7.6
40217	2019-01-14	108	1.4	-2.4	5.3
40218	2019-01-15	108	-1.7	-7.2	2.6
40219	2019-01-16	108	-5.2	-10.1	-1.1
40220	2019-01-17	108	-0.3	-3.2	4.0
# 결측치 제거
# 날짜 dateTime => 월일 잘라내기
df['날짜'] = pd.to_datetime(df['날짜'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40221 entries, 0 to 40220
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   날짜       40221 non-null  datetime64[ns]
 1   지점       40221 non-null  int64         
 2   평균기온(℃)  39465 non-null  float64       
 3   최저기온(℃)  39464 non-null  float64       
 4   최고기온(℃)  39463 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int64(1)
memory usage: 1.5 MB
df['Year'] = df['날짜'].dt.year
df['Month'] = df['날짜'].dt.month
df['Day'] = df['날짜'].dt.day
df.head()


날짜	지점	평균기온(℃)	최저기온(℃)	최고기온(℃)	Year	Month	Day
0	1907-10-01	108	13.5	7.9	20.7	1907	10	1
1	1907-10-02	108	16.2	7.9	22.0	1907	10	2
2	1907-10-03	108	16.2	13.1	21.3	1907	10	3
3	1907-10-04	108	16.5	11.2	22.0	1907	10	4
4	1907-10-05	108	17.6	10.9	25.4	1907	10	5
df.columns = ['날짜','지점','평균기온','최저기온','최고기온','Year','Month','Day']
df.head()

	날짜	지점	평균기온	최저기온	최고기온	Year	Month	Day
0	1907-10-01	108	13.5	7.9	20.7	1907	10	1
1	1907-10-02	108	16.2	7.9	22.0	1907	10	2
2	1907-10-03	108	16.2	13.1	21.3	1907	10	3
3	1907-10-04	108	16.5	11.2	22.0	1907	10	4
4	1907-10-05	108	17.6	10.9	25.4	1907	10	5
df_0214 = df[(df['Month'] == 2)&(df['Day'] == 14)]
df_0214

	날짜	지점	평균기온	최저기온	최고기온	Year	Month	Day
136	1908-02-14	108	-3.3	-7.5	2.3	1908	2	14
502	1909-02-14	108	2.6	-4.5	8.8	1909	2	14
867	1910-02-14	108	-3.1	-10.1	2.8	1910	2	14
1232	1911-02-14	108	0.8	0.0	3.5	1911	2	14
1597	1912-02-14	108	6.3	0.9	11.2	1912	2	14
...	...	...	...	...	...	...	...	...
38422	2014-02-14	108	2.7	-0.7	7.6	2014	2	14
38787	2015-02-14	108	2.0	-3.1	6.6	2015	2	14
39152	2016-02-14	108	-2.6	-6.8	5.8	2016	2	14
39518	2017-02-14	108	0.3	-4.0	6.5	2017	2	14
39883	2018-02-14	108	3.5	-0.7	8.7	2018	2	14
110 rows × 8 columns
df_year = df_0214['Year']
df_high = df_0214['최고기온']
df_low = df_0214['최저기온']
plt.style.use('ggplot')
fig = plt.figure(figsize=(20,5))
ax = fig.add_subplot(1,1,1) 

ax.plot(df_year, df_0214['최고기온'], label = df['날짜'])
ax.plot(df_year, df_0214['최저기온'])
ax.set_title('2월 14일 기온 변화')
ax.set_xticklabels(df['Year'], rotation = 90)
ax.set_xlabel('year')
ax.set_ylabel('temperture')
plt.show()

df.head()

	날짜	지점	평균기온	최저기온	최고기온	Year	Month	Day
0	1907-10-01	108	13.5	7.9	20.7	1907	10	1
1	1907-10-02	108	16.2	7.9	22.0	1907	10	2
2	1907-10-03	108	16.2	13.1	21.3	1907	10	3
3	1907-10-04	108	16.5	11.2	22.0	1907	10	4
4	1907-10-05	108	17.6	10.9	25.4	1907	10	5
plt.style.use('ggplot')
plt.hist(df['평균기온'], bins = 100, color = 'r')
plt.show()

df.isnull().sum()

날짜         0
지점         0
평균기온     756
최저기온     757
최고기온     758
Year       0
Month      0
Day        0
dtype: int64
df = df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
df.isnull().sum()

날짜       0
지점       0
평균기온     0
최저기온     0
최고기온     0
Year     0
Month    0
Day      0
dtype: int64
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39463 entries, 0 to 40220
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   날짜      39463 non-null  datetime64[ns]
 1   지점      39463 non-null  int64         
 2   평균기온    39463 non-null  float64       
 3   최저기온    39463 non-null  float64       
 4   최고기온    39463 non-null  float64       
 5   Year    39463 non-null  int64         
 6   Month   39463 non-null  int64         
 7   Day     39463 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int64(4)
memory usage: 2.7 MB
plt.style.use('ggplot')
plt.boxplot(df['평균기온'])
plt.title("1907년부터 2019년까지 서울의 평균기온")

# Text(0.5, 1.0, '1907년부터 2019년까지 서울의 평균기온')

폰트 설정하면 글자 깨지는 것 보완가능

 

 

반응형

'Data_Science > Data_Analysis_Py' 카테고리의 다른 글

19. 세계음주데이터2  (0) 2021.11.23
18. 세계음주 데이터 분석  (0) 2021.11.03
16. EDA, 멕시코식당 주문 CHIPOTLE  (0) 2021.10.28
15. 스크래핑  (0) 2021.10.28
14. Stockprice (2  (0) 2021.10.26
728x90
반응형

멕시코식당 주문 CHIPOTLE

1. 탐색 : 기본정보 살펴보기

  • 데이터셋의 기본 정보
  • 데이터셋의 행과열, 데이터
  • 데이터셋의 수치적 특징 파악
  • tsv 셀구분 \t tab 구분 sep = \t설정
import pandas as pd
df = pd.read_csv('chipotle.tsv', sep ='\t')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4622 entries, 0 to 4621
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   order_id            4622 non-null   int64 
 1   quantity            4622 non-null   int64 
 2   item_name           4622 non-null   object
 3   choice_description  3376 non-null   object
 4   item_price          4622 non-null   object
dtypes: int64(2), object(3)
memory usage: 180.7+ KB
df.shape
# (4622, 5)

df.head()
	order_id	quantity	item_name	choice_description	item_price
0	1	1	Chips and Fresh Tomato Salsa	NaN	$2.39
1	1	1	Izze	[Clementine]	$3.39
2	1	1	Nantucket Nectar	[Apple]	$3.39
3	1	1	Chips and Tomatillo-Green Chili Salsa	NaN	$2.39
4	2	2	Chicken Bowl	[Tomatillo-Red Chili Salsa (Hot), [Black Beans...	$16.98
df.columns
Index(['order_id', 'quantity', 'item_name', 'choice_description',
       'item_price'],
      dtype='object')
      
df.index
# RangeIndex(start=0, stop=4622, step=1)

수치적 특징 파악

df.describe()
	order_id	quantity
count	4622.000000	4622.000000
mean	927.254868	1.075725
std	528.890796	0.410186
min	1.000000	1.000000
25%	477.250000	1.000000
50%	926.000000	1.000000
75%	1393.000000	1.000000
max	1834.000000	15.000000
# df['order_id'] = str(df['order_id'])
df['order_id'] = df['order_id'].astype(str)

df.describe()
	quantity
count	4622.000000
mean	1.075725
std	0.410186
min	1.000000
25%	1.000000
50%	1.000000
75%	1.000000
max	15.000000
# item_name 종류 출력하기
df['item_name'].unique()
print(len(df['item_name'].unique()))
# 50

탐색과 시각화

# item별 주문 수량 조회 가장많이 주문 top10개 출력
df['item_name'].value_counts()[:10]

Chicken Bowl                    726
Chicken Burrito                 553
Chips and Guacamole             479
Steak Burrito                   368
Canned Soft Drink               301
Chips                           211
Steak Bowl                      211
Bottled Water                   162
Chicken Soft Tacos              115
Chips and Fresh Tomato Salsa    110
Name: item_name, dtype: int64
# 아이템당 주문 갯수 조회
print(df.groupby('item_name')['order_id'].count())
item_name
6 Pack Soft Drink                         54
Barbacoa Bowl                             66
Barbacoa Burrito                          91
Barbacoa Crispy Tacos                     11
Barbacoa Salad Bowl                       10
Barbacoa Soft Tacos                       25
Bottled Water                            162
Bowl                                       2
Burrito                                    6
Canned Soda                              104
Canned Soft Drink                        301
Carnitas Bowl                             68
Carnitas Burrito                          59
Carnitas Crispy Tacos                      7
Carnitas Salad                             1
Carnitas Salad Bowl                        6
Carnitas Soft Tacos                       40
Chicken Bowl                             726
Chicken Burrito                          553
Chicken Crispy Tacos                      47
Chicken Salad                              9
Chicken Salad Bowl                       110
Chicken Soft Tacos                       115
Chips                                    211
Chips and Fresh Tomato Salsa             110
Chips and Guacamole                      479
Chips and Mild Fresh Tomato Salsa          1
Chips and Roasted Chili Corn Salsa        22
Chips and Roasted Chili-Corn Salsa        18
Chips and Tomatillo Green Chili Salsa     43
Chips and Tomatillo Red Chili Salsa       48
Chips and Tomatillo-Green Chili Salsa     31
Chips and Tomatillo-Red Chili Salsa       20
Crispy Tacos                               2
Izze                                      20
Nantucket Nectar                          27
Salad                                      2
Side of Chips                            101
Steak Bowl                               211
Steak Burrito                            368
Steak Crispy Tacos                        35
Steak Salad                                4
Steak Salad Bowl                          29
Steak Soft Tacos                          55
Veggie Bowl                               85
Veggie Burrito                            95
Veggie Crispy Tacos                        1
Veggie Salad                               6
Veggie Salad Bowl                         18
Veggie Soft Tacos                          7
Name: order_id, dtype: int64
print(df['order_id'].unique())
print(len(df['order_id'].unique()))
# ['1' '2' '3' ... '1832' '1833' '1834']
# 1834
print(df['order_id'].tail())
# ['1' '2' '3' ... '1832' '1833' '1834']
# 1834
print(df['order_id'].tail())

4617    1833
4618    1833
4619    1834
4620    1834
4621    1834
Name: order_id, dtype: object
# 간단한 시각화
# item당 주문 개수를 시각화로 출력하기
import matplotlib.pyplot as plt
order_count =df.groupby('item_name')['order_id'].count()
order_count = order_count.sort_values(ascending=False)
print(order_count)

item_name
Chicken Bowl                             726
Chicken Burrito                          553
Chips and Guacamole                      479
Steak Burrito                            368
Canned Soft Drink                        301
Steak Bowl                               211
Chips                                    211
Bottled Water                            162
Chicken Soft Tacos                       115
Chicken Salad Bowl                       110
Chips and Fresh Tomato Salsa             110
Canned Soda                              104
Side of Chips                            101
Veggie Burrito                            95
Barbacoa Burrito                          91
Veggie Bowl                               85
Carnitas Bowl                             68
Barbacoa Bowl                             66
Carnitas Burrito                          59
Steak Soft Tacos                          55
6 Pack Soft Drink                         54
Chips and Tomatillo Red Chili Salsa       48
Chicken Crispy Tacos                      47
Chips and Tomatillo Green Chili Salsa     43
Carnitas Soft Tacos                       40
Steak Crispy Tacos                        35
Chips and Tomatillo-Green Chili Salsa     31
Steak Salad Bowl                          29
Nantucket Nectar                          27
Barbacoa Soft Tacos                       25
Chips and Roasted Chili Corn Salsa        22
Chips and Tomatillo-Red Chili Salsa       20
Izze                                      20
Chips and Roasted Chili-Corn Salsa        18
Veggie Salad Bowl                         18
Barbacoa Crispy Tacos                     11
Barbacoa Salad Bowl                       10
Chicken Salad                              9
Veggie Soft Tacos                          7
Carnitas Crispy Tacos                      7
Burrito                                    6
Carnitas Salad Bowl                        6
Veggie Salad                               6
Steak Salad                                4
Salad                                      2
Bowl                                       2
Crispy Tacos                               2
Chips and Mild Fresh Tomato Salsa          1
Veggie Crispy Tacos                        1
Carnitas Salad                             1
Name: order_id, dtype: int64
item_name_list = order_count.index.tolist()
order_cnt = order_count.values.tolist()
plt.style.use('ggplot')
fig = plt.figure(figsize = (20,5))
ax = fig.add_subplot(1,1,1)
ax.bar(item_name_list, order_cnt, align = 'center')
plt.ylabel('ordered_item_count')
plt.xlabel('item Name')
ax.set_xticklabels(item_name_list, rotation = 45)
plt.title('Distribution of all ordered item')
plt.show()

item_qty = df.groupby('item_name')['quantity'].sum()
item_qty = item_qty.sort_values(ascending=False)[:10]

item_name_list = item_qty.index.tolist()
sell_cnt = item_qty.values.tolist()


plt.style.use('ggplot')
fig = plt.figure(figsize = (20,5))
ax = fig.add_subplot(1,1,1)

ax.bar(item_name_list, sell_cnt, align = 'center')
plt.ylabel('selled_item_count')
plt.xlabel('item Name')
ax.set_xticklabels(item_name_list, rotation = 45)
plt.title('Distribution of all sell item')
plt.show()

df['item_price']

0        $2.39 
1        $3.39 
2        $3.39 
3        $2.39 
4       $16.98 
         ...   
4617    $11.75 
4618    $11.75 
4619    $11.25 
4620     $8.75 
4621     $8.75 
Name: item_price, Length: 4622, dtype: object
# 숫자형으로 바꾸려면

df['item_price'] = df['item_price'].apply(lambda x : float(x[1:]))
df['item_price'

0     2.39
1     3.39
2     3.39
3     2.39
4    16.98
Name: item_price, dtype: float64
df['item_price'].describe()

count    4622.000000
mean        7.464336
std         4.245557
min         1.090000
25%         3.390000
50%         8.750000
75%         9.250000
max        44.250000
Name: item_price, dtype: float64
# 주문당 평균 계산금액
df[['order_id','item_price']].head()

	order_id	item_price
0	1	2.39
1	1	3.39
2	1	3.39
3	1	2.39
4	2	16.98
df.groupby('order_id')['item_price'].sum().mean()
# 18.811428571428717
df.groupby('order_id')['item_price'].sum().describe()

count    1834.000000
mean       18.811429
std        11.652512
min        10.080000
25%        12.572500
50%        16.200000
75%        21.960000
max       205.250000
Name: item_price, dtype: float64
order_group_tot = df.groupby('order_id').sum()
order_group_tot

	quantity	item_price
order_id		
1	4	11.56
10	2	13.20
100	2	10.08
1000	2	20.50
1001	2	10.08
...	...	...
995	3	24.95
996	4	43.00
997	2	22.50
998	2	10.88
999	5	29.25

 

results = order_group_tot[order_group_tot.item_price >= 50]
results
	quantity	item_price
order_id		
1006	8	71.40
1051	7	59.35
1360	6	52.18
1443	35	160.74
1449	11	95.39
1454	10	85.24
1483	14	139.00
1559	16	82.44
1586	6	51.20
1660	19	70.25
1764	7	55.55
1786	20	114.30
1825	6	66.50
195	8	81.00
205	12	109.90
343	7	54.05
488	6	50.90
491	10	102.00
511	17	104.59
561	10	52.45
577	11	71.15
691	11	118.25
759	18	86.30
818	9	70.85
848	7	55.45
916	8	52.45
926	23	205.25
953	8	81.14
print(results.index.values)

['1006' '1051' '1360' '1443' '1449' '1454' '1483' '1559' '1586' '1660'
 '1764' '1786' '1825' '195' '205' '343' '488' '491' '511' '561' '577'
 '691' '759' '818' '848' '916' '926' '953']
df[df['order_id'] == '1006']

	order_id	quantity	item_name	choice_description	item_price
2529	1006	1	Chicken Soft Tacos	[Fresh Tomato Salsa, [Sour Cream, Lettuce]]	8.75
2530	1006	1	Veggie Bowl	[Fresh Tomato Salsa, [Rice, Black Beans, Chees...	11.25
2531	1006	1	Steak Bowl	[Fresh Tomato Salsa, [Rice, Cheese, Sour Cream...	9.25
2532	1006	1	Chicken Burrito	[Fresh Tomato Salsa, [Rice, Cheese, Sour Cream...	11.25
2533	1006	1	Chicken Bowl	[Fresh Tomato Salsa, [Fajita Vegetables, Rice,...	8.75
2534	1006	1	Chicken Burrito	[Fresh Tomato Salsa, [Rice, Black Beans, Chees...	8.75
2535	1006	1	Chicken Burrito	[Fresh Tomato Salsa, [Rice, Black Beans, Chees...	11.25
2536	1006	1	Chips	NaN	2.15
df.loc[df['order_id'].isin(results.index.values)]

	order_id	quantity	item_name	choice_description	item_price
448	195	1	Chicken Burrito	[Fresh Tomato Salsa, [Rice, Cheese, Sour Cream...	8.75
449	195	1	Chicken Bowl	[Fresh Tomato Salsa, [Rice, Black Beans, Chees...	8.75
450	195	1	Barbacoa Burrito	[Fresh Tomato Salsa, [Fajita Vegetables, Rice,...	11.75
451	195	1	Chicken Burrito	[Fresh Tomato Salsa, [Fajita Vegetables, Rice,...	8.75
452	195	1	Chicken Burrito	[Fresh Tomato Salsa, [Rice, Black Beans, Chees...	11.25
...	...	...	...	...	...
4590	1825	1	Chicken Bowl	[Roasted Chili Corn Salsa, [Rice, Black Beans,...	11.25
4591	1825	1	Chicken Bowl	[Tomatillo Red Chili Salsa, [Rice, Black Beans...	8.75
4592	1825	1	Barbacoa Burrito	[Tomatillo Red Chili Salsa, [Rice, Fajita Vege...	11.75
4593	1825	1	Carnitas Bowl	[Roasted Chili Corn Salsa, [Rice, Sour Cream, ...	11.75
4594	1825	1	Barbacoa Bowl	[Roasted Chili Corn Salsa, [Pinto Beans, Sour ...	11.75
  • 6명이 해야할 것,
  • 마스크 살수있는 것
  • 세부적으로, 화장실 위치,
  • 내가 정말 필요할까?,
  • 출산율이 떨어지니깐 각지역별 출산율 > 각정보가지고 분석
  • 왜 특이점이 생겼는가? 하나가 아니라 관련된 것 모두 가져오기
  • 얼굴인식, 딥러닝
# 각 단가 구하기
item_two_price = df[df.quantity == 2]
item_two_price[['quantity','item_name','item_price']]

	quantity	item_name	item_price
4	2	Chicken Bowl	16.98
18	2	Canned Soda	2.18
51	2	Canned Soda	2.18
135	2	Chicken Salad Bowl	22.50
148	2	Steak Burrito	17.98
...	...	...	...
4435	2	Chicken Bowl	17.50
4499	2	Canned Soft Drink	2.50
4560	2	Canned Soft Drink	2.50
4561	2	Chicken Salad Bowl	17.50
4582	2	Bottled Water	3.00
price_one_item  = df.groupby('item_name').min()
price_one_item = price_one_item['item_price']
price_one_item

item_name
6 Pack Soft Drink                        6.49
Barbacoa Bowl                            8.69
Barbacoa Burrito                         8.69
Barbacoa Crispy Tacos                    8.99
Barbacoa Salad Bowl                      9.39
Barbacoa Soft Tacos                      8.99
Bottled Water                            1.09
Bowl                                     7.40
Burrito                                  7.40
Canned Soda                              1.09
Canned Soft Drink                        1.25
Carnitas Bowl                            8.99
Carnitas Burrito                         8.69
Carnitas Crispy Tacos                    8.99
Carnitas Salad                           8.99
Carnitas Salad Bowl                      9.39
Carnitas Soft Tacos                      8.99
Chicken Bowl                             8.19
Chicken Burrito                          8.19
Chicken Crispy Tacos                     8.49
Chicken Salad                            8.19
Chicken Salad Bowl                       8.75
Chicken Soft Tacos                       8.49
Chips                                    1.99
Chips and Fresh Tomato Salsa             2.29
Chips and Guacamole                      3.89
Chips and Mild Fresh Tomato Salsa        3.00
Chips and Roasted Chili Corn Salsa       2.95
Chips and Roasted Chili-Corn Salsa       2.39
Chips and Tomatillo Green Chili Salsa    2.95
Chips and Tomatillo Red Chili Salsa      2.95
Chips and Tomatillo-Green Chili Salsa    2.39
Chips and Tomatillo-Red Chili Salsa      2.39
Crispy Tacos                             7.40
Izze                                     3.39
Nantucket Nectar                         3.39
Salad                                    7.40
Side of Chips                            1.69
Steak Bowl                               8.69
Steak Burrito                            8.69
Steak Crispy Tacos                       8.69
Steak Salad                              8.69
Steak Salad Bowl                         9.39
Steak Soft Tacos                         8.99
Veggie Bowl                              8.49
Veggie Burrito                           8.49
Veggie Crispy Tacos                      8.49
Veggie Salad                             8.49
Veggie Salad Bowl                        8.75
Veggie Soft Tacos                        8.49
Name: item_price, dtype: float64
plt.hist(price_one_item)
plt.ylabel('counts')
plt.title('Histogram of item price')
plt.show()

 

price_top10 = price_one_item.sort_values(ascending = False)
price_top10

item_name
Carnitas Salad Bowl                      9.39
Steak Salad Bowl                         9.39
Barbacoa Salad Bowl                      9.39
Barbacoa Soft Tacos                      8.99
Carnitas Salad                           8.99
Carnitas Crispy Tacos                    8.99
Steak Soft Tacos                         8.99
Carnitas Bowl                            8.99
Carnitas Soft Tacos                      8.99
Barbacoa Crispy Tacos                    8.99
Veggie Salad Bowl                        8.75
Chicken Salad Bowl                       8.75
Carnitas Burrito                         8.69
Barbacoa Burrito                         8.69
Barbacoa Bowl                            8.69
Steak Bowl                               8.69
Steak Burrito                            8.69
Steak Salad                              8.69
Steak Crispy Tacos                       8.69
Veggie Soft Tacos                        8.49
Chicken Crispy Tacos                     8.49
Veggie Bowl                              8.49
Veggie Burrito                           8.49
Veggie Crispy Tacos                      8.49
Veggie Salad                             8.49
Chicken Soft Tacos                       8.49
Chicken Salad                            8.19
Chicken Burrito                          8.19
Chicken Bowl                             8.19
Burrito                                  7.40
Bowl                                     7.40
Salad                                    7.40
Crispy Tacos                             7.40
6 Pack Soft Drink                        6.49
Chips and Guacamole                      3.89
Izze                                     3.39
Nantucket Nectar                         3.39
Chips and Mild Fresh Tomato Salsa        3.00
Chips and Roasted Chili Corn Salsa       2.95
Chips and Tomatillo Green Chili Salsa    2.95
Chips and Tomatillo Red Chili Salsa      2.95
Chips and Roasted Chili-Corn Salsa       2.39
Chips and Tomatillo-Green Chili Salsa    2.39
Chips and Tomatillo-Red Chili Salsa      2.39
Chips and Fresh Tomato Salsa             2.29
Chips                                    1.99
Side of Chips                            1.69
Canned Soft Drink                        1.25
Canned Soda                              1.09
Bottled Water                            1.09
Name: item_price, dtype: float64
df.groupby('order_id').sum()["item_price"].sort_values(ascending = False)[:5]

order_id
926     205.25
1443    160.74
1483    139.00
691     118.25
1786    114.30
Name: item_price, dtype: float64
result1 = df.groupby('order_id').sum()
result2 = result1['item_price']
df.groupby('order_id').sum().sort_values(by = 'item_price', ascending = False)[:5]
result1 = df.groupby('order_id').sum()
result2 = result1.sort_values(by="item_price",ascending = False)
df.groupby('order_id').sum().sort_values(by = 'quantity', ascending = False)[:10]

	quantity	item_price
order_id		
1443	35	160.74
926	23	205.25
1786	20	114.30
1660	19	70.25
759	18	86.30
511	17	104.59
1559	16	82.44
1483	14	139.00
205	12	109.90
691	11	118.25
df_salad = df[df['item_name'] == 'Veggie Salad Bowl']
df_salad['item_name']
df_salad.groupby('order_id').sum()
print(len(df_salad.groupby('order_id').sum()))

# 18
df_salad.groupby('order_id').sum().shape[0]
# 18
df_salad = df_salad.drop_duplicates(['item_name', 'order_id'])
print(len(df_salad))
# 18

 

반응형

'Data_Science > Data_Analysis_Py' 카테고리의 다른 글

18. 세계음주 데이터 분석  (0) 2021.11.03
17. 서울 기온 분석  (0) 2021.11.02
15. 스크래핑  (0) 2021.10.28
14. Stockprice (2  (0) 2021.10.26
13. Stockprice  (0) 2021.10.26
728x90
반응형
html = '''
<html><body>
<div id='potal'>
<h1>포탈목록</h1>

<ul class = 'items'>
<li><a href="http://www.naver.com">naver</a></li>
<li><a href="http://www.daum.net">daum</a></li>
</ul>
</div>
</body></html>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "html.parser")
links = soup.find_all("a")
for a in links :
    href = a.attrs["href"]
    text = a.string
    print(text, ">", href)
    
# naver > http://www.naver.com
# daum > http://www.daum.net
# "div#potal > h1" : id 속성값이 potal인 div 태그의 하위 태그 중 태그의 이름이 h1 태그 선택
# > : 하위태그, 자식 노드 바로 아래태그
#   : 하위태그 자손노드 아래 태그 전부
h1 = soup.select_one("div#potal > h1").string
print('h1=',h1)
# h1= 포탈목록
li_list = soup.select("div#potal > ul.items > li")
print(type(li_list))
print(li_list)
for li in li_list :
    print(li)
    print("li = ", li.string)
    
<class 'list'>
[<li><a href="http://www.naver.com">naver</a></li>, <li><a href="http://www.daum.net">daum</a></li>]
<li><a href="http://www.naver.com">naver</a></li>
li =  naver
<li><a href="http://www.daum.net">daum</a></li>
li =  daum
from bs4 import BeautifulSoup
import urllib.request as req
url = "https://www.weather.go.kr/weather/forecast/mid-term-rss3.jsp"
res = req.urlopen(url) # 연결
print(res)
soup = BeautifulSoup(res, "html.parser")
title = soup.find("title").string
wf = soup.find("wf").string
print(title)
print(wf)
# 
<http.client.HTTPResponse object at 0x0000028CAB0C6EB0>
기상청 육상 중기예보
○ (강수) 11월 1일(월)~2일(화)은 강원영동에 비가 오겠고, 6일(토)은 제주도에 비가 오겠습니다. <br />○ (기온) 이번 예보기간 아침 기온은 4~13도, 낮 기온은 13~21도로 오늘(28일, 아침최저기온 3~12도, 낮최고기온 18~23도)과 비슷하거나 조금 낮겠습니다.<br /><br />* 이번 예보기간 동안 내륙을 중심으로 낮과 밤의 기온차가 크겠으니, 건강관리에 각별히 유의하기 바랍니다.

# <![CDATA[...]]> :CDATA 섹션 > 순수문자열의 영역
# 마크업언어로 파싱하지 않고 순수문자열로 인식
# <태그>
# 10 > 5 PARSER 가 햇갈려함, 그래서 순수문자열의 영역으로 애매함 방지, CDATA 섹션

 

for w in wf.split("<br />") :
    print(w)
○ (강수) 11월 1일(월)~2일(화)은 강원영동에 비가 오겠고, 6일(토)은 제주도에 비가 오겠습니다. 
○ (기온) 이번 예보기간 아침 기온은 4~13도, 낮 기온은 13~21도로 오늘(28일, 아침최저기온 3~12도, 낮최고기온 18~23도)과 비슷하거나 조금 낮겠습니다.

* 이번 예보기간 동안 내륙을 중심으로 낮과 밤의 기온차가 크겠으니, 건강관리에 각별히 유의하기 바랍니다.
import os.path
if not os.path.exists("forcast.xml") : # 파일없으면
    req.urlretrieve(url, "forcast.xml") # 파일저장
# xml은 메모장으로 가능
# 모든 location 태그의 하위태그 중 city 태그를 조회하기
for location in soup.find_all("location") :
    name = location.find("city").string
    weather = location.find("wf").string
    print(name, weather)
    
서울 구름많음
인천 구름많음
수원 구름많음
파주 구름많음
이천 구름많음
평택 구름많음
춘천 구름많음
원주 구름많음
강릉 맑음
대전 맑음
세종 맑음
홍성 맑음
청주 맑음
충주 맑음
영동 맑음
광주 맑음
목포 맑음
여수 맑음
순천 맑음
광양 맑음
나주 맑음
전주 맑음
군산 맑음
정읍 맑음
남원 맑음
고창 맑음
무주 맑음
부산 맑음
울산 맑음
창원 맑음
진주 맑음
거창 맑음
통영 맑음
대구 맑음
안동 맑음
포항 맑음
경주 맑음
울진 맑음
울릉도 맑음
제주 맑음
서귀포 맑음
반응형

'Data_Science > Data_Analysis_Py' 카테고리의 다른 글

17. 서울 기온 분석  (0) 2021.11.02
16. EDA, 멕시코식당 주문 CHIPOTLE  (0) 2021.10.28
14. Stockprice (2  (0) 2021.10.26
13. Stockprice  (0) 2021.10.26
12. titanic (2  (0) 2021.10.26
728x90
반응형
import pandas as pd
pd.set_option('display.max_columns', 10) # 출력할 최대열 개수
pd.set_option('display.max_colwidth', 20) #  출력할 열의 너비
df1 = pd.read_excel('stockprice.xlsx', index_col = 'id', engine = 'openpyxl')
df2 = pd.read_excel('stockvaluation.xlsx', index_col = 'id', engine = 'openpyxl')

# left outer join
df3 = df1.join(df2)
print(df3)

       stock_name          value   price    name           eps       bps  \
id                                                                         
128940       한미약품   59385.666667  421000     NaN           NaN       NaN   
130960     CJ E&M   58540.666667   98900  CJ E&M   6301.333333   54068.0   
138250      엔에스쇼핑   14558.666667   13200     NaN           NaN       NaN   
139480        이마트  239230.833333  254500     이마트  18268.166667  295780.0   
142280     녹십자엠에스     468.833333   10200     NaN           NaN       NaN   
145990        삼양사   82750.000000   82000     삼양사   5741.000000  108090.0   
185750        종근당   40293.666667  100500     종근당   3990.333333   40684.0   
192400      쿠쿠홀딩스  179204.666667  177500     NaN           NaN       NaN   
199800         툴젠   -2514.333333  115400     NaN           NaN       NaN   
204210     모두투어리츠    3093.333333    3475  모두투어리츠     85.166667    5335.0   

              per       pbr  
id                           
128940        NaN       NaN  
130960  15.695091  1.829178  
138250        NaN       NaN  
139480  13.931338  0.860437  
142280        NaN       NaN  
145990  14.283226  0.758627  
185750  25.185866  2.470259  
192400        NaN       NaN  
199800        NaN       NaN  
204210  40.802348  0.651359
# 교집합
df4 = df1.join(df2, how = 'inner')
print(df4)
       stock_name          value   price    name           eps     bps  \
id                                                                       
130960     CJ E&M   58540.666667   98900  CJ E&M   6301.333333   54068   
139480        이마트  239230.833333  254500     이마트  18268.166667  295780   
145990        삼양사   82750.000000   82000     삼양사   5741.000000  108090   
185750        종근당   40293.666667  100500     종근당   3990.333333   40684   
204210     모두투어리츠    3093.333333    3475  모두투어리츠     85.166667    5335   

              per       pbr  
id                           
130960  15.695091  1.829178  
139480  13.931338  0.860437  
145990  14.283226  0.758627  
185750  25.185866  2.470259  
204210  40.802348  0.651359

 

 

 

 

Titanic

# groupby 컬럼을 기준으로 그룹화
import seaborn as sns
titanic = sns.load_dataset('titanic')
df = titanic.loc[:,['age','sex','class','fare','survived']]
print(df.head())

    age     sex  class     fare  survived
0  22.0    male  Third   7.2500         0
1  38.0  female  First  71.2833         1
2  26.0  female  Third   7.9250         1
3  35.0  female  First  53.1000         1
4  35.0    male  Third   8.0500         0
grouped = df.groupby(['class'])
print(grouped)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000203DE051C40>
for key, group in grouped :
    print('key :', key)
    print('number :', len(group))
    print(group.head())
    
key : First
number : 216
     age     sex  class     fare  survived
1   38.0  female  First  71.2833         1
3   35.0  female  First  53.1000         1
6   54.0    male  First  51.8625         0
11  58.0  female  First  26.5500         1
23  28.0    male  First  35.5000         1
key : Second
number : 184
     age     sex   class     fare  survived
9   14.0  female  Second  30.0708         1
15  55.0  female  Second  16.0000         1
17   NaN    male  Second  13.0000         1
20  35.0    male  Second  26.0000         0
21  34.0    male  Second  13.0000         1
key : Third
number : 491
    age     sex  class     fare  survived
0  22.0    male  Third   7.2500         0
2  26.0  female  Third   7.9250         1
4  35.0    male  Third   8.0500         0
5   NaN    male  Third   8.4583         0
7   2.0    male  Third  21.0750         0
average = grouped.mean()
print(average)
              age       fare  survived
class                                 
First   38.233441  84.154687  0.629630
Second  29.877630  20.662183  0.472826
Third   25.140620  13.675550  0.242363
stdall = grouped.std()
print(stdall)

              age       fare  survived
class                                 
First   14.802856  78.380373  0.484026
Second  14.001077  13.417399  0.500623
Third   12.495398  11.778142  0.428949
maxage = grouped.age.max()
minage = grouped.age.min()

def max_min(x) :
    return x.max() - x.min() 
    
agg_maxmin = grouped.agg(max_min)
agg_maxmin
	age	fare	survived
class			
First	79.08	512.3292	1
Second	69.33	73.5000	1
Third	73.58	69.5500	1
aggall = grouped.agg(['min','max'])
aggall
	age	sex	fare	survived
min	max	min	max	min	max	min	max
class								
First	0.92	80.0	female	male	0.0	512.3292	0	1
Second	0.67	70.0	female	male	0.0	73.5000	0	1
Third	0.42	74.0	female	male	0.0	69.5500	0	1
aggsep = grouped.agg({'fare' : ['max','min'], 'age' : 'mean'})
aggsep

	fare	age
max	min	mean
class			
First	512.3292	0.0	38.233441
Second	73.5000	0.0	29.877630
Third	69.5500	0.0	25.140620
aggsep = grouped.agg({'fare' :'mean', 'age' :  ['max','min']})
aggsep

	fare	age
mean	max	min
class			
First	84.154687	80.0	0.92
Second	20.662183	70.0	0.67
Third	13.675550	74.0	0.42
groupfilter1 = grouped.filter(lambda x : len(x) >= 200)
print(groupfilter1.head())
print(type(groupfilter1))
print(groupfilter1.info())

    age     sex  class     fare  survived
0  22.0    male  Third   7.2500         0
1  38.0  female  First  71.2833         1
2  26.0  female  Third   7.9250         1
3  35.0  female  First  53.1000         1
4  35.0    male  Third   8.0500         0
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
Int64Index: 707 entries, 0 to 890
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   age       541 non-null    float64 
 1   sex       707 non-null    object  
 2   class     707 non-null    category
 3   fare      707 non-null    float64 
 4   survived  707 non-null    int64   
dtypes: category(1), float64(2), int64(1), object(1)
memory usage: 28.4+ KB
None
groupfilter1 = grouped.apply(lambda x : x.describe())
print(groupfilter1)

                    age        fare    survived
class                                           
First  count  186.000000  216.000000  216.000000
       mean    38.233441   84.154687    0.629630
       std     14.802856   78.380373    0.484026
       min      0.920000    0.000000    0.000000
       25%     27.000000   30.923950    0.000000
       50%     37.000000   60.287500    1.000000
       75%     49.000000   93.500000    1.000000
       max     80.000000  512.329200    1.000000
Second count  173.000000  184.000000  184.000000
       mean    29.877630   20.662183    0.472826
       std     14.001077   13.417399    0.500623
       min      0.670000    0.000000    0.000000
       25%     23.000000   13.000000    0.000000
       50%     29.000000   14.250000    0.000000
       75%     36.000000   26.000000    1.000000
       max     70.000000   73.500000    1.000000
Third  count  355.000000  491.000000  491.000000
       mean    25.140620   13.675550    0.242363
       std     12.495398   11.778142    0.428949
       min      0.420000    0.000000    0.000000
       25%     18.000000    7.750000    0.000000
       50%     24.000000    8.050000    0.000000
       75%     32.000000   15.500000    0.000000
       max     74.000000   69.550000    1.000000
group2 = df.groupby(['class', 'sex'])
gdf = group2.mean()
print(gdf)

                     age        fare  survived
class  sex                                    
First  female  34.611765  106.125798  0.968085
       male    41.281386   67.226127  0.368852
Second female  28.722973   21.970121  0.921053
       male    30.740707   19.741782  0.157407
Third  female  21.750000   16.118810  0.500000
       male    26.507589   12.661633  0.135447
print(gdf.loc['First'])

              age        fare  survived
sex                                    
female  34.611765  106.125798  0.968085
male    41.281386   67.226127  0.368852
print(gdf.loc['First','female'])

age          34.611765
fare        106.125798
survived      0.968085
Name: (First, female), dtype: float64
print(gdf.xs('male',level = 'sex'))

              age       fare  survived
class                                 
First   41.281386  67.226127  0.368852
Second  30.740707  19.741782  0.157407
Third   26.507589  12.661633  0.135447
pdf1 = pd.pivot_table(df, # 비벗할 테이블
                     index = 'class', # 행위치에 들어갈 열
                     columns = 'sex', # 열위치에 들어갈 열
                     values = 'age', # 데이터로 사용할 열
                     aggfunc = 'mean') # 데이터 집계함수
print(pdf1)

sex        female       male
class                       
First   34.611765  41.281386
Second  28.722973  30.740707
Third   21.750000  26.507589
pdf2 = pd.pivot_table(df,
                     index = 'class',
                     columns = 'sex',
                     values = 'survived',
                     aggfunc = ['mean','sum'])
print(pdf2)
print(type(pdf2))

            mean              sum     
sex       female      male female male
class                                 
First   0.968085  0.368852     91   45
Second  0.921053  0.157407     70   17
Third   0.500000  0.135447     72   47
<class 'pandas.core.frame.DataFrame'>
pdf2.loc['First']

      sex   
mean  female     0.968085
      male       0.368852
sum   female    91.000000
      male      45.000000
Name: First, dtype: float64
pdf2.xs('First')
      sex   
mean  female     0.968085
      male       0.368852
sum   female    91.000000
      male      45.000000
Name: First, dtype: float64
print(pdf2.xs(('female'), level='sex', axis=1))

            mean  sum
class                
First   0.968085   91
Second  0.921053   70
Third   0.500000   72
print(pdf2.xs(('mean','female'), axis=1))
class
First     0.968085
Second    0.921053
Third     0.500000
Name: (mean, female), dtype: float64

 

 

반응형

'Data_Science > Data_Analysis_Py' 카테고리의 다른 글

16. EDA, 멕시코식당 주문 CHIPOTLE  (0) 2021.10.28
15. 스크래핑  (0) 2021.10.28
13. Stockprice  (0) 2021.10.26
12. titanic (2  (0) 2021.10.26
11. 행정안전부, 연령별 인구 분석  (0) 2021.10.26
728x90
반응형

stockvaluation.xlsx
0.01MB
stockprice.xlsx
0.01MB

import pandas as pd
df1 = pd.read_excel('stockprice.xlsx')
df2 = pd.read_excel('stockvaluation.xlsx')
df3 = pd.concat([df1, df2], axis=1) # 열기준 병합
print(df3)

       id stock_name          value   price      id       name           eps  \
0  128940       한미약품   59385.666667  421000  130960     CJ E&M   6301.333333   
1  130960     CJ E&M   58540.666667   98900  136480         하림    274.166667   
2  138250      엔에스쇼핑   14558.666667   13200  138040    메리츠금융지주   2122.333333   
3  139480        이마트  239230.833333  254500  139480        이마트  18268.166667   
4  142280     녹십자엠에스     468.833333   10200  145990        삼양사   5741.000000   
5  145990        삼양사   82750.000000   82000  161390      한국타이어   5648.500000   
6  185750        종근당   40293.666667  100500  181710  NHN엔터테인먼트   2110.166667   
7  192400      쿠쿠홀딩스  179204.666667  177500  185750        종근당   3990.333333   
8  199800         툴젠   -2514.333333  115400  204210     모두투어리츠     85.166667   
9  204210     모두투어리츠    3093.333333    3475  207940   삼성바이오로직스   4644.166667   

      bps        per       pbr  
0   54068  15.695091  1.829178  
1    3551  11.489362  0.887074  
2   14894   6.313806  0.899691  
3  295780  13.931338  0.860437  
4  108090  14.283226  0.758627  
5   51341   7.453306  0.820007  
6   78434  30.755864  0.827447  
7   40684  25.185866  2.470259  
8    5335  40.802348  0.651359  
9   60099  89.790059  6.938551
result2 = pd.merge(df1, df2)
print(result2)
print(result2.columns)

       id stock_name          value   price    name           eps     bps  \
0  130960     CJ E&M   58540.666667   98900  CJ E&M   6301.333333   54068   
1  139480        이마트  239230.833333  254500     이마트  18268.166667  295780   
2  145990        삼양사   82750.000000   82000     삼양사   5741.000000  108090   
3  185750        종근당   40293.666667  100500     종근당   3990.333333   40684   
4  204210     모두투어리츠    3093.333333    3475  모두투어리츠     85.166667    5335   

         per       pbr  
0  15.695091  1.829178  
1  13.931338  0.860437  
2  14.283226  0.758627  
3  25.185866  2.470259  
4  40.802348  0.651359  
Index(['id', 'stock_name', 'value', 'price', 'name', 'eps', 'bps', 'per',
       'pbr'],
      dtype='object')
# 병합시, 두개의 데이터 프레임을 연결컬럼을 설정하기
# 연결컬럼을 key라고 한다,
# outer 방식
result3 = pd.merge(df1, df2, on='id', how='outer')

print(result3)
       id stock_name          value     price       name           eps  \
0   128940       한미약품   59385.666667  421000.0        NaN           NaN   
1   130960     CJ E&M   58540.666667   98900.0     CJ E&M   6301.333333   
2   138250      엔에스쇼핑   14558.666667   13200.0        NaN           NaN   
3   139480        이마트  239230.833333  254500.0        이마트  18268.166667   
4   142280     녹십자엠에스     468.833333   10200.0        NaN           NaN   
5   145990        삼양사   82750.000000   82000.0        삼양사   5741.000000   
6   185750        종근당   40293.666667  100500.0        종근당   3990.333333   
7   192400      쿠쿠홀딩스  179204.666667  177500.0        NaN           NaN   
8   199800         툴젠   -2514.333333  115400.0        NaN           NaN   
9   204210     모두투어리츠    3093.333333    3475.0     모두투어리츠     85.166667   
10  136480        NaN            NaN       NaN         하림    274.166667   
11  138040        NaN            NaN       NaN    메리츠금융지주   2122.333333   
12  161390        NaN            NaN       NaN      한국타이어   5648.500000   
13  181710        NaN            NaN       NaN  NHN엔터테인먼트   2110.166667   
14  207940        NaN            NaN       NaN   삼성바이오로직스   4644.166667   

         bps        per       pbr  
0        NaN        NaN       NaN  
1    54068.0  15.695091  1.829178  
2        NaN        NaN       NaN  
3   295780.0  13.931338  0.860437  
4        NaN        NaN       NaN  
5   108090.0  14.283226  0.758627  
6    40684.0  25.185866  2.470259  
7        NaN        NaN       NaN  
8        NaN        NaN       NaN  
9     5335.0  40.802348  0.651359  
10    3551.0  11.489362  0.887074  
11   14894.0   6.313806  0.899691  
12   51341.0   7.453306  0.820007  
13   78434.0  30.755864  0.827447  
14   60099.0  89.790059  6.938551  



print(result3.columns)

Index(['id', 'stock_name', 'value', 'price', 'name', 'eps', 'bps', 'per',
       'pbr'],
      dtype='object')
# 변합시 사용되는 키의 이름이 다른 경우
result4 = pd.merge(df1, df2, how='left', left_on='stock_name', right_on='name')
print(result4)
print(result4.columns)

     id_x stock_name          value   price      id_y    name           eps  \
0  128940       한미약품   59385.666667  421000       NaN     NaN           NaN   
1  130960     CJ E&M   58540.666667   98900  130960.0  CJ E&M   6301.333333   
2  138250      엔에스쇼핑   14558.666667   13200       NaN     NaN           NaN   
3  139480        이마트  239230.833333  254500  139480.0     이마트  18268.166667   
4  142280     녹십자엠에스     468.833333   10200       NaN     NaN           NaN   
5  145990        삼양사   82750.000000   82000  145990.0     삼양사   5741.000000   
6  185750        종근당   40293.666667  100500  185750.0     종근당   3990.333333   
7  192400      쿠쿠홀딩스  179204.666667  177500       NaN     NaN           NaN   
8  199800         툴젠   -2514.333333  115400       NaN     NaN           NaN   
9  204210     모두투어리츠    3093.333333    3475  204210.0  모두투어리츠     85.166667   

        bps        per       pbr  
0       NaN        NaN       NaN  
1   54068.0  15.695091  1.829178  
2       NaN        NaN       NaN  
3  295780.0  13.931338  0.860437  
4       NaN        NaN       NaN  
5  108090.0  14.283226  0.758627  
6   40684.0  25.185866  2.470259  
7       NaN        NaN       NaN  
8       NaN        NaN       NaN  
9    5335.0  40.802348  0.651359  
Index(['id_x', 'stock_name', 'value', 'price', 'id_y', 'name', 'eps', 'bps',
       'per', 'pbr'],
      dtype='object')
# df2 데이터셋의 내용을 모두 조회되도록,
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_colwidth', 20)

result5 = pd.merge(df1, df2, how = 'right', left_on = 'stock_name', right_on = 'name')

print(result5)

       id_x stock_name          value     price    id_y       name  \
0  130960.0     CJ E&M   58540.666667   98900.0  130960     CJ E&M   
1       NaN        NaN            NaN       NaN  136480         하림   
2       NaN        NaN            NaN       NaN  138040    메리츠금융지주   
3  139480.0        이마트  239230.833333  254500.0  139480        이마트   
4  145990.0        삼양사   82750.000000   82000.0  145990        삼양사   
5       NaN        NaN            NaN       NaN  161390      한국타이어   
6       NaN        NaN            NaN       NaN  181710  NHN엔터테인먼트   
7  185750.0        종근당   40293.666667  100500.0  185750        종근당   
8  204210.0     모두투어리츠    3093.333333    3475.0  204210     모두투어리츠   
9       NaN        NaN            NaN       NaN  207940   삼성바이오로직스   

            eps     bps        per       pbr  
0   6301.333333   54068  15.695091  1.829178  
1    274.166667    3551  11.489362  0.887074  
2   2122.333333   14894   6.313806  0.899691  
3  18268.166667  295780  13.931338  0.860437  
4   5741.000000  108090  14.283226  0.758627  
5   5648.500000   51341   7.453306  0.820007  
6   2110.166667   78434  30.755864  0.827447  
7   3990.333333   40684  25.185866  2.470259  
8     85.166667    5335  40.802348  0.651359  
9   4644.166667   60099  89.790059  6.938551
price_under = df1[df1['price'] < 50000]
print(price_under)

       id stock_name         value  price
2  138250      엔에스쇼핑  14558.666667  13200
4  142280     녹십자엠에스    468.833333  10200
9  204210     모두투어리츠   3093.333333   3475
result6 = pd.merge(price_under, df2)
print(result6)

       id stock_name        value  price    name        eps   bps        per  \
0  204210     모두투어리츠  3093.333333   3475  모두투어리츠  85.166667  5335  40.802348   

        pbr  
0  0.651359

 

 

 

 

반응형

'Data_Science > Data_Analysis_Py' 카테고리의 다른 글

15. 스크래핑  (0) 2021.10.28
14. Stockprice (2  (0) 2021.10.26
12. titanic (2  (0) 2021.10.26
11. 행정안전부, 연령별 인구 분석  (0) 2021.10.26
10. folium 2  (0) 2021.10.26
728x90
반응형
import seaborn as sns
df = sns.load_dataset('titanic')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.6+ KB
# deck열의 nan  개수 확인
nan_deck = df['deck'].value_counts(dropna = False)
nan_deck

NaN    688
C       59
B       47
D       33
E       32
A       15
F       13
G        4
Name: deck, dtype: int64
# isnull 누락데이터 여부 누락 True, 아니면 False
df.isnull().sum(axis=0)
# 반대 notnull()
# df.head().isnull().sum(axis=0) # 누락분의 합계

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64
# 각열의 nan개수 계산하기
missing_df = df.isnull()
for col in missing_df.columns :
    missing_count = missing_df[col].value_counts()
    try :
        print(col, ':', missing_count[True]) # True 있으면 에러처리됨,
    except :
        print(col, ':', 0) # 예외 처리되면 0, 없음

survived : 0
pclass : 0
sex : 0
age : 177
sibsp : 0
parch : 0
fare : 0
embarked : 2
class : 0
who : 0
adult_male : 0
deck : 688
embark_town : 2
alive : 0
alone : 0
#  dropna nan 500개되는 열  삭제
df_thresh = df.dropna(axis = 1, thresh = 500) 
print(df_thresh.columns)
df_thresh.info

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'embark_town', 'alive',
       'alone'],
      dtype='object')
<bound method DataFrame.info of      survived  pclass     sex   age  sibsp  parch     fare embarked   class  \
0           0       3    male  22.0      1      0   7.2500        S   Third   
1           1       1  female  38.0      1      0  71.2833        C   First   
2           1       3  female  26.0      0      0   7.9250        S   Third   
3           1       1  female  35.0      1      0  53.1000        S   First   
4           0       3    male  35.0      0      0   8.0500        S   Third   
..        ...     ...     ...   ...    ...    ...      ...      ...     ...   
886         0       2    male  27.0      0      0  13.0000        S  Second   
887         1       1  female  19.0      0      0  30.0000        S   First   
888         0       3  female   NaN      1      2  23.4500        S   Third   
889         1       1    male  26.0      0      0  30.0000        C   First   
890         0       3    male  32.0      0      0   7.7500        Q   Third   

       who  adult_male  embark_town alive  alone  
0      man        True  Southampton    no  False  
1    woman       False    Cherbourg   yes  False  
2    woman       False  Southampton   yes   True  
3    woman       False  Southampton   yes  False  
4      man        True  Southampton    no   True  
..     ...         ...          ...   ...    ...  
886    man        True  Southampton    no   True  
887  woman       False  Southampton   yes   True  
888  woman       False  Southampton    no  False  
889    man        True    Cherbourg   yes   True  
890    man        True   Queenstown    no   True  

[891 rows x 14 columns]>
# nan 값있는 행 삭제
df_age = df.dropna(subset = ['age'], how = 'any', axis = 0)
print(len(df_age))
print(df_age.info())

714
<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     714 non-null    int64   
 1   pclass       714 non-null    int64   
 2   sex          714 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        714 non-null    int64   
 5   parch        714 non-null    int64   
 6   fare         714 non-null    float64 
 7   embarked     712 non-null    object  
 8   class        714 non-null    category
 9   who          714 non-null    object  
 10  adult_male   714 non-null    bool    
 11  deck         184 non-null    category
 12  embark_town  712 non-null    object  
 13  alive        714 non-null    object  
 14  alone        714 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 70.2+ KB
None
# df 데이터 : age 열의 nan 값을 다른 나이 데이터의 평균으로 변경하기
print(df.info())
mean_age = df['age'].mean(axis=0)
df['age'].fillna(mean_age, inplace = True)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.6+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          891 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.6+ KB
None
# embarktown 컬럼의 결측값은 컬럼의 값 중 빈도수가 가장 많은 값으로 치환하기
# most_freq = df['embark_town'].value_counts(dropna = True)
most_freq = df['embark_town'].value_counts(dropna = True).idxmax() # 가장 많은 것 출력
print(most_freq)

# Southampton
df['embark_town'].fillna(most_freq, inplace = True)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          891 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  891 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.6+ KB
None
# 재 업로드
df = sns.load_dataset('titanic')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.6+ KB
print(df['embark_town'][825:830])

825     Queenstown
826    Southampton
827      Cherbourg
828     Queenstown
829            NaN
Name: embark_town, dtype: object
# 결측치를 앞의 값으로 치환
df['embark_town'].fillna(method='ffill', inplace = True)
print(df['embark_town'][825:830])

825     Queenstown
826    Southampton
827      Cherbourg
828     Queenstown
829     Queenstown
Name: embark_town, dtype: object

 

반응형

'Data_Science > Data_Analysis_Py' 카테고리의 다른 글

14. Stockprice (2  (0) 2021.10.26
13. Stockprice  (0) 2021.10.26
11. 행정안전부, 연령별 인구 분석  (0) 2021.10.26
10. folium 2  (0) 2021.10.26
9. tips || '21.06.28.  (0) 2021.10.26
728x90
반응형

age1_utf.csv
1.54MB

# 행안부, 연령별 인구 현황 파일 다운받기, 계, 연령구분단위 1세, 0~100세
import csv
f = open('./age1_utf.csv',encoding="utf8")
data = csv.reader(f)
next(data) # 헤더 제거, 첫줄 읽어버리기
for row in data :
    print(row)
    
['서울특별시  (1100000000)', '9,575,355', '9,575,355', '44,116', '47,612', '51,389', '54,644', '60,226', '66,775', '68,122', '67,105', '72,631', '72,433', '74,702', '69,966', '72,360', '78,681', '74,635', '71,679', '75,052', '80,554', '81,153', '89,544', '108,121', '114,369', '120,701', '133,889', '142,513', '155,641', '160,766', '166,742', '172,551', '170,425', '155,942', '149,727', '146,537', '138,384', '137,327', '138,403', '134,457', '144,175', '151,702', '158,712', '157,393', '155,800', '139,387', '137,722', '136,405', '139,782', '148,291', '158,851', '163,010', '166,672', '171,150', '162,638', '166,009', '149,296', '141,771', '143,737', '145,888', '134,601', '151,699', '143,144', '162,666', '152,287', '142,186', '136,982', '127,836', '123,489', '127,802', '99,709', '99,786', '91,396', '76,633', '85,104', '80,194', '81,207', '79,145', '55,147', '59,598', '56,681', '63,261', '59,700', '46,471', '42,577', '39,035', '33,287', '29,726', '26,231', '21,958', '17,597', '14,591', '12,017', '9,371', '7,894', '6,808', '5,082', '3,741', '2,523', '1,790', '1,469', '1,321', '828', '2,520']
#  인구구조 알고싶은 동 입력
import numpy as np
import csv
import matplotlib.pyplot as plt
f = open('./age1_utf.csv',encoding="utf8")
# 스트림 : 데이터의 이동 통로
# 한번 읽으면 사라짐, 그래서 문제가 될지도, 또 쓰고 싶으면, 별도로 저장해야 함.
data = csv.reader(f)
next(data) # 헤더 제거, 첫줄 읽어버리기
name = input('지역의 이름(읍면동 단위) 입력')

for row in data :
    if name in row[0] :
        name = row[0]
        print(name)
        row = list(map((lambda x : x.replace(',', '')), row)) # 문자열 제거 숫자내부 m 제거해서 정수값 변경가능
        home = np.array(row[3:], dtype = int)
        
plt.style.use('ggplot')        
plt.figure(figsize = (10, 5), dpi = 100)
plt.rc('font', family = 'Malgun Gothic')
plt.title(name + ' 지역의 인구 구조')
plt.plot(home)
plt.show()

# 지역의 이름(읍면동 단위) 입력강남구
# 서울특별시 강남구 (1168000000)

가장 비슷한 인구구조를 가진 그래플와 지역 출력

# 가장 비슷한 인구구조를 가진 그래플와 지역 출력
import numpy as np
import csv
import matplotlib.pyplot as plt
import re
f = open('./age1_utf.csv',encoding="utf8")
# 스트림 : 데이터의 이동 통로
# 한번 읽으면 사라짐, 그래서 문제가 될지도, 또 쓰고 싶으면, 별도로 저장해야 함.
data = csv.reader(f)
next(data) # 헤더 제거, 첫줄 읽어버리기
# name = input('지역의 이름(읍면동 단위) 입력')
data =list(data)
name = '상록'
mn = 1 # 파라미터 초기화, 최소차이 저장
result_name = ''
result = 0

for row in data :
    if name in row[0]:
        row = list(map((lambda x : x.replace(',', '')), row)) # 문자열 제거 숫자내부 m 제거해서 정수값 변경가능
        home = np.array(row[3:], dtype =int) / int(row[2])

for row in data :
    row = list(map((lambda x : x.replace(',', '')), row))
    away = np.array(row[3:], dtype =int) / int(row[2])
    s = np.sum((home - away)**2) # 차이가 마이너스가 될수도, 절대값 개념
    if s < mn and name not in row[0] :
        mn = s #  mn 저장
        result_name = row[0]
        result = away
        
plt.style.use('ggplot')        
plt.figure(figsize = (10, 5), dpi = 100)
plt.rc('font', family = 'Malgun Gothic')
plt.title(name + ' 지역의 인구 구조')
plt.plot(home, label = name) # 선택지역
plt.plot(result, label = result_name) # 가장 비슷한 지역
plt.legend()
plt.show()

 

age_14.csv
1.42MB

import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
import re

df = pd.read_csv('age_14.csv', encoding="cp949", index_col = 0)
print(df.head())
df = df.div(df['총인구수'], axis=0)
del df['총인구수'], df['연령구간인구수']
name = '신림동'
a = df.index.str.contains(name) # 신림동 레코드
df2 = df[a]

plt.rc('font', family = 'Malgun Gothic')
df2.T.plot()
plt.show()

                               총인구수  연령구간인구수     0세     1세     2세     3세  \
행정구역                                                                        
서울특별시  (1100000000)          9857426  9857426  61253  70532  74322  72482   
서울특별시 종로구 (1111000000)        154770   154770    652    794    911    895   
서울특별시 종로구 청운효자동(1111051500)    13272    13272     81     82     90    105   
서울특별시 종로구 사직동(1111053000)       9441     9441     43     63     75     68   
서울특별시 종로구 삼청동(1111054000)       2907     2907     12      4     22     13   

                                4세     5세     6세     7세  ...   91세   92세  \
행정구역                                                     ...               
서울특별시  (1100000000)          71688  78941  75386  75929  ...  5892  4695   
서울특별시 종로구 (1111000000)         859   1046    909   1012  ...   149   115   
서울특별시 종로구 청운효자동(1111051500)     97    116    117    116  ...    12    14   
서울특별시 종로구 사직동(1111053000)       76     81     74     83  ...    11    13   
서울특별시 종로구 삼청동(1111054000)       12     13     20     11  ...     5     2   

                              93세   94세   95세   96세   97세   98세  99세  100세 이상  
행정구역                                                                           
서울특별시  (1100000000)          3589  3501  2569  1851  1436  1010  736     5519  
서울특별시 종로구 (1111000000)         95    79    81    60    46    37   26      226  
서울특별시 종로구 청운효자동(1111051500)     7     7     8     3     5     1    1       17  
서울특별시 종로구 사직동(1111053000)       7     5     7     3     6     4    4       17  
서울특별시 종로구 삼청동(1111054000)       3     3     2     2     0     2    1        7  

[5 rows x 103 columns]

 

 

 

 

반응형

'Data_Science > Data_Analysis_Py' 카테고리의 다른 글

13. Stockprice  (0) 2021.10.26
12. titanic (2  (0) 2021.10.26
10. folium 2  (0) 2021.10.26
9. tips || '21.06.28.  (0) 2021.10.26
8. iris || '21.06.28.  (0) 2021.10.26

+ Recent posts