728x90
반응형
import pandas as pd
pd.set_option('display.max_columns', 10) # 출력할 최대열 개수
pd.set_option('display.max_colwidth', 20) # 출력할 열의 너비
df1 = pd.read_excel('stockprice.xlsx', index_col = 'id', engine = 'openpyxl')
df2 = pd.read_excel('stockvaluation.xlsx', index_col = 'id', engine = 'openpyxl')
# left outer join
df3 = df1.join(df2)
print(df3)
stock_name value price name eps bps \
id
128940 한미약품 59385.666667 421000 NaN NaN NaN
130960 CJ E&M 58540.666667 98900 CJ E&M 6301.333333 54068.0
138250 엔에스쇼핑 14558.666667 13200 NaN NaN NaN
139480 이마트 239230.833333 254500 이마트 18268.166667 295780.0
142280 녹십자엠에스 468.833333 10200 NaN NaN NaN
145990 삼양사 82750.000000 82000 삼양사 5741.000000 108090.0
185750 종근당 40293.666667 100500 종근당 3990.333333 40684.0
192400 쿠쿠홀딩스 179204.666667 177500 NaN NaN NaN
199800 툴젠 -2514.333333 115400 NaN NaN NaN
204210 모두투어리츠 3093.333333 3475 모두투어리츠 85.166667 5335.0
per pbr
id
128940 NaN NaN
130960 15.695091 1.829178
138250 NaN NaN
139480 13.931338 0.860437
142280 NaN NaN
145990 14.283226 0.758627
185750 25.185866 2.470259
192400 NaN NaN
199800 NaN NaN
204210 40.802348 0.651359
# 교집합
df4 = df1.join(df2, how = 'inner')
print(df4)
stock_name value price name eps bps \
id
130960 CJ E&M 58540.666667 98900 CJ E&M 6301.333333 54068
139480 이마트 239230.833333 254500 이마트 18268.166667 295780
145990 삼양사 82750.000000 82000 삼양사 5741.000000 108090
185750 종근당 40293.666667 100500 종근당 3990.333333 40684
204210 모두투어리츠 3093.333333 3475 모두투어리츠 85.166667 5335
per pbr
id
130960 15.695091 1.829178
139480 13.931338 0.860437
145990 14.283226 0.758627
185750 25.185866 2.470259
204210 40.802348 0.651359
Titanic
# groupby 컬럼을 기준으로 그룹화
import seaborn as sns
titanic = sns.load_dataset('titanic')
df = titanic.loc[:,['age','sex','class','fare','survived']]
print(df.head())
age sex class fare survived
0 22.0 male Third 7.2500 0
1 38.0 female First 71.2833 1
2 26.0 female Third 7.9250 1
3 35.0 female First 53.1000 1
4 35.0 male Third 8.0500 0
grouped = df.groupby(['class'])
print(grouped)
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000203DE051C40>
for key, group in grouped :
print('key :', key)
print('number :', len(group))
print(group.head())
key : First
number : 216
age sex class fare survived
1 38.0 female First 71.2833 1
3 35.0 female First 53.1000 1
6 54.0 male First 51.8625 0
11 58.0 female First 26.5500 1
23 28.0 male First 35.5000 1
key : Second
number : 184
age sex class fare survived
9 14.0 female Second 30.0708 1
15 55.0 female Second 16.0000 1
17 NaN male Second 13.0000 1
20 35.0 male Second 26.0000 0
21 34.0 male Second 13.0000 1
key : Third
number : 491
age sex class fare survived
0 22.0 male Third 7.2500 0
2 26.0 female Third 7.9250 1
4 35.0 male Third 8.0500 0
5 NaN male Third 8.4583 0
7 2.0 male Third 21.0750 0
average = grouped.mean()
print(average)
age fare survived
class
First 38.233441 84.154687 0.629630
Second 29.877630 20.662183 0.472826
Third 25.140620 13.675550 0.242363
stdall = grouped.std()
print(stdall)
age fare survived
class
First 14.802856 78.380373 0.484026
Second 14.001077 13.417399 0.500623
Third 12.495398 11.778142 0.428949
maxage = grouped.age.max()
minage = grouped.age.min()
def max_min(x) :
return x.max() - x.min()
agg_maxmin = grouped.agg(max_min)
agg_maxmin
age fare survived
class
First 79.08 512.3292 1
Second 69.33 73.5000 1
Third 73.58 69.5500 1
aggall = grouped.agg(['min','max'])
aggall
age sex fare survived
min max min max min max min max
class
First 0.92 80.0 female male 0.0 512.3292 0 1
Second 0.67 70.0 female male 0.0 73.5000 0 1
Third 0.42 74.0 female male 0.0 69.5500 0 1
aggsep = grouped.agg({'fare' : ['max','min'], 'age' : 'mean'})
aggsep
fare age
max min mean
class
First 512.3292 0.0 38.233441
Second 73.5000 0.0 29.877630
Third 69.5500 0.0 25.140620
aggsep = grouped.agg({'fare' :'mean', 'age' : ['max','min']})
aggsep
fare age
mean max min
class
First 84.154687 80.0 0.92
Second 20.662183 70.0 0.67
Third 13.675550 74.0 0.42
groupfilter1 = grouped.filter(lambda x : len(x) >= 200)
print(groupfilter1.head())
print(type(groupfilter1))
print(groupfilter1.info())
age sex class fare survived
0 22.0 male Third 7.2500 0
1 38.0 female First 71.2833 1
2 26.0 female Third 7.9250 1
3 35.0 female First 53.1000 1
4 35.0 male Third 8.0500 0
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
Int64Index: 707 entries, 0 to 890
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 age 541 non-null float64
1 sex 707 non-null object
2 class 707 non-null category
3 fare 707 non-null float64
4 survived 707 non-null int64
dtypes: category(1), float64(2), int64(1), object(1)
memory usage: 28.4+ KB
None
groupfilter1 = grouped.apply(lambda x : x.describe())
print(groupfilter1)
age fare survived
class
First count 186.000000 216.000000 216.000000
mean 38.233441 84.154687 0.629630
std 14.802856 78.380373 0.484026
min 0.920000 0.000000 0.000000
25% 27.000000 30.923950 0.000000
50% 37.000000 60.287500 1.000000
75% 49.000000 93.500000 1.000000
max 80.000000 512.329200 1.000000
Second count 173.000000 184.000000 184.000000
mean 29.877630 20.662183 0.472826
std 14.001077 13.417399 0.500623
min 0.670000 0.000000 0.000000
25% 23.000000 13.000000 0.000000
50% 29.000000 14.250000 0.000000
75% 36.000000 26.000000 1.000000
max 70.000000 73.500000 1.000000
Third count 355.000000 491.000000 491.000000
mean 25.140620 13.675550 0.242363
std 12.495398 11.778142 0.428949
min 0.420000 0.000000 0.000000
25% 18.000000 7.750000 0.000000
50% 24.000000 8.050000 0.000000
75% 32.000000 15.500000 0.000000
max 74.000000 69.550000 1.000000
group2 = df.groupby(['class', 'sex'])
gdf = group2.mean()
print(gdf)
age fare survived
class sex
First female 34.611765 106.125798 0.968085
male 41.281386 67.226127 0.368852
Second female 28.722973 21.970121 0.921053
male 30.740707 19.741782 0.157407
Third female 21.750000 16.118810 0.500000
male 26.507589 12.661633 0.135447
print(gdf.loc['First'])
age fare survived
sex
female 34.611765 106.125798 0.968085
male 41.281386 67.226127 0.368852
print(gdf.loc['First','female'])
age 34.611765
fare 106.125798
survived 0.968085
Name: (First, female), dtype: float64
print(gdf.xs('male',level = 'sex'))
age fare survived
class
First 41.281386 67.226127 0.368852
Second 30.740707 19.741782 0.157407
Third 26.507589 12.661633 0.135447
pdf1 = pd.pivot_table(df, # 비벗할 테이블
index = 'class', # 행위치에 들어갈 열
columns = 'sex', # 열위치에 들어갈 열
values = 'age', # 데이터로 사용할 열
aggfunc = 'mean') # 데이터 집계함수
print(pdf1)
sex female male
class
First 34.611765 41.281386
Second 28.722973 30.740707
Third 21.750000 26.507589
pdf2 = pd.pivot_table(df,
index = 'class',
columns = 'sex',
values = 'survived',
aggfunc = ['mean','sum'])
print(pdf2)
print(type(pdf2))
mean sum
sex female male female male
class
First 0.968085 0.368852 91 45
Second 0.921053 0.157407 70 17
Third 0.500000 0.135447 72 47
<class 'pandas.core.frame.DataFrame'>
pdf2.loc['First']
sex
mean female 0.968085
male 0.368852
sum female 91.000000
male 45.000000
Name: First, dtype: float64
pdf2.xs('First')
sex
mean female 0.968085
male 0.368852
sum female 91.000000
male 45.000000
Name: First, dtype: float64
print(pdf2.xs(('female'), level='sex', axis=1))
mean sum
class
First 0.968085 91
Second 0.921053 70
Third 0.500000 72
print(pdf2.xs(('mean','female'), axis=1))
class
First 0.968085
Second 0.921053
Third 0.500000
Name: (mean, female), dtype: float64
반응형
'Data_Science > Data_Analysis_Py' 카테고리의 다른 글
| 16. EDA, 멕시코식당 주문 CHIPOTLE (0) | 2021.10.28 |
|---|---|
| 15. 스크래핑 (0) | 2021.10.28 |
| 13. Stockprice (0) | 2021.10.26 |
| 12. titanic (2 (0) | 2021.10.26 |
| 11. 행정안전부, 연령별 인구 분석 (0) | 2021.10.26 |