728x90
반응형
import pandas as pd
pd.set_option('display.max_columns', 10) # 출력할 최대열 개수
pd.set_option('display.max_colwidth', 20) #  출력할 열의 너비
df1 = pd.read_excel('stockprice.xlsx', index_col = 'id', engine = 'openpyxl')
df2 = pd.read_excel('stockvaluation.xlsx', index_col = 'id', engine = 'openpyxl')

# left outer join
df3 = df1.join(df2)
print(df3)

       stock_name          value   price    name           eps       bps  \
id                                                                         
128940       한미약품   59385.666667  421000     NaN           NaN       NaN   
130960     CJ E&M   58540.666667   98900  CJ E&M   6301.333333   54068.0   
138250      엔에스쇼핑   14558.666667   13200     NaN           NaN       NaN   
139480        이마트  239230.833333  254500     이마트  18268.166667  295780.0   
142280     녹십자엠에스     468.833333   10200     NaN           NaN       NaN   
145990        삼양사   82750.000000   82000     삼양사   5741.000000  108090.0   
185750        종근당   40293.666667  100500     종근당   3990.333333   40684.0   
192400      쿠쿠홀딩스  179204.666667  177500     NaN           NaN       NaN   
199800         툴젠   -2514.333333  115400     NaN           NaN       NaN   
204210     모두투어리츠    3093.333333    3475  모두투어리츠     85.166667    5335.0   

              per       pbr  
id                           
128940        NaN       NaN  
130960  15.695091  1.829178  
138250        NaN       NaN  
139480  13.931338  0.860437  
142280        NaN       NaN  
145990  14.283226  0.758627  
185750  25.185866  2.470259  
192400        NaN       NaN  
199800        NaN       NaN  
204210  40.802348  0.651359
# 교집합
df4 = df1.join(df2, how = 'inner')
print(df4)
       stock_name          value   price    name           eps     bps  \
id                                                                       
130960     CJ E&M   58540.666667   98900  CJ E&M   6301.333333   54068   
139480        이마트  239230.833333  254500     이마트  18268.166667  295780   
145990        삼양사   82750.000000   82000     삼양사   5741.000000  108090   
185750        종근당   40293.666667  100500     종근당   3990.333333   40684   
204210     모두투어리츠    3093.333333    3475  모두투어리츠     85.166667    5335   

              per       pbr  
id                           
130960  15.695091  1.829178  
139480  13.931338  0.860437  
145990  14.283226  0.758627  
185750  25.185866  2.470259  
204210  40.802348  0.651359

 

 

 

 

Titanic

# groupby 컬럼을 기준으로 그룹화
import seaborn as sns
titanic = sns.load_dataset('titanic')
df = titanic.loc[:,['age','sex','class','fare','survived']]
print(df.head())

    age     sex  class     fare  survived
0  22.0    male  Third   7.2500         0
1  38.0  female  First  71.2833         1
2  26.0  female  Third   7.9250         1
3  35.0  female  First  53.1000         1
4  35.0    male  Third   8.0500         0
grouped = df.groupby(['class'])
print(grouped)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000203DE051C40>
for key, group in grouped :
    print('key :', key)
    print('number :', len(group))
    print(group.head())
    
key : First
number : 216
     age     sex  class     fare  survived
1   38.0  female  First  71.2833         1
3   35.0  female  First  53.1000         1
6   54.0    male  First  51.8625         0
11  58.0  female  First  26.5500         1
23  28.0    male  First  35.5000         1
key : Second
number : 184
     age     sex   class     fare  survived
9   14.0  female  Second  30.0708         1
15  55.0  female  Second  16.0000         1
17   NaN    male  Second  13.0000         1
20  35.0    male  Second  26.0000         0
21  34.0    male  Second  13.0000         1
key : Third
number : 491
    age     sex  class     fare  survived
0  22.0    male  Third   7.2500         0
2  26.0  female  Third   7.9250         1
4  35.0    male  Third   8.0500         0
5   NaN    male  Third   8.4583         0
7   2.0    male  Third  21.0750         0
average = grouped.mean()
print(average)
              age       fare  survived
class                                 
First   38.233441  84.154687  0.629630
Second  29.877630  20.662183  0.472826
Third   25.140620  13.675550  0.242363
stdall = grouped.std()
print(stdall)

              age       fare  survived
class                                 
First   14.802856  78.380373  0.484026
Second  14.001077  13.417399  0.500623
Third   12.495398  11.778142  0.428949
maxage = grouped.age.max()
minage = grouped.age.min()

def max_min(x) :
    return x.max() - x.min() 
    
agg_maxmin = grouped.agg(max_min)
agg_maxmin
	age	fare	survived
class			
First	79.08	512.3292	1
Second	69.33	73.5000	1
Third	73.58	69.5500	1
aggall = grouped.agg(['min','max'])
aggall
	age	sex	fare	survived
min	max	min	max	min	max	min	max
class								
First	0.92	80.0	female	male	0.0	512.3292	0	1
Second	0.67	70.0	female	male	0.0	73.5000	0	1
Third	0.42	74.0	female	male	0.0	69.5500	0	1
aggsep = grouped.agg({'fare' : ['max','min'], 'age' : 'mean'})
aggsep

	fare	age
max	min	mean
class			
First	512.3292	0.0	38.233441
Second	73.5000	0.0	29.877630
Third	69.5500	0.0	25.140620
aggsep = grouped.agg({'fare' :'mean', 'age' :  ['max','min']})
aggsep

	fare	age
mean	max	min
class			
First	84.154687	80.0	0.92
Second	20.662183	70.0	0.67
Third	13.675550	74.0	0.42
groupfilter1 = grouped.filter(lambda x : len(x) >= 200)
print(groupfilter1.head())
print(type(groupfilter1))
print(groupfilter1.info())

    age     sex  class     fare  survived
0  22.0    male  Third   7.2500         0
1  38.0  female  First  71.2833         1
2  26.0  female  Third   7.9250         1
3  35.0  female  First  53.1000         1
4  35.0    male  Third   8.0500         0
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
Int64Index: 707 entries, 0 to 890
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   age       541 non-null    float64 
 1   sex       707 non-null    object  
 2   class     707 non-null    category
 3   fare      707 non-null    float64 
 4   survived  707 non-null    int64   
dtypes: category(1), float64(2), int64(1), object(1)
memory usage: 28.4+ KB
None
groupfilter1 = grouped.apply(lambda x : x.describe())
print(groupfilter1)

                    age        fare    survived
class                                           
First  count  186.000000  216.000000  216.000000
       mean    38.233441   84.154687    0.629630
       std     14.802856   78.380373    0.484026
       min      0.920000    0.000000    0.000000
       25%     27.000000   30.923950    0.000000
       50%     37.000000   60.287500    1.000000
       75%     49.000000   93.500000    1.000000
       max     80.000000  512.329200    1.000000
Second count  173.000000  184.000000  184.000000
       mean    29.877630   20.662183    0.472826
       std     14.001077   13.417399    0.500623
       min      0.670000    0.000000    0.000000
       25%     23.000000   13.000000    0.000000
       50%     29.000000   14.250000    0.000000
       75%     36.000000   26.000000    1.000000
       max     70.000000   73.500000    1.000000
Third  count  355.000000  491.000000  491.000000
       mean    25.140620   13.675550    0.242363
       std     12.495398   11.778142    0.428949
       min      0.420000    0.000000    0.000000
       25%     18.000000    7.750000    0.000000
       50%     24.000000    8.050000    0.000000
       75%     32.000000   15.500000    0.000000
       max     74.000000   69.550000    1.000000
group2 = df.groupby(['class', 'sex'])
gdf = group2.mean()
print(gdf)

                     age        fare  survived
class  sex                                    
First  female  34.611765  106.125798  0.968085
       male    41.281386   67.226127  0.368852
Second female  28.722973   21.970121  0.921053
       male    30.740707   19.741782  0.157407
Third  female  21.750000   16.118810  0.500000
       male    26.507589   12.661633  0.135447
print(gdf.loc['First'])

              age        fare  survived
sex                                    
female  34.611765  106.125798  0.968085
male    41.281386   67.226127  0.368852
print(gdf.loc['First','female'])

age          34.611765
fare        106.125798
survived      0.968085
Name: (First, female), dtype: float64
print(gdf.xs('male',level = 'sex'))

              age       fare  survived
class                                 
First   41.281386  67.226127  0.368852
Second  30.740707  19.741782  0.157407
Third   26.507589  12.661633  0.135447
pdf1 = pd.pivot_table(df, # 비벗할 테이블
                     index = 'class', # 행위치에 들어갈 열
                     columns = 'sex', # 열위치에 들어갈 열
                     values = 'age', # 데이터로 사용할 열
                     aggfunc = 'mean') # 데이터 집계함수
print(pdf1)

sex        female       male
class                       
First   34.611765  41.281386
Second  28.722973  30.740707
Third   21.750000  26.507589
pdf2 = pd.pivot_table(df,
                     index = 'class',
                     columns = 'sex',
                     values = 'survived',
                     aggfunc = ['mean','sum'])
print(pdf2)
print(type(pdf2))

            mean              sum     
sex       female      male female male
class                                 
First   0.968085  0.368852     91   45
Second  0.921053  0.157407     70   17
Third   0.500000  0.135447     72   47
<class 'pandas.core.frame.DataFrame'>
pdf2.loc['First']

      sex   
mean  female     0.968085
      male       0.368852
sum   female    91.000000
      male      45.000000
Name: First, dtype: float64
pdf2.xs('First')
      sex   
mean  female     0.968085
      male       0.368852
sum   female    91.000000
      male      45.000000
Name: First, dtype: float64
print(pdf2.xs(('female'), level='sex', axis=1))

            mean  sum
class                
First   0.968085   91
Second  0.921053   70
Third   0.500000   72
print(pdf2.xs(('mean','female'), axis=1))
class
First     0.968085
Second    0.921053
Third     0.500000
Name: (mean, female), dtype: float64

 

 

반응형

'Data_Science > Data_Analysis_Py' 카테고리의 다른 글

16. EDA, 멕시코식당 주문 CHIPOTLE  (0) 2021.10.28
15. 스크래핑  (0) 2021.10.28
13. Stockprice  (0) 2021.10.26
12. titanic (2  (0) 2021.10.26
11. 행정안전부, 연령별 인구 분석  (0) 2021.10.26

+ Recent posts