728x90
반응형
import seaborn as sns
df = sns.load_dataset('titanic')
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 survived 891 non-null int64
1 pclass 891 non-null int64
2 sex 891 non-null object
3 age 714 non-null float64
4 sibsp 891 non-null int64
5 parch 891 non-null int64
6 fare 891 non-null float64
7 embarked 889 non-null object
8 class 891 non-null category
9 who 891 non-null object
10 adult_male 891 non-null bool
11 deck 203 non-null category
12 embark_town 889 non-null object
13 alive 891 non-null object
14 alone 891 non-null bool
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.6+ KB
# deck열의 nan 개수 확인
nan_deck = df['deck'].value_counts(dropna = False)
nan_deck
NaN 688
C 59
B 47
D 33
E 32
A 15
F 13
G 4
Name: deck, dtype: int64
# isnull 누락데이터 여부 누락 True, 아니면 False
df.isnull().sum(axis=0)
# 반대 notnull()
# df.head().isnull().sum(axis=0) # 누락분의 합계
survived 0
pclass 0
sex 0
age 177
sibsp 0
parch 0
fare 0
embarked 2
class 0
who 0
adult_male 0
deck 688
embark_town 2
alive 0
alone 0
dtype: int64
# 각열의 nan개수 계산하기
missing_df = df.isnull()
for col in missing_df.columns :
missing_count = missing_df[col].value_counts()
try :
print(col, ':', missing_count[True]) # True 있으면 에러처리됨,
except :
print(col, ':', 0) # 예외 처리되면 0, 없음
survived : 0
pclass : 0
sex : 0
age : 177
sibsp : 0
parch : 0
fare : 0
embarked : 2
class : 0
who : 0
adult_male : 0
deck : 688
embark_town : 2
alive : 0
alone : 0
# dropna nan 500개되는 열 삭제
df_thresh = df.dropna(axis = 1, thresh = 500)
print(df_thresh.columns)
df_thresh.info
Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
'embarked', 'class', 'who', 'adult_male', 'embark_town', 'alive',
'alone'],
dtype='object')
<bound method DataFrame.info of survived pclass sex age sibsp parch fare embarked class \
0 0 3 male 22.0 1 0 7.2500 S Third
1 1 1 female 38.0 1 0 71.2833 C First
2 1 3 female 26.0 0 0 7.9250 S Third
3 1 1 female 35.0 1 0 53.1000 S First
4 0 3 male 35.0 0 0 8.0500 S Third
.. ... ... ... ... ... ... ... ... ...
886 0 2 male 27.0 0 0 13.0000 S Second
887 1 1 female 19.0 0 0 30.0000 S First
888 0 3 female NaN 1 2 23.4500 S Third
889 1 1 male 26.0 0 0 30.0000 C First
890 0 3 male 32.0 0 0 7.7500 Q Third
who adult_male embark_town alive alone
0 man True Southampton no False
1 woman False Cherbourg yes False
2 woman False Southampton yes True
3 woman False Southampton yes False
4 man True Southampton no True
.. ... ... ... ... ...
886 man True Southampton no True
887 woman False Southampton yes True
888 woman False Southampton no False
889 man True Cherbourg yes True
890 man True Queenstown no True
[891 rows x 14 columns]>
# nan 값있는 행 삭제
df_age = df.dropna(subset = ['age'], how = 'any', axis = 0)
print(len(df_age))
print(df_age.info())
714
<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 survived 714 non-null int64
1 pclass 714 non-null int64
2 sex 714 non-null object
3 age 714 non-null float64
4 sibsp 714 non-null int64
5 parch 714 non-null int64
6 fare 714 non-null float64
7 embarked 712 non-null object
8 class 714 non-null category
9 who 714 non-null object
10 adult_male 714 non-null bool
11 deck 184 non-null category
12 embark_town 712 non-null object
13 alive 714 non-null object
14 alone 714 non-null bool
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 70.2+ KB
None
# df 데이터 : age 열의 nan 값을 다른 나이 데이터의 평균으로 변경하기
print(df.info())
mean_age = df['age'].mean(axis=0)
df['age'].fillna(mean_age, inplace = True)
print(df.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 survived 891 non-null int64
1 pclass 891 non-null int64
2 sex 891 non-null object
3 age 714 non-null float64
4 sibsp 891 non-null int64
5 parch 891 non-null int64
6 fare 891 non-null float64
7 embarked 889 non-null object
8 class 891 non-null category
9 who 891 non-null object
10 adult_male 891 non-null bool
11 deck 203 non-null category
12 embark_town 889 non-null object
13 alive 891 non-null object
14 alone 891 non-null bool
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.6+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 survived 891 non-null int64
1 pclass 891 non-null int64
2 sex 891 non-null object
3 age 891 non-null float64
4 sibsp 891 non-null int64
5 parch 891 non-null int64
6 fare 891 non-null float64
7 embarked 889 non-null object
8 class 891 non-null category
9 who 891 non-null object
10 adult_male 891 non-null bool
11 deck 203 non-null category
12 embark_town 889 non-null object
13 alive 891 non-null object
14 alone 891 non-null bool
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.6+ KB
None
# embarktown 컬럼의 결측값은 컬럼의 값 중 빈도수가 가장 많은 값으로 치환하기
# most_freq = df['embark_town'].value_counts(dropna = True)
most_freq = df['embark_town'].value_counts(dropna = True).idxmax() # 가장 많은 것 출력
print(most_freq)
# Southampton
df['embark_town'].fillna(most_freq, inplace = True)
print(df.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 survived 891 non-null int64
1 pclass 891 non-null int64
2 sex 891 non-null object
3 age 891 non-null float64
4 sibsp 891 non-null int64
5 parch 891 non-null int64
6 fare 891 non-null float64
7 embarked 889 non-null object
8 class 891 non-null category
9 who 891 non-null object
10 adult_male 891 non-null bool
11 deck 203 non-null category
12 embark_town 891 non-null object
13 alive 891 non-null object
14 alone 891 non-null bool
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.6+ KB
None
# 재 업로드
df = sns.load_dataset('titanic')
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 survived 891 non-null int64
1 pclass 891 non-null int64
2 sex 891 non-null object
3 age 714 non-null float64
4 sibsp 891 non-null int64
5 parch 891 non-null int64
6 fare 891 non-null float64
7 embarked 889 non-null object
8 class 891 non-null category
9 who 891 non-null object
10 adult_male 891 non-null bool
11 deck 203 non-null category
12 embark_town 889 non-null object
13 alive 891 non-null object
14 alone 891 non-null bool
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.6+ KB
print(df['embark_town'][825:830])
825 Queenstown
826 Southampton
827 Cherbourg
828 Queenstown
829 NaN
Name: embark_town, dtype: object
# 결측치를 앞의 값으로 치환
df['embark_town'].fillna(method='ffill', inplace = True)
print(df['embark_town'][825:830])
825 Queenstown
826 Southampton
827 Cherbourg
828 Queenstown
829 Queenstown
Name: embark_town, dtype: object
반응형
'Data_Science > Data_Analysis_Py' 카테고리의 다른 글
14. Stockprice (2 (0) | 2021.10.26 |
---|---|
13. Stockprice (0) | 2021.10.26 |
11. 행정안전부, 연령별 인구 분석 (0) | 2021.10.26 |
10. folium 2 (0) | 2021.10.26 |
9. tips || '21.06.28. (0) | 2021.10.26 |