[Python] Data manipulation with pandas(1)
Pandas
- Inspecting dataframe
.info()
.shape
.describe
- sort
.sort_values()
- select
- df
[['column1', 'column2']]
- df
- filter
- df
[df[column] == 'condition']
- df
- mutate
- df
[column] = df[column] + 1
- df
# import data
import seaborn as sns
import pandas as pd
iris = sns.load_dataset('iris')
.info() & .shape & .describe()
- df
.info
- df
.shape
- df
.describe
# Print information about iris
print(iris.info())
# # Print the shape of iris
print('-'*60)
print(iris.shape)
# # Print a short statsitic description of homelessness
print('-'*60)
print(iris.describe())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 sepal_length 150 non-null float64
1 sepal_width 150 non-null float64
2 petal_length 150 non-null float64
3 petal_width 150 non-null float64
4 species 150 non-null object
dtypes: float64(4), object(1)
memory usage: 6.0+ KB
None
------------------------------------------------------------
(150, 5)
------------------------------------------------------------
sepal_length sepal_width petal_length petal_width
count 150.000000 150.000000 150.000000 150.000000
mean 5.843333 3.057333 3.758000 1.199333
std 0.828066 0.435866 1.765298 0.762238
min 4.300000 2.000000 1.000000 0.100000
25% 5.100000 2.800000 1.600000 0.300000
50% 5.800000 3.000000 4.350000 1.300000
75% 6.400000 3.300000 5.100000 1.800000
max 7.900000 4.400000 6.900000 2.500000
.head() & .columns & .index
- df
.head
- df
.column
- df
.index
# Print the head 5 values of iris
print('-'*60)
print(iris.head(5).values)
# Print the column index of iris
print('-'*60)
print(iris.columns)
# Print the row index of iris
print('-'*60)
print(iris.index)
------------------------------------------------------------
[[5.1 3.5 1.4 0.2 'setosa']
[4.9 3.0 1.4 0.2 'setosa']
[4.7 3.2 1.3 0.2 'setosa']
[4.6 3.1 1.5 0.2 'setosa']
[5.0 3.6 1.4 0.2 'setosa']]
------------------------------------------------------------
Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
'species'],
dtype='object')
------------------------------------------------------------
RangeIndex(start=0, stop=150, step=1)
.sort_values()
- single column - df
.sort_values("column_name")
- multiple column - df
.sort_values(["column_name1, "column_name2"])
iris.sort_values('petal_width').head()
sepal_length | sepal_width | petal_length | petal_width | species | |
---|---|---|---|---|---|
32 | 5.2 | 4.1 | 1.5 | 0.1 | setosa |
13 | 4.3 | 3.0 | 1.1 | 0.1 | setosa |
37 | 4.9 | 3.6 | 1.4 | 0.1 | setosa |
9 | 4.9 | 3.1 | 1.5 | 0.1 | setosa |
12 | 4.8 | 3.0 | 1.4 | 0.1 | setosa |
iris.sort_values(['petal_length', 'petal_width'], ascending = [True, False]).head()
sepal_length | sepal_width | petal_length | petal_width | species | |
---|---|---|---|---|---|
22 | 4.6 | 3.6 | 1.0 | 0.2 | setosa |
13 | 4.3 | 3.0 | 1.1 | 0.1 | setosa |
14 | 5.8 | 4.0 | 1.2 | 0.2 | setosa |
35 | 5.0 | 3.2 | 1.2 | 0.2 | setosa |
16 | 5.4 | 3.9 | 1.3 | 0.4 | setosa |
df[[‘colum1’, ‘column2’]]
- select하는 column이 1개일 경우 square bracket이 1개여도 가능(pandas.Series로 처리됨)
iris[['species', 'sepal_length']].head(5)
species | sepal_length | |
---|---|---|
0 | setosa | 5.1 |
1 | setosa | 4.9 |
2 | setosa | 4.7 |
3 | setosa | 4.6 |
4 | setosa | 5.0 |
df[df[‘column’] = ‘condition’]
- 조건이 두개인 경우 각 조건마다
()
로 묶어야함 - 조건을 줄 때는 Series로 주어야함
iris[(iris['species']=='setosa') & (iris['sepal_length'] > 5.0)].head(5)
# 위와 같음
# iris_setosa = iris['species']=='setosa'
# sepal_bigger_than_5 = iris['sepal_length'] > 5.0
# iris[iris_setosa & sepal_bigger_than_5].head(5)
# 조건이 복수일 때, 아래 .isin() 활용
iris[(iris['species'].isin(['setosa', 'versicolor'])) & (iris['sepal_length'] > 5.0)]
sepal_length | sepal_width | petal_length | petal_width | species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
5 | 5.4 | 3.9 | 1.7 | 0.4 | setosa |
10 | 5.4 | 3.7 | 1.5 | 0.2 | setosa |
14 | 5.8 | 4.0 | 1.2 | 0.2 | setosa |
15 | 5.7 | 4.4 | 1.5 | 0.4 | setosa |
.isin()
- filter시 여러 조건을 걸 때 사용
cond = iris['species'].isin(['setosa', 'virginica'])
iris[cond].shape # setosa 50개 virginica 50개
(100, 5)
df[‘column’] =
iris['sepal_agg'] = iris['sepal_length'] * iris['sepal_width']
iris.head(5)
sepal_length | sepal_width | petal_length | petal_width | species | sepal_agg | |
---|---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa | 17.85 |
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa | 14.70 |
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa | 15.04 |
3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa | 14.26 |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa | 18.00 |