[Python] Data manipulation with pandas(1)
Pandas
- Inspecting dataframe
    
.info().shape.describe
 - sort
    
.sort_values()
 - select
    
- df
[['column1', 'column2']] 
 - df
 - filter
    
- df
[df[column] == 'condition'] 
 - df
 - mutate
    
- df
[column] = df[column] + 1 
 - df
 
# import data
import seaborn as sns
import pandas as pd
iris = sns.load_dataset('iris')
.info() & .shape & .describe()
- df
.info - df
.shape - df
.describe 
# Print information about iris
print(iris.info())
# # Print the shape of iris
print('-'*60)
print(iris.shape)
# # Print a short statsitic description of homelessness
print('-'*60)
print(iris.describe())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB
None
------------------------------------------------------------
(150, 5)
------------------------------------------------------------
       sepal_length  sepal_width  petal_length  petal_width
count    150.000000   150.000000    150.000000   150.000000
mean       5.843333     3.057333      3.758000     1.199333
std        0.828066     0.435866      1.765298     0.762238
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.600000     0.300000
50%        5.800000     3.000000      4.350000     1.300000
75%        6.400000     3.300000      5.100000     1.800000
max        7.900000     4.400000      6.900000     2.500000
.head() & .columns & .index
- df
.head - df
.column - df
.index 
# Print the head 5 values of iris
print('-'*60)
print(iris.head(5).values)
# Print the column index of iris
print('-'*60)
print(iris.columns)
# Print the row index of iris
print('-'*60)
print(iris.index)
------------------------------------------------------------
[[5.1 3.5 1.4 0.2 'setosa']
 [4.9 3.0 1.4 0.2 'setosa']
 [4.7 3.2 1.3 0.2 'setosa']
 [4.6 3.1 1.5 0.2 'setosa']
 [5.0 3.6 1.4 0.2 'setosa']]
------------------------------------------------------------
Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')
------------------------------------------------------------
RangeIndex(start=0, stop=150, step=1)
.sort_values()
- single column - df
.sort_values("column_name") - multiple column - df
.sort_values(["column_name1, "column_name2"]) 
iris.sort_values('petal_width').head()
| sepal_length | sepal_width | petal_length | petal_width | species | |
|---|---|---|---|---|---|
| 32 | 5.2 | 4.1 | 1.5 | 0.1 | setosa | 
| 13 | 4.3 | 3.0 | 1.1 | 0.1 | setosa | 
| 37 | 4.9 | 3.6 | 1.4 | 0.1 | setosa | 
| 9 | 4.9 | 3.1 | 1.5 | 0.1 | setosa | 
| 12 | 4.8 | 3.0 | 1.4 | 0.1 | setosa | 
iris.sort_values(['petal_length', 'petal_width'], ascending = [True, False]).head()
| sepal_length | sepal_width | petal_length | petal_width | species | |
|---|---|---|---|---|---|
| 22 | 4.6 | 3.6 | 1.0 | 0.2 | setosa | 
| 13 | 4.3 | 3.0 | 1.1 | 0.1 | setosa | 
| 14 | 5.8 | 4.0 | 1.2 | 0.2 | setosa | 
| 35 | 5.0 | 3.2 | 1.2 | 0.2 | setosa | 
| 16 | 5.4 | 3.9 | 1.3 | 0.4 | setosa | 
df[[‘colum1’, ‘column2’]]
- select하는 column이 1개일 경우 square bracket이 1개여도 가능(pandas.Series로 처리됨)
 
iris[['species', 'sepal_length']].head(5)
| species | sepal_length | |
|---|---|---|
| 0 | setosa | 5.1 | 
| 1 | setosa | 4.9 | 
| 2 | setosa | 4.7 | 
| 3 | setosa | 4.6 | 
| 4 | setosa | 5.0 | 
df[df[‘column’] = ‘condition’]
- 조건이 두개인 경우 각 조건마다 
()로 묶어야함 - 조건을 줄 때는 Series로 주어야함
 
iris[(iris['species']=='setosa') & (iris['sepal_length'] > 5.0)].head(5)
# 위와 같음
# iris_setosa = iris['species']=='setosa'
# sepal_bigger_than_5 = iris['sepal_length'] > 5.0
# iris[iris_setosa & sepal_bigger_than_5].head(5)
# 조건이 복수일 때, 아래 .isin() 활용
iris[(iris['species'].isin(['setosa', 'versicolor'])) & (iris['sepal_length'] > 5.0)]
| sepal_length | sepal_width | petal_length | petal_width | species | |
|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa | 
| 5 | 5.4 | 3.9 | 1.7 | 0.4 | setosa | 
| 10 | 5.4 | 3.7 | 1.5 | 0.2 | setosa | 
| 14 | 5.8 | 4.0 | 1.2 | 0.2 | setosa | 
| 15 | 5.7 | 4.4 | 1.5 | 0.4 | setosa | 
.isin()
- filter시 여러 조건을 걸 때 사용
 
cond = iris['species'].isin(['setosa', 'virginica'])
iris[cond].shape # setosa 50개 virginica 50개
(100, 5)
df[‘column’] =
iris['sepal_agg'] = iris['sepal_length'] * iris['sepal_width']
iris.head(5)
| sepal_length | sepal_width | petal_length | petal_width | species | sepal_agg | |
|---|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa | 17.85 | 
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa | 14.70 | 
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa | 15.04 | 
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa | 14.26 | 
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa | 18.00 |