[Python] Data manipulation with pandas(1)

2 minute read


Pandas

  • Inspecting dataframe
    • .info()
    • .shape
    • .describe
  • sort
    • .sort_values()
  • select
    • df[['column1', 'column2']]
  • filter
    • df[df[column] == 'condition']
  • mutate
    • df[column] = df[column] + 1

# import data
import seaborn as sns
import pandas as pd

iris = sns.load_dataset('iris')

.info() & .shape & .describe()

  • df.info
  • df.shape
  • df.describe
# Print information about iris
print(iris.info())

# # Print the shape of iris
print('-'*60)
print(iris.shape)

# # Print a short statsitic description of homelessness
print('-'*60)
print(iris.describe())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB
None
------------------------------------------------------------
(150, 5)
------------------------------------------------------------
       sepal_length  sepal_width  petal_length  petal_width
count    150.000000   150.000000    150.000000   150.000000
mean       5.843333     3.057333      3.758000     1.199333
std        0.828066     0.435866      1.765298     0.762238
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.600000     0.300000
50%        5.800000     3.000000      4.350000     1.300000
75%        6.400000     3.300000      5.100000     1.800000
max        7.900000     4.400000      6.900000     2.500000

.head() & .columns & .index

  • df.head
  • df.column
  • df.index
# Print the head 5 values of iris
print('-'*60)
print(iris.head(5).values)

# Print the column index of iris
print('-'*60)
print(iris.columns)

# Print the row index of iris
print('-'*60)
print(iris.index)
------------------------------------------------------------
[[5.1 3.5 1.4 0.2 'setosa']
 [4.9 3.0 1.4 0.2 'setosa']
 [4.7 3.2 1.3 0.2 'setosa']
 [4.6 3.1 1.5 0.2 'setosa']
 [5.0 3.6 1.4 0.2 'setosa']]
------------------------------------------------------------
Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')
------------------------------------------------------------
RangeIndex(start=0, stop=150, step=1)

.sort_values()

  • single column - df.sort_values("column_name")
  • multiple column - df.sort_values(["column_name1, "column_name2"])
iris.sort_values('petal_width').head()
sepal_length sepal_width petal_length petal_width species
32 5.2 4.1 1.5 0.1 setosa
13 4.3 3.0 1.1 0.1 setosa
37 4.9 3.6 1.4 0.1 setosa
9 4.9 3.1 1.5 0.1 setosa
12 4.8 3.0 1.4 0.1 setosa
iris.sort_values(['petal_length', 'petal_width'], ascending = [True, False]).head()
sepal_length sepal_width petal_length petal_width species
22 4.6 3.6 1.0 0.2 setosa
13 4.3 3.0 1.1 0.1 setosa
14 5.8 4.0 1.2 0.2 setosa
35 5.0 3.2 1.2 0.2 setosa
16 5.4 3.9 1.3 0.4 setosa

df[[‘colum1’, ‘column2’]]

  • select하는 column이 1개일 경우 square bracket이 1개여도 가능(pandas.Series로 처리됨)
iris[['species', 'sepal_length']].head(5)
species sepal_length
0 setosa 5.1
1 setosa 4.9
2 setosa 4.7
3 setosa 4.6
4 setosa 5.0

df[df[‘column’] = ‘condition’]

  • 조건이 두개인 경우 각 조건마다 ()로 묶어야함
  • 조건을 줄 때는 Series로 주어야함
iris[(iris['species']=='setosa') & (iris['sepal_length'] > 5.0)].head(5)

# 위와 같음
# iris_setosa = iris['species']=='setosa'
# sepal_bigger_than_5 = iris['sepal_length'] > 5.0

# iris[iris_setosa & sepal_bigger_than_5].head(5)

# 조건이 복수일 때, 아래 .isin() 활용
iris[(iris['species'].isin(['setosa', 'versicolor'])) & (iris['sepal_length'] > 5.0)]
sepal_length sepal_width petal_length petal_width species
0 5.1 3.5 1.4 0.2 setosa
5 5.4 3.9 1.7 0.4 setosa
10 5.4 3.7 1.5 0.2 setosa
14 5.8 4.0 1.2 0.2 setosa
15 5.7 4.4 1.5 0.4 setosa

.isin()

  • filter시 여러 조건을 걸 때 사용
cond = iris['species'].isin(['setosa', 'virginica'])

iris[cond].shape # setosa 50개 virginica 50개
(100, 5)

df[‘column’] =

iris['sepal_agg'] = iris['sepal_length'] * iris['sepal_width']
iris.head(5)
sepal_length sepal_width petal_length petal_width species sepal_agg
0 5.1 3.5 1.4 0.2 setosa 17.85
1 4.9 3.0 1.4 0.2 setosa 14.70
2 4.7 3.2 1.3 0.2 setosa 15.04
3 4.6 3.1 1.5 0.2 setosa 14.26
4 5.0 3.6 1.4 0.2 setosa 18.00