[Python] Reshaping Data with pandas(3)

5 minute read


Pandas

  • pd.MultiIndex.from_arrays() - setting MultiIndex to a dataframe
  • .stack() - rearrange innermost column index to become the innermost row index (즉, column들을 새로운 하나의 index로 처리)
  • .unstack() - rearrange innermost row index to become the innermost column index
    • innermost column or row index have the highest level
  • .swaplevel() - exchange row/column between different levels

pd.MultiIndex.from_arrays()

# import pkg & dataset
import pandas as pd

churn = pd.read_csv("churn-bigml-20.csv")
df3 = churn[["Area code", "Total day calls", "Total day charge", "Total night calls", "Total night charge"]]
df3 = df3.iloc[:4]
df3
Area code Total day calls Total day charge Total night calls Total night charge
0 408 97 31.37 90 9.71
1 415 137 21.95 111 9.40
2 415 67 56.59 128 7.23
3 415 103 18.77 105 8.53

# predefine index to use as index
new_index = [['California', 'California', 'New York', 'Ohio'], 
             ['Los Angeles', 'San Francisco', 'New York', 'Cleveland']]

# create a multi-level index using predefined new_index
multi_index = pd.MultiIndex.from_arrays(new_index, names=['state', 'city'])

# Assign the new index to the churn index
df3.index = multi_index

df3
Area code Total day calls Total day charge Total night calls Total night charge
state city
California Los Angeles 408 97 31.37 90 9.71
San Francisco 415 137 21.95 111 9.40
New York New York 415 67 56.59 128 7.23
Ohio Cleveland 415 103 18.77 105 8.53

.stack()

  • level을 통해 어떤 column을 stack할 것인지 정할 수 있음
  • stack의 dropna=True가 default임
# dataset
df4 = df3.drop(columns=["Area code"])
df4.reset_index(inplace=True)
df4_melt = df4.melt(id_vars=["state", "city"])
df4_melt["variable"] = df4_melt["variable"].str.replace("Total ", "")
df4_melt[["time", "feature"]] = df4_melt["variable"].str.split(" ", expand=True)
df4_pivot = df4_melt.pivot(index=["state","city"], columns=["time", "feature"], values="value")
df4_pivot
time day night
feature calls charge calls charge
state city
California Los Angeles 97.0 31.37 90.0 9.71
San Francisco 137.0 21.95 111.0 9.40
New York New York 67.0 56.59 128.0 7.23
Ohio Cleveland 103.0 18.77 105.0 8.53
# default는 innermost column
df4_pivot.stack()
time day night
state city feature
California Los Angeles calls 97.00 90.00
charge 31.37 9.71
San Francisco calls 137.00 111.00
charge 21.95 9.40
New York New York calls 67.00 128.00
charge 56.59 7.23
Ohio Cleveland calls 103.00 105.00
charge 18.77 8.53
# dataset 준비
df5_melt = df4.drop(columns=["Total night charge"]).melt(id_vars=["state", "city"])
df5_melt["variable"] = df5_melt["variable"].str.replace("Total ", "")
df5_melt[["time", "feature"]] = df4_melt["variable"].str.split(" ", expand=True)
df5_pivot = df5_melt.pivot(index=["state","city"], columns=["time", "feature"], values="value")
df5_pivot
time day night
feature calls charge calls
state city
California Los Angeles 97.0 31.37 90.0
San Francisco 137.0 21.95 111.0
New York New York 67.0 56.59 128.0
Ohio Cleveland 103.0 18.77 105.0
# dropna=False를 통해 없는 default로 사라진 행까지 생성되게 할 수 있으며, fillna를 통해 빈 값을 채울 수 있음
df5_pivot.stack(dropna=False).fillna("채움")
time day night
state city feature
California Los Angeles calls 97.00 90.0
charge 31.37 채움
San Francisco calls 137.00 111.0
charge 21.95 채움
New York New York calls 67.00 128.0
charge 56.59 채움
Ohio Cleveland calls 103.00 105.0
charge 18.77 채움
# level을 통해 어떤 열을 stack할 것인지 명시 가능
df4_pivot.stack(level=1)
# 위와 같음
df4_pivot.stack(level="feature")
time day night
state city feature
California Los Angeles calls 97.00 90.00
charge 31.37 9.71
San Francisco calls 137.00 111.00
charge 21.95 9.40
New York New York calls 67.00 128.00
charge 56.59 7.23
Ohio Cleveland calls 103.00 105.00
charge 18.77 8.53
# 복수의 column index stack
df4_pivot.stack(level=[0,1])
# # 위와 같음
# df4_pivot.stack(level=["time", "feature"])
state       city           time   feature
California  Los Angeles    day    calls       97.00
                                  charge      31.37
                           night  calls       90.00
                                  charge       9.71
            San Francisco  day    calls      137.00
                                  charge      21.95
                           night  calls      111.00
                                  charge       9.40
New York    New York       day    calls       67.00
                                  charge      56.59
                           night  calls      128.00
                                  charge       7.23
Ohio        Cleveland      day    calls      103.00
                                  charge      18.77
                           night  calls      105.00
                                  charge       8.53
dtype: float64

.unstack()

  • level을 통해 어떤 column을 unstack할 것인지 정할 수 있음
  • fill_value를 통해 unstack 시 발생하는 NaN을 다른 값으로 채울 수 있음
# default는 innermost row
df4_pivot.unstack()
time day night
feature calls charge calls charge
city Cleveland Los Angeles New York San Francisco Cleveland Los Angeles New York San Francisco Cleveland Los Angeles New York San Francisco Cleveland Los Angeles New York San Francisco
state
California NaN 97.0 NaN 137.0 NaN 31.37 NaN 21.95 NaN 90.0 NaN 111.0 NaN 9.71 NaN 9.4
New York NaN NaN 67.0 NaN NaN NaN 56.59 NaN NaN NaN 128.0 NaN NaN NaN 7.23 NaN
Ohio 103.0 NaN NaN NaN 18.77 NaN NaN NaN 105.0 NaN NaN NaN 8.53 NaN NaN NaN
# fill_value=0을 통해 missing value 채울 수 있음
df4_pivot.unstack(fill_value=0)
time day night
feature calls charge calls charge
city Cleveland Los Angeles New York San Francisco Cleveland Los Angeles New York San Francisco Cleveland Los Angeles New York San Francisco Cleveland Los Angeles New York San Francisco
state
California 0.0 97.0 0.0 137.0 0.00 31.37 0.00 21.95 0.0 90.0 0.0 111.0 0.00 9.71 0.00 9.4
New York 0.0 0.0 67.0 0.0 0.00 0.00 56.59 0.00 0.0 0.0 128.0 0.0 0.00 0.00 7.23 0.0
Ohio 103.0 0.0 0.0 0.0 18.77 0.00 0.00 0.00 105.0 0.0 0.0 0.0 8.53 0.00 0.00 0.0
# level을 통해 어떤 row를 stack할 것인지 명시 가능
df4_pivot.unstack(level=0)
# 위와 같음
# df4_pivot.unstack(level="state")
time day night
feature calls charge calls charge
state California New York Ohio California New York Ohio California New York Ohio California New York Ohio
city
Cleveland NaN NaN 103.0 NaN NaN 18.77 NaN NaN 105.0 NaN NaN 8.53
Los Angeles 97.0 NaN NaN 31.37 NaN NaN 90.0 NaN NaN 9.71 NaN NaN
New York NaN 67.0 NaN NaN 56.59 NaN NaN 128.0 NaN NaN 7.23 NaN
San Francisco 137.0 NaN NaN 21.95 NaN NaN 111.0 NaN NaN 9.40 NaN NaN
# 복수의 row index unstack
df4_pivot.unstack(level=[0,1])
# # 위와 같음
# df4_pivot.unstack(level=["state", "city"])
time   feature  state       city         
day    calls    California  Cleveland           NaN
                            Los Angeles       97.00
                            New York            NaN
                            San Francisco    137.00
                New York    Cleveland           NaN
                            Los Angeles         NaN
                            New York          67.00
                            San Francisco       NaN
                Ohio        Cleveland        103.00
                            Los Angeles         NaN
                            New York            NaN
                            San Francisco       NaN
       charge   California  Cleveland           NaN
                            Los Angeles       31.37
                            New York            NaN
                            San Francisco     21.95
                New York    Cleveland           NaN
                            Los Angeles         NaN
                            New York          56.59
                            San Francisco       NaN
                Ohio        Cleveland         18.77
                            Los Angeles         NaN
                            New York            NaN
                            San Francisco       NaN
night  calls    California  Cleveland           NaN
                            Los Angeles       90.00
                            New York            NaN
                            San Francisco    111.00
                New York    Cleveland           NaN
                            Los Angeles         NaN
                            New York         128.00
                            San Francisco       NaN
                Ohio        Cleveland        105.00
                            Los Angeles         NaN
                            New York            NaN
                            San Francisco       NaN
       charge   California  Cleveland           NaN
                            Los Angeles        9.71
                            New York            NaN
                            San Francisco      9.40
                New York    Cleveland           NaN
                            Los Angeles         NaN
                            New York           7.23
                            San Francisco       NaN
                Ohio        Cleveland          8.53
                            Los Angeles         NaN
                            New York            NaN
                            San Francisco       NaN
dtype: float64

.swaplevel()

  • axis=1을 통해 column의 level 간 이동이 가능
# row index 간 교환
df4_pivot.swaplevel(0,1)
time day night
feature calls charge calls charge
city state
Los Angeles California 97.0 31.37 90.0 9.71
San Francisco California 137.0 21.95 111.0 9.40
New York New York 67.0 56.59 128.0 7.23
Cleveland Ohio 103.0 18.77 105.0 8.53
# column index 간 교환
df4_pivot.swaplevel(0,1,axis=1)
feature calls charge calls charge
time day day night night
state city
California Los Angeles 97.0 31.37 90.0 9.71
San Francisco 137.0 21.95 111.0 9.40
New York New York 67.0 56.59 128.0 7.23
Ohio Cleveland 103.0 18.77 105.0 8.53