[Python] Reshaping Data with pandas(3)
Pandas
pd.MultiIndex.from_arrays()
- setting MultiIndex to a dataframe.stack()
- rearrange innermost column index to become the innermost row index (즉, column들을 새로운 하나의 index로 처리).unstack()
- rearrange innermost row index to become the innermost column index- innermost column or row index have the highest level
.swaplevel()
- exchange row/column between different levels
pd.MultiIndex.from_arrays()
# import pkg & dataset
import pandas as pd
churn = pd.read_csv("churn-bigml-20.csv")
df3 = churn[["Area code", "Total day calls", "Total day charge", "Total night calls", "Total night charge"]]
df3 = df3.iloc[:4]
df3
Area code | Total day calls | Total day charge | Total night calls | Total night charge | |
---|---|---|---|---|---|
0 | 408 | 97 | 31.37 | 90 | 9.71 |
1 | 415 | 137 | 21.95 | 111 | 9.40 |
2 | 415 | 67 | 56.59 | 128 | 7.23 |
3 | 415 | 103 | 18.77 | 105 | 8.53 |
# predefine index to use as index
new_index = [['California', 'California', 'New York', 'Ohio'],
['Los Angeles', 'San Francisco', 'New York', 'Cleveland']]
# create a multi-level index using predefined new_index
multi_index = pd.MultiIndex.from_arrays(new_index, names=['state', 'city'])
# Assign the new index to the churn index
df3.index = multi_index
df3
Area code | Total day calls | Total day charge | Total night calls | Total night charge | ||
---|---|---|---|---|---|---|
state | city | |||||
California | Los Angeles | 408 | 97 | 31.37 | 90 | 9.71 |
San Francisco | 415 | 137 | 21.95 | 111 | 9.40 | |
New York | New York | 415 | 67 | 56.59 | 128 | 7.23 |
Ohio | Cleveland | 415 | 103 | 18.77 | 105 | 8.53 |
.stack()
level
을 통해 어떤 column을 stack할 것인지 정할 수 있음- stack의 dropna=True가 default임
# dataset
df4 = df3.drop(columns=["Area code"])
df4.reset_index(inplace=True)
df4_melt = df4.melt(id_vars=["state", "city"])
df4_melt["variable"] = df4_melt["variable"].str.replace("Total ", "")
df4_melt[["time", "feature"]] = df4_melt["variable"].str.split(" ", expand=True)
df4_pivot = df4_melt.pivot(index=["state","city"], columns=["time", "feature"], values="value")
df4_pivot
time | day | night | |||
---|---|---|---|---|---|
feature | calls | charge | calls | charge | |
state | city | ||||
California | Los Angeles | 97.0 | 31.37 | 90.0 | 9.71 |
San Francisco | 137.0 | 21.95 | 111.0 | 9.40 | |
New York | New York | 67.0 | 56.59 | 128.0 | 7.23 |
Ohio | Cleveland | 103.0 | 18.77 | 105.0 | 8.53 |
# default는 innermost column
df4_pivot.stack()
time | day | night | ||
---|---|---|---|---|
state | city | feature | ||
California | Los Angeles | calls | 97.00 | 90.00 |
charge | 31.37 | 9.71 | ||
San Francisco | calls | 137.00 | 111.00 | |
charge | 21.95 | 9.40 | ||
New York | New York | calls | 67.00 | 128.00 |
charge | 56.59 | 7.23 | ||
Ohio | Cleveland | calls | 103.00 | 105.00 |
charge | 18.77 | 8.53 |
# dataset 준비
df5_melt = df4.drop(columns=["Total night charge"]).melt(id_vars=["state", "city"])
df5_melt["variable"] = df5_melt["variable"].str.replace("Total ", "")
df5_melt[["time", "feature"]] = df4_melt["variable"].str.split(" ", expand=True)
df5_pivot = df5_melt.pivot(index=["state","city"], columns=["time", "feature"], values="value")
df5_pivot
time | day | night | ||
---|---|---|---|---|
feature | calls | charge | calls | |
state | city | |||
California | Los Angeles | 97.0 | 31.37 | 90.0 |
San Francisco | 137.0 | 21.95 | 111.0 | |
New York | New York | 67.0 | 56.59 | 128.0 |
Ohio | Cleveland | 103.0 | 18.77 | 105.0 |
# dropna=False를 통해 없는 default로 사라진 행까지 생성되게 할 수 있으며, fillna를 통해 빈 값을 채울 수 있음
df5_pivot.stack(dropna=False).fillna("채움")
time | day | night | ||
---|---|---|---|---|
state | city | feature | ||
California | Los Angeles | calls | 97.00 | 90.0 |
charge | 31.37 | 채움 | ||
San Francisco | calls | 137.00 | 111.0 | |
charge | 21.95 | 채움 | ||
New York | New York | calls | 67.00 | 128.0 |
charge | 56.59 | 채움 | ||
Ohio | Cleveland | calls | 103.00 | 105.0 |
charge | 18.77 | 채움 |
# level을 통해 어떤 열을 stack할 것인지 명시 가능
df4_pivot.stack(level=1)
# 위와 같음
df4_pivot.stack(level="feature")
time | day | night | ||
---|---|---|---|---|
state | city | feature | ||
California | Los Angeles | calls | 97.00 | 90.00 |
charge | 31.37 | 9.71 | ||
San Francisco | calls | 137.00 | 111.00 | |
charge | 21.95 | 9.40 | ||
New York | New York | calls | 67.00 | 128.00 |
charge | 56.59 | 7.23 | ||
Ohio | Cleveland | calls | 103.00 | 105.00 |
charge | 18.77 | 8.53 |
# 복수의 column index stack
df4_pivot.stack(level=[0,1])
# # 위와 같음
# df4_pivot.stack(level=["time", "feature"])
state city time feature
California Los Angeles day calls 97.00
charge 31.37
night calls 90.00
charge 9.71
San Francisco day calls 137.00
charge 21.95
night calls 111.00
charge 9.40
New York New York day calls 67.00
charge 56.59
night calls 128.00
charge 7.23
Ohio Cleveland day calls 103.00
charge 18.77
night calls 105.00
charge 8.53
dtype: float64
.unstack()
level
을 통해 어떤 column을 unstack할 것인지 정할 수 있음fill_value
를 통해 unstack 시 발생하는 NaN을 다른 값으로 채울 수 있음
# default는 innermost row
df4_pivot.unstack()
time | day | night | ||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
feature | calls | charge | calls | charge | ||||||||||||
city | Cleveland | Los Angeles | New York | San Francisco | Cleveland | Los Angeles | New York | San Francisco | Cleveland | Los Angeles | New York | San Francisco | Cleveland | Los Angeles | New York | San Francisco |
state | ||||||||||||||||
California | NaN | 97.0 | NaN | 137.0 | NaN | 31.37 | NaN | 21.95 | NaN | 90.0 | NaN | 111.0 | NaN | 9.71 | NaN | 9.4 |
New York | NaN | NaN | 67.0 | NaN | NaN | NaN | 56.59 | NaN | NaN | NaN | 128.0 | NaN | NaN | NaN | 7.23 | NaN |
Ohio | 103.0 | NaN | NaN | NaN | 18.77 | NaN | NaN | NaN | 105.0 | NaN | NaN | NaN | 8.53 | NaN | NaN | NaN |
# fill_value=0을 통해 missing value 채울 수 있음
df4_pivot.unstack(fill_value=0)
time | day | night | ||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
feature | calls | charge | calls | charge | ||||||||||||
city | Cleveland | Los Angeles | New York | San Francisco | Cleveland | Los Angeles | New York | San Francisco | Cleveland | Los Angeles | New York | San Francisco | Cleveland | Los Angeles | New York | San Francisco |
state | ||||||||||||||||
California | 0.0 | 97.0 | 0.0 | 137.0 | 0.00 | 31.37 | 0.00 | 21.95 | 0.0 | 90.0 | 0.0 | 111.0 | 0.00 | 9.71 | 0.00 | 9.4 |
New York | 0.0 | 0.0 | 67.0 | 0.0 | 0.00 | 0.00 | 56.59 | 0.00 | 0.0 | 0.0 | 128.0 | 0.0 | 0.00 | 0.00 | 7.23 | 0.0 |
Ohio | 103.0 | 0.0 | 0.0 | 0.0 | 18.77 | 0.00 | 0.00 | 0.00 | 105.0 | 0.0 | 0.0 | 0.0 | 8.53 | 0.00 | 0.00 | 0.0 |
# level을 통해 어떤 row를 stack할 것인지 명시 가능
df4_pivot.unstack(level=0)
# 위와 같음
# df4_pivot.unstack(level="state")
time | day | night | ||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|
feature | calls | charge | calls | charge | ||||||||
state | California | New York | Ohio | California | New York | Ohio | California | New York | Ohio | California | New York | Ohio |
city | ||||||||||||
Cleveland | NaN | NaN | 103.0 | NaN | NaN | 18.77 | NaN | NaN | 105.0 | NaN | NaN | 8.53 |
Los Angeles | 97.0 | NaN | NaN | 31.37 | NaN | NaN | 90.0 | NaN | NaN | 9.71 | NaN | NaN |
New York | NaN | 67.0 | NaN | NaN | 56.59 | NaN | NaN | 128.0 | NaN | NaN | 7.23 | NaN |
San Francisco | 137.0 | NaN | NaN | 21.95 | NaN | NaN | 111.0 | NaN | NaN | 9.40 | NaN | NaN |
# 복수의 row index unstack
df4_pivot.unstack(level=[0,1])
# # 위와 같음
# df4_pivot.unstack(level=["state", "city"])
time feature state city
day calls California Cleveland NaN
Los Angeles 97.00
New York NaN
San Francisco 137.00
New York Cleveland NaN
Los Angeles NaN
New York 67.00
San Francisco NaN
Ohio Cleveland 103.00
Los Angeles NaN
New York NaN
San Francisco NaN
charge California Cleveland NaN
Los Angeles 31.37
New York NaN
San Francisco 21.95
New York Cleveland NaN
Los Angeles NaN
New York 56.59
San Francisco NaN
Ohio Cleveland 18.77
Los Angeles NaN
New York NaN
San Francisco NaN
night calls California Cleveland NaN
Los Angeles 90.00
New York NaN
San Francisco 111.00
New York Cleveland NaN
Los Angeles NaN
New York 128.00
San Francisco NaN
Ohio Cleveland 105.00
Los Angeles NaN
New York NaN
San Francisco NaN
charge California Cleveland NaN
Los Angeles 9.71
New York NaN
San Francisco 9.40
New York Cleveland NaN
Los Angeles NaN
New York 7.23
San Francisco NaN
Ohio Cleveland 8.53
Los Angeles NaN
New York NaN
San Francisco NaN
dtype: float64
.swaplevel()
- axis=1을 통해 column의 level 간 이동이 가능
# row index 간 교환
df4_pivot.swaplevel(0,1)
time | day | night | |||
---|---|---|---|---|---|
feature | calls | charge | calls | charge | |
city | state | ||||
Los Angeles | California | 97.0 | 31.37 | 90.0 | 9.71 |
San Francisco | California | 137.0 | 21.95 | 111.0 | 9.40 |
New York | New York | 67.0 | 56.59 | 128.0 | 7.23 |
Cleveland | Ohio | 103.0 | 18.77 | 105.0 | 8.53 |
# column index 간 교환
df4_pivot.swaplevel(0,1,axis=1)
feature | calls | charge | calls | charge | |
---|---|---|---|---|---|
time | day | day | night | night | |
state | city | ||||
California | Los Angeles | 97.0 | 31.37 | 90.0 | 9.71 |
San Francisco | 137.0 | 21.95 | 111.0 | 9.40 | |
New York | New York | 67.0 | 56.59 | 128.0 | 7.23 |
Ohio | Cleveland | 103.0 | 18.77 | 105.0 | 8.53 |