[Python] Reshaping Data with pandas(4)
Pandas
- Using
.stack
with.mean()
or.diff()
.explode()
- converts list contained column into separate rowspd.json_normalize()
- converts json data into dataframe- Convert nested data column
# import pkg & dataset
import numpy as np
import pandas as pd
df = pd.read_csv("obesity.csv")
df_melted = df.melt(id_vars="country")
df_melted[["gender", "year"]] = df_melted["variable"].str.extract("([a-z]+)([0-9]+)", expand=True)
df_melted = df_melted.drop(columns="variable").rename(columns={"value":"perc_obesity"})
df_melted = df_melted.set_index(["country", "gender", "year"], drop=True)
df_melted
perc_obesity | |||
---|---|---|---|
country | gender | year | |
argentina | male | 2005 | 21.5 |
japan | male | 2005 | 2.5 |
norway | male | 2005 | 17.6 |
argentina | female | 2005 | 24.2 |
japan | female | 2005 | 2.6 |
norway | female | 2005 | 18.6 |
argentina | male | 2015 | 26.8 |
japan | male | 2015 | 4.6 |
norway | male | 2015 | 23.0 |
argentina | female | 2015 | 28.5 |
japan | female | 2015 | 3.6 |
norway | female | 2015 | 22.2 |
Using .stack() with .mean() or .diff()
# mean of obesity percent by gender & year
df_general = df_melted.unstack(level=0).mean(axis=1)
df_general
gender year
female 2005 15.133333
2015 18.100000
male 2005 13.866667
2015 18.133333
dtype: float64
# difference of obesity percent by year considering country & gender
df_melted.unstack(level=2).diff(axis=1)
perc_obesity | |||
---|---|---|---|
year | 2005 | 2015 | |
country | gender | ||
argentina | female | NaN | 4.3 |
male | NaN | 5.3 | |
japan | female | NaN | 1.0 |
male | NaN | 2.1 | |
norway | female | NaN | 3.6 |
male | NaN | 5.4 |
# mean of obesity by country
df_melted.stack().groupby("country").mean()
country
argentina 25.250
japan 3.325
norway 20.350
dtype: float64
.explode()
# dataset
# csv는 python의 list로써 저장되는 것이 아니므로, 데이터를 읽을 때 해당 열이 python의 list임을 명시해야 함
import ast
df2 = pd.read_csv("obesity_list.csv", converters={"bounds": ast.literal_eval})
# 방법 1) explode한 df와 기존 df의 열 merge
df2_bounds = df2["bounds"].explode()
df2[["country", "perc_obesity"]].merge(df2_bounds, left_index=True, right_index=True)
country | perc_obesity | bounds | |
---|---|---|---|
0 | Argentina | 21.5 | 15.4 |
0 | Argentina | 21.5 | 31.5 |
1 | Germany | 22.3 | 16.2 |
1 | Germany | 22.3 | 32.4 |
2 | Japan | 2.5 | 1.1 |
2 | Japan | 2.5 | 3.5 |
3 | Norway | 23.0 | 13.1 |
3 | Norway | 23.0 | 33.0 |
# 방법 2) 해당 행을 explode 함수 안에 명시, 이 경우 index를 reset할 필요 존재
df2.explode("bounds").reset_index(drop=True)
country | perc_obesity | bounds | |
---|---|---|---|
0 | Argentina | 21.5 | 15.4 |
1 | Argentina | 21.5 | 31.5 |
2 | Germany | 22.3 | 16.2 |
3 | Germany | 22.3 | 32.4 |
4 | Japan | 2.5 | 1.1 |
5 | Japan | 2.5 | 3.5 |
6 | Norway | 23.0 | 13.1 |
7 | Norway | 23.0 | 33.0 |
# dataset
df3 = pd.read_csv("obesity3.csv")
df3
country | perc_obesity | bounds | |
---|---|---|---|
0 | Argentina | 21.5 | 11.4-25.5 |
1 | Germany | 22.3 | 16.2-32.4 |
2 | Japan | 2.5 | 8.1-16.5 |
3 | Norway | 23.0 | 9.1-20.1 |
# 한 열이 특정한 string을 기준을 바탕으로 list로 분리한 뒤 explode를 사용할 수도 있음
df3_splited = df3.assign(bounds = df3["bounds"].str.split("-").explode("bounds"))
df3_splited
country | perc_obesity | bounds | |
---|---|---|---|
0 | Argentina | 21.5 | 11.4 |
1 | Germany | 22.3 | 25.5 |
2 | Japan | 2.5 | 16.2 |
3 | Norway | 23.0 | 32.4 |
pd.json_normalize()
- nest가 한번인 경우까지 dataframe으로 normalize 됨
- parameter
sep
- column name의 separator인 string 명시 가능 (default는 .)record_path
- double nested 되어 있을 경우 해당 데이터를 dataframe으로 반환meta
- Fields to use as metadata for each record in resulting table
# dataset
import json
movies = json.load(open("movies.json"))
movies
[{'director': 'Woody Allen',
'producer': 'Letty Aronson',
'features': {'title': 'Magic in the Moonlight', 'year': 2014}},
{'director': 'Niki Caro',
'producer': 'Jason Reed',
'features': {'title': 'Mulan', 'year': 2020}}]
pd.json_normalize(movies, sep="_")
director | producer | features_title | features_year | |
---|---|---|---|---|
0 | Woody Allen | Letty Aronson | Magic in the Moonlight | 2014 |
1 | Niki Caro | Jason Reed | Mulan | 2020 |
# dataset
movies2 = json.load(open("movies2.json"))
movies2
[{'director': 'Woody Allen',
'producer': 'Letty Aronson',
'features': [{'title': 'Magic in the Moonlight', 'year': 2014},
{'title': 'Vicky Cristina Barcelona', 'year': 2008},
{'title': 'Midnight in Paris', 'year': 2011}]},
{'director': 'Niki Caro',
'producer': 'Jason Reed',
'features': [{'title': 'Mulan', 'year': 2020}]}]
# 열 안에 또 json 형식의 데이터가 있으면 flatten되지 않음
pd.json_normalize(movies2)
director | producer | features | |
---|---|---|---|
0 | Woody Allen | Letty Aronson | [{'title': 'Magic in the Moonlight', 'year': 2... |
1 | Niki Caro | Jason Reed | [{'title': 'Mulan', 'year': 2020}] |
# features 열의 json 형식의 데이터를 dataframe으로 반환
pd.json_normalize(movies2, record_path="features")
title | year | |
---|---|---|
0 | Magic in the Moonlight | 2014 |
1 | Vicky Cristina Barcelona | 2008 |
2 | Midnight in Paris | 2011 |
3 | Mulan | 2020 |
pd.json_normalize(movies2, record_path='features', meta=["director", "producer"])
title | year | director | producer | |
---|---|---|---|---|
0 | Magic in the Moonlight | 2014 | Woody Allen | Letty Aronson |
1 | Vicky Cristina Barcelona | 2008 | Woody Allen | Letty Aronson |
2 | Midnight in Paris | 2011 | Woody Allen | Letty Aronson |
3 | Mulan | 2020 | Niki Caro | Jason Reed |
Convert nested data column
json.loads
- convert json string into python dictionary
# dataset
# bird_name이 담겨 있는 list
bird_name = ['Killdeer', 'Chipping Sparrow', 'Cedar Waxwing']
# bird feature이 담겨 있는 list(각 bird의 feature은 dictionary 형태의 String임)
bird_features = ['{"Size" : "Large", "Color": "Golden brown", "Behavior": "Runs swiftly along ground", "Habitat": "Rocky areas"}',
'{"Size":"Small", "Color": "Gray-white", "Behavior": "Often in flocks", "Habitat": "Open woodlands"}',
'{"Size":"Small", "Color": "Gray-brown", "Behavior": "Catch insects over open water", "Habitat": "Parks"}']
# make dataframe using dict
birds = pd.DataFrame(dict(bird_name=bird_name, bird_features=bird_features))
birds
bird_name | bird_features | |
---|---|---|
0 | Killdeer | {"Size" : "Large", "Color": "Golden brown", "B... |
1 | Chipping Sparrow | {"Size":"Small", "Color": "Gray-white", "Behav... |
2 | Cedar Waxwing | {"Size":"Small", "Color": "Gray-brown", "Behav... |
- 방법1
# convert bird_features into dataframe using json.loads & pd.Series
# json.loads - json 형식의 문자열을 dictionary으로 convert, 이후 dictionary를 pd.Series으로 convert
bird_features_df = birds["bird_features"].apply(json.loads).apply(pd.Series)
bird_features_df
Size | Color | Behavior | Habitat | |
---|---|---|---|---|
0 | Large | Golden brown | Runs swiftly along ground | Rocky areas |
1 | Small | Gray-white | Often in flocks | Open woodlands |
2 | Small | Gray-brown | Catch insects over open water | Parks |
# bird_name과 bird_feature 최종 합치기
bird_name_df = birds.drop(columns="bird_features")
pd.concat([bird_name_df, bird_features_df], axis=1)
bird_name | Size | Color | Behavior | Habitat | |
---|---|---|---|---|---|
0 | Killdeer | Large | Golden brown | Runs swiftly along ground | Rocky areas |
1 | Chipping Sparrow | Small | Gray-white | Often in flocks | Open woodlands |
2 | Cedar Waxwing | Small | Gray-brown | Catch insects over open water | Parks |
- 방법2
# Apply json.loads to the bird_features column and transform it to a list
birds_features = birds['bird_features'].apply(json.loads).to_list()
# Convert birds_features into a JSON
birds_dump = json.dumps(birds_features)
# Read the JSON birds_dump into a DataFrame
birds_df = pd.read_json(birds_dump)
# Concatenate the 'bird_name' column of birds with birds_df
pd.concat([birds["bird_name"], birds_df], axis=1)
bird_name | Size | Color | Behavior | Habitat | |
---|---|---|---|---|---|
0 | Killdeer | Large | Golden brown | Runs swiftly along ground | Rocky areas |
1 | Chipping Sparrow | Small | Gray-white | Often in flocks | Open woodlands |
2 | Cedar Waxwing | Small | Gray-brown | Catch insects over open water | Parks |