[Python] Reshaping Data with pandas(4)

4 minute read

Pandas

Using .stack with .mean() or .diff()
.explode() - converts list contained column into separate rows
pd.json_normalize() - converts json data into dataframe
Convert nested data column

# import pkg & dataset
import numpy as np
import pandas as pd
df = pd.read_csv("obesity.csv")
df_melted = df.melt(id_vars="country")

df_melted[["gender", "year"]] = df_melted["variable"].str.extract("([a-z]+)([0-9]+)", expand=True)
df_melted = df_melted.drop(columns="variable").rename(columns={"value":"perc_obesity"})
df_melted = df_melted.set_index(["country", "gender", "year"], drop=True)
df_melted

			perc_obesity
country	gender	year
argentina	male	2005	21.5
japan	male	2005	2.5
norway	male	2005	17.6
argentina	female	2005	24.2
japan	female	2005	2.6
norway	female	2005	18.6
argentina	male	2015	26.8
japan	male	2015	4.6
norway	male	2015	23.0
argentina	female	2015	28.5
japan	female	2015	3.6
norway	female	2015	22.2

Using .stack() with .mean() or .diff()

# mean of obesity percent by gender & year
df_general = df_melted.unstack(level=0).mean(axis=1)
df_general

gender  year
female  2005    15.133333
        2015    18.100000
male    2005    13.866667
        2015    18.133333
dtype: float64

# difference of obesity percent by year considering country & gender
df_melted.unstack(level=2).diff(axis=1)

		perc_obesity
	year	2005	2015
country	gender
argentina	female	NaN	4.3
argentina	male	NaN	5.3
japan	female	NaN	1.0
japan	male	NaN	2.1
norway	female	NaN	3.6
norway	male	NaN	5.4

# mean of obesity by country
df_melted.stack().groupby("country").mean()

country
argentina    25.250
japan         3.325
norway       20.350
dtype: float64

.explode()

# dataset
# csv는 python의 list로써 저장되는 것이 아니므로, 데이터를 읽을 때 해당 열이 python의 list임을 명시해야 함
import ast
df2 = pd.read_csv("obesity_list.csv", converters={"bounds": ast.literal_eval})

# 방법 1) explode한 df와 기존 df의 열 merge
df2_bounds = df2["bounds"].explode()
df2[["country", "perc_obesity"]].merge(df2_bounds, left_index=True, right_index=True)

	country	perc_obesity	bounds
0	Argentina	21.5	15.4
0	Argentina	21.5	31.5
1	Germany	22.3	16.2
1	Germany	22.3	32.4
2	Japan	2.5	1.1
2	Japan	2.5	3.5
3	Norway	23.0	13.1
3	Norway	23.0	33.0

# 방법 2) 해당 행을 explode 함수 안에 명시, 이 경우 index를 reset할 필요 존재
df2.explode("bounds").reset_index(drop=True)

	country	perc_obesity	bounds
0	Argentina	21.5	15.4
1	Argentina	21.5	31.5
2	Germany	22.3	16.2
3	Germany	22.3	32.4
4	Japan	2.5	1.1
5	Japan	2.5	3.5
6	Norway	23.0	13.1
7	Norway	23.0	33.0

# dataset
df3 = pd.read_csv("obesity3.csv")
df3

	country	perc_obesity	bounds
0	Argentina	21.5	11.4-25.5
1	Germany	22.3	16.2-32.4
2	Japan	2.5	8.1-16.5
3	Norway	23.0	9.1-20.1

# 한 열이 특정한 string을 기준을 바탕으로 list로 분리한 뒤 explode를 사용할 수도 있음
df3_splited = df3.assign(bounds = df3["bounds"].str.split("-").explode("bounds"))
df3_splited

	country	perc_obesity	bounds
0	Argentina	21.5	11.4
1	Germany	22.3	25.5
2	Japan	2.5	16.2
3	Norway	23.0	32.4

pd.json_normalize()

nest가 한번인 경우까지 dataframe으로 normalize 됨
parameter
- sep - column name의 separator인 string 명시 가능 (default는 .)
- record_path - double nested 되어 있을 경우 해당 데이터를 dataframe으로 반환
- meta - Fields to use as metadata for each record in resulting table

# dataset
import json
movies = json.load(open("movies.json"))
movies

[{'director': 'Woody Allen',
  'producer': 'Letty Aronson',
  'features': {'title': 'Magic in the Moonlight', 'year': 2014}},
 {'director': 'Niki Caro',
  'producer': 'Jason Reed',
  'features': {'title': 'Mulan', 'year': 2020}}]

pd.json_normalize(movies, sep="_")

	director	producer	features_title	features_year
0	Woody Allen	Letty Aronson	Magic in the Moonlight	2014
1	Niki Caro	Jason Reed	Mulan	2020

# dataset
movies2 = json.load(open("movies2.json"))
movies2

[{'director': 'Woody Allen',
  'producer': 'Letty Aronson',
  'features': [{'title': 'Magic in the Moonlight', 'year': 2014},
   {'title': 'Vicky Cristina Barcelona', 'year': 2008},
   {'title': 'Midnight in Paris', 'year': 2011}]},
 {'director': 'Niki Caro',
  'producer': 'Jason Reed',
  'features': [{'title': 'Mulan', 'year': 2020}]}]

# 열 안에 또 json 형식의 데이터가 있으면 flatten되지 않음
pd.json_normalize(movies2)

	director	producer	features
0	Woody Allen	Letty Aronson	[{'title': 'Magic in the Moonlight', 'year': 2...
1	Niki Caro	Jason Reed	[{'title': 'Mulan', 'year': 2020}]

# features 열의 json 형식의 데이터를 dataframe으로 반환
pd.json_normalize(movies2, record_path="features")

	title	year
0	Magic in the Moonlight	2014
1	Vicky Cristina Barcelona	2008
2	Midnight in Paris	2011
3	Mulan	2020

pd.json_normalize(movies2, record_path='features', meta=["director", "producer"])

	title	year	director	producer
0	Magic in the Moonlight	2014	Woody Allen	Letty Aronson
1	Vicky Cristina Barcelona	2008	Woody Allen	Letty Aronson
2	Midnight in Paris	2011	Woody Allen	Letty Aronson
3	Mulan	2020	Niki Caro	Jason Reed

Convert nested data column

json.loads - convert json string into python dictionary

# dataset
# bird_name이 담겨 있는 list
bird_name = ['Killdeer', 'Chipping Sparrow', 'Cedar Waxwing']
# bird feature이 담겨 있는 list(각 bird의 feature은 dictionary 형태의 String임)
bird_features = ['{"Size" : "Large", "Color": "Golden brown", "Behavior": "Runs swiftly along ground", "Habitat": "Rocky areas"}',
                 '{"Size":"Small", "Color": "Gray-white", "Behavior": "Often in flocks", "Habitat": "Open woodlands"}',
                 '{"Size":"Small", "Color": "Gray-brown", "Behavior": "Catch insects over open water", "Habitat": "Parks"}']

# make dataframe using dict
birds = pd.DataFrame(dict(bird_name=bird_name, bird_features=bird_features))
birds

	bird_name	bird_features
0	Killdeer	{"Size" : "Large", "Color": "Golden brown", "B...
1	Chipping Sparrow	{"Size":"Small", "Color": "Gray-white", "Behav...
2	Cedar Waxwing	{"Size":"Small", "Color": "Gray-brown", "Behav...

방법1

# convert bird_features into dataframe using json.loads & pd.Series
# json.loads - json 형식의 문자열을 dictionary으로 convert, 이후 dictionary를 pd.Series으로 convert
bird_features_df = birds["bird_features"].apply(json.loads).apply(pd.Series)
bird_features_df

	Size	Color	Behavior	Habitat
0	Large	Golden brown	Runs swiftly along ground	Rocky areas
1	Small	Gray-white	Often in flocks	Open woodlands
2	Small	Gray-brown	Catch insects over open water	Parks

# bird_name과 bird_feature 최종 합치기
bird_name_df = birds.drop(columns="bird_features")
pd.concat([bird_name_df, bird_features_df], axis=1)

	bird_name	Size	Color	Behavior	Habitat
0	Killdeer	Large	Golden brown	Runs swiftly along ground	Rocky areas
1	Chipping Sparrow	Small	Gray-white	Often in flocks	Open woodlands
2	Cedar Waxwing	Small	Gray-brown	Catch insects over open water	Parks

방법2

# Apply json.loads to the bird_features column and transform it to a list 
birds_features = birds['bird_features'].apply(json.loads).to_list()

# Convert birds_features into a JSON 
birds_dump = json.dumps(birds_features)

# Read the JSON birds_dump into a DataFrame
birds_df = pd.read_json(birds_dump)

# Concatenate the 'bird_name' column of birds with birds_df
pd.concat([birds["bird_name"], birds_df], axis=1)

	bird_name	Size	Color	Behavior	Habitat
0	Killdeer	Large	Golden brown	Runs swiftly along ground	Rocky areas
1	Chipping Sparrow	Small	Gray-white	Often in flocks	Open woodlands
2	Cedar Waxwing	Small	Gray-brown	Catch insects over open water	Parks

Share on

Twitter Facebook LinkedIn

ZSU

[Python] Reshaping Data with pandas(4)

Pandas

Using .stack() with .mean() or .diff()

.explode()

pd.json_normalize()

Convert nested data column

Share on

You may also enjoy

[Kubernetes] Force Delete Pod, PV, and PVC

[Software Engineering at Google] CH8 Style Guides and Rules

[Software Engineering at Google] CH7 Measuring Engineering Productivity

[Software Engineering at Google] CH6 Leading at Scale