[Python] Reshaping Data with pandas(4)

4 minute read


Pandas

  • Using .stack with .mean() or .diff()
  • .explode() - converts list contained column into separate rows
  • pd.json_normalize() - converts json data into dataframe
  • Convert nested data column

# import pkg & dataset
import numpy as np
import pandas as pd
df = pd.read_csv("obesity.csv")
df_melted = df.melt(id_vars="country")

df_melted[["gender", "year"]] = df_melted["variable"].str.extract("([a-z]+)([0-9]+)", expand=True)
df_melted = df_melted.drop(columns="variable").rename(columns={"value":"perc_obesity"})
df_melted = df_melted.set_index(["country", "gender", "year"], drop=True)
df_melted
perc_obesity
country gender year
argentina male 2005 21.5
japan male 2005 2.5
norway male 2005 17.6
argentina female 2005 24.2
japan female 2005 2.6
norway female 2005 18.6
argentina male 2015 26.8
japan male 2015 4.6
norway male 2015 23.0
argentina female 2015 28.5
japan female 2015 3.6
norway female 2015 22.2

Using .stack() with .mean() or .diff()

# mean of obesity percent by gender & year
df_general = df_melted.unstack(level=0).mean(axis=1)
df_general
gender  year
female  2005    15.133333
        2015    18.100000
male    2005    13.866667
        2015    18.133333
dtype: float64
# difference of obesity percent by year considering country & gender
df_melted.unstack(level=2).diff(axis=1)
perc_obesity
year 2005 2015
country gender
argentina female NaN 4.3
male NaN 5.3
japan female NaN 1.0
male NaN 2.1
norway female NaN 3.6
male NaN 5.4
# mean of obesity by country
df_melted.stack().groupby("country").mean()
country
argentina    25.250
japan         3.325
norway       20.350
dtype: float64

.explode()

# dataset
# csv는 python의 list로써 저장되는 것이 아니므로, 데이터를 읽을 때 해당 열이 python의 list임을 명시해야 함
import ast
df2 = pd.read_csv("obesity_list.csv", converters={"bounds": ast.literal_eval})
# 방법 1) explode한 df와 기존 df의 열 merge
df2_bounds = df2["bounds"].explode()
df2[["country", "perc_obesity"]].merge(df2_bounds, left_index=True, right_index=True)
country perc_obesity bounds
0 Argentina 21.5 15.4
0 Argentina 21.5 31.5
1 Germany 22.3 16.2
1 Germany 22.3 32.4
2 Japan 2.5 1.1
2 Japan 2.5 3.5
3 Norway 23.0 13.1
3 Norway 23.0 33.0
# 방법 2) 해당 행을 explode 함수 안에 명시, 이 경우 index를 reset할 필요 존재
df2.explode("bounds").reset_index(drop=True)
country perc_obesity bounds
0 Argentina 21.5 15.4
1 Argentina 21.5 31.5
2 Germany 22.3 16.2
3 Germany 22.3 32.4
4 Japan 2.5 1.1
5 Japan 2.5 3.5
6 Norway 23.0 13.1
7 Norway 23.0 33.0
# dataset
df3 = pd.read_csv("obesity3.csv")
df3
country perc_obesity bounds
0 Argentina 21.5 11.4-25.5
1 Germany 22.3 16.2-32.4
2 Japan 2.5 8.1-16.5
3 Norway 23.0 9.1-20.1
# 한 열이 특정한 string을 기준을 바탕으로 list로 분리한 뒤 explode를 사용할 수도 있음
df3_splited = df3.assign(bounds = df3["bounds"].str.split("-").explode("bounds"))
df3_splited
country perc_obesity bounds
0 Argentina 21.5 11.4
1 Germany 22.3 25.5
2 Japan 2.5 16.2
3 Norway 23.0 32.4

pd.json_normalize()

  • nest가 한번인 경우까지 dataframe으로 normalize 됨
  • parameter
    • sep - column name의 separator인 string 명시 가능 (default는 .)
    • record_path - double nested 되어 있을 경우 해당 데이터를 dataframe으로 반환
    • meta - Fields to use as metadata for each record in resulting table
# dataset
import json
movies = json.load(open("movies.json"))
movies
[{'director': 'Woody Allen',
  'producer': 'Letty Aronson',
  'features': {'title': 'Magic in the Moonlight', 'year': 2014}},
 {'director': 'Niki Caro',
  'producer': 'Jason Reed',
  'features': {'title': 'Mulan', 'year': 2020}}]
pd.json_normalize(movies, sep="_")
director producer features_title features_year
0 Woody Allen Letty Aronson Magic in the Moonlight 2014
1 Niki Caro Jason Reed Mulan 2020
# dataset
movies2 = json.load(open("movies2.json"))
movies2
[{'director': 'Woody Allen',
  'producer': 'Letty Aronson',
  'features': [{'title': 'Magic in the Moonlight', 'year': 2014},
   {'title': 'Vicky Cristina Barcelona', 'year': 2008},
   {'title': 'Midnight in Paris', 'year': 2011}]},
 {'director': 'Niki Caro',
  'producer': 'Jason Reed',
  'features': [{'title': 'Mulan', 'year': 2020}]}]
# 열 안에 또 json 형식의 데이터가 있으면 flatten되지 않음
pd.json_normalize(movies2)
director producer features
0 Woody Allen Letty Aronson [{'title': 'Magic in the Moonlight', 'year': 2...
1 Niki Caro Jason Reed [{'title': 'Mulan', 'year': 2020}]
# features 열의 json 형식의 데이터를 dataframe으로 반환
pd.json_normalize(movies2, record_path="features")
title year
0 Magic in the Moonlight 2014
1 Vicky Cristina Barcelona 2008
2 Midnight in Paris 2011
3 Mulan 2020
pd.json_normalize(movies2, record_path='features', meta=["director", "producer"])
title year director producer
0 Magic in the Moonlight 2014 Woody Allen Letty Aronson
1 Vicky Cristina Barcelona 2008 Woody Allen Letty Aronson
2 Midnight in Paris 2011 Woody Allen Letty Aronson
3 Mulan 2020 Niki Caro Jason Reed

Convert nested data column

  • json.loads - convert json string into python dictionary
# dataset
# bird_name이 담겨 있는 list
bird_name = ['Killdeer', 'Chipping Sparrow', 'Cedar Waxwing']
# bird feature이 담겨 있는 list(각 bird의 feature은 dictionary 형태의 String임)
bird_features = ['{"Size" : "Large", "Color": "Golden brown", "Behavior": "Runs swiftly along ground", "Habitat": "Rocky areas"}',
                 '{"Size":"Small", "Color": "Gray-white", "Behavior": "Often in flocks", "Habitat": "Open woodlands"}',
                 '{"Size":"Small", "Color": "Gray-brown", "Behavior": "Catch insects over open water", "Habitat": "Parks"}']
# make dataframe using dict
birds = pd.DataFrame(dict(bird_name=bird_name, bird_features=bird_features))
birds
bird_name bird_features
0 Killdeer {"Size" : "Large", "Color": "Golden brown", "B...
1 Chipping Sparrow {"Size":"Small", "Color": "Gray-white", "Behav...
2 Cedar Waxwing {"Size":"Small", "Color": "Gray-brown", "Behav...
  • 방법1
# convert bird_features into dataframe using json.loads & pd.Series
# json.loads - json 형식의 문자열을 dictionary으로 convert, 이후 dictionary를 pd.Series으로 convert
bird_features_df = birds["bird_features"].apply(json.loads).apply(pd.Series)
bird_features_df
Size Color Behavior Habitat
0 Large Golden brown Runs swiftly along ground Rocky areas
1 Small Gray-white Often in flocks Open woodlands
2 Small Gray-brown Catch insects over open water Parks
# bird_name과 bird_feature 최종 합치기
bird_name_df = birds.drop(columns="bird_features")
pd.concat([bird_name_df, bird_features_df], axis=1)
bird_name Size Color Behavior Habitat
0 Killdeer Large Golden brown Runs swiftly along ground Rocky areas
1 Chipping Sparrow Small Gray-white Often in flocks Open woodlands
2 Cedar Waxwing Small Gray-brown Catch insects over open water Parks
  • 방법2
# Apply json.loads to the bird_features column and transform it to a list 
birds_features = birds['bird_features'].apply(json.loads).to_list()

# Convert birds_features into a JSON 
birds_dump = json.dumps(birds_features)

# Read the JSON birds_dump into a DataFrame
birds_df = pd.read_json(birds_dump)

# Concatenate the 'bird_name' column of birds with birds_df
pd.concat([birds["bird_name"], birds_df], axis=1)
bird_name Size Color Behavior Habitat
0 Killdeer Large Golden brown Runs swiftly along ground Rocky areas
1 Chipping Sparrow Small Gray-white Often in flocks Open woodlands
2 Cedar Waxwing Small Gray-brown Catch insects over open water Parks