[Python] Reshaping Data with pandas(2)
4 minute read
Pandas
.melt()
- wide to long
pd.wide_to_long
- wide to long
.str
- handling string in pandas
# import pkg & dataset
import numpy as np
import pandas as pd
df_book = pd.read_csv("books.csv", sep =",", index_col="bookID")
df_book.head(1)
|
title |
authors |
average_rating |
isbn |
isbn13 |
language_code |
num_pages |
ratings_count |
text_reviews_count |
publication_date |
publisher |
bookID |
|
|
|
|
|
|
|
|
|
|
|
1 |
Harry Potter and the Half-Blood Prince (Harry ... |
J.K. Rowling/Mary GrandPré |
4.57 |
0439785960 |
9780439785969 |
eng |
652 |
2095690.0 |
27591.0 |
9/16/2006 |
Scholastic Inc. |
.melt()
- parameter
- id_vars - identifier variable
- value_vars - category variable (default는 전체)
- var_name - categroy variable cloumn name
- value_name - value column name
# value_vars default는 모든 열
df_book.melt(id_vars="title")
|
title |
variable |
value |
0 |
Harry Potter and the Half-Blood Prince (Harry ... |
authors |
J.K. Rowling/Mary GrandPré |
1 |
Harry Potter and the Order of the Phoenix (Har... |
authors |
J.K. Rowling/Mary GrandPré |
2 |
Harry Potter and the Chamber of Secrets (Harry... |
authors |
J.K. Rowling |
3 |
Harry Potter and the Prisoner of Azkaban (Harr... |
authors |
J.K. Rowling/Mary GrandPré |
4 |
Harry Potter Boxed Set Books 1-5 (Harry Potte... |
authors |
J.K. Rowling/Mary GrandPré |
... |
... |
... |
... |
111305 |
Expelled from Eden: A William T. Vollmann Reader |
publisher |
Da Capo Press |
111306 |
You Bright and Risen Angels |
publisher |
Penguin Books |
111307 |
The Ice-Shirt (Seven Dreams #1) |
publisher |
Penguin Books |
111308 |
Poor People |
publisher |
Ecco |
111309 |
Las aventuras de Tom Sawyer |
publisher |
Edimat Libros |
111310 rows × 3 columns
# 관심있는 열만 지정할 수 있음
df_book.melt(id_vars=["title",'authors'],
value_vars=["ratings_count","num_pages"],
var_name="feature",
value_name="number")
|
title |
authors |
feature |
number |
0 |
Harry Potter and the Half-Blood Prince (Harry ... |
J.K. Rowling/Mary GrandPré |
ratings_count |
2095690.0 |
1 |
Harry Potter and the Order of the Phoenix (Har... |
J.K. Rowling/Mary GrandPré |
ratings_count |
2153167.0 |
2 |
Harry Potter and the Chamber of Secrets (Harry... |
J.K. Rowling |
ratings_count |
6333.0 |
3 |
Harry Potter and the Prisoner of Azkaban (Harr... |
J.K. Rowling/Mary GrandPré |
ratings_count |
2339585.0 |
4 |
Harry Potter Boxed Set Books 1-5 (Harry Potte... |
J.K. Rowling/Mary GrandPré |
ratings_count |
41428.0 |
... |
... |
... |
... |
... |
22257 |
Expelled from Eden: A William T. Vollmann Reader |
William T. Vollmann/Larry McCaffery/Michael He... |
num_pages |
512 |
22258 |
You Bright and Risen Angels |
William T. Vollmann |
num_pages |
635 |
22259 |
The Ice-Shirt (Seven Dreams #1) |
William T. Vollmann |
num_pages |
415 |
22260 |
Poor People |
William T. Vollmann |
num_pages |
434 |
22261 |
Las aventuras de Tom Sawyer |
Mark Twain |
num_pages |
272 |
22262 rows × 4 columns
pd.wide_to_long
- parameter
- dataframe
- stubnames - prefix
- i - id
- j - suffix
- sep - separation criterion (default=””)
- always assumed that prefix is immediately followed by a numeric suffix
- suffix - if suffix is not numeric, suffix can be assigned using regex
# dataset
df1 = df_book[["title","language_code","publication_date","num_pages"]]
df1 = df1.rename(columns={"language_code":"language","publication_date":"publication date","num_pages":"page number"})
df1 = df1.iloc[:3]
df1["publication number"] = [2,6,4]
df1.set_index("title", inplace=True)
df1
|
language |
publication date |
page number |
publication number |
title |
|
|
|
|
Harry Potter and the Half-Blood Prince (Harry Potter #6) |
eng |
9/16/2006 |
652 |
2 |
Harry Potter and the Order of the Phoenix (Harry Potter #5) |
eng |
9/1/2004 |
870 |
6 |
Harry Potter and the Chamber of Secrets (Harry Potter #2) |
eng |
11/1/2003 |
352 |
4 |
# wide_to_long은 index을 사용할 수 없기에 일반 column으로 전환
df1 = df1.reset_index(drop=False)
pd.wide_to_long(
df1,
stubnames=["publication", "page"],
i=["title", "language"],
j="feature",
sep=" ",
suffix="\w+"
)
|
|
|
publication |
page |
title |
language |
feature |
|
|
Harry Potter and the Half-Blood Prince (Harry Potter #6) |
eng |
date |
9/16/2006 |
NaN |
number |
2 |
652 |
Harry Potter and the Order of the Phoenix (Harry Potter #5) |
eng |
date |
9/1/2004 |
NaN |
number |
6 |
870 |
Harry Potter and the Chamber of Secrets (Harry Potter #2) |
eng |
date |
11/1/2003 |
NaN |
number |
4 |
352 |
.str
- Pandas series를 string으로 데이터 처리 가능
.split
으로 문자열 분리
- 첫 번째 parameter는 분리 기준
expand=True
를 통해 분리된 값을 dataframe으로 반환
.get
으로 분리된 문자열 선택 가능
# dataset
df2 = df_book[["title", "num_pages", "ratings_count"]]
df2 = df2[df2["title"].str.contains("The Science of Sherlock Holmes|The New Annotated Sherlock Holmes", na=False)] # na=False - df2["title"] 값이 NA인 경우에는 제외
df2 = df2.rename(columns={"title":"main_title", "num_pages":"number_pages", "ratings_count":"number_ratings"})
df2["version"] = ["Vol I", "Vol II", "Vol I"]
df2.reset_index(drop=True, inplace=True)
df2
|
main_title |
number_pages |
number_ratings |
version |
0 |
The New Annotated Sherlock Holmes: The Complet... |
1878 |
1411.0 |
Vol I |
1 |
The New Annotated Sherlock Holmes: The Novels |
907 |
2203.0 |
Vol II |
2 |
The Science of Sherlock Holmes: From Baskervil... |
244 |
2037.0 |
Vol I |
# Split main_title by a colon and assign it to two columns named title and subtitle
df2[['title', 'subtitle']] = df2['main_title'].str.split(':', expand=True)
# Split version by a space and assign the second element to the column named volume
df2['volume'] = df2['version'].str.split(' ').str.get(1)
# Drop the main_title and version columns modifying books_sh
df2.drop(['main_title', 'version'], axis=1, inplace=True)
# Reshape using title, subtitle and volume as index, name feature the new variable from columns starting with number, separated by undescore and ending in words
df2_long = pd.wide_to_long(df2,
stubnames="number",
i=["title", "subtitle", "volume"],
j="feature",
sep="_",
suffix="\w+")
df2_long
|
|
|
|
number |
title |
subtitle |
volume |
feature |
|
The New Annotated Sherlock Holmes |
The Complete Short Stories |
I |
pages |
1878 |
ratings |
1411.0 |
The Novels |
II |
pages |
907 |
ratings |
2203.0 |
The Science of Sherlock Holmes |
From Baskerville Hall to the Valley of Fear the Real Forensics Behind the Great Detective's Greatest Cases |
I |
pages |
244 |
ratings |
2037.0 |