[Python] Reshaping Data with pandas(1)
2 minute read
Pandas
.transpose() - switch column & row
.pivot() - long to wide
.pivot_table() - long to wide
# import pkg & dataset
import numpy as np
import pandas as pd
df_fifa = pd.read_csv("players_20.csv")
.transpose()
df1 = df_fifa.set_index("short_name")[["height_cm", "weight_kg"]]
df1.head(3)
|
height_cm |
weight_kg |
| short_name |
|
|
| L. Messi |
170 |
72 |
| Cristiano Ronaldo |
187 |
83 |
| Neymar Jr |
175 |
68 |
| short_name |
L. Messi |
Cristiano Ronaldo |
Neymar Jr |
J. Oblak |
E. Hazard |
K. De Bruyne |
M. ter Stegen |
V. van Dijk |
L. Modrić |
M. Salah |
... |
M. Gallagher |
Huang Jiahui |
M. Sagaf |
E. Tweed |
P. Martin |
Shao Shuai |
Xiao Mingjie |
Zhang Wei |
Wang Haijian |
Pan Ximing |
| height_cm |
170 |
187 |
175 |
188 |
175 |
181 |
187 |
193 |
172 |
175 |
... |
178 |
183 |
177 |
180 |
188 |
186 |
177 |
186 |
185 |
182 |
| weight_kg |
72 |
83 |
68 |
87 |
74 |
70 |
85 |
92 |
66 |
71 |
... |
70 |
74 |
70 |
72 |
84 |
79 |
66 |
75 |
74 |
78 |
2 rows × 18278 columns
.pivot()
- values가 전체인 경우 생략
- 중복값이 있는 경우 제거해야함
# 데이터 불러오기
fifa_long = pd.read_csv("fifa_long.csv")
fifa_long.head(3)
|
name |
movement |
overall |
attacking |
| 0 |
messi |
shooting |
92 |
90 |
| 1 |
ronaldo |
shooting |
91 |
89 |
| 2 |
messi |
passing |
95 |
94 |
# values가 1개인 경우
fifa_long.pivot(index="name", columns="movement", values="overall")
| movement |
dribbling |
passing |
shooting |
| name |
|
|
|
| messi |
96 |
95 |
92 |
| ronaldo |
88 |
82 |
91 |
# values가 복수개인 경우
fifa_long.pivot(index="name", columns="movement", values=["overall", "attacking"])
# values가 2개이므로 위와 같음
# fifa_long.pivot(index="name", columns="movement")
|
overall |
attacking |
| movement |
dribbling |
passing |
shooting |
dribbling |
passing |
shooting |
| name |
|
|
|
|
|
|
| messi |
96 |
95 |
92 |
88 |
94 |
90 |
| ronaldo |
88 |
82 |
91 |
83 |
83 |
89 |
# 중복 행 만들기
fifa_long = fifa_long.append({"name":"ronaldo","movement":"dribbling","overall":85,"attacking":84}, ignore_index=True)
fifa_long
|
name |
movement |
overall |
attacking |
| 0 |
messi |
shooting |
92 |
90 |
| 1 |
ronaldo |
shooting |
91 |
89 |
| 2 |
messi |
passing |
95 |
94 |
| 3 |
ronaldo |
passing |
82 |
83 |
| 4 |
messi |
dribbling |
96 |
88 |
| 5 |
ronaldo |
dribbling |
88 |
83 |
| 6 |
ronaldo |
dribbling |
85 |
84 |
| 7 |
ronaldo |
dribbling |
85 |
84 |
# 중복 행 제거
fifa_long.drop(7, axis=0)
|
name |
movement |
overall |
attacking |
| 0 |
messi |
shooting |
92 |
90 |
| 1 |
ronaldo |
shooting |
91 |
89 |
| 2 |
messi |
passing |
95 |
94 |
| 3 |
ronaldo |
passing |
82 |
83 |
| 4 |
messi |
dribbling |
96 |
88 |
| 5 |
ronaldo |
dribbling |
88 |
83 |
| 6 |
ronaldo |
dribbling |
85 |
84 |
.pivot_table()
- 다음 3가지의 조건 중 하나라도 만족하는 경우 사용(즉, pivot과의 차이)
- index/column pair이 한 개 이상인 경우
- multi-index로 pivot을 원하는 경우
- summary statistic을 산출하고 싶은 경우
- parameter
- index - 행 기준
- columns - 열 기준
- values - 값 기준
- aggfunc - 값에 대한 통계치, default는 mean
- margins - 총계
# dataset player included in fc barcelona or real madrid
df2 = df_fifa[["sofifa_id", "short_name", "age", "height_cm", "weight_kg", "nationality", "club"]]
df_rm_fbc = df2[df2.club.isin(["FC Barcelona", "Real Madrid"])]
# maximum height of player by nationality & club
df_rm_fbc.pivot_table(index="nationality",
columns="club",
values="height_cm",
aggfunc="max").head(3)
| club |
FC Barcelona |
Real Madrid |
| nationality |
|
|
| Argentina |
170.0 |
NaN |
| Belgium |
NaN |
199.0 |
| Brazil |
190.0 |
186.0 |
# total count of players of RM and FBC by nationality & club
df_rm_fbc.pivot_table(index="nationality",
columns="club",
values="sofifa_id",
aggfunc="count",
margins=True)
| club |
FC Barcelona |
Real Madrid |
All |
| nationality |
|
|
|
| Argentina |
1.0 |
NaN |
1 |
| Belgium |
NaN |
2.0 |
2 |
| Brazil |
3.0 |
6.0 |
9 |
| Chile |
1.0 |
NaN |
1 |
| Colombia |
NaN |
1.0 |
1 |
| Costa Rica |
NaN |
1.0 |
1 |
| Croatia |
1.0 |
1.0 |
2 |
| Dominican Republic |
NaN |
1.0 |
1 |
| France |
5.0 |
3.0 |
8 |
| Germany |
1.0 |
1.0 |
2 |
| Japan |
1.0 |
1.0 |
2 |
| Netherlands |
2.0 |
NaN |
2 |
| Portugal |
1.0 |
NaN |
1 |
| Senegal |
1.0 |
NaN |
1 |
| Serbia |
NaN |
1.0 |
1 |
| Spain |
15.0 |
13.0 |
28 |
| Uruguay |
1.0 |
1.0 |
2 |
| Wales |
NaN |
1.0 |
1 |
| All |
33.0 |
33.0 |
66 |