[Python] iterrows & itertuples & pd.applys
Writing Efficient Python Code
- PART4
iterrows
를 통해 row별 iterration 가능iterrows
를 쓸 경우 index와 pandas Series로 구성되어 있는 tuple을 반환
itertuples
를 통해 row별 iterration 가능itertuples
를 쓸 경우 namedtuple 형태를 반환
pd.applys
을 통해 loop 대체 가능- 0: row, 1: column
- 효율성:
.values
>pd.applys
>itertuples
>iterrows
# import data
import pandas as pd
import numpy as np
baseball_df = pd.read_csv('baseball_stats.csv')
baseball_df.head(5)
Team | League | Year | RS | RA | W | OBP | SLG | BA | Playoffs | RankSeason | RankPlayoffs | G | OOBP | OSLG | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | ARI | NL | 2012 | 734 | 688 | 81 | 0.328 | 0.418 | 0.259 | 0 | NaN | NaN | 162 | 0.317 | 0.415 |
1 | ATL | NL | 2012 | 700 | 600 | 94 | 0.320 | 0.389 | 0.247 | 1 | 4.0 | 5.0 | 162 | 0.306 | 0.378 |
2 | BAL | AL | 2012 | 712 | 705 | 93 | 0.311 | 0.417 | 0.247 | 1 | 5.0 | 4.0 | 162 | 0.315 | 0.403 |
3 | BOS | AL | 2012 | 734 | 806 | 69 | 0.315 | 0.415 | 0.260 | 0 | NaN | NaN | 162 | 0.331 | 0.428 |
4 | CHC | NL | 2012 | 613 | 759 | 61 | 0.302 | 0.378 | 0.240 | 0 | NaN | NaN | 162 | 0.335 | 0.424 |
for i,row in baseball_df.head(1).iterrows():
print(i,row)
for series in baseball_df.head(1).iterrows():
print(series[1]['Team'])
0 Team ARI
League NL
Year 2012
RS 734
RA 688
W 81
OBP 0.328
SLG 0.418
BA 0.259
Playoffs 0
RankSeason NaN
RankPlayoffs NaN
G 162
OOBP 0.317
OSLG 0.415
Name: 0, dtype: object
ARI
def calc_run_diff(run_s, run_a):
return run_s - run_a
run_diffs = []
for i,row in baseball_df.iterrows():
run_scored = row['RS']
run_allowed = row['RA']
run_diff = calc_run_diff(run_scored, run_allowed)
run_diffs.append(run_diff)
baseball_df['RD'] = run_diffs
baseball_df.head(5)
Team | League | Year | RS | RA | W | OBP | SLG | BA | Playoffs | RankSeason | RankPlayoffs | G | OOBP | OSLG | RD | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | ARI | NL | 2012 | 734 | 688 | 81 | 0.328 | 0.418 | 0.259 | 0 | NaN | NaN | 162 | 0.317 | 0.415 | 46 |
1 | ATL | NL | 2012 | 700 | 600 | 94 | 0.320 | 0.389 | 0.247 | 1 | 4.0 | 5.0 | 162 | 0.306 | 0.378 | 100 |
2 | BAL | AL | 2012 | 712 | 705 | 93 | 0.311 | 0.417 | 0.247 | 1 | 5.0 | 4.0 | 162 | 0.315 | 0.403 | 7 |
3 | BOS | AL | 2012 | 734 | 806 | 69 | 0.315 | 0.415 | 0.260 | 0 | NaN | NaN | 162 | 0.331 | 0.428 | -72 |
4 | CHC | NL | 2012 | 613 | 759 | 61 | 0.302 | 0.378 | 0.240 | 0 | NaN | NaN | 162 | 0.335 | 0.424 | -146 |
iterrows
- 1) loop over pit_df and print each row
for i,row in baseball_df.head(1).iterrows():
print(i,row)
0 Team ARI
League NL
Year 2012
RS 734
RA 688
W 81
OBP 0.328
SLG 0.418
BA 0.259
Playoffs 0
RankSeason NaN
RankPlayoffs NaN
G 162
OOBP 0.317
OSLG 0.415
Name: 0, dtype: object
itertuples
- In what year within your DataFrame did the New York Yankees have the highest run differential?
yankees_df = baseball_df[baseball_df.Team == 'NYY']
run_diff = []
for row in yankees_df.itertuples():
RS = row.RS
RA = row.RA
run_diff.append(abs(calc_run_diff(RS, RA)))
yankees_df['RD']= run_diff
yankees_df.sort_values('RD', ascending = 0).head(5)
Team | League | Year | RS | RA | W | OBP | SLG | BA | Playoffs | RankSeason | RankPlayoffs | G | OOBP | OSLG | RD | WP | WP_preds | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
439 | NYY | AL | 1998 | 965 | 656 | 114 | 0.364 | 0.460 | 0.288 | 1 | 1.0 | 1.0 | 162 | NaN | NaN | 309 | 0.70 | 0.68 |
48 | NYY | AL | 2011 | 867 | 657 | 97 | 0.343 | 0.444 | 0.263 | 1 | 2.0 | 4.0 | 162 | 0.322 | 0.399 | 210 | 0.60 | 0.64 |
468 | NYY | AL | 1997 | 891 | 688 | 96 | 0.362 | 0.436 | 0.287 | 1 | 3.0 | 4.0 | 162 | NaN | NaN | 203 | 0.59 | 0.63 |
319 | NYY | AL | 2002 | 897 | 697 | 103 | 0.354 | 0.455 | 0.275 | 1 | 1.0 | 4.0 | 161 | 0.309 | 0.395 | 200 | 0.64 | 0.62 |
168 | NYY | AL | 2007 | 968 | 777 | 94 | 0.366 | 0.463 | 0.290 | 1 | 2.0 | 4.0 | 162 | 0.340 | 0.417 | 191 | 0.58 | 0.61 |
pd.apply
# row sum
baseball_df[['RS', 'RA']].sum(axis=0)
RS 880981
RA 880981
dtype: int64
# column sum
baseball_df[['RS', 'RA']].sum(axis=1)
0 1422
1 1300
2 1417
3 1540
4 1372
...
1227 1464
1228 1332
1229 1568
1230 1438
1231 1315
Length: 1232, dtype: int64
def text_playoffs(num_playoffs):
if num_playoffs == 1:
return 'Yes'
else:
return 'No'
baseball_df['Playoffs'].apply(text_playoffs)
# 위와 같음
# baseball_df.apply(lambda row: text_playoffs(row['Playoffs']), axis=1)
0 No
1 Yes
2 Yes
3 No
4 No
...
1227 No
1228 No
1229 Yes
1230 No
1231 No
Name: Playoffs, Length: 1232, dtype: object
def calc_win_perc(wins, games_played):
win_perc = wins / games_played
return np.round(win_perc,2)
win_percs = yankees_df.apply(lambda row: calc_win_perc(row['W'], row['G']), axis=1)
yankees_df['WP'] = win_percs
print(yankees_df[yankees_df['WP'] >= 0.50].head(5))
Team League Year RS RA W OBP SLG BA Playoffs \
18 NYY AL 2012 804 668 95 0.337 0.453 0.265 1
48 NYY AL 2011 867 657 97 0.343 0.444 0.263 1
78 NYY AL 2010 859 693 95 0.350 0.436 0.267 1
108 NYY AL 2009 915 753 103 0.362 0.478 0.283 1
138 NYY AL 2008 789 727 89 0.342 0.427 0.271 0
RankSeason RankPlayoffs G OOBP OSLG RD WP
18 3.0 3.0 162 0.311 0.419 136 0.59
48 2.0 4.0 162 0.322 0.399 210 0.60
78 3.0 3.0 162 0.322 0.399 166 0.59
108 1.0 1.0 162 0.327 0.408 162 0.64
138 NaN NaN 162 0.329 0.405 62 0.55
- Pandas series에 .values를 사용해 numpy array로 바꾼 후 연산하면 빠르게 연산 가능
win_percs_np = calc_win_perc(baseball_df['W'].values, baseball_df['G'].values)
baseball_df['WP'] = win_percs_np
baseball_df['WP'].head()
0 0.50
1 0.58
2 0.57
3 0.43
4 0.38
Name: WP, dtype: float64
win_perc_preds_loop = []
# Use a loop and .itertuples() to collect each row's predicted win percentage
for row in baseball_df.itertuples():
runs_scored = row.RS
runs_allowed = row.RA
win_perc_pred = predict_win_perc(runs_scored, runs_allowed)
win_perc_preds_loop.append(win_perc_pred)
# Apply predict_win_perc to each row of the DataFrame
win_perc_preds_apply = baseball_df.apply(lambda row: predict_win_perc(row['RS'], row['RA']), axis=1)
# Calculate the win percentage predictions using NumPy arrays
win_perc_preds_np = predict_win_perc(baseball_df['RS'].values, baseball_df['RA'].values)
baseball_df['WP_preds'] = win_perc_preds_np
print(baseball_df.head())
Team League Year RS RA W OBP SLG BA Playoffs RankSeason \
0 ARI NL 2012 734 688 81 0.328 0.418 0.259 0 NaN
1 ATL NL 2012 700 600 94 0.320 0.389 0.247 1 4.0
2 BAL AL 2012 712 705 93 0.311 0.417 0.247 1 5.0
3 BOS AL 2012 734 806 69 0.315 0.415 0.260 0 NaN
4 CHC NL 2012 613 759 61 0.302 0.378 0.240 0 NaN
RankPlayoffs G OOBP OSLG RD WP WP_preds
0 NaN 162 0.317 0.415 46 0.50 0.53
1 5.0 162 0.306 0.378 100 0.58 0.58
2 4.0 162 0.315 0.403 7 0.57 0.50
3 NaN 162 0.331 0.428 -72 0.43 0.45
4 NaN 162 0.335 0.424 -146 0.38 0.39