[Python] Dplyr to Pandas

2 minute read


dplyr to pandas


# import data
import seaborn as sns
import pandas as pd
import re

iris_df = sns.load_dataset('iris')
print(type(iris_df))
<class 'pandas.core.frame.DataFrame'>

dplyr::mutate()

# dplyr
iris_df %>% mutate(New_feature= Petal.Width*Petal.Length/2)

# pandas
iris_df["New_feature"] = iris_df["petal_width"] * iris_df["petal_length"] / 2

dplyr::select()

# dplyr
iris_df %>% select('sepal_length', 'sepal_width')

# pandas
iris_df[['sepal_length', 'sepal_width']]
# 위와 같음
iris_df.loc[:,['sepal_length', 'sepal_width']]
iris_df.iloc[:,[1,2]]

dplyr::select(-)

# dplyr
iris_df %>% select(-c(Sepal.Length, Sepal.Width))

# pandas
# cf. inplace = False means the result would be stored in a new DataFrame instead of the original one
iris_df.drop(["sepal_length", "sepal_width"], axis=1, inplace=True)

dplyr::filter()

# dplyr
iris_df %>% filter(species %in% c('setosa', 'virginica'))
iris_df %>% filter(sepal_length > 5.0 & species == 'versicolor')

# pandas
iris_df.loc[(iris_df.species == 'setosa') | (iris_df.species == 'virginica'),:]
iris_df.loc[(iris_df.sepal_length > 5.0) & (iris_df.species == 'versicolor') ]

dplyr::arrange()

# 오름차순
dplyr
iris_df %>% arrange(sepal_length)

# pandas
iris_df.sort_values('sepal_length', ascending=1)

# 내림차순
# dplyr
iris_df %>% arrange(desc(sepal_length))

# pandas
iris_df.sort_values('sepal_length', ascending=0)

dplyr::rename()

# dplyr
iris_df_renamed = iris_df %>% rename(SEPAL_LENGTH = sepal_length)

# pandas
# cf. inplace = False means the result would be stored in a new DataFrame instead of the original one
iris_df_renamed = iris_df.rename(columns={'sepal_length':'SEPAL_LENGTH'}, inplace=False) 
# dplyr
iris_df %>% rename_with(toupper, matches("length|width"))

# pandas
pattern = re.compile(r".*(length|width)")
for col in iris_df.columns:
  if bool((pattern.match(col))):
    iris_df.rename(columns = {col: col.upper()}, inplace = True)

dplyr::case_when()

# dplyr
iris_df %>% 
  mutate(Species = case_when(Species == 'setosa' ~ 0,
                             Species == 'versicolor' ~ 1,
                             Species == 'virginica' ~ 2))

# pandas
iris_df.loc[iris_df['species'] == 'setosa', "species"] = 0
iris_df.loc[iris_df['species'] == 'versicolor', "species"] = 1
iris_df.loc[iris_df['species'] == 'virginica', "species"] = 2
iris_df

dplyr::distinct()

# dplyr
iris_df %>% select(Species) %>% distinct()

# pandas
iris_df.species.unique()

dplyr::summarise()

  • get mean and min for each column
# dplyr
iris_df %>% summarise(across(everything(), mean))
iris_df %>% summarise(across(everything(), min))

# pandas
iris_df.agg(['mean', 'min'])

dplyr::group_by()

# dplyr
# aggregation by group for all columns
iris_df %>% group_by(Species) %>% summarise_all(list(mean,min))

# aggregation by group for a specific column
iris_df %>% group_by(Species) %>% summarise(mean=mean(Sepal.Length))

# pandas
# aggregation by group for all columns
iris_df.groupby(['species']).agg(['mean', 'min'])

# aggregation by group for a specific column
iris_df.groupby(['species']).agg({'sepal_length':['mean']})

dplyr::relocate()

# dplyr
iris_df %>% relocate(Species, .before = Sepal.Length)

# pandas
# cf. iris_df에 저장이 안 됨
iris_df.reindex(['species', 'petal_length', 'sepal_length', 'sepal_width', 'petal_width'], axis=1)

dplyr::slice()

# dplyr, r은 1부터 시작
# 첫 5개행
iris_df %>% slice(1:6)

# 첫 2개열
iris_df[,1:2]

# 1st, 4th, 25th 행 & 1st 6th 열
iris_df %>% select(Sepal.Length, Petal.Width) %>% slice(1,4,25)

# 첫 5개 행 & 5th~7th 열
iris_df %>% select(Sepal.Length, Sepal.Width, Petal.Length) %>% slice(0:5)

# pandas, python은 0부터 시작
# 첫 5개행
iris_df.iloc[0:5]

# 첫 2개열
iris_df.iloc[:, 0:2] 

# 1st, 4th, 25th 행 & 1st, 4th 열
iris_df.iloc[[0,3,24], [0,3]]

# 첫 5개 행 & 1th~3th 열
iris_df.iloc[0:5, 0:3] 

dplyr::slice_head() & slice_tail()

# dplyr
iris_df %>% slice_head(n=5)
iris_df %>% slice_tail(prop=0.1)

# pandas
iris_df.head(5)
iris_df.tail(n=int(len(iris_df)*0.1))

dplyr::bind_rows() & bind_cols()

# dplyr
a %>% bind_rows(b)
a %>% bind_cols(b)

# pandas
# rows
pd.concat([A,B])

# cols
pd.concat([A,B], axis=1)

count records

# dplyr
# Total number of records in dataframe
iris_df %>% nrow()

# Number of records per Group(two ways)
iris_df %>% group_by(Species) %>% count()
iris_df %>% group_by(Species) %>% tally()


# Total number of records in dataframe
len(iris_df)

# Number of records per Group(two ways)
iris_df.value_counts('species')
iris_df.groupby(['species']).size()