[PySpark] DataFrame Basics

1 minute read


PySpark DataFrame Basics

  • PySpark Basic1
  • DataFrame Schema
  • PySpark Basic2

PySpark Basic1

from pyspark.sql import SparkSession
# start spark session by applying it
spark = SparkSession.builder.appName("Basics").getOrCreate()
# read data
df = spark.read.json("people.json")
# show dataframe
df.show()
+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+
# show df schema
df.printSchema()
root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)
# show df columns
df.columns
['age', 'name']
# get summary of df
df.describe()
DataFrame[summary: string, age: string, name: string]
# # get statiscal summary of df
df.describe().show()
+-------+------------------+-------+
|summary|               age|   name|
+-------+------------------+-------+
|  count|                 2|      3|
|   mean|              24.5|   null|
| stddev|7.7781745930520225|   null|
|    min|                19|   Andy|
|    max|                30|Michael|
+-------+------------------+-------+

DataFrame Schema

from pyspark.sql.types import StructField, StringType, IntegerType, StructType
# third parameter true - no data will be null
data_schema = [StructField("age", IntegerType(),True),
               StructField("name", StringType(),True)]
final_struc = StructType(fields=data_schema)
df = spark.read.json('people.json', schema=final_struc)
df.printSchema()
root
 |-- age: integer (nullable = true)
 |-- name: string (nullable = true)

PySpark Basic2

  • selecting a column
# select column from a df
df["age"]
Column<'age'>
type(df["age"])
pyspark.sql.column.Column
# select a column of a df as a df
df.select("age")
DataFrame[age: int]
df.select("age").show()
+----+
| age|
+----+
|null|
|  30|
|  19|
+----+
type(df.select("age"))
pyspark.sql.dataframe.DataFrame
# selecting multiple columns
df.select(["age","name"])
DataFrame[age: int, name: string]
  • selecting a row
df.head(2)[0]
Row(age=None, name='Michael')
type(df.head(2)[0])
pyspark.sql.types.Row
  • mutate
df.withColumn("double_age", df["age"]*2).show()
+----+-------+----------+
| age|   name|double_age|
+----+-------+----------+
|null|Michael|      null|
|  30|   Andy|        60|
|  19| Justin|        38|
+----+-------+----------+
  • rename
df.withColumnRenamed("age", "new_age").show()
+-------+-------+
|new_age|   name|
+-------+-------+
|   null|Michael|
|     30|   Andy|
|     19| Justin|
+-------+-------+
  • sql in Spark
# make a temporary view
df.createOrReplaceTempView("people")
# query
results = spark.sql("SELECT * FROM people WHERE age = 30")
results.show()
+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+