[PySpark] DataFrame Basics
PySpark DataFrame Basics
- PySpark Basic1
- DataFrame Schema
- PySpark Basic2
PySpark Basic1
from pyspark.sql import SparkSession
# start spark session by applying it
spark = SparkSession.builder.appName("Basics").getOrCreate()
# read data
df = spark.read.json("people.json")
# show dataframe
df.show()
+----+-------+
| age| name|
+----+-------+
|null|Michael|
| 30| Andy|
| 19| Justin|
+----+-------+
# show df schema
df.printSchema()
root
|-- age: long (nullable = true)
|-- name: string (nullable = true)
# show df columns
df.columns
['age', 'name']
# get summary of df
df.describe()
DataFrame[summary: string, age: string, name: string]
# # get statiscal summary of df
df.describe().show()
+-------+------------------+-------+
|summary| age| name|
+-------+------------------+-------+
| count| 2| 3|
| mean| 24.5| null|
| stddev|7.7781745930520225| null|
| min| 19| Andy|
| max| 30|Michael|
+-------+------------------+-------+
DataFrame Schema
from pyspark.sql.types import StructField, StringType, IntegerType, StructType
# third parameter true - no data will be null
data_schema = [StructField("age", IntegerType(),True),
StructField("name", StringType(),True)]
final_struc = StructType(fields=data_schema)
df = spark.read.json('people.json', schema=final_struc)
df.printSchema()
root
|-- age: integer (nullable = true)
|-- name: string (nullable = true)
PySpark Basic2
- selecting a column
# select column from a df
df["age"]
Column<'age'>
type(df["age"])
pyspark.sql.column.Column
# select a column of a df as a df
df.select("age")
DataFrame[age: int]
df.select("age").show()
+----+
| age|
+----+
|null|
| 30|
| 19|
+----+
type(df.select("age"))
pyspark.sql.dataframe.DataFrame
# selecting multiple columns
df.select(["age","name"])
DataFrame[age: int, name: string]
- selecting a row
df.head(2)[0]
Row(age=None, name='Michael')
type(df.head(2)[0])
pyspark.sql.types.Row
- mutate
df.withColumn("double_age", df["age"]*2).show()
+----+-------+----------+
| age| name|double_age|
+----+-------+----------+
|null|Michael| null|
| 30| Andy| 60|
| 19| Justin| 38|
+----+-------+----------+
- rename
df.withColumnRenamed("age", "new_age").show()
+-------+-------+
|new_age| name|
+-------+-------+
| null|Michael|
| 30| Andy|
| 19| Justin|
+-------+-------+
- sql in Spark
# make a temporary view
df.createOrReplaceTempView("people")
# query
results = spark.sql("SELECT * FROM people WHERE age = 30")
results.show()
+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+