This tutorial will explain the pivot function available in Pyspark that can be used to transform rows into columns.
pivot(pivot_column, values=None)
df = spark.read.parquet("file:///path_to_files/ledger.parquet")
df.show()
+-------+-------+-----+
|year_nr|Quarter|Sales|
+-------+-------+-----+
| 2015| Q1| 90|
| 2015| Q2| 70|
| 2015| Q3| 130|
| 2015| Q4| 30|
| 2016| Q1| 40|
| 2016| Q2| 50|
| 2016| Q3| 120|
| 2016| Q4| 20|
| 2015| Q2| 1000|
| 2015| Q3| 1500|
| 2016| Q1| 1100|
| 2016| Q4| 150|
+-------+-------+-----+
df_updated= df.groupBy("year_nr").pivot("quarter").sum("sales")
df_updated.show()
+-------+----+----+----+---+
|year_nr| Q1| Q2| Q3| Q4|
+-------+----+----+----+---+
| 2015| 90|1070|1630| 30|
| 2016|1140| 50| 120|170|
+-------+----+----+----+---+
df_updated= df.groupBy("year_nr").pivot("quarter", ["Q1","Q2","Q3","Q4"]).sum("sales")
df_updated.show()
+-------+----+----+----+---+
|year_nr| Q1| Q2| Q3| Q4|
+-------+----+----+----+---+
| 2015| 90|1070|1630| 30|
| 2016|1140| 50| 120|170|
+-------+----+----+----+---+
df_updated= df.groupBy("year_nr").pivot("quarter", ["Q1","Q3","Q4"]).sum("sales")
df_updated.show()
+-------+----+----+---+
|year_nr| Q1| Q3| Q4|
+-------+----+----+---+
| 2015| 90|1630| 30|
| 2016|1140| 120|170|
+-------+----+----+---+
from pyspark.sql.functions import first
df_updated= df.groupBy("year_nr").pivot("quarter", ["Q1","Q2","Q3","Q4"]).agg(f.first("sales"))
df_updated.show()
+-------+---+---+---+---+
|year_nr| Q1| Q2| Q3| Q4|
+-------+---+---+---+---+
| 2015| 90| 70|130| 30|
| 2016| 40| 50|120| 20|
+-------+---+---+---+---+