empdf = spark.read.parquet("file:///path_to_files/employee_with_comm.parquet")
empdf.show()
+-------+--------+-------+----------+-------+-----+
| emp_no|emp_name| salary|manager_id|dept_no| comm|
+-------+--------+-------+----------+-------+-----+
|1000245| PRADEEP|5000.00| null| 100| 0.00|
|1000258| BLAKE|2850.00| 1000245| 300|50.00|
|1000262| CLARK|2450.00| 1000245| 100|50.00|
|1000276| JONES|2975.00| 1000245| 200|75.00|
|1000288| SCOTT|3000.00| 1000276| 200| 0.00|
|1000292| FORD|3000.00| 1000276| 200| 0.00|
|1000294| SMITH| 800.00| 1000292| 200| 0.00|
|1000299| ALLEN|1600.00| 1000258| 300| 0.00|
|1000310| WARD|1250.00| 1000258| 300|50.00|
|1000312| MARTIN|1250.00| 1000258| 300|50.00|
|1000315| TURNER|1500.00| 1000258| 300| 0.00|
|1000326| ADAMS|1100.00| 1000288| 200| 0.00|
|1000336| JAMES| 950.00| 1000258| 300|50.00|
|1000346| MILLER|1300.00| 1000262| 100| 0.00|
|1000347| DAVID|1400.00| 1000245| 500| 0.00|
+-------+--------+-------+----------+-------+-----+
empdf.groupBy("dept_no").sum("salary").show()
+-------+-----------+
|dept_no|sum(salary)|
+-------+-----------+
| 100| 8750.00|
| 300| 9400.00|
| 500| 1400.00|
| 200| 10875.00|
+-------+-----------+
empdf.groupBy("dept_no").sum("salary", "comm").show()
+-------+-----------+---------+
|dept_no|sum(salary)|sum(comm)|
+-------+-----------+---------+
| 100| 8750.00| 50.00|
| 300| 9400.00| 200.00|
| 500| 1400.00| 0.00|
| 200| 10875.00| 75.00|
+-------+-----------+---------+
empdf.groupBy("dept_no", "manager_id").sum("salary", "comm").show()
+-------+----------+-----------+---------+
|dept_no|manager_id|sum(salary)|sum(comm)|
+-------+----------+-----------+---------+
| 300| 1000258| 6550.00| 150.00|
| 100| null| 5000.00| 0.00|
| 200| 1000276| 6000.00| 0.00|
| 500| 1000245| 1400.00| 0.00|
| 200| 1000292| 800.00| 0.00|
| 300| 1000245| 2850.00| 50.00|
| 200| 1000288| 1100.00| 0.00|
| 100| 1000245| 2450.00| 50.00|
| 100| 1000262| 1300.00| 0.00|
| 200| 1000245| 2975.00| 75.00|
+-------+----------+-----------+---------+
empdf.groupBy("dept_no").count().show()
+-------+-----+
|dept_no|count|
+-------+-----+
| 100| 3|
| 300| 6|
| 500| 1|
| 200| 5|
+-------+-----+
empdf.groupBy("dept_no").min("salary", "emp_name").show()
+-------+-----------+
|dept_no|min(salary)|
+-------+-----------+
| 100| 1300.00|
| 300| 950.00|
| 500| 1400.00|
| 200| 800.00|
+-------+-----------+
empdf.groupBy("dept_no").min("salary","comm").show()
+-------+-----------+---------+
|dept_no|min(salary)|min(comm)|
+-------+-----------+---------+
| 100| 1300.00| 0.00|
| 300| 950.00| 0.00|
| 500| 1400.00| 0.00|
| 200| 800.00| 0.00|
+-------+-----------+---------+
empdf.groupBy("dept_no", "manager_id").min("salary", "comm").show()
+-------+----------+-----------+---------+
|dept_no|manager_id|min(salary)|min(comm)|
+-------+----------+-----------+---------+
| 300| 1000258| 950.00| 0.00|
| 100| null| 5000.00| 0.00|
| 200| 1000276| 3000.00| 0.00|
| 500| 1000245| 1400.00| 0.00|
| 200| 1000292| 800.00| 0.00|
| 300| 1000245| 2850.00| 50.00|
| 200| 1000288| 1100.00| 0.00|
| 100| 1000245| 2450.00| 50.00|
| 100| 1000262| 1300.00| 0.00|
| 200| 1000245| 2975.00| 75.00|
+-------+----------+-----------+---------+
empdf.groupBy("dept_no").max("salary").show()
+-------+-----------+
|dept_no|max(salary)|
+-------+-----------+
| 100| 5000.00|
| 300| 2850.00|
| 500| 1400.00|
| 200| 3000.00|
+-------+-----------+
empdf.groupBy("dept_no").max("salary","comm").show()
+-------+-----------+---------+
|dept_no|max(salary)|max(comm)|
+-------+-----------+---------+
| 100| 5000.00| 50.00|
| 300| 2850.00| 50.00|
| 500| 1400.00| 0.00|
| 200| 3000.00| 75.00|
+-------+-----------+---------+
empdf.groupBy("dept_no", "manager_id").max("salary", "comm").show()
+-------+----------+-----------+---------+
|dept_no|manager_id|max(salary)|max(comm)|
+-------+----------+-----------+---------+
| 300| 1000258| 1600.00| 50.00|
| 100| null| 5000.00| 0.00|
| 200| 1000276| 3000.00| 0.00|
| 500| 1000245| 1400.00| 0.00|
| 200| 1000292| 800.00| 0.00|
| 300| 1000245| 2850.00| 50.00|
| 200| 1000288| 1100.00| 0.00|
| 100| 1000245| 2450.00| 50.00|
| 100| 1000262| 1300.00| 0.00|
| 200| 1000245| 2975.00| 75.00|
+-------+----------+-----------+---------+
empdf.groupBy("dept_no").avg("salary").show()
+-------+-----------+
|dept_no|avg(salary)|
+-------+-----------+
| 100|2916.666667|
| 300|1566.666667|
| 500|1400.000000|
| 200|2175.000000|
+-------+-----------+
empdf.groupBy("dept_no").avg("salary","comm").show()
+-------+-----------+---------+
|dept_no|avg(salary)|avg(comm)|
+-------+-----------+---------+
| 100|2916.666667|16.666667|
| 300|1566.666667|33.333333|
| 500|1400.000000| 0.000000|
| 200|2175.000000|15.000000|
+-------+-----------+---------+
empdf.groupBy("dept_no", "manager_id").avg("salary", "comm").show()
+-------+----------+-----------+---------+
|dept_no|manager_id|avg(salary)|avg(comm)|
+-------+----------+-----------+---------+
| 300| 1000258|1310.000000|30.000000|
| 100| null|5000.000000| 0.000000|
| 200| 1000276|3000.000000| 0.000000|
| 500| 1000245|1400.000000| 0.000000|
| 200| 1000292| 800.000000| 0.000000|
| 300| 1000245|2850.000000|50.000000|
| 200| 1000288|1100.000000| 0.000000|
| 100| 1000245|2450.000000|50.000000|
| 100| 1000262|1300.000000| 0.000000|
| 200| 1000245|2975.000000|75.000000|
+-------+----------+-----------+---------+
empdf.groupBy("dept_no").agg({"*":"count","salary":"sum"}).show()
+-------+--------+-----------+
|dept_no|count(1)|sum(salary)|
+-------+--------+-----------+
| 100| 3| 8750.00|
| 300| 6| 9400.00|
| 500| 1| 1400.00|
| 200| 5| 10875.00|
+-------+--------+-----------+
empdf.groupBy("dept_no").agg({"*":"count","salary":"sum","salary":"max","salary":"min","emp_name":"max"}).show()
+-------+-------------+--------+-----------+
|dept_no|max(emp_name)|count(1)|min(salary)|
+-------+-------------+--------+-----------+
| 100| PRADEEP| 3| 1300.00|
| 300| WARD| 6| 950.00|
| 500| DAVID| 1| 1400.00|
| 200| SMITH| 5| 800.00|
+-------+-------------+--------+-----------+
from pyspark.sql.functions import min, max, avg, sum, count
empdf.groupBy("dept_no").agg(sum("salary"), min("salary"),max("salary"),avg("salary"),count("*")).show()
+-------+-----------+-----------+-----------+-----------+--------+
|dept_no|sum(salary)|min(salary)|max(salary)|avg(salary)|count(1)|
+-------+-----------+-----------+-----------+-----------+--------+
| 100| 8750.00| 1300.00| 5000.00|2916.666667| 3|
| 300| 9400.00| 950.00| 2850.00|1566.666667| 6|
| 500| 1400.00| 1400.00| 1400.00|1400.000000| 1|
| 200| 10875.00| 800.00| 3000.00|2175.000000| 5|
+-------+-----------+-----------+-----------+-----------+--------+
from pyspark.sql.functions import min, max, avg, sum, count
empdf.groupBy("dept_no").agg(sum("salary").alias("sum_salary"), min("salary").alias("min_salary"), max("salary").alias("max_salary"), avg("salary").alias("avg_salary"), count("*").alias("count_*") ).show()
+-------+----------+----------+----------+-----------+-------+
|dept_no|sum_salary|min_salary|max_salary| avg_salary|count_*|
+-------+----------+----------+----------+-----------+-------+
| 100| 8750.00| 1300.00| 5000.00|2916.666667| 3|
| 300| 9400.00| 950.00| 2850.00|1566.666667| 6|
| 500| 1400.00| 1400.00| 1400.00|1400.000000| 1|
| 200| 10875.00| 800.00| 3000.00|2175.000000| 5|
+-------+----------+----------+----------+-----------+-------+