This tutorial will explain how to use different sample functions available in Pyspark to extract subset of dataframe from the main dataframe. There are multiple dataframe functions for data sampling, click on function name in the below list and it will take you to the respective section of the page.
empdf = spark.read.parquet("file:///path_to_files/employee.parquet")
empdf.show()
+-------+--------+-------+----------+-------+
| emp_no|emp_name| salary|manager_id|dept_no|
+-------+--------+-------+----------+-------+
|1000245| PRADEEP|5000.00| null| 100|
|1000258| BLAKE|2850.00| 1000245| 300|
|1000262| CLARK|2450.00| 1000245| 100|
|1000276| JONES|2975.00| 1000245| 200|
|1000288| SCOTT|3000.00| 1000276| 200|
|1000292| FORD|3000.00| 1000276| 200|
|1000294| SMITH| 800.00| 1000292| 200|
|1000299| ALLEN|1600.00| 1000258| 300|
|1000310| WARD|1250.00| 1000258| 300|
|1000312| MARTIN|1250.00| 1000258| 300|
|1000315| TURNER|1500.00| 1000258| 300|
|1000326| ADAMS|1100.00| 1000288| 200|
|1000336| JAMES| 950.00| 1000258| 300|
|1000346| MILLER|1300.00| 1000262| 100|
|1000347| DAVID|1400.00| 1000245| 500|
+-------+--------+-------+----------+-------+
sample(withReplacement=None, fraction=None, seed=None)
empdf_sample = empdf.sample(0.2)
empdf_sample.show()
+-------+--------+-------+----------+-------+
| emp_no|emp_name| salary|manager_id|dept_no|
+-------+--------+-------+----------+-------+
|1000262| CLARK|2450.00| 1000245| 100|
+-------+--------+-------+----------+-------+
empdf_sample = empdf.sample(True, 0.3)
empdf_sample.show()
+-------+--------+-------+----------+-------+
| emp_no|emp_name| salary|manager_id|dept_no|
+-------+--------+-------+----------+-------+
|1000288| SCOTT|3000.00| 1000276| 200|
|1000292| FORD|3000.00| 1000276| 200|
|1000312| MARTIN|1250.00| 1000258| 300|
|1000312| MARTIN|1250.00| 1000258| 300|
+-------+--------+-------+----------+-------+
empdf_sample = empdf.sample(True,0.4,3)
empdf_sample.show()
+-------+--------+-------+----------+-------+
| emp_no|emp_name| salary|manager_id|dept_no|
+-------+--------+-------+----------+-------+
|1000292| FORD|3000.00| 1000276| 200|
|1000294| SMITH| 800.00| 1000292| 200|
|1000315| TURNER|1500.00| 1000258| 300|
|1000346| MILLER|1300.00| 1000262| 100|
+-------+--------+-------+----------+-------+
empdf_sample = empdf.sample(0.4,2)
empdf_sample.show()
+-------+--------+-------+----------+-------+
| emp_no|emp_name| salary|manager_id|dept_no|
+-------+--------+-------+----------+-------+
|1000258| BLAKE|2850.00| 1000245| 300|
|1000292| FORD|3000.00| 1000276| 200|
|1000312| MARTIN|1250.00| 1000258| 300|
|1000326| ADAMS|1100.00| 1000288| 200|
|1000336| JAMES| 950.00| 1000258| 300|
|1000346| MILLER|1300.00| 1000262| 100|
+-------+--------+-------+----------+-------+
empdf_sample = empdf.sample(0.4,2)
empdf_sample.show()
+-------+--------+-------+----------+-------+
| emp_no|emp_name| salary|manager_id|dept_no|
+-------+--------+-------+----------+-------+
|1000258| BLAKE|2850.00| 1000245| 300|
|1000292| FORD|3000.00| 1000276| 200|
|1000312| MARTIN|1250.00| 1000258| 300|
|1000326| ADAMS|1100.00| 1000288| 200|
|1000336| JAMES| 950.00| 1000258| 300|
|1000346| MILLER|1300.00| 1000262| 100|
+-------+--------+-------+----------+-------+
empdf_sample = empdf.sample(True,0.4,3)
empdf_sample.show()
+-------+--------+-------+----------+-------+
| emp_no|emp_name| salary|manager_id|dept_no|
+-------+--------+-------+----------+-------+
|1000292| FORD|3000.00| 1000276| 200|
|1000294| SMITH| 800.00| 1000292| 200|
|1000315| TURNER|1500.00| 1000258| 300|
|1000346| MILLER|1300.00| 1000262| 100|
+-------+--------+-------+----------+-------+
sampleBy(column, fractions, seed=None)
empdf_sample = empdf.sampleBy("dept_no", {200:0.1,300:0.2})
empdf_sample.show()
+-------+--------+-------+----------+-------+
| emp_no|emp_name| salary|manager_id|dept_no|
+-------+--------+-------+----------+-------+
|1000310| WARD|1250.00| 1000258| 300|
|1000312| MARTIN|1250.00| 1000258| 300|
+-------+--------+-------+----------+-------+
empdf_sample = empdf.sampleBy("dept_no", {200:0.1,300:0.2})
empdf_sample.show()
+-------+--------+-------+----------+-------+
| emp_no|emp_name| salary|manager_id|dept_no|
+-------+--------+-------+----------+-------+
|1000258| BLAKE|2850.00| 1000245| 300|
|1000294| SMITH| 800.00| 1000292| 200|
|1000299| ALLEN|1600.00| 1000258| 300|
|1000310| WARD|1250.00| 1000258| 300|
|1000312| MARTIN|1250.00| 1000258| 300|
|1000336| JAMES| 950.00| 1000258| 300|
+-------+--------+-------+----------+-------+
empdf_sample = empdf.sampleBy("dept_no", {200:0.1,300:0.2})
empdf_sample.show()
+-------+--------+------+----------+-------+
| emp_no|emp_name|salary|manager_id|dept_no|
+-------+--------+------+----------+-------+
|1000294| SMITH|800.00| 1000292| 200|
+-------+--------+------+----------+-------+
empdf_sample = empdf.sampleBy("dept_no", {200:0.1,300:0.2},1)
empdf_sample.show()
+-------+--------+-------+----------+-------+
| emp_no|emp_name| salary|manager_id|dept_no|
+-------+--------+-------+----------+-------+
|1000276| JONES|2975.00| 1000245| 200|
|1000299| ALLEN|1600.00| 1000258| 300|
+-------+--------+-------+----------+-------+
empdf_sample = empdf.sampleBy("dept_no", {200:0.1,300:0.2},1)
empdf_sample.show()
+-------+--------+-------+----------+-------+
| emp_no|emp_name| salary|manager_id|dept_no|
+-------+--------+-------+----------+-------+
|1000276| JONES|2975.00| 1000245| 200|
|1000299| ALLEN|1600.00| 1000258| 300|
+-------+--------+-------+----------+-------+