This tutorial will explain how filters can be used on dataframes in Pyspark. where() function is an alias for filter() function. Following topics will be covered on this page:
Dataframe.filter(condition)
from pyspark.sql.functions import col #importing col function
empdf=spark.read.parquet("file:///path_to_file/employee.parquet")
deptdf=spark.read.parquet("file:///path_to_file/department.parquet")
from pyspark.sql.functions import col #importing col function
empdf.filter(col("emp_name")=="SCOTT").show()
+-------+--------+-------+----------+-------+
| emp_no|emp_name| salary|manager_id|dept_no|
+-------+--------+-------+----------+-------+
|1000288| SCOTT|3000.00| 1000276| 200|
+-------+--------+-------+----------+-------+
empdf.filter("emp_name=='SCOTT'").show()
+-------+--------+-------+----------+-------+
| emp_no|emp_name| salary|manager_id|dept_no|
+-------+--------+-------+----------+-------+
|1000288| SCOTT|3000.00| 1000276| 200|
+-------+--------+-------+----------+-------+
empdf.filter(col("emp_no")%10==9).show()
+-------+--------+-------+----------+-------+
| emp_no|emp_name| salary|manager_id|dept_no|
+-------+--------+-------+----------+-------+
|1000299| ALLEN|1600.00| 1000258| 300|
+-------+--------+-------+----------+-------+
empdf.filter(col("manager_id")!=1000258).show()
+-------+--------+-------+----------+-------+
| emp_no|emp_name| salary|manager_id|dept_no|
+-------+--------+-------+----------+-------+
|1000258| BLAKE|2850.00| 1000245| 300|
|1000262| CLARK|2450.00| 1000245| 100|
|1000276| JONES|2975.00| 1000245| 200|
|1000288| SCOTT|3000.00| 1000276| 200|
|1000292| FORD|3000.00| 1000276| 200|
|1000294| SMITH| 800.00| 1000292| 200|
|1000326| ADAMS|1100.00| 1000288| 200|
|1000346| MILLER|1300.00| 1000262| 100|
|1000347| DAVID|1400.00| 1000245| 500|
+-------+--------+-------+----------+-------+
empdf.filter(col("salary") < 1400).show()
+-------+--------+-------+----------+-------+
| emp_no|emp_name| salary|manager_id|dept_no|
+-------+--------+-------+----------+-------+
|1000294| SMITH| 800.00| 1000292| 200|
|1000310| WARD|1250.00| 1000258| 300|
|1000312| MARTIN|1250.00| 1000258| 300|
|1000326| ADAMS|1100.00| 1000288| 200|
|1000336| JAMES| 950.00| 1000258| 300|
|1000346| MILLER|1300.00| 1000262| 100|
+-------+--------+-------+----------+-------+
from pyspark.sql.functions import col
empdf.filter((col("emp_no").isin(1000262,1000276,1000310))).show()
+-------+--------+-------+----------+-------+
| emp_no|emp_name| salary|manager_id|dept_no|
+-------+--------+-------+----------+-------+
|1000262| CLARK|2450.00| 1000245| 100|
|1000276| JONES|2975.00| 1000245| 200|
|1000310| WARD|1250.00| 1000258| 300|
+-------+--------+-------+----------+-------+
li=[1000346,1000288,1000294]
empdf.filter((col("emp_no").isin(li))).show()
+-------+--------+-------+----------+-------+
| emp_no|emp_name| salary|manager_id|dept_no|
+-------+--------+-------+----------+-------+
|1000288| SCOTT|3000.00| 1000276| 200|
|1000294| SMITH| 800.00| 1000292| 200|
|1000346| MILLER|1300.00| 1000262| 100|
+-------+--------+-------+----------+-------+
from pyspark.sql.functions import col
empdf.filter(~(col("emp_no").isin(1000262,1000276,1000310))).show()
+-------+--------+-------+----------+-------+
| emp_no|emp_name| salary|manager_id|dept_no|
+-------+--------+-------+----------+-------+
|1000245| PRADEEP|5000.00| null| 100|
|1000258| BLAKE|2850.00| 1000245| 300|
|1000288| SCOTT|3000.00| 1000276| 200|
|1000292| FORD|3000.00| 1000276| 200|
|1000294| SMITH| 800.00| 1000292| 200|
|1000299| ALLEN|1600.00| 1000258| 300|
|1000312| MARTIN|1250.00| 1000258| 300|
|1000315| TURNER|1500.00| 1000258| 300|
|1000326| ADAMS|1100.00| 1000288| 200|
|1000336| JAMES| 950.00| 1000258| 300|
|1000346| MILLER|1300.00| 1000262| 100|
|1000347| DAVID|1400.00| 1000245| 500|
+-------+--------+-------+----------+-------+
li=[1000346,1000288,1000294]
empdf.filter(~(col("emp_no").isin(li))).show()
+-------+--------+-------+----------+-------+
| emp_no|emp_name| salary|manager_id|dept_no|
+-------+--------+-------+----------+-------+
|1000245| PRADEEP|5000.00| null| 100|
|1000258| BLAKE|2850.00| 1000245| 300|
|1000262| CLARK|2450.00| 1000245| 100|
|1000276| JONES|2975.00| 1000245| 200|
|1000292| FORD|3000.00| 1000276| 200|
|1000299| ALLEN|1600.00| 1000258| 300|
|1000310| WARD|1250.00| 1000258| 300|
|1000312| MARTIN|1250.00| 1000258| 300|
|1000315| TURNER|1500.00| 1000258| 300|
|1000326| ADAMS|1100.00| 1000288| 200|
|1000336| JAMES| 950.00| 1000258| 300|
|1000347| DAVID|1400.00| 1000245| 500|
+-------+--------+-------+----------+-------+
from pyspark.sql.functions import col
li=[1000346,1000288,1000294]
empdf.filter((col("emp_no").isin(li))).show()
+-------+--------+-------+----------+-------+
| emp_no|emp_name| salary|manager_id|dept_no|
+-------+--------+-------+----------+-------+
|1000288| SCOTT|3000.00| 1000276| 200|
|1000294| SMITH| 800.00| 1000292| 200|
|1000346| MILLER|1300.00| 1000262| 100|
+-------+--------+-------+----------+-------+
from pyspark.sql.functions import col
empdf.filter(col("manager_id").isNull()).show()
+-------+--------+-------+----------+-------+
| emp_no|emp_name| salary|manager_id|dept_no|
+-------+--------+-------+----------+-------+
|1000245| PRADEEP|5000.00| null| 100|
+-------+--------+-------+----------+-------+
from pyspark.sql.functions import col
empdf.filter(col("manager_id").isNotNull()).show()
+-------+--------+-------+----------+-------+
| emp_no|emp_name| salary|manager_id|dept_no|
+-------+--------+-------+----------+-------+
|1000258| BLAKE|2850.00| 1000245| 300|
|1000262| CLARK|2450.00| 1000245| 100|
|1000276| JONES|2975.00| 1000245| 200|
|1000288| SCOTT|3000.00| 1000276| 200|
|1000292| FORD|3000.00| 1000276| 200|
|1000294| SMITH| 800.00| 1000292| 200|
|1000299| ALLEN|1600.00| 1000258| 300|
|1000310| WARD|1250.00| 1000258| 300|
|1000312| MARTIN|1250.00| 1000258| 300|
|1000315| TURNER|1500.00| 1000258| 300|
|1000326| ADAMS|1100.00| 1000288| 200|
|1000336| JAMES| 950.00| 1000258| 300|
|1000346| MILLER|1300.00| 1000262| 100|
|1000347| DAVID|1400.00| 1000245| 500|
+-------+--------+-------+----------+-------+
from pyspark.sql.functions import col
empdf.filter(col("emp_name").like("%ES%")).show()
+-------+--------+-------+----------+-------+
| emp_no|emp_name| salary|manager_id|dept_no|
+-------+--------+-------+----------+-------+
|1000276| JONES|2975.00| 1000245| 200|
|1000336| JAMES| 950.00| 1000258| 300|
+-------+--------+-------+----------+-------+
empdf.filter(col("emp_name").like("%S")).show()
+-------+--------+-------+----------+-------+
| emp_no|emp_name| salary|manager_id|dept_no|
+-------+--------+-------+----------+-------+
|1000276| JONES|2975.00| 1000245| 200|
|1000326| ADAMS|1100.00| 1000288| 200|
|1000336| JAMES| 950.00| 1000258| 300|
+-------+--------+-------+----------+-------+
empdf.filter(col("emp_name").like("S%")).show()
+-------+--------+-------+----------+-------+
| emp_no|emp_name| salary|manager_id|dept_no|
+-------+--------+-------+----------+-------+
|1000288| SCOTT|3000.00| 1000276| 200|
|1000294| SMITH| 800.00| 1000292| 200|
+-------+--------+-------+----------+-------+
empdf.filter(col("emp_name").like("%A%D")).show()
+-------+--------+-------+----------+-------+
| emp_no|emp_name| salary|manager_id|dept_no|
+-------+--------+-------+----------+-------+
|1000310| WARD|1250.00| 1000258| 300|
|1000347| DAVID|1400.00| 1000245| 500|
+-------+--------+-------+----------+-------+
from pyspark.sql.functions import col
empdf.filter(~col("emp_name").like("%ES%")).show()
+-------+--------+-------+----------+-------+
| emp_no|emp_name| salary|manager_id|dept_no|
+-------+--------+-------+----------+-------+
|1000245| PRADEEP|5000.00| null| 100|
|1000258| BLAKE|2850.00| 1000245| 300|
|1000262| CLARK|2450.00| 1000245| 100|
|1000288| SCOTT|3000.00| 1000276| 200|
|1000292| FORD|3000.00| 1000276| 200|
|1000294| SMITH| 800.00| 1000292| 200|
|1000299| ALLEN|1600.00| 1000258| 300|
|1000310| WARD|1250.00| 1000258| 300|
|1000312| MARTIN|1250.00| 1000258| 300|
|1000315| TURNER|1500.00| 1000258| 300|
|1000326| ADAMS|1100.00| 1000288| 200|
|1000346| MILLER|1300.00| 1000262| 100|
|1000347| DAVID|1400.00| 1000245| 500|
+-------+--------+-------+----------+-------+
empdf.filter(~col("emp_name").like("%S")).show()
+-------+--------+-------+----------+-------+
| emp_no|emp_name| salary|manager_id|dept_no|
+-------+--------+-------+----------+-------+
|1000245| PRADEEP|5000.00| null| 100|
|1000258| BLAKE|2850.00| 1000245| 300|
|1000262| CLARK|2450.00| 1000245| 100|
|1000288| SCOTT|3000.00| 1000276| 200|
|1000292| FORD|3000.00| 1000276| 200|
|1000294| SMITH| 800.00| 1000292| 200|
|1000299| ALLEN|1600.00| 1000258| 300|
|1000310| WARD|1250.00| 1000258| 300|
|1000312| MARTIN|1250.00| 1000258| 300|
|1000315| TURNER|1500.00| 1000258| 300|
|1000346| MILLER|1300.00| 1000262| 100|
|1000347| DAVID|1400.00| 1000245| 500|
+-------+--------+-------+----------+-------+
empdf.filter(~col("emp_name").like("S%")).show()
+-------+--------+-------+----------+-------+
| emp_no|emp_name| salary|manager_id|dept_no|
+-------+--------+-------+----------+-------+
|1000245| PRADEEP|5000.00| null| 100|
|1000258| BLAKE|2850.00| 1000245| 300|
|1000262| CLARK|2450.00| 1000245| 100|
|1000276| JONES|2975.00| 1000245| 200|
|1000292| FORD|3000.00| 1000276| 200|
|1000299| ALLEN|1600.00| 1000258| 300|
|1000310| WARD|1250.00| 1000258| 300|
|1000312| MARTIN|1250.00| 1000258| 300|
|1000315| TURNER|1500.00| 1000258| 300|
|1000326| ADAMS|1100.00| 1000288| 200|
|1000336| JAMES| 950.00| 1000258| 300|
|1000346| MILLER|1300.00| 1000262| 100|
|1000347| DAVID|1400.00| 1000245| 500|
+-------+--------+-------+----------+-------+
from pyspark.sql.functions import col
empdf.filter(col("emp_name").contains("S")).show()
+-------+--------+-------+----------+-------+
| emp_no|emp_name| salary|manager_id|dept_no|
+-------+--------+-------+----------+-------+
|1000276| JONES|2975.00| 1000245| 200|
|1000288| SCOTT|3000.00| 1000276| 200|
|1000294| SMITH| 800.00| 1000292| 200|
|1000326| ADAMS|1100.00| 1000288| 200|
|1000336| JAMES| 950.00| 1000258| 300|
+-------+--------+-------+----------+-------+
from pyspark.sql.functions import col
empdf.filter(col("salary").between(2500,4500)).show()
+-------+--------+-------+----------+-------+
| emp_no|emp_name| salary|manager_id|dept_no|
+-------+--------+-------+----------+-------+
|1000258| BLAKE|2850.00| 1000245| 300|
|1000276| JONES|2975.00| 1000245| 200|
|1000288| SCOTT|3000.00| 1000276| 200|
|1000292| FORD|3000.00| 1000276| 200|
+-------+--------+-------+----------+-------+
from pyspark.sql.functions import col
empdf.filter((col("emp_name")=="FORD")|(col("emp_name")=="ALLEN")).show()
+-------+--------+-------+-------+
| emp_no|emp_name| salary|dept_no|
+-------+--------+-------+-------+
|1000292| FORD|3000.00| 200|
|1000299| ALLEN|1600.00| 300|
+-------+--------+-------+-------+
empdf.filter((col("emp_name")=="FORD")|(col("salary")>2000)).show()
+-------+--------+-------+-------+
| emp_no|emp_name| salary|dept_no|
+-------+--------+-------+-------+
|1000245| PRADEEP|5000.00| 100|
|1000258| BLAKE|2850.00| 300|
|1000262| CLARK|2450.00| 100|
|1000276| JONES|2975.00| 200|
|1000288| SCOTT|3000.00| 200|
|1000292| FORD|3000.00| 200|
+-------+--------+-------+-------+
empdf.filter((col("dept_no")==200)&(col("salary")>2000)).show()
+-------+--------+-------+-------+
| emp_no|emp_name| salary|dept_no|
+-------+--------+-------+-------+
|1000276| JONES|2975.00| 200|
|1000288| SCOTT|3000.00| 200|
|1000292| FORD|3000.00| 200|
+-------+--------+-------+-------+
empdf.filter((((col("dept_no")==200)&(col("salary")>2000))|(col("emp_name")=="JAMES"))).show()
+-------+--------+-------+----------+-------+
| emp_no|emp_name| salary|manager_id|dept_no|
+-------+--------+-------+----------+-------+
|1000276| JONES|2975.00| 1000245| 200|
|1000288| SCOTT|3000.00| 1000276| 200|
|1000292| FORD|3000.00| 1000276| 200|
|1000336| JAMES| 950.00| 1000258| 300|
+-------+--------+-------+----------+-------+