df = spark.read.parquet("file:///path_to_files/null_handling_sample_data.parquet")
df.show()
+-----+---------+-------+
|db_id| db_name|db_type|
+-----+---------+-------+
| 12| Teradata| RDBMS|
| null| Vertica| RDBMS|
| null| null| null|
| 12| null| RDBMS|
| 22| Mysql| null|
| 50|Snowflake| RDBMS|
| 51| null|CloudDB|
+-----+---------+-------+
na.drop( how='any', thresh=None, subset=None)
dropna( how='any', thresh=None, subset=None)
df.dropna().show() # same as df.na.drop().show()
+-----+---------+-------+
|db_id| db_name|db_type|
+-----+---------+-------+
| 12| Teradata| RDBMS|
| 50|Snowflake| RDBMS|
+-----+---------+-------+
df.na.drop('all').show() #same as df.dropna('all').show()
+-----+---------+-------+
|db_id| db_name|db_type|
+-----+---------+-------+
| 12| Teradata| RDBMS|
| null| Vertica| null|
| 12| null| RDBMS|
| 22| Mysql| null|
| 50|Snowflake| RDBMS|
| 51| null|CloudDB|
+-----+---------+-------+
df.na.drop('any', subset=["db_id"]).show()
+-----+---------+-------+
|db_id| db_name|db_type|
+-----+---------+-------+
| 12| Teradata| RDBMS|
| 12| null| RDBMS|
| 22| Mysql| null|
| 50|Snowflake| RDBMS|
| 51| null|CloudDB|
+-----+---------+-------+
df.na.drop('any', subset=["db_id", "db_type"]).show()
+-----+---------+-------+
|db_id| db_name|db_type|
+-----+---------+-------+
| 12| Teradata| RDBMS|
| 12| null| RDBMS|
| 50|Snowflake| RDBMS|
| 51| null|CloudDB|
+-----+---------+-------+
df.na.drop('all', subset=["db_id", "db_type"]).show()
+-----+---------+-------+
|db_id| db_name|db_type|
+-----+---------+-------+
| 12| Teradata| RDBMS|
| 12| null| RDBMS|
| 22| Mysql| null|
| 50|Snowflake| RDBMS|
| 51| null|CloudDB|
+-----+---------+-------+
df.dropna('all', 2).show()
+-----+---------+-------+
|db_id| db_name|db_type|
+-----+---------+-------+
| 12| Teradata| RDBMS|
| 12| null| RDBMS|
| 22| Mysql| null|
| 50|Snowflake| RDBMS|
| 51| null|CloudDB|
+-----+---------+-------+
df.dropna('all', 1).show()
+-----+---------+-------+
|db_id| db_name|db_type|
+-----+---------+-------+
| 12| Teradata| RDBMS|
| null| Vertica| null|
| 12| null| RDBMS|
| 22| Mysql| null|
| 50|Snowflake| RDBMS|
| 51| null|CloudDB|
+-----+---------+-------+
na.fill(value, subset=None)
fillna(value, subset=None)
df.fillna(0).show() # same as df.na.fill(0).show()
+-----+---------+-------+
|db_id| db_name|db_type|
+-----+---------+-------+
| 12| Teradata| RDBMS|
| 0| Vertica| null|
| 0| null| null|
| 12| null| RDBMS|
| 22| Mysql| null|
| 50|Snowflake| RDBMS|
| 51| null|CloudDB|
+-----+---------+-------+
df.fillna("UnAssigned").show() # same as df.na.fill("UnAssigned").show()
+-----+----------+----------+
|db_id| db_name| db_type|
+-----+----------+----------+
| 12| Teradata| RDBMS|
| null| Vertica|UnAssigned|
| null|UnAssigned|UnAssigned|
| 12|UnAssigned| RDBMS|
| 22| Mysql|UnAssigned|
| 50| Snowflake| RDBMS|
| 51|UnAssigned| CloudDB|
+-----+----------+----------+
df.fillna("UnAssigned" , ["db_name"]).show()
+-----+----------+-------+
|db_id| db_name|db_type|
+-----+----------+-------+
| 12| Teradata| RDBMS|
| null| Vertica| null|
| null|UnAssigned| null|
| 12|UnAssigned| RDBMS|
| 22| Mysql| null|
| 50| Snowflake| RDBMS|
| 51|UnAssigned|CloudDB|
+-----+----------+-------+
df.fillna({"db_id" : 40, "db_type" : "UnAssigned"}).show()
+-----+---------+----------+
|db_id| db_name| db_type|
+-----+---------+----------+
| 12| Teradata| RDBMS|
| 40| Vertica|UnAssigned|
| 40| null|UnAssigned|
| 12| null| RDBMS|
| 22| Mysql|UnAssigned|
| 50|Snowflake| RDBMS|
| 51| null| CloudDB|
+-----+---------+----------+
from pyspark.sql.functions import col
df.filter(col("db_type").isNull()).show()
+-----+-------+-------+
|db_id|db_name|db_type|
+-----+-------+-------+
| null|Vertica| null|
| null| null| null|
| 22| Mysql| null|
+-----+-------+-------+
from pyspark.sql.functions import col
df.filter(col("db_id").isNotNull()).show()
+-----+---------+-------+
|db_id| db_name|db_type|
+-----+---------+-------+
| 12| Teradata| RDBMS|
| 12| null| RDBMS|
| 22| Mysql| null|
| 50|Snowflake| RDBMS|
| 51| null|CloudDB|
+-----+---------+-------+