This tutorial will explain various approaches with examples on how to drop an existing column(s) from a dataframe. Below listed topics will be explained with examples on this page, click on item in the below list and it will take you to the respective section of the page:
df = spark.read.csv("file:///path_to_files/csv_file_with_duplicates.csv", header=True)
df.show()
+-----+---------+-------+
|db_id| db_name|db_type|
+-----+---------+-------+
| 12| Teradata| RDBMS|
| 14|Snowflake|CloudDB|
| 15| Vertica| RDBMS|
| 12| Teradata| RDBMS|
| 22| Mysql| RDBMS|
+-----+---------+-------+
df_other = spark.read.csv("file:///path_to_files/join_example_file_2.csv", header=True)
df_other.show()
+-----+-----------+-------+
|db_id| db_name|db_type|
+-----+-----------+-------+
| 17| Oracle| RDBMS|
| 19| MongoDB| NOSQL|
| 21|SingleStore| RDBMS|
| 22| Mysql| RDBMS|
| 14| Snowflake| RDBMS|
+-----+-----------+-------+
drop(column name / comma separated column names)
This function takes 1 parameter i.e. either name of the column as string or comma separated strings of column names.
df_updated = df.drop("db_type")
df_updated.show()
+-----+---------+
|db_id| db_name|
+-----+---------+
| 12| Teradata|
| 14|Snowflake|
| 15| Vertica|
| 12| Teradata|
| 22| Mysql|
+-----+---------+
df_updated = df.drop("db_type_cd")
df_updated.show()
+-----+---------+-------+
|db_id| db_name|db_type|
+-----+---------+-------+
| 12| Teradata| RDBMS|
| 14|Snowflake|CloudDB|
| 15| Vertica| RDBMS|
| 12| Teradata| RDBMS|
| 22| Mysql| RDBMS|
+-----+---------+-------+
df_updated = df.drop("db_id","db_type")
df_updated.show()
+---------+
| db_name|
+---------+
| Teradata|
|Snowflake|
| Vertica|
| Teradata|
| Mysql|
+---------+
df_updated = df.select("db_id", "db_name")
df_updated.show()
+-----+---------+
|db_id| db_name|
+-----+---------+
| 12| Teradata|
| 14|Snowflake|
| 15| Vertica|
| 12| Teradata|
| 22| Mysql|
+-----+---------+
#same example as above but by using list
col_name_to_drop="db_type"
column_li = df.columns
if col_name_to_drop in column_li:
column_li.remove(col_name_to_drop)
df_updated = df.select(column_li)
df_updated.show()
+-----+---------+
|db_id| db_name|
+-----+---------+
| 12| Teradata|
| 14|Snowflake|
| 15| Vertica|
| 12| Teradata|
| 22| Mysql|
+-----+---------+
df_updated = df.join(df_other,df.db_id==df_other.db_id).drop(df_other.db_id)
df_updated.show()
+-----+---------+-------+---------+-------+
|db_id| db_name|db_type| db_name|db_type|
+-----+---------+-------+---------+-------+
| 14|Snowflake|CloudDB|Snowflake| RDBMS|
| 22| Mysql| RDBMS| Mysql| RDBMS|
+-----+---------+-------+---------+-------+
df_updated = df.join(df_other,df.db_id==df_other.db_id).select(df.db_id, df.db_name, df.db_type, df_other.db_type.alias("other_db_type"))
df_updated.show()
+-----+---------+-------+-------------+
|db_id| db_name|db_type|other_db_type|
+-----+---------+-------+-------------+
| 14|Snowflake|CloudDB| RDBMS|
| 22| Mysql| RDBMS| RDBMS|
+-----+---------+-------+-------------+
df = df.drop("db_type")
df.show()
+-----+---------+
|db_id| db_name|
+-----+---------+
| 12| Teradata|
| 14|Snowflake|
| 15| Vertica|
| 12| Teradata|
| 22| Mysql|
+-----+---------+