This tutorial will explain with examples how to use array_union, array_intersect and array_except array functions in Pyspark. Other array functions can be viewed by clicking functions in the below list.
from pyspark.sql import Row
df = spark.createDataFrame([Row(col1=["b", "a", "c",None], col2=["c", "d", "a", "f"])])
df.show()
+---------+------------+
| col1| col2|
+---------+------------+
|[b, a, c]|[c, d, a, f]|
+---------+------------+
array_union(column1, column2)
import pyspark.sql.functions as f
df_updated = df.select(f.array_union(df.col1, df.col2).alias("union_"))
df_updated.show()
+---------------+
| union_|
+---------------+
|[b, a, c, d, f]|
+---------------+
import pyspark.sql.functions as f
df = spark.createDataFrame([Row(col1=["b", "a", "c",None], col2=["c", "d", "a", "f"])])
df.show()
+----------+------------+
| col1| col2|
+----------+------------+
|[b, a, c,]|[c, d, a, f]|
+----------+------------+
df_updated = df.select(f.array_union(df.col1, df.col2).alias("union_"))
df_updated.show()
+----------------+
| union_|
+----------------+
|[b, a, c,, d, f]|
+----------------+
array_intersect(column1, column2)
import pyspark.sql.functions as f
df_updated = df.select(f.array_intersect(df.col1, df.col2).alias("array_intersect_"))
df_updated.show()
+----------------+
|array_intersect_|
+----------------+
| [a, c]|
+----------------+
array_except(column1, column2)
import pyspark.sql.functions as f
df_updated = df.select(f.array_except(df.col1, df.col2).alias("array_except_"))
df_updated.show()
+-------------+
|array_except_|
+-------------+
| [b]|
+-------------+
import pyspark.sql.functions as f
df = spark.createDataFrame([Row(col1=["b", "a", "c",None], col2=["c", "d", "a", "f"])])
df.show()
+----------+------------+
| col1| col2|
+----------+------------+
|[b, a, c,]|[c, d, a, f]|
+----------+------------+
df_updated = df.select(f.array_except(df.col1, df.col2).alias("array_except_"))
df_updated.show()
+-------------+
|array_except_|
+-------------+
| [b,]|
+-------------+