This tutorial will explain with examples how to use arrays_overlap and arrays_zip array functions in Pyspark. Other array functions can be viewed by clicking functions in the below list.
from pyspark.sql import Row
df = spark.createDataFrame([Row(col1=["b", "a", "c",None], col2=["c", "d", "a", "f"])])
df.show()
+---------+------------+
| col1| col2|
+---------+------------+
|[b, a, c]|[c, d, a, f]|
+---------+------------+
arrays_overlap(array_column1, array_colum2)
import pyspark.sql.functions as f
df_updated = df.select(df.col1, df.col2, f.arrays_overlap(df.col1, df.col2).alias("is_overlap"))
df_updated.show()
+---------+------------+----------+
| col1| col2|is_overlap|
+---------+------------+----------+
|[b, g, y]|[c, d, a, f]| false|
+---------+------------+----------+
import pyspark.sql.functions as f
from pyspark.sql import Row
df = spark.createDataFrame([Row(col1=["b", "g", "y",None], col2=["c", "d", "a", "f"])])
df.show()
+----------+------------+
| col1| col2|
+----------+------------+
|[b, g, y,]|[c, d, a, f]|
+----------+------------+
df_updated = df.select(df.col1, df.col2, f.arrays_overlap(df.col1, df.col2).alias("is_overlap"))
df_updated.show()
+----------+------------+----------+
| col1| col2|is_overlap|
+----------+------------+----------+
|[b, g, y,]|[c, d, a, f]| null|
+----------+------------+----------+
import pyspark.sql.functions as f
from pyspark.sql import Row
df = spark.createDataFrame([Row(col1=["b", "g", "y"], col2=["c", "d", "a", "f"])])
df.show()
+---------+------------+
| col1| col2|
+---------+------------+
|[b, g, y]|[c, d, a, f]|
+---------+------------+
df_updated = df.select(df.col1, df.col2, f.arrays_overlap(df.col1, df.col2).alias("is_overlap"))
df_updated.show()
+---------+------------+----------+
| col1| col2|is_overlap|
+---------+------------+----------+
|[b, g, y]|[c, d, a, f]| false|
+---------+------------+----------+
arrays_zip(*column)
import pyspark.sql.functions as f
df_updated = df.select(df.col1, df.col2, f.arrays_zip(df.col1, df.col2).alias("zipped_array"))
df_updated.show(truncate=False)
+----------+------------+-------------------------------+
|col1 |col2 |zipped_array |
+----------+------------+-------------------------------+
|[b, g, y,]|[c, d, a, f]|[[b, c], [g, d], [y, a], [, f]]|
+----------+------------+-------------------------------+
import pyspark.sql.functions as f
from pyspark.sql import Row
df = spark.createDataFrame([Row(col1=["b", "g", "y"], col2=["c", "d", "a", "f"])])
df.show()
+---------+------------+
| col1| col2|
+---------+------------+
|[b, g, y]|[c, d, a, f]|
+---------+------------+
df_updated = df.select(df.col1, df.col2, f.arrays_zip(df.col1, df.col2).alias("zipped_array"))
df_updated.show(truncate=False)
+---------+------------+-------------------------------+
|col1 |col2 |zipped_array |
+---------+------------+-------------------------------+
|[b, g, y]|[c, d, a, f]|[[b, c], [g, d], [y, a], [, f]]|
+---------+------------+-------------------------------+
import pyspark.sql.functions as f
from pyspark.sql import Row
df = spark.createDataFrame([Row(col1=["b", "g", "y"], col2=["c", "d", "a", "f"], col3=["p", "e", "k", "t"])])
df.show()
+---------+------------+------------+
| col1| col2| col3|
+---------+------------+------------+
|[b, g, y]|[c, d, a, f]|[p, e, k, t]|
+---------+------------+------------+
df_updated = df.select(df.col1, df.col2, df.col3, f.arrays_zip(df.col1, df.col2, df.col3).alias("zipped_array"))
df_updated.show(truncate=False)
+---------+------------+------------+-------------------------------------------+
|col1 |col2 |col3 |zipped_array |
+---------+------------+------------+-------------------------------------------+
|[b, g, y]|[c, d, a, f]|[p, e, k, t]|[[b, c, p], [g, d, e], [y, a, k], [, f, t]]|
+---------+------------+------------+-------------------------------------------+