This tutorial will explain with examples how to use array_position, array_contains and array_remove array functions in Pyspark. Other array functions can be viewed by clicking functions in the below list.
df = spark.createDataFrame([(["d","a", "b","a", "c"],), (["f","d","a", None],)], ['data'])
df.show()
+---------------+
| data|
+---------------+
|[d, a, b, a, c]|
| [f, d, a,]|
+---------------+
array_position(column, value)
import pyspark.sql.functions as f
df_updated = df.select(df.data, f.array_position(df.data, "a").alias("position_check"))
df_updated.show()
+---------------+--------------+
| data|position_check|
+---------------+--------------+
|[d, a, b, a, c]| 2|
| [f, d, a,]| 3|
+---------------+--------------+
import pyspark.sql.functions as f
df_updated = df.select(df.data, f.array_position(df.data, "c").alias("position_check"))
df_updated.show()
+---------------+--------------+
| data|position_check|
+---------------+--------------+
|[d, a, b, a, c]| 5|
| [f, d, a,]| 0|
+---------------+--------------+
import pyspark.sql.functions as f
df_updated = df.select(df.data, f.array_position(df.data, "Z").alias("position_check"))
df_updated.show()
+---------------+--------------+
| data|position_check|
+---------------+--------------+
|[d, a, b, a, c]| 0|
| [f, d, a,]| 0|
+---------------+--------------+
array_contains(column, value)
import pyspark.sql.functions as f
df_updated = df.select(df.data, f.array_contains(df.data, "a").alias("contains_check"))
df_updated.show()
+---------------+--------------+
| data|contains_check|
+---------------+--------------+
|[d, a, b, a, c]| true|
| [f, d, a,]| true|
+---------------+--------------+
import pyspark.sql.functions as f
df_updated = df.select(df.data, f.array_contains(df.data, "c").alias("contains_check"))
df_updated.show()
+---------------+--------------+
| data|contains_check|
+---------------+--------------+
|[d, a, b, a, c]| true|
| [f, d, a,]| null|
+---------------+--------------+
import pyspark.sql.functions as f
df_updated = df.select(df.data, f.array_contains(df.data, "Z").alias("contains_check"))
df_updated.show()
+---------------+--------------+
| data|contains_check|
+---------------+--------------+
|[d, a, b, a, c]| false|
| [f, d, a,]| null|
+---------------+--------------+
array_remove(column, value)
import pyspark.sql.functions as f
df_updated = df.select(df.data, f.array_remove(df.data, "a").alias("removed_data"))
df_updated.show()
+---------------+------------+
| data|removed_data|
+---------------+------------+
|[d, a, b, a, c]| [d, b, c]|
| [f, d, a,]| [f, d,]|
+---------------+------------+
import pyspark.sql.functions as f
df_updated = df.select(df.data, f.array_remove(df.data, "c").alias("removed_data"))
df_updated.show()
+---------------+------------+
| data|removed_data|
+---------------+------------+
|[d, a, b, a, c]|[d, a, b, a]|
| [f, d, a,]| [f, d, a,]|
+---------------+------------+
import pyspark.sql.functions as f
df_updated = df.select(df.data, f.array_remove(df.data, "Z").alias("removed_data"))
df_updated.show()
+---------------+---------------+
| data| removed_data|
+---------------+---------------+
|[d, a, b, a, c]|[d, a, b, a, c]|
| [f, d, a,]| [f, d, a,]|
+---------------+---------------+