This tutorial will explain with examples how to use array_distinct, array_min, array_max and array_repeat array functions in Pyspark. Other array functions can be viewed by clicking functions in the below list.
df = spark.createDataFrame([(["d","a", "b","a", "c"],), (["f","d","a", None],)], ['data'])
df.show()
+---------------+
| data|
+---------------+
|[d, a, b, a, c]|
| [f, d, a,]|
+---------------+
array_distinct(column)
import pyspark.sql.functions as f
df_updated = df.select(df.data, f.array_distinct(df.data).alias("distinct_data"))
df_updated.show()
+---------------+-------------+
| data|distinct_data|
+---------------+-------------+
|[d, a, b, a, c]| [d, a, b, c]|
| [f, d, a,]| [f, d, a,]|
+---------------+-------------+
array_min(column)
import pyspark.sql.functions as f
df_updated = df.select(df.data, f.array_min(df.data).alias("minimum_data"))
df_updated.show()
+---------------+------------+
| data|minimum_data|
+---------------+------------+
|[d, a, b, a, c]| a|
| [f, d, a,]| a|
+---------------+------------+
array_max(column)
import pyspark.sql.functions as f
df_updated = df.select(df.data, f.array_max(df.data).alias("maximum_data"))
df_updated.show()
+---------------+------------+
| data|maximum_data|
+---------------+------------+
|[d, a, b, a, c]| d|
| [f, d, a,]| f|
+---------------+------------+
array_repeat(column, count)
import pyspark.sql.functions as f
df_updated = df.select(df.data, f.array_repeat(df.data, 3).alias("repeat_data"))
df_updated.show(truncate=False)
+---------------+---------------------------------------------------+
|data |repeat_data |
+---------------+---------------------------------------------------+
|[d, a, b, a, c]|[[d, a, b, a, c], [d, a, b, a, c], [d, a, b, a, c]]|
|[f, d, a,] |[[f, d, a,], [f, d, a,], [f, d, a,]] |
+---------------+---------------------------------------------------+