This tutorial will explain with examples how to use array_sort and array_join array functions in Pyspark. Other array functions can be viewed by clicking functions in the below list.
df = spark.createDataFrame([(["d","a", "b", "c"],), (["f","a", None],)], ['data'])
df.show()
+------------+
| data|
+------------+
|[d, a, b, c]|
| [f, a,]|
+------------+
array_sort(column)
import pyspark.sql.functions as f
df_updated = df.select(df.data, f.array_sort(df.data).alias("sorted"))
df_updated.show()
+------------+------------+
| data| sorted|
+------------+------------+
|[d, a, b, c]|[a, b, c, d]|
| [f, a,]| [a, f,]|
+------------+------------+
array_join(column, delimiter, null_replacement=None)
import pyspark.sql.functions as f
df_updated = df.select(df.data, f.array_join(df.data, ",").alias("joined"))
df_updated.show()
+------------+-------+
| data| joined|
+------------+-------+
|[d, a, b, c]|d,a,b,c|
| [f, a,]| f,a|
+------------+-------+
import pyspark.sql.functions as f
df_updated = df.select(df.data, f.array_join(df.data, "|").alias("joined"))
df_updated.show()
+------------+-------+
| data| joined|
+------------+-------+
|[d, a, b, c]|d|a|b|c|
| [f, a,]| f|a|
+------------+-------+
import pyspark.sql.functions as f
df_updated = df.select(df.data, f.array_join(df.data, "_","*").alias("joined"))
df_updated.show()
+------------+-------+
| data| joined|
+------------+-------+
|[d, a, b, c]|d_a_b_c|
| [f, a,]| f_a_*|
+------------+-------+
import pyspark.sql.functions as f
df_updated = df.select(df.data, f.array_join(f.array_sort(df.data), "_").alias("ascending_and_joined"))
df_updated.show()
+------------+--------------------+
| data|ascending_and_joined|
+------------+--------------------+
|[d, a, b, c]| a_b_c_d|
| [f, a,]| a_f|
+------------+--------------------+