df = spark.read.parquet("file:///path_to_files/date_format_example_file.parquet")
df.show()
+---------+----------+-------------------+
| engine| dt_field| ts_field|
+---------+----------+-------------------+
| spark|2022-11-03|2022-06-06 06:06:06|
|mapreduce|2022-10-25|2022-06-21 13:04:43|
+---------+----------+-------------------+
df.printSchema()
root
|-- engine: string (nullable = true)
|-- dt_field: date (nullable = true)
|-- ts_field: timestamp (nullable = true)
from pyspark.sql.functions import date_trunc
df_update = df.select("ts_field", date_trunc('yyyy',"ts_field").alias("trunc_year"))
df_update.show(truncate=False)
+-------------------+-------------------+
|ts_field |trunc_year |
+-------------------+-------------------+
|2022-06-06 06:06:06|2022-01-01 00:00:00|
|2022-06-21 13:04:43|2022-01-01 00:00:00|
+-------------------+-------------------+
from pyspark.sql.functions import date_trunc
df_update = df.select("ts_field", date_trunc('MM',"ts_field").alias("trunc_month"))
df_update.show(truncate=False)
+-------------------+-------------------+
|ts_field |trunc_month |
+-------------------+-------------------+
|2022-06-06 06:06:06|2022-06-01 00:00:00|
|2022-06-21 13:04:43|2022-06-01 00:00:00|
+-------------------+-------------------+
from pyspark.sql.functions import date_trunc
df_update = df.select("ts_field", date_trunc('DD',"ts_field").alias("trunc_day"))
df_update = df.select("ts_field", date_trunc('Day',"ts_field").alias("trunc_day"))
df_update.show(truncate=False)
+-------------------+-------------------+
|ts_field |trunc_day |
+-------------------+-------------------+
|2022-06-06 06:06:06|2022-06-06 00:00:00|
|2022-06-21 13:04:43|2022-06-21 00:00:00|
+-------------------+-------------------+
from pyspark.sql.functions import date_trunc
df_update = df.select("ts_field", date_trunc('quarter',"ts_field").alias("trunc_quarter"))
df_update.show(truncate=False)
+-------------------+-------------------+
|ts_field |trunc_quarter |
+-------------------+-------------------+
|2022-06-06 06:06:06|2022-04-01 00:00:00|
|2022-06-21 13:04:43|2022-04-01 00:00:00|
+-------------------+-------------------+
from pyspark.sql.functions import date_trunc
df_update = df.select("ts_field", date_trunc('hour',"ts_field").alias("trunc_hour"))
df_update.show(truncate=False)
+-------------------+-------------------+
|ts_field |trunc_hour |
+-------------------+-------------------+
|2022-06-06 06:06:06|2022-06-06 06:00:00|
|2022-06-21 13:04:43|2022-06-21 13:00:00|
+-------------------+-------------------+
from pyspark.sql.functions import date_trunc
df_update = df.select("ts_field", date_trunc('minute',"ts_field").alias("trunc_minute"))
df_update.show(truncate=False)
+-------------------+-------------------+
|ts_field |trunc_minute |
+-------------------+-------------------+
|2022-06-06 06:06:06|2022-06-06 06:06:00|
|2022-06-21 13:04:43|2022-06-21 13:04:00|
+-------------------+-------------------+
from pyspark.sql.functions import date_trunc, current_timestamp
df_update = df.select(current_timestamp(), date_trunc('second', current_timestamp()).alias("trunc_second"))
df_update.show(truncate=False)
+-----------------------+-------------------+
|current_timestamp() |trunc_second |
+-----------------------+-------------------+
|2022-06-14 22:04:46.562|2022-06-14 22:04:46|
|2022-06-14 22:04:46.562|2022-06-14 22:04:46|
+-----------------------+-------------------+
from pyspark.sql.functions import date_trunc, current_timestamp
df_update = df.select(current_date(), date_trunc('week', current_timestamp()).alias("trunc_second"))
df_update.show(truncate=False)
+--------------+-------------------+
|current_date()|trunc_second |
+--------------+-------------------+
|2022-06-14 |2022-06-13 00:00:00|
|2022-06-14 |2022-06-13 00:00:00|
+--------------+-------------------+