df = spark.read.parquet("file:///path_to_files/date_format_example_file.parquet")
df.show()
+---------+----------+-------------------+
| engine| dt_field| ts_field|
+---------+----------+-------------------+
| spark|2022-11-03|2022-06-06 06:06:06|
|mapreduce|2022-10-25|2022-06-21 13:04:43|
+---------+----------+-------------------+
df.printSchema()
root
|-- engine: string (nullable = true)
|-- dt_field: date (nullable = true)
|-- ts_field: timestamp (nullable = true)
from pyspark.sql.functions import current_date, last_day
df_update = df.select("dt_field", last_day("dt_field").alias("last_day"), current_date(), last_day(current_date()).alias("last_day"))
df_update.show(truncate=False)
+----------+----------+--------------+----------+
|dt_field |last_day |current_date()|last_day |
+----------+----------+--------------+----------+
|2022-11-03|2022-11-30|2022-06-14 |2022-06-30|
|2022-10-25|2022-10-31|2022-06-14 |2022-06-30|
+----------+----------+--------------+----------+
from pyspark.sql.functions import current_date, next_day
df_update = df.select("dt_field", next_day("dt_field", "wed").alias("next_day"), current_date(), next_day(current_date(),"Fri").alias("next_day"))
df_update.show(truncate=False)
+----------+----------+--------------+----------+
|dt_field |next_day |current_date()|next_day |
+----------+----------+--------------+----------+
|2022-11-03|2022-11-09|2022-06-14 |2022-06-17|
|2022-10-25|2022-10-26|2022-06-14 |2022-06-17|
+----------+----------+--------------+----------+
from pyspark.sql.functions import current_date, date_add
df_update = df.select("dt_field", date_add("dt_field", 2).alias("date_add"), current_date(), date_add(current_date(), -4).alias("date_add"))
df_update.show(truncate=False)
+----------+----------+--------------+----------+
|dt_field |date_add |current_date()|date_add |
+----------+----------+--------------+----------+
|2022-11-03|2022-11-05|2022-06-14 |2022-06-10|
|2022-10-25|2022-10-27|2022-06-14 |2022-06-10|
+----------+----------+--------------+----------+
from pyspark.sql.functions import current_date, add_months
df_update = df.select("dt_field", add_months("dt_field", 2).alias("add_months"), current_date(), add_months(current_date(), -4).alias("add_months"))
df_update.show(truncate=False)
+----------+----------+--------------+----------+
|dt_field |add_months|current_date()|add_months|
+----------+----------+--------------+----------+
|2022-11-03|2023-01-03|2022-06-14 |2022-02-14|
|2022-10-25|2022-12-25|2022-06-14 |2022-02-14|
+----------+----------+--------------+----------+
from pyspark.sql.functions import current_date, datediff
df_update = df.select("dt_field", current_date(), datediff("dt_field", current_date() ).alias("datediff"))
df_update.show(truncate=False)
+----------+--------------+--------+
|dt_field |current_date()|datediff|
+----------+--------------+--------+
|2022-11-03|2022-06-14 |142 |
|2022-10-25|2022-06-14 |133 |
+----------+--------------+--------+
from pyspark.sql.functions import current_timestamp, datediff
df_update = df.select("ts_field", current_timestamp(), datediff("ts_field", current_timestamp() ).alias("datediff"))
df_update.show(truncate=False)
+-------------------+----------------------+--------+
|ts_field |current_timestamp() |datediff|
+-------------------+----------------------+--------+
|2022-06-06 06:06:06|2022-06-14 21:04:21.34|-8 |
|2022-06-21 13:04:43|2022-06-14 21:04:21.34|7 |
+-------------------+----------------------+--------+
from pyspark.sql.functions import current_date, months_between
df_update = df.select("dt_field", current_date(), months_between("dt_field", current_date() ).alias("months_between"))
df_update.show(truncate=False)
+----------+--------------+--------------+
|dt_field |current_date()|months_between|
+----------+--------------+--------------+
|2022-11-03|2022-06-14 |4.64516129 |
|2022-10-25|2022-06-14 |4.35483871 |
+----------+--------------+--------------+
from pyspark.sql.functions import current_date, months_between
df_update = df.select("dt_field", current_date(), months_between("dt_field", current_date(), False ).alias("months_between"))
df_update.show(truncate=False)
+----------+--------------+-----------------+
|dt_field |current_date()|months_between |
+----------+--------------+-----------------+
|2022-11-03|2022-06-14 |4.645161290322581|
|2022-10-25|2022-06-14 |4.354838709677419|
+----------+--------------+-----------------+
from pyspark.sql.functions import current_timestamp, months_between
df_update = df.select("ts_field", current_timestamp(), months_between("ts_field", current_timestamp() ).alias("months_between"))
df_update.show(truncate=False)
+-------------------+----------------------+--------------+
|ts_field |current_timestamp() |months_between|
+-------------------+----------------------+--------------+
|2022-06-06 06:06:06|2022-06-14 21:10:44.71|-0.2783296 |
|2022-06-21 13:04:43|2022-06-14 21:10:44.71|0.21491898 |
+-------------------+----------------------+--------------+