This tutorial will explain (with examples) how to format data and timestamp datatypes using date_format function in Pyspark.
MetaCharacters |
Description / Functionality |
yyyy |
Return formatted year in four digits(example: 1987) |
yy |
Return formatted year in two digits(example: 87) |
MM |
Return formatted month of the year in number format(example: 12) |
MMM |
Return formatted month in 3 characters format(example: Jun) |
MMMM |
Return formatted full month name(example: June) format |
dd |
Return formatted day of the month in two digits(example: 30) |
d |
Return formatted day of the month in 1 digits(example: 5) |
DDD |
Return formatted day of the year(example: 276) |
EEE |
Return formatted day of the week in 3 characters(example: Wed) |
EEEE |
Return formatted full name of week day(example: Wednesday) |
HH |
Return formatted hour of the time in two digits i.e. 24 hour format(example: 17) |
hh |
Return formatted hour of the time in 12 hour format(example: 11) |
a |
Return formatted timestamp string with AM/PM format |
mm |
Return formatted minutes of the time in two digits(example: 59) |
ss |
Return formatted seconds of the time in two digits(example: 58) |
SSS |
Return formatted milliseconds of the time in three digits(example: 545) |
SSSSSS |
Return formatted microseconds of the time in six digits(example: 545333) |
SSSSSSSSS |
Return formatted nanoseconds of the time (example: 545333444) |
VV |
Return formatted timestamp string with timezone Id (e.g. Asia/Calcutta) |
z |
Return formatted timestamp string with timezone abbreviation(e.g. IST) |
zzzz |
Return formatted timestamp string with timezone name(e.g. India Standard Time ) |
O |
Return formatted timestamp offset from GMT (e.g GMT-7) |
Z |
Return formatted timestamp offset from GMT (e.g. -0700 ) |
x |
Return formatted timestamp offset from GMT (e.g. -07 ) |
df = spark.read.parquet("file:///path_to_files/date_format_example_file.parquet")
df.show()
+---------+----------+-------------------+
| engine| dt_field| ts_field|
+---------+----------+-------------------+
| spark|2022-11-03|2022-06-06 06:06:06|
|mapreduce|2022-10-25|2022-06-21 13:04:43|
+---------+----------+-------------------+
df.printSchema()
root
|-- engine: string (nullable = true)
|-- dt_field: date (nullable = true)
|-- ts_field: timestamp (nullable = true)
from pyspark.sql.functions import date_format
date_format(timestamp/date column, format_string)
df.select("engine", date_format("dt_field", "dd/MMM/yy")).show()
+---------+--------------------------------+
| engine|date_format(dt_field, dd/MMM/yy)|
+---------+--------------------------------+
| spark| 03/Nov/22|
|mapreduce| 25/Oct/22|
+---------+--------------------------------+
df.select("engine", date_format("dt_field", "EEE dd/MM/yy")).show()
+---------+-----------------------------------+
| engine|date_format(dt_field, EEE dd/MM/yy)|
+---------+-----------------------------------+
| spark| Thu 03/11/22|
|mapreduce| Tue 25/10/22|
+---------+-----------------------------------+
df.select("engine", date_format("dt_field", "EEEE dd/MM/yy")).show()
+---------+--------------------------------------+
| engine|date_format(dt_field, EEEE dd/MM/yyyy)|
+---------+--------------------------------------+
| spark| Thursday 03/11/2022|
|mapreduce| Tuesday 25/10/2022|
+---------+--------------------------------------+
df.select("engine", date_format("dt_field", "EEEE MMMM dd,yyyy")).show(truncate=False)
+---------+----------------------------------------+
|engine |date_format(dt_field, EEEE MMMM dd,yyyy)|
+---------+----------------------------------------+
|spark |Thursday November 03,2022 |
|mapreduce|Tuesday October 25,2022 |
+---------+----------------------------------------+
df.select("engine", date_format("ts_field", "dd-MM-yyyy hh:mm:ss a")).show(truncate=False)
+---------+--------------------------------------------+
|engine |date_format(ts_field, dd-MM-yyyy hh:mm:ss a)|
+---------+--------------------------------------------+
|spark |06-06-2022 06:06:06 AM |
|mapreduce|21-06-2022 01:04:43 PM |
+---------+--------------------------------------------+
df.select("engine", date_format("ts_field", "EEEE MMMM dd, yyyy hh:mm:ss a")).show(truncate=False)
+---------+----------------------------------------------------+
|engine |date_format(ts_field, EEEE MMMM dd, yyyy hh:mm:ss a)|
+---------+----------------------------------------------------+
|spark |Monday June 06, 2022 06:06:06 AM |
|mapreduce|Tuesday June 21, 2022 01:04:43 PM |
+---------+----------------------------------------------------+
df.select("engine", date_format("ts_field", "MM/dd/yyyy HH:mm:ss.SSSSSS")).show(truncate=False)
+---------+-------------------------------------------------+
|engine |date_format(ts_field, MM/dd/yyyy HH:mm:ss.SSSSSS)|
+---------+-------------------------------------------------+
|spark |06/06/2022 06:06:06.000000 |
|mapreduce|06/21/2022 13:04:43.000000 |
+---------+-------------------------------------------------+
df.select("engine", date_format("ts_field", "MM/dd/yyyy HH:mm:ss z")).show(truncate=False)
+---------+--------------------------------------------+
|engine |date_format(ts_field, MM/dd/yyyy HH:mm:ss z)|
+---------+--------------------------------------------+
|spark |06/06/2022 06:06:06 PDT |
|mapreduce|06/21/2022 13:04:43 PDT |
+---------+--------------------------------------------+
df.select("engine", date_format("ts_field", "MM/dd/yyyy HH:mm:ss zzzz")).show(truncate=False)
+---------+-----------------------------------------------+
|engine |date_format(ts_field, MM/dd/yyyy HH:mm:ss zzzz)|
+---------+-----------------------------------------------+
|spark |06/06/2022 06:06:06 Pacific Daylight Time |
|mapreduce|06/21/2022 13:04:43 Pacific Daylight Time |
+---------+-----------------------------------------------+
df.select("engine", date_format("ts_field", "MM-dd-yyyy HH:mm:ss VV")).show(truncate=False)
+---------+---------------------------------------------+
|engine |date_format(ts_field, MM-dd-yyyy HH:mm:ss VV)|
+---------+---------------------------------------------+
|spark |06-06-2022 06:06:06 America/Los_Angeles |
|mapreduce|06-21-2022 13:04:43 America/Los_Angeles |
+---------+---------------------------------------------+
df.select("engine", date_format("ts_field", "MM-dd-yyyy HH:mm:ss O")).show(truncate=False)
+---------+--------------------------------------------+
|engine |date_format(ts_field, MM-dd-yyyy HH:mm:ss O)|
+---------+--------------------------------------------+
|spark |06-06-2022 06:06:06 GMT-7 |
|mapreduce|06-21-2022 13:04:43 GMT-7 |
+---------+--------------------------------------------+
df.select("engine", date_format("ts_field", "MM-dd-yyyy HH:mm:ss Z")).show(truncate=False)
+---------+--------------------------------------------+
|engine |date_format(ts_field, MM-dd-yyyy HH:mm:ss Z)|
+---------+--------------------------------------------+
|spark |06-06-2022 06:06:06 -0700 |
|mapreduce|06-21-2022 13:04:43 -0700 |
+---------+--------------------------------------------+
df.select("engine", date_format("ts_field", "MM-dd-yyyy HH:mm:ss x")).show(truncate=False)
+---------+--------------------------------------------+
|engine |date_format(ts_field, MM-dd-yyyy HH:mm:ss x)|
+---------+--------------------------------------------+
|spark |06-06-2022 06:06:06 -07 |
|mapreduce|06-21-2022 13:04:43 -07 |
+---------+--------------------------------------------+