This tutorial will explain (with examples) how to convert strings into date/timestamp datatypes using to_date / to_timestamp functions in Pyspark.
MetaCharacters |
Description / Functionality |
yyyy |
Convert year in four digits(example: 1987) |
yy |
Convert year in two digits(example: 87) |
MM |
Convert month of the year in number format(example: 12) |
MMM |
Convert month in 3 characters format(example: Jun) |
MMMM |
Convert full month name(example: June) format |
dd |
Convert day of the month in two digits(example: 30) |
d |
Convert day of the month in 1 digits(example: 5) |
DDD |
Convert day of the year(example: 276) |
HH |
Convert hour of the time in two digits i.e. 24 hour format(example: 17) |
hh |
Convert hour of the time in 12 hour format(example: 11) |
a |
Convert string timestamp with AM/PM format |
mm |
Convert minutes of the time in two digits(example: 59) |
ss |
Convert seconds of the time in two digits(example: 58) |
SSS |
Convert milliseconds of the time in three digits(example: 545) |
SSSSSS |
Convert microseconds of the time in six digits(example: 545333) |
SSSSSSSSS |
Convert nanoseconds of the time (example: 545333444) |
VV |
Convert timestamp string with timezone Id (e.g. Asia/Calcutta or +05:30) into timestamp |
z |
Convert timestamp string with timezone abbreviation(e.g. IST or +05:30) into timestamp |
zzzz |
Convert timestamp string with timezone name(e.g. India Standard Time ) into timestamp |
df = spark.read.csv("file:///path_to_files/string_to_timestamp_example_file.csv",header=True)
df.show()
+-----+-----------+
|db_id| db_name|
+-----+-----------+
| 56|SingleStore|
| 14| Snowflake|
+-----+-----------+
from pyspark.sql.functions import to_date, lit
to_date(date_string, format_string)
df.select("db_id", to_date(lit("1-Oct-22"),"d-MMM-yy")).show()
+-----+-------------------------------+
|db_id|to_date('1-Oct-22', 'd-MMM-yy')|
+-----+-------------------------------+
| 56| 2022-10-01|
| 14| 2022-10-01|
+-----+-------------------------------+
df.select("db_id", to_date(lit("1-12-2022"),"d-MM-yyyy")).show()
+-----+---------------------------------+
|db_id|to_date('1-12-2022', 'd-MM-yyyy')|
+-----+---------------------------------+
| 56| 2022-12-01|
| 14| 2022-12-01|
+-----+---------------------------------+
df.select("db_id", to_date(lit("12-Jun-22"),"dd-MMM-yy")).show()
+-----+---------------------------------+
|db_id|to_date('12-Jun-22', 'dd-MMM-yy')|
+-----+---------------------------------+
| 56| 2022-06-12|
| 14| 2022-06-12|
+-----+---------------------------------+
df.select("db_id", to_date(lit("12-Jun-2022"),"dd-MMM-yyyy")).show()
+-----+-------------------------------------+
|db_id|to_date('12-Jun-2022', 'dd-MMM-yyyy')|
+-----+-------------------------------------+
| 56| 2022-06-12|
| 14| 2022-06-12|
+-----+-------------------------------------+
df.select("db_id", to_date(lit("12-June-22"),"dd-MMMM-yy")).show()
+-----+-----------------------------------+
|db_id|to_date('12-June-22', 'dd-MMMM-yy')|
+-----+-----------------------------------+
| 56| 2022-06-12|
| 14| 2022-06-12|
+-----+-----------------------------------+
df.select("db_id", to_date(lit("151-2022"),"DDD-yyyy")).show()
+-----+-------------------------------+
|db_id|to_date('151-2022', 'DDD-yyyy')|
+-----+-------------------------------+
| 56| 2022-05-31|
| 14| 2022-05-31|
+-----+-------------------------------+
var_date=[("icecream","11/03/2022"),("cake","10/25/2022")]
df_2 = spark.createDataFrame(sc.parallelize(var_date))
df_2.select("_1", to_date("_2", "MM/dd/yyyy")).show()
+--------+---------------------------+
|_1 |to_date(`_2`, 'MM/dd/yyyy')|
+--------+---------------------------+
|icecream|2022-11-03 |
|cake |2022-10-25 |
+--------+---------------------------+
from pyspark.sql.functions import to_timestamp, lit
to_timestamp(timestamp_string, format)
df.select("db_id", to_timestamp( lit("2022-03-15 10:22:22"), "yyyy-MM-dd HH:mm:ss") ).show(truncate=False)
+-----+----------------------------------------------------------+
|db_id|to_timestamp('2022-03-15 10:22:22', 'yyyy-MM-dd HH:mm:ss')|
+-----+----------------------------------------------------------+
|56 |2022-03-15 10:22:22 |
|14 |2022-03-15 10:22:22 |
+-----+----------------------------------------------------------+
df.select("db_id", to_timestamp( lit("2022-03-15 10:22:22.545"), "yyyy-MM-dd HH:mm:ss.SSS") ).show(truncate=False)
+-----+------------------------------------------------------------------+
|db_id|to_timestamp('2022-03-15 10:22:22.545', 'yyyy-MM-dd HH:mm:ss.SSS')|
+-----+------------------------------------------------------------------+
|56 |2022-03-15 10:22:22.545 |
|14 |2022-03-15 10:22:22.545 |
+-----+------------------------------------------------------------------+
df.select("db_id", to_timestamp( lit("October 03,2022 12:05:47"), "MMMM dd,yyyy HH:mm:ss") ).show(truncate=False)
+-----+-----------------------------------------------------------------+
|db_id|to_timestamp('October 03,2022 12:05:47', 'MMMM dd,yyyy HH:mm:ss')|
+-----+-----------------------------------------------------------------+
|56 |2022-10-03 12:05:47 |
|14 |2022-10-03 12:05:47 |
+-----+-----------------------------------------------------------------+
df.select(to_timestamp(lit("01-10-19 10:12:11 PM"),"dd-MM-yy hh:mm:ss a")).show(truncate=False)
df.select("db_id", to_timestamp( lit("01-10-22 10:12:11 PM"), "dd-MM-yy hh:mm:ss a") ).show(truncate=False)
+-----+-----------------------------------------------------------+
|db_id|to_timestamp('01-10-22 10:12:11 PM', 'dd-MM-yy hh:mm:ss a')|
+-----+-----------------------------------------------------------+
|56 |2022-10-01 22:12:11 |
|14 |2022-10-01 22:12:11 |
+-----+-----------------------------------------------------------+
df.select("db_id", to_timestamp( lit("June 03,2022 18:05:47 IST") ,"MMMM dd,yyyy HH:mm:ss z") ).show(truncate=False)
+-----+--------------------------------------------------------------------+
|db_id|to_timestamp('June 03,2022 18:05:47 PST', 'MMMM dd,yyyy HH:mm:ss z')|
+-----+--------------------------------------------------------------------+
|56 |2022-06-03 18:05:47 |
|14 |2022-06-03 18:05:47 |
+-----+--------------------------------------------------------------------+
df.select("db_id", to_timestamp( lit("June 03,2022 18:05:47 +05:30"), "MMMM dd,yyyy HH:mm:ss VV") ).show(truncate=False)
+-----+------------------------------------------------------------------------+
|db_id|to_timestamp('June 03,2022 18:05:47 +05:30', 'MMMM dd,yyyy HH:mm:ss VV')|
+-----+------------------------------------------------------------------------+
|56 |2022-06-03 05:35:47 |
|14 |2022-06-03 05:35:47 |
+-----+------------------------------------------------------------------------+
df.select("db_id", to_timestamp( lit("June 03,2022 18:05:47 Asia/Calcutta"), "MMMM dd,yyyy HH:mm:ss VV") ).show(truncate=False)
+-----+-------------------------------------------------------------------------------+
|db_id|to_timestamp('June 03,2022 18:05:47 Asia/Calcutta', 'MMMM dd,yyyy HH:mm:ss VV')|
+-----+-------------------------------------------------------------------------------+
|56 |2022-06-03 05:35:47 |
|14 |2022-06-03 05:35:47 |
+-----+-------------------------------------------------------------------------------+
df.select("db_id", to_timestamp(lit("May 03,2022 18:05:47 India Standard Time"),"MMMM dd,yyyy HH:mm:ss zzzz")).show(truncate=False)
+-----+--------------------------------------------------------------------------------------+
|db_id|to_timestamp('May 03,2022 18:05:47 India Standard Time', 'MMMM dd,yyyy HH:mm:ss zzzz')|
+-----+--------------------------------------------------------------------------------------+
|56 |2022-05-03 05:35:47 |
|14 |2022-05-03 05:35:47 |
+-----+--------------------------------------------------------------------------------------+
var_tm_stamp=[("icecream","May 03,2022 18:05:47"),("cake","June 05,2022 18:05:47")]
df_1 = spark.createDataFrame(sc.parallelize(var_tm_stamp))
df_1.select("_1", to_timestamp("_2", "MMMM dd,yyyy HH:mm:ss")).show(truncate=False)
+--------+-------------------------------------------+
|_1 |to_timestamp(`_2`, 'MMMM dd,yyyy HH:mm:ss')|
+--------+-------------------------------------------+
|icecream|2022-05-03 18:05:47 |
|cake |2022-06-05 18:05:47 |
+--------+-------------------------------------------+