This tutorial will explain how to read data from various types of databases(such as Mysql, SingleStore, Teradata) using JDBC Connection into Spark dataframe.
Example 1: Using options()
df=spark.read.format("jdbc").options(driver="com.mysql.cj.jdbc.Driver",
user="tutorial_user",
password="user_password",
url="jdbc:mysql://Mysql.dbmstutorials.com:3306?serverTimezone=UTC&useSSL=false",
dbtable="tutorial_db.department"
).load()
df.show()
+-------+--------------------+-----------+
|dept_no| department_name| loc_name|
+-------+--------------------+-----------+
| 100| ACCOUNTS| JAIPUR|
| 200| R & D| NEW DELHI|
| 300| SALES| BENGALURU|
| 400|INFORMATION TECHN...|BHUBANESWAR|
+-------+--------------------+-----------+
Example 2: Using option()
df=spark.read.format("jdbc") \
.option("driver","com.mysql.cj.jdbc.Driver") \
.option("user","tutorial_user") \
.option("password","user_password") \
.option("url","jdbc:mysql://Mysql.dbmstutorials.com:3306?serverTimezone=UTC&useSSL=false") \
.option("dbtable","tutorial_db.department") \
.load()
df.show()
+-------+--------------------+-----------+
|dept_no| department_name| loc_name|
+-------+--------------------+-----------+
| 100| ACCOUNTS| JAIPUR|
| 200| R & D| NEW DELHI|
| 300| SALES| BENGALURU|
| 400|INFORMATION TECHN...|BHUBANESWAR|
+-------+--------------------+-----------+
df=spark.read.format("jdbc").options(driver="com.mysql.cj.jdbc.Driver",
user="tutorial_user",
password="user_password",
url="jdbc:mysql://Mysql.dbmstutorials.com:3306?serverTimezone=UTC&useSSL=false",
query="SELECT * FROM tutorial_db.department WHERE dept_no IN (100,200)"
).load()
df.show()
+-------+---------------+---------+
|dept_no|department_name| loc_name|
+-------+---------------+---------+
| 100| ACCOUNTS| JAIPUR|
| 200| R & D|NEW DELHI|
+-------+---------------+---------+
df=spark.read.format("jdbc").options(driver="org.mariadb.jdbc.Driver",
user="tutorial_user",
password="user_password",
url= "jdbc:mysql://singlestore.dbmstutorials.com:3306/tutorial_db",
dbtable="department",
partitionColumn="dept_no",
lowerBound=10,
upperBound=30,
numPartitions=5
).load()
df.show()
+-------+--------------------+-----------+
|dept_no| department_name| loc_name|
+-------+--------------------+-----------+
| 100| ACCOUNTS| JAIPUR|
| 200| R & D| NEW DELHI|
| 300| SALES| BENGALURU|
| 400|INFORMATION TECHN...|BHUBANESWAR|
+-------+--------------------+-----------+
df=spark.read.format("jdbc").options(driver="org.mariadb.jdbc.Driver",
user="tutorial_user",
password="user_password",
url= "jdbc:mysql://singlestore.dbmstutorials.com:3306/tutorial_db",
query="SELECT * FROM tutorial_db.department WHERE dept_no=300"
).load()
df.show()
+-------+---------------+---------+
|dept_no|department_name| loc_name|
+-------+---------------+---------+
| 300| SALES|BENGALURU|
+-------+---------------+---------+
teradata_df = spark.read.format("jdbc") \
.option("url", "jdbc:teradata://Teradata.dbmstutorials.com/COLUMN_NAME=ON") \
.option("dbtable", "tutorial_db.department") \
.option("user", "tutorial_user") \
.option("password", "user_password") \
.option("driver","com.teradata.jdbc.TeraDriver") \
.load()
teradata_df.show()
+-------+--------------------+-----------+
|dept_no| department_name| loc_name|
+-------+--------------------+-----------+
| 200| R & D| NEW DELHI|
| 100| ACCOUNTS| JAIPUR|
| 400|INFORMATION TECHN...|BHUBANESWAR|
| 300| SALES| BENGALURU|
+-------+--------------------+-----------+
teradata_df = spark.read.format("jdbc") \
.option("url","jdbc:teradata://Teradata.dbmstutorials.com/COLUMN_NAME=ON") \
.option("query","SELECT * FROM tutorial_db.department WHERE dept_no=400") \
.option("user","tutorial_user") \
.option("password","user_password") \
.option("driver","com.teradata.jdbc.TeraDriver") \
.load()
teradata_df.show()
+-------+--------------------+-----------+
|dept_no| department_name| loc_name|
+-------+--------------------+-----------+
| 400|INFORMATION TECHN...|BHUBANESWAR|
+-------+--------------------+-----------+
hive_df = spark.read.table("retail.orders")
hive_df.show(5)
+--------+--------------------+-----------------+------------+
|order_id| order_date|order_customer_id|order_status|
+--------+--------------------+-----------------+------------+
| 68817|2014-03-27 00:00:...| 6704| COMPLETE|
| 68818|2014-03-31 00:00:...| 12393| PROCESSING|
| 68819|2014-04-03 00:00:...| 1212| COMPLETE|
| 68820|2014-04-04 00:00:...| 6358| COMPLETE|
| 68821|2014-04-05 00:00:...| 2564| COMPLETE|
+--------+--------------------+-----------------+------------+
hive_df = spark.sql("SELECT * FROM retail.orders WHERE order_id IN (68817, 68819)")
hive_df.show()
+--------+--------------------+-----------------+------------+
|order_id| order_date|order_customer_id|order_status|
+--------+--------------------+-----------------+------------+
| 68817|2014-03-27 00:00:...| 6704| COMPLETE|
| 68819|2014-04-03 00:00:...| 1212| COMPLETE|
+--------+--------------------+-----------------+------------+