This tutorial will explain how to list all columns, data types or print schema of a dataframe, it will also explain how to create a new schema for reading files. Below listed topics will be explained with examples, click on item in the below list and it will take you to the respective section of the page:
df = spark.read.csv("file:///path_to_files/csv_file_with_duplicates.csv", header=True)
df.show()
+-----+---------+-------+
|db_id| db_name|db_type|
+-----+---------+-------+
| 12| Teradata| RDBMS|
| 14|Snowflake|CloudDB|
| 15| Vertica| RDBMS|
| 12| Teradata| RDBMS|
| 22| Mysql| RDBMS|
+-----+---------+-------+
df.printSchema()
Output:
root
|-- db_id: string (nullable = true)
|-- db_name: string (nullable = true)
|-- db_type: string (nullable = true)
df.columns
Output: ['db_id', 'db_name', 'db_type']
column_name_list = df.columns # getting column list
column_name_list.remove('db_type') # removing unwanted column from list
df.select(column_name_list).show() # displaying data for all the remaining column
+-----+---------+
|db_id| db_name|
+-----+---------+
| 12| Teradata|
| 14|Snowflake|
| 15| Vertica|
| 12| Teradata|
| 22| Mysql|
+-----+---------+
df.dtypes
Output:
[('db_id', 'string'), ('db_name', 'string'), ('db_type', 'string')]
df.schema
Output:
StructType(List(StructField(db_id,StringType,true), StructField(db_name,StringType,true),StructField(db_type,StringType,true)))
df.schema.fields
Output:
[StructField(db_id,StringType,true), StructField(db_name,StringType,true), StructField(db_type,StringType,true)]
df.schema.names
Output: ['db_id', 'db_name', 'db_type']
df.schema.fieldNames()
Output:
['db_id', 'db_name', 'db_type']
df.schema.jsonValue()
Output:
{'type': 'struct', 'fields': [{'name': 'db_id', 'type': 'string', 'nullable': True, 'metadata': {}}, {'name': 'db_name', 'type': 'string', 'nullable': True, 'metadata': {}}, {'name': 'db_type', 'type': 'string', 'nullable': True, 'metadata': {}}]}
df.schema.json()
Output:
'{"fields":[{"metadata":{},"name":"db_id","nullable":true,"type":"string"},{"metadata":{},"name":"db_name","nullable":true,"type":"string"},{"metadata":{},"name":"db_type","nullable":true,"type":"string"}],"type":"struct"}'
StructType_variable.add(field, data_type=None, nullable=True, metadata=None)
from pyspark.sql.types import StructType # imported Custom Schema to read a file
schema_def = StructType() # Created a StructType object
schema_def.add("db_id","integer",True) # Adding column 1 to StructType
schema_def.add("db_name","string",True) # Adding column 2 to StructType
schema_def.add("db_type_cd","string",True) # Adding column 3 to StructType
print(schema_def)
Output:
StructType(List(StructField(db_id,IntegerType,true), StructField(db_name,StringType,true),StructField(db_type_cd,StringType,true)))
from pyspark.sql.types import StructType # imported StructType
schema_def = StructType() # Created a StructType object
schema_def.add("db_id","integer",True) # Adding column 1 to StructType
schema_def.add("db_name","string",True) # Adding column 2 to StructType
schema_def.add("db_type_cd","string",True) # Adding column 3 to StructType
df_with_schema = spark.read.csv("file:///path_to_files/csv_file_with_duplicates.csv", schema=schema_def, header=True)
df_with_schema.printSchema()
root
|-- db_id: integer (nullable = true)
|-- db_name: string (nullable = true)
|-- db_type_cd: string (nullable = true)