This tutorials provides most of the information related to tables in Hive. There are 2 type of tables in Hive.
→ Internal Table: Internal Tables stores data inside HDFS hive/warehouse with tablename as directory. Internal tables are also called managed tables.
→ External Table: External Tables stores data in the user defined HDFS directory.
|
Internal/Managed Table |
External Table |
Table Creation |
User don't have to specify internal keyword in create table statement. Example: 'CREATE TABLE tablename'. |
User has to specifically specify external keyword in create table statement. example: 'CREATE EXTERNAL TABLE tablename'. |
Table Data Location |
Data will stored in system defined HDFS hive/warehouse directory. |
User has to specify the HDFS data directory in create table statement. |
DROP Table |
Drop table command will remove all the data from HDFS as well and data will lost. |
Drop table command will not remove any data(no data loss) from HDFS and it will just remove table defintion. |
CREATE TABLE hiveFirstTable
(
order_id INT,
order_date STRING,
cust_id INT,
order_status STRING
)
ROW FORMAT delimited
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
STORED AS textfile;
CREATE EXTERNAL TABLE hiveFirstExternalTable
(
order_id INT,
order_date STRING,
cust_id INT,
order_status STRING
)
ROW FORMAT delimited
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
STORED AS textfile
LOCATION '/user/username/somedirectory';
Attribute |
Description |
ROW FORMAT |
This is used to specify the format of data file that will be loaded into HIVE. Possible types: DELIMITED, SERDE |
TBLPROPERTIES |
This attribute can be used to specify multiple properties for tables as key-value pair. Example: Ignoring 'n' number of lines of header or footer from file to be loaded. |
STORED AS |
This is used to specify the format of data file that will be loaded/stored into HIVE. Default is plain text file, other types includes: ORC, PARQUET, RCFILE, AVRO, JSONFILE, SEQUENCEFILE. |
LINES TERMINATED BY |
This is used to specify the character which can be used to differentiate multiple rows. Default is newline character. |
PARTITION BY |
This is used to partition the table data into multiple files based on partition column defined. |
CLUSTERED BY |
This is used to classify data into specified number of bucket(again multiple files) using hash function. |
LOCATION |
This property is specifically for external table to specify the directory location where data file loaded into the external table will be stored. |
ROW FORMAT delimited FIELDS TERMINATED BY '\t'
ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'
TBLPROPERTIES("comment"="Adding table information as comment")
TBLPROPERTIES ("skip.header.line.count"="1");
TBLPROPERTIES("skip.footer.line.count"="1")
TBLPROPERTIES("transactional"="true")
TBLPROPERTIES("transactional"="false")
TBLPROPERTIES("orc.compress"="ZLIB")
TBLPROPERTIES("orc.compress"="SNAPPY")
TBLPROPERTIES("orc.create.index"="true")
CREATE TABLE hiveFirstTableSkipHeader
(
order_id INT,
order_date STRING,
cust_id INT,
order_status STRING
)
ROW FORMAT delimited
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS textfile
TBLPROPERTIES("skip.header.line.count"="1");
CREATE TABLE hiveFirstTableSkipFooter
(
order_id INT,
order_date STRING,
cust_id INT,
order_status STRING
)
ROW FORMAT delimited
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS textfile
TBLPROPERTIES("skip.footer.line.count"="1");
CREATE TABLE hiveFirstTableSkipHeaderFooter
(
order_id INT,
order_date STRING,
cust_id INT,
order_status STRING
)
ROW FORMAT delimited
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
STORED AS textfile
tblproperties("skip.header.line.count"="1", "skip.footer.line.count"="1");
CREATE TABLE hiveFirstTransTable
(
order_id INT,
order_date STRING,
cust_id INT,
order_status STRING
)
ROW FORMAT delimited
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
STORED AS ORC
TBLPROPERTIES("transactional"="true");
Note: Below properties need to be set otherwise create table statement will fail
SET hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
SET hive.support.concurrency=true;
Create table hiveFirstPartitionedClusteredTable
(
order_id INT,
order_date STRING,
cust_id INT
)
PARTITIONED BY (order_status STRING)
CLUSTERED BY (order_id) INTO 10 buckets
ROW format delimited
FIELDS TERMINATED BY ','
STORED AS textfile;
CREATE TABLE hiveFirstTransTable
(
order_id INT,
order_date STRING,
cust_id INT,
order_status STRING
)
ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'
STORED AS TEXTFILE;
CREATE TABLE hiveFirstTable_copy LIKE hiveFirstTable;
CREATE TABLE hiveFirstTable_copy_with_date AS SELECT * FROM hiveFirstTable;