This tutorials will explain how to incrementally load data in Hive tables. Incremental load in Hive can be achieved using transient table and partition overwrite.
***************** Main table ****************
CREATE TABLE orders_incr
(
order_id INT,
cust_id INT,
order_status STRING
)
PARTITIONED BY (order_date DATE)
Row format delimited
fields terminated by ','
Stored as textfile;
***************** Transient table ****************
CREATE TABLE orders_temp
(
order_id INT,
cust_id INT,
order_status STRING
)
PARTITIONED BY (order_date DATE)
Row format delimited
fields terminated by ','
Stored as textfile;
***************** order_incr_1.txt **************
68784,10349,COMPLETE,2014-01-14
68785,10778,PENDING,2014-01-16
68786,1847,COMPLETE,2014-01-19
68787,3219,PROCESSING,2014-01-20
68788,11334,COMPLETE,2014-01-23
68798,2595,COMPLETE,2014-01-13
68799,11190,COMPLETE,2014-01-14
68800,10037,PROCESSING,2014-01-16
68801,2079,PENDING_PAYMENT,2014-01-18
68802,10670,COMPLETE,2014-01-19
68803,9397,CLOSED,2014-01-20
*************************************************
***************** order_incr_2.txt **************
68785,10778,COMPLETE,2014-01-16
68787,3219,PROCESSING,2014-01-20
68788,11334,CLOSED,2014-01-23
68800,10037,COMPLETE,2014-01-16
68801,2079,PROCESSING,2014-01-18
*************************************************
LOAD DATA local inpath '/path_to_file/data_files/order_incr_1.txt' OVERWRITE INTO TABLE orders_temp;
set hive.exec.dynamic.partition.mode=nonstrict;
INSERT OVERWRITE TABLE orders_incr PARTITION(order_date) SELECT order_id,cust_id,order_status,order_date from orders_temp;
LOAD DATA local inpath '/path_to_file/data_files/order_incr_2.txt' OVERWRITE INTO TABLE orders_temp;
set hive.exec.dynamic.partition.mode=nonstrict;
INSERT OVERWRITE TABLE orders_incr PARTITION(order_date)
SELECT COALESCE(orders_temp.order_id, orders_incr.order_id),
COALESCE(orders_temp.cust_id, orders_incr.cust_id),
COALESCE(orders_temp.order_status, orders_incr.order_status),
COALESCE(orders_temp.order_date, orders_incr.order_date)
FROM orders_temp FULL OUTER JOIN orders_incr ON (orders_incr.order_id = orders_temp.order_id
and orders_incr.order_date = orders_temp.order_date
)
;
LOAD DATA local inpath '/path_to_file/data_files/order_incr_2.txt' OVERWRITE INTO TABLE orders_temp;
INSERT TABLE orders_temp
SELECT orders_incr.order_id, orders_incr.cust_id, orders_incr.order_status, orders_incr.order_date
FROM orders_incr
LEFT OUTER JOIN orders_temp ON (orders_incr.order_id = orders_temp.order_id )
WHERE orders_incr.order_date IN (SELECT distinct order_date FROM orders_temp)
AND orders_temp.order_id IS NULL;
set hive.exec.dynamic.partition.mode=nonstrict;
INSERT OVERWRITE TABLE orders_incr PARTITION(order_date)
SELECT order_id,cust_id,order_status,order_date
FROM orders_temp;
FAILED: SemanticException [Error 10096]: Dynamic partition strict mode requires at least one static partition column. To turn this off set hive.exec.dynamic.partition.mode=nonstrict