SERDE is a combination of Serializer and Deserializer i.e SERIALIZER + DESERIALIZER = SERDE. SERDE is popularly used to load from sources storing data in JSON format.
CREATE TABLE order_json
(
order_id INT,
order_date STRING,
cust_id STRING,
order_status STRING
)
ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'
STORED AS TEXTFILE;
LOAD DATA local inpath '/Users/username/hive/data_files/order_json.txt' OVERWRITE INTO TABLE order_json;
- --Sample data in JSON file
{"order_id":68784,"order_date":"2014-01-14 00:00:00.0","order_status":"COMPLETE"}
{"order_id":68785,"order_date":"2014-01-16 00:00:00.0","cust_id":"10778","order_status":"PENDING"}
{"order_id":68786,"order_date":"2014-01-19 00:00:00.0","cust_id":"1847","order_status":"COMPLETE"}
{"order_id":68787,"order_date":"2014-01-20 00:00:00.0","cust_id":"3219","order_status":"PROCESSING"}
Points to be noted:
Failed with exception java.io.IOException:org.apache.hadoop.hive.serde2.SerDeException: org.codehaus.jackson.JsonParseException: Current token (VALUE_STRING) not numeric, can not use numeric value accessors
{"order_id":68784,"order_date":"2014-01-14 00:00:00.0","cust_id":"10349","order_status":"COMPLETE"}
{"order_id":68784,"order_date":"2014-01-14 00:00:00.0","cust_id":10349,"order_status":"COMPLETE"}
CREATE TABLE order_json_number
(
order_id INT,
order_date STRING,
cust_id STRING,
order_status STRING
)
ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'
STORED AS TEXTFILE;
LOAD DATA local inpath '/Users/username/hive/data_files/order_json_number.txt' OVERWRITE INTO TABLE order_json_number;
--Sample data in JSON file
{"order_id":68834,"order_date":"2014-05-01 00:00:00.0","cust_id":6938,"order_status":"COMPLETE"}
{"order_id":68835,"order_date":"2014-05-02 00:00:00.0","cust_id":764,"order_status":"COMPLETE"}
{"order_id":68836,"order_date":"2014-05-03 00:00:00.0","cust_id":8009,"order_status":"PENDING_PAYMENT"}
{"order_id":68837,"order_date":"2014-05-07 00:00:00.0","cust_id":1223,"order_status":"COMPLETE"}