Differences
This shows you the differences between two versions of the page.
Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
kb:hive [2017/11/20 10:28] – yehuda | kb:hive [2022/01/03 16:03] (current) – external edit 127.0.0.1 | ||
---|---|---|---|
Line 1: | Line 1: | ||
====== Hive ====== | ====== Hive ====== | ||
- | ===== Migrate CSV to hive table ===== | + | https:// |
- | <code sql> | + | |
- | + | ||
- | DROP TABLE IF EXISTS schema.tbl_external_csv; | + | |
- | DROP TABLE IF EXISTS schema.tbl_internal; | + | |
+ | <code sql> | ||
-- | -- | ||
- | -- Create external csv reader | + | -- In order to support an on-going reconciliation between current records in HIVE and new change records, two tables |
+ | -- should be defined: base_table and incremental_table | ||
-- | -- | ||
- | CREATE | + | -- base_table |
- | name STRING, | + | -- The example below shows DDL for the Hive table “base_table” that will include any delimited files located in |
- | guid STRING, | + | -- HDFS under the ‘/ |
- | src_date String, | + | -- source system. After the first processing run, it will house the on-going, most up-to-date set of records from the source system: |
- | key INT) | + | -- |
- | COMMENT ' | + | CREATE TABLE base_table |
+ | id string, | ||
+ | field1 string, field2 string, field3 string, field4 string, field5 string, | ||
+ | modified_date string | ||
+ | ) | ||
ROW FORMAT DELIMITED | ROW FORMAT DELIMITED | ||
FIELDS TERMINATED BY ',' | FIELDS TERMINATED BY ',' | ||
- | STORED AS TEXTFILE | + | LOCATION '/ |
- | LOCATION '/ | + | |
- | tblproperties(" | + | |
- | -- | + | -- # |
- | -- Create tbl_internal | + | -- The DDL below shows an external Hive table “incremental_table” that will include |
- | -- | + | -- any delimited files with incremental change records, located in HDFS under the ‘/ |
- | CREATE TABLE IF NOT EXISTS schema.tbl_internal | + | -- directory: |
- | name STRING, | + | CREATE |
- | guid STRING, | + | id string, |
- | `date` TIMESTAMP, | + | field1 string, field2 string, field3 string, field4 string, field5 string, |
- | key INT | + | modified_date string |
- | ) STORED AS parquet; | + | ) |
+ | ROW FORMAT DELIMITED | ||
+ | FIELDS TERMINATED BY ',' | ||
+ | LOCATION '/ | ||
- | -- | + | -- # |
- | -- Import data form external | + | -- This view combines record sets from both the Base (base_table) and |
- | -- | + | -- Change (incremental_table) tables and is reduced only to the most recent records for each unique “id”. |
- | INSERT OVERWRITE TABLE `schema`.`tbl_internal` | + | -- It is defined as follows: |
- | SELECT | + | CREATE VIEW reconcile_view AS |
- | | + | SELECT t1.* FROM |
- | | + | ( |
- | | + | SELECT * FROM base_table |
- | `key` | + | UNION ALL |
- | FROM `schema`.`tbl_external_csv`; | + | |
+ | ) t1 | ||
+ | JOIN | ||
+ | |||
+ | ( | ||
+ | | ||
+ | | ||
+ | ( | ||
+ | SELECT * FROM base_table | ||
+ | UNION ALL | ||
+ | SELECT * FROM incremental_table | ||
+ | ) t2 | ||
+ | GROUP BY id | ||
+ | |||
+ | ) s | ||
+ | ON t1.id = s.id AND t1.modified_date = s.max_modified; | ||
</ | </ | ||
+ | |||
+ | Source: https:// |