Differences
This shows you the differences between two versions of the page.
| Next revision | Previous revision | ||
| kb:hive [2017/11/20 10:23] – created yehuda | kb:hive [2022/01/03 16:03] (current) – external edit 127.0.0.1 | ||
|---|---|---|---|
| Line 1: | Line 1: | ||
| ====== Hive ====== | ====== Hive ====== | ||
| - | ===== Migrate CSV to hive table ===== | + | https:// |
| - | <code sql> | + | |
| - | + | ||
| - | DROP TABLE IF EXISTS schema.tbl_external_csv; | + | |
| - | DROP TABLE IF EXISTS schema.tbl_internal; | + | |
| + | <code sql> | ||
| -- | -- | ||
| - | -- Create external csv reader | + | -- In order to support an on-going reconciliation between current records in HIVE and new change records, two tables |
| + | -- should be defined: base_table and incremental_table | ||
| -- | -- | ||
| - | CREATE | + | -- base_table |
| - | name STRING, | + | -- The example below shows DDL for the Hive table “base_table” that will include any delimited files located in |
| - | guid STRING, | + | -- HDFS under the ‘/ |
| - | src_date String, | + | -- source system. After the first processing run, it will house the on-going, most up-to-date set of records from the source system: |
| - | key INT) | + | -- |
| - | COMMENT ' | + | CREATE TABLE base_table |
| + | id string, | ||
| + | field1 string, field2 string, field3 string, field4 string, field5 string, | ||
| + | modified_date string | ||
| + | ) | ||
| ROW FORMAT DELIMITED | ROW FORMAT DELIMITED | ||
| FIELDS TERMINATED BY ',' | FIELDS TERMINATED BY ',' | ||
| - | STORED AS TEXTFILE | + | LOCATION '/ |
| - | LOCATION '/ | + | |
| - | tblproperties(" | + | |
| - | -- | + | -- # |
| - | -- Create tbl_internal | + | -- The DDL below shows an external Hive table “incremental_table” that will include |
| - | -- | + | -- any delimited files with incremental change records, located in HDFS under the ‘/ |
| - | CREATE TABLE IF NOT EXISTS schema.tbl_internal | + | -- directory: |
| - | name STRING, | + | CREATE |
| - | guid STRING, | + | id string, |
| - | `date` TIMESTAMP, | + | field1 string, field2 string, field3 string, field4 string, field5 string, |
| - | key INT | + | modified_date string |
| - | ) STORED AS parquet | + | ) |
| - | """ | + | ROW FORMAT DELIMITED |
| + | FIELDS TERMINATED BY ',' | ||
| + | LOCATION '/ | ||
| + | -- # | ||
| + | -- This view combines record sets from both the Base (base_table) and | ||
| + | -- Change (incremental_table) tables and is reduced only to the most recent records for each unique “id”. | ||
| + | -- It is defined as follows: | ||
| + | CREATE VIEW reconcile_view AS | ||
| + | SELECT t1.* FROM | ||
| + | ( | ||
| + | | ||
| + | UNION ALL | ||
| + | | ||
| + | ) t1 | ||
| + | JOIN | ||
| - | hiveContext.sql(" | + | ( |
| - | hiveContext.sql(""" | + | |
| - | | + | FROM |
| - | | + | ( |
| - | | + | SELECT |
| - | | + | UNION ALL |
| - | | + | |
| - | | + | |
| - | FROM `schema`.`tbl_external_csv` | + | GROUP BY id |
| + | ) s | ||
| + | ON t1.id = s.id AND t1.modified_date = s.max_modified; | ||
| </ | </ | ||
| + | |||
| + | Source: https:// | ||