Differences

This shows you the differences between two versions of the page.

--- kb:hive [2017/11/20 10:23] – created yehuda
+++ kb:hive [2022/01/03 16:03] (current) – external edit 127.0.0.1
@@ Line 1: / Line 1: @@
 ====== Hive ======
-===== Migrate CSV to hive table =====
+https://hortonworks.com/blog/four-step-strategy-incremental-updates-hive/
-<code sql>
-DROP TABLE IF EXISTS schema.tbl_external_csv;
-DROP TABLE IF EXISTS schema.tbl_internal;
+<code sql>
 --
--- Create external csv reader
+-- In order to support an on-going reconciliation between current records in HIVE and new change records, two tables
+-- should be defined: base_table and incremental_table
 --
-CREATE EXTERNAL TABLE IF NOT EXISTS schema.tbl_external_csv(
+-- base_table
-    name STRING,
+-- The example below shows DDL for the Hive table “base_table” that will include any delimited files located in
-    guid STRING,
+-- HDFS under the ‘/user/hive/base_table’ directory. This table will house the  initial, complete record load from the
-    src_date String,
+-- source system. After the first processing run, it will house the on-going, most up-to-date set of records from the source system:
-    key INT)
+--
-COMMENT 'tbl_internal external table'
+CREATE TABLE base_table (
+id string,
+field1 string, field2 string, field3 string, field4 string, field5 string,
+modified_date string
+)
 ROW FORMAT DELIMITED
 FIELDS TERMINATED BY ','
-STORED AS TEXTFILE
+LOCATION '/user/hive/base_table';
-LOCATION '/user/raw_user/raw_files'
-tblproperties("skip.header.line.count"="1");
---
+-- #incremental_table
--- Create tbl_internal
+-- The DDL below shows an external Hive table “incremental_table” that will include
---
+-- any delimited files with incremental change records, located in HDFS under the ‘/user/hive/incremental_append’
-CREATE TABLE IF NOT EXISTS schema.tbl_internal (
+-- directory:
-    name STRING,
+CREATE EXTERNAL TABLE incremental_table (
-    guid STRING,
+id string,
-    `date` TIMESTAMP,
+field1 string, field2 string, field3 string, field4 string, field5 string,
-    key INT
+modified_date string
-) STORED AS parquet
+)
-"""
+ROW FORMAT DELIMITED
+FIELDS TERMINATED BY ','
+LOCATION '/user/hive/incremental_table';
+-- #reconcile_view
+-- This view combines record sets from both the Base (base_table) and
+-- Change (incremental_table) tables and is reduced only to the most recent records for each unique “id”.
+-- It is defined as follows:
+CREATE VIEW reconcile_view AS
+SELECT t1.* FROM
+(
+   SELECT * FROM base_table
+   UNION ALL
+   SELECT * FROM incremental_table
+) t1
+JOIN
-hiveContext.sql("TRUNCATE TABLE `schema`.`tbl_internal`")
+(
-hiveContext.sql("""
+   SELECT id, max(modified_date) max_modified
-    INSERT OVERWRITE TABLE `schema`.`tbl_internal`
+   FROM
-    SELECT
+   (
-        `client_name`,
+      SELECT * FROM base_table
-        `topguid`,
+      UNION ALL
-        from_unixtime(unix_timestamp(`src_date`, 'dd/MM/yyyy'),'yyyy-MM-dd') as `date`,
+      SELECT * FROM incremental_table
-        `key`
+   ) t2
-    FROM `schema`.`tbl_external_csv`
+   GROUP BY id
+) s
+ON t1.id = s.id AND t1.modified_date = s.max_modified;
 </code>
+Source: https://hortonworks.com/blog/four-step-strategy-incremental-updates-hive/

kb/hive.1511173415.txt.gz · Last modified: 2022/01/03 16:03 (external edit)

Back to top