Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revisionPrevious revision
Next revision
Previous revision
kb:hive [2018/11/13 12:30] yehudakb:hive [2022/01/03 16:03] (current) – external edit 127.0.0.1
Line 4: Line 4:
  
 <code sql> <code sql>
--- In order to support an on-going reconciliation between current records in HIVE and new change records, two tables should be defined: base_table and incremental_table+-- 
 +-- In order to support an on-going reconciliation between current records in HIVE and new change records, two tables  
 +-- should be defined: base_table and incremental_table 
 +--
 -- base_table -- base_table
--- The example below shows DDL for the Hive table “base_table” that will include any delimited files located in HDFS under the ‘/user/hive/base_table’ directory. This table will house the  initial, complete record load from the source system. After the first processing run, it will house the on-going, most up-to-date set of records from the source system: +-- The example below shows DDL for the Hive table “base_table” that will include any delimited files located in  
- +-- HDFS under the ‘/user/hive/base_table’ directory. This table will house the  initial, complete record load from the  
- +-- source system. After the first processing run, it will house the on-going, most up-to-date set of records from the source system: 
 +--
 CREATE TABLE base_table ( CREATE TABLE base_table (
- 
 id string, id string,
- +field1 string, field2 string, field3 string, field4 string, field5 string,
-field1 string, +
- +
-field2 string, +
- +
-field3 string, +
- +
-field4 string, +
- +
-field5 string, +
 modified_date string modified_date string
- 
 ) )
- 
 ROW FORMAT DELIMITED ROW FORMAT DELIMITED
- 
 FIELDS TERMINATED BY ',' FIELDS TERMINATED BY ','
- 
 LOCATION '/user/hive/base_table'; LOCATION '/user/hive/base_table';
  
 -- #incremental_table -- #incremental_table
--- The DDL below shows an external Hive table “incremental_table” that will include any delimited files with incremental change records, located in HDFS under the ‘/user/hive/incremental_append’ -- directory: +-- The DDL below shows an external Hive table “incremental_table” that will include  
- +-- any delimited files with incremental change records, located in HDFS under the ‘/user/hive/incremental_append’  
- +-- directory:
 CREATE EXTERNAL TABLE incremental_table ( CREATE EXTERNAL TABLE incremental_table (
- 
 id string, id string,
- +field1 string, field2 string, field3 string, field4 string, field5 string,
-field1 string, +
- +
-field2 string, +
- +
-field3 string, +
- +
-field4 string, +
- +
-field5 string, +
 modified_date string modified_date string
- 
 ) )
- 
 ROW FORMAT DELIMITED ROW FORMAT DELIMITED
- 
 FIELDS TERMINATED BY ',' FIELDS TERMINATED BY ','
- 
 LOCATION '/user/hive/incremental_table'; LOCATION '/user/hive/incremental_table';
  
 -- #reconcile_view -- #reconcile_view
--- This view combines record sets from both the Base (base_table) and Change (incremental_table) tables and is reduced only to the most recent records for each unique “id”. +-- This view combines record sets from both the Base (base_table) and  
 +-- Change (incremental_table) tables and is reduced only to the most recent records for each unique “id”. 
 -- It is defined as follows: -- It is defined as follows:
- 
- 
 CREATE VIEW reconcile_view AS CREATE VIEW reconcile_view AS
- 
 SELECT t1.* FROM SELECT t1.* FROM
- +( 
-(SELECT * FROM base_table +   SELECT * FROM base_table 
- +   UNION ALL 
-UNION ALL +   SELECT * FROM incremental_table 
- +) t1
-SELECT * FROM incremental_table) t1+
  
 JOIN JOIN
  
-(SELECT id, max(modified_date) max_modified FROM +( 
- +   SELECT id, max(modified_date) max_modified  
-(SELECT * FROM base_table +   FROM  
- +   ( 
-UNION ALL +      SELECT * FROM base_table 
- +      UNION ALL 
-SELECT * FROM incremental_table) t2 +      SELECT * FROM incremental_table 
- +   ) t2 
-GROUP BY id) s+   GROUP BY id
  
 +) s
 ON t1.id = s.id AND t1.modified_date = s.max_modified; ON t1.id = s.id AND t1.modified_date = s.max_modified;
 </code> </code>
 +
 +Source: https://hortonworks.com/blog/four-step-strategy-incremental-updates-hive/
kb/hive.1542112204.txt.gz · Last modified: (external edit)
Back to top
Driven by DokuWiki Recent changes RSS feed Valid CSS Valid XHTML 1.0