Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Next revision
Previous revision
kb:hive [2017/11/20 10:23] – created yehudakb:hive [2022/01/03 16:03] (current) – external edit 127.0.0.1
Line 1: Line 1:
 ====== Hive ====== ====== Hive ======
  
-===== Migrate CSV to hive table ===== +https://hortonworks.com/blog/four-step-strategy-incremental-updates-hive/
-<code sql> +
- +
-DROP TABLE IF EXISTS schema.tbl_external_csv; +
-DROP TABLE IF EXISTS schema.tbl_internal;+
  
 +<code sql>
 -- --
--- Create external csv reader+-- In order to support an on-going reconciliation between current records in HIVE and new change records, two tables  
 +-- should be defined: base_table and incremental_table
 -- --
-CREATE EXTERNAL TABLE IF NOT EXISTS schema.tbl_external_csv+-- base_table 
-    name STRING+-- The example below shows DDL for the Hive table “base_table” that will include any delimited files located in  
-    guid STRING, +-- HDFS under the ‘/user/hive/base_table’ directory. This table will house the  initial, complete record load from the  
-    src_date String+-- source system. After the first processing run, it will house the on-going, most up-to-date set of records from the source system: 
-    key INT) +-- 
-COMMENT 'tbl_internal external table'+CREATE TABLE base_table 
 +id string
 +field1 stringfield2 string, field3 string, field4 string, field5 string
 +modified_date string 
 +)
 ROW FORMAT DELIMITED ROW FORMAT DELIMITED
 FIELDS TERMINATED BY ',' FIELDS TERMINATED BY ','
-STORED AS TEXTFILE +LOCATION '/user/hive/base_table';
-LOCATION '/user/raw_user/raw_files' +
-tblproperties("skip.header.line.count"="1");+
  
--- +-- #incremental_table 
--- Create tbl_internal +-- The DDL below shows an external Hive table “incremental_table” that will include  
--- +-- any delimited files with incremental change records, located in HDFS under the ‘/user/hive/incremental_append’  
-CREATE TABLE IF NOT EXISTS schema.tbl_internal +-- directory: 
-    name STRING+CREATE EXTERNAL TABLE incremental_table 
-    guid STRING, +id string
-    `date` TIMESTAMP+field1 stringfield2 string, field3 string, field4 string, field5 string
-    key INT +modified_date string 
-STORED AS parquet +
-"""+ROW FORMAT DELIMITED 
 +FIELDS TERMINATED BY ',' 
 +LOCATION '/user/hive/incremental_table';
  
 +-- #reconcile_view
 +-- This view combines record sets from both the Base (base_table) and 
 +-- Change (incremental_table) tables and is reduced only to the most recent records for each unique “id”. 
 +-- It is defined as follows:
 +CREATE VIEW reconcile_view AS
 +SELECT t1.* FROM
 +(
 +   SELECT * FROM base_table
 +   UNION ALL
 +   SELECT * FROM incremental_table
 +) t1
  
 +JOIN
  
-hiveContext.sql("TRUNCATE TABLE `schema`.`tbl_internal`"+( 
-hiveContext.sql(""" +   SELECT id, max(modified_datemax_modified  
-    INSERT OVERWRITE TABLE `schema`.`tbl_internal` +   FROM  
-    SELECT +   
-        `client_name`, +      SELECT * FROM base_table 
-        `topguid`, +      UNION ALL 
-        from_unixtime(unix_timestamp(`src_date`, 'dd/MM/yyyy'),'yyyy-MM-dd') as `date`, +      SELECT * FROM incremental_table 
-        `key` +   t2 
-    FROM `schema`.`tbl_external_csv`+   GROUP BY id
  
 +) s
 +ON t1.id = s.id AND t1.modified_date = s.max_modified;
 </code> </code>
 +
 +Source: https://hortonworks.com/blog/four-step-strategy-incremental-updates-hive/
kb/hive.1511173415.txt.gz · Last modified: (external edit)
Back to top
Driven by DokuWiki Recent changes RSS feed Valid CSS Valid XHTML 1.0