目錄

實現步驟:

1、Hive數倉建模

2、執行Hive建表腳本

3、業務分析

①基礎驗證腳本

②業務分析腳本

4、提交到git倉庫(對應自己的文件)

案例小結


實現步驟:

1、Hive數倉建模

scripts目錄創建 HQL 腳本,定義兩層表結構,直接關聯 HDFS 清洗後的數據:

-- 1. 數倉數據庫初始化(保留原邏輯,確保冪等性)
CREATE DATABASE IF NOT EXISTS ecommerce_dw;
USE ecommerce_dw;

-- 2. ODS層:原始數據層
-- 字段名與CSV完全一致,類型統一為STRING(保留原始格式,避免轉換失敗)
CREATE EXTERNAL TABLE IF NOT EXISTS ods_ecommerce_transactions (
    Transaction_ID STRING COMMENT '交易ID(原始格式,Spark清洗前為字符串)',
    User_Name STRING COMMENT '用户名(用户唯一標識,CSV原始字段)',
    Age STRING COMMENT '用户年齡(原始格式,Spark後續轉為INT)',
    Country STRING COMMENT '交易國家(與CSV一致)',
    Product_Category STRING COMMENT '商品品類(原始格式)',
    Purchase_Amount STRING COMMENT '交易金額(原始格式,Spark後續轉為DECIMAL)',
    Payment_Method STRING COMMENT '支付方式(原始格式)',
    Transaction_Date STRING COMMENT '交易日期(原始格式:yyyy/M/d)'
)
COMMENT '跨境電商交易原始數據ODS層(與CSV字段1:1對應)'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE
LOCATION 'hdfs://node1:9000/user/hadoop/ecommerce_dw/raw_data/transactions/'
TBLPROPERTIES (
    'skip.header.line.count' = '1',
    'serialization.encoding' = 'UTF-8'
);


-- 3. DWD層:明細數據層(100%匹配Spark清洗結果)
CREATE EXTERNAL TABLE IF NOT EXISTS dwd_ecommerce_transactions (
    Transaction_ID INT COMMENT '交易ID(清洗後去重去空)',
    User_Name STRING COMMENT '用户名(用户唯一標識)',
    Age INT COMMENT '用户年齡(18-100歲,過濾異常值)',
    Country STRING COMMENT '交易國家',
    Product_Category STRING COMMENT '商品品類',
    Purchase_Amount DECIMAL(10,2) COMMENT '交易金額(保留2位小數,單位:默認美元)',
    Payment_Method STRING COMMENT '支付方式',
    Transaction_Date DATE COMMENT '交易日期(yyyy-MM-dd)'
)
COMMENT '跨境電商交易清洗明細DWD層(與Spark輸出對齊)'
PARTITIONED BY (dt DATE COMMENT '分區字段:交易日期(按日分區)')
STORED AS PARQUET
LOCATION 'hdfs://node1:9000/user/hadoop/ecommerce_dw/processed_data/transactions_clean/'
TBLPROPERTIES (
    'parquet.compression' = 'snappy',
    'external.table.purge' = 'true'
);

-- 4. DWD層分區元數據
-- Spark按分區寫入Parquet後,Hive需刷新分區才能識別新增數據
MSCK REPAIR TABLE dwd_ecommerce_transactions;

2、執行Hive建表腳本

# 1. 確保Hive Metastore和HiveServer2已啓動(之前步驟已啓動,若未啓動則執行)
nohup hive --service metastore &
nohup hive --service hiveserver2 &

# 2. 執行HQL腳本(-f:指定腳本文件)
hive -f /home/Hadoop/ecommerce_dw_project/scripts/hive_create_table.hql

# 3. 驗證表是否創建成功(進入Hive CLI查看)
hive
USE ecommerce_dw;
SHOW TABLES;  # 預期輸出:ods_ecommerce_transactions、dwd_ecommerce_transactions
DESC dwd_ecommerce_transactions;  # 查看DWD表結構
SELECT * FROM dwd_ecommerce_transactions LIMIT 10;  # 查看錶數據(能正常顯示則成功)
quit;  # 退出Hive CLI

3、業務分析

scripts目錄建分析腳本

①基礎驗證腳本

功能:驗證 ODS/DWD 層數據完整性、一致性,確保數據鏈路通暢。

-- Basic Verification for ODS & DWD Layers
USE ecommerce_dw;

-- ==============================================
-- ODS Layer Verification
-- ==============================================
SELECT 'ODS Layer Structure' AS verification_step;
DESCRIBE ods_ecommerce_transactions;

SELECT 'ODS Layer Core Metrics' AS verification_step;
SELECT
  COUNT(*) AS total_raw_records,
  COUNT(DISTINCT Transaction_ID) AS unique_transaction_ids,
  COUNT(DISTINCT User_Name) AS unique_users,
  COUNT(DISTINCT Country) AS unique_countries,
  COUNT(DISTINCT Product_Category) AS unique_categories,
  MIN(Transaction_Date) AS earliest_transaction_date,
  MAX(Transaction_Date) AS latest_transaction_date
FROM ods_ecommerce_transactions;

-- ==============================================
-- DWD Layer Verification
-- ==============================================
SELECT 'DWD Layer Partitions' AS verification_step;
SHOW PARTITIONS dwd_ecommerce_transactions;

SELECT 'DWD Layer Core Metrics' AS verification_step;
SELECT
  COUNT(*) AS total_cleaned_records,
  COUNT(DISTINCT Transaction_ID) AS unique_transaction_ids,
  COUNT(DISTINCT User_Name) AS unique_users,
  MIN(Age) AS min_age,
  MAX(Age) AS max_age,
  MIN(Purchase_Amount) AS min_purchase_amount,
  MAX(Purchase_Amount) AS max_purchase_amount,
  COUNT(DISTINCT dt) AS transaction_days
FROM dwd_ecommerce_transactions;

-- ==============================================
-- Data Consistency (ODS vs DWD)
-- ==============================================
SELECT 'Category Distribution Comparison' AS verification_step;
SELECT 
  'ODS' AS data_layer,
  Product_Category,
  COUNT(*) AS transaction_count
FROM ods_ecommerce_transactions
GROUP BY Product_Category
UNION ALL
SELECT 
  'DWD' AS data_layer,
  Product_Category,
  COUNT(*) AS transaction_count
FROM dwd_ecommerce_transactions
GROUP BY Product_Category
ORDER BY Product_Category, data_layer;

SELECT 'Payment Method Distribution Comparison' AS verification_step;
SELECT 
  'ODS' AS data_layer,
  Payment_Method,
  COUNT(*) AS transaction_count
FROM ods_ecommerce_transactions
GROUP BY Payment_Method
UNION ALL
SELECT 
  'DWD' AS data_layer,
  Payment_Method,
  COUNT(*) AS transaction_count
FROM dwd_ecommerce_transactions
GROUP BY Payment_Method
ORDER BY Payment_Method, data_layer;

-- ==============================================
-- Verification Completed
-- ==============================================
SELECT 'Basic Verification Finished' AS result;

執行:

# 用beeline執行(推薦,兼容Hive集羣)
beeline -u jdbc:hive2://node1:10000 -n Hadoop -f basic_verification.sql > basic_verification_result.txt

# 或直接在Hive CLI中執行
hive -f basic_verification.sql > basic_verification_result.txt

②業務分析腳本

功能:分析交易規模、用户價值、商品表現、地域 & 支付偏好

-- Core Business Analysis
USE ecommerce_dw;

-- ==============================================
-- 1. Transaction Scale Analysis (Time Dimension)
-- ==============================================
SELECT '1. Monthly Transaction Scale' AS analysis_topic;
CREATE TABLE IF NOT EXISTS monthly_transaction_scale (
  transaction_year INT,
  transaction_month INT,
  transaction_count INT,
  total_sales DECIMAL(12,2),
  average_order_value DECIMAL(10,2)
)
STORED AS PARQUET
LOCATION 'hdfs://node1:9000/user/hadoop/ecommerce_dw/analysis_results/monthly_transaction_scale';

INSERT OVERWRITE TABLE monthly_transaction_scale
SELECT
  YEAR(dt) AS transaction_year,
  MONTH(dt) AS transaction_month,
  COUNT(*) AS transaction_count,
  SUM(Purchase_Amount) AS total_sales,
  AVG(Purchase_Amount) AS average_order_value
FROM dwd_ecommerce_transactions
GROUP BY YEAR(dt), MONTH(dt)
ORDER BY transaction_year, transaction_month;

SELECT * FROM monthly_transaction_scale;

SELECT '1. Quarterly Transaction Scale' AS analysis_topic;
CREATE TABLE IF NOT EXISTS quarterly_transaction_scale (
  transaction_year INT,
  transaction_quarter INT,
  transaction_count INT,
  total_sales DECIMAL(12,2),
  average_order_value DECIMAL(10,2)
)
STORED AS PARQUET
LOCATION 'hdfs://node1:9000/user/hadoop/ecommerce_dw/analysis_results/quarterly_transaction_scale';

INSERT OVERWRITE TABLE quarterly_transaction_scale
SELECT
  YEAR(dt) AS transaction_year,
  QUARTER(dt) AS transaction_quarter,
  COUNT(*) AS transaction_count,
  SUM(Purchase_Amount) AS total_sales,
  AVG(Purchase_Amount) AS average_order_value
FROM dwd_ecommerce_transactions
GROUP BY YEAR(dt), QUARTER(dt)
ORDER BY transaction_year, transaction_quarter;

SELECT * FROM quarterly_transaction_scale;

-- ==============================================
-- 2. User Analysis
-- ==============================================
SELECT '2. User Age Group Analysis' AS analysis_topic;
CREATE TABLE IF NOT EXISTS user_age_group_analysis (
  age_group STRING,
  user_count INT,
  transaction_count INT,
  total_sales DECIMAL(12,2),
  average_order_value DECIMAL(10,2)
)
STORED AS PARQUET
LOCATION 'hdfs://node1:9000/user/hadoop/ecommerce_dw/analysis_results/user_age_group_analysis';

INSERT OVERWRITE TABLE user_age_group_analysis
SELECT
  CASE
    WHEN Age BETWEEN 18 AND 25 THEN '18-25'
    WHEN Age BETWEEN 26 AND 35 THEN '26-35'
    WHEN Age BETWEEN 36 AND 45 THEN '36-45'
    WHEN Age BETWEEN 46 AND 55 THEN '46-55'
    ELSE '56+'
  END AS age_group,
  COUNT(DISTINCT User_Name) AS user_count,
  COUNT(*) AS transaction_count,
  SUM(Purchase_Amount) AS total_sales,
  AVG(Purchase_Amount) AS average_order_value
FROM dwd_ecommerce_transactions
GROUP BY
  CASE
    WHEN Age BETWEEN 18 AND 25 THEN '18-25'
    WHEN Age BETWEEN 26 AND 35 THEN '26-35'
    WHEN Age BETWEEN 36 AND 45 THEN '36-45'
    WHEN Age BETWEEN 46 AND 55 THEN '46-55'
    ELSE '56+'
  END
ORDER BY total_sales DESC;

SELECT * FROM user_age_group_analysis;

SELECT '2. User Value Segmentation' AS analysis_topic;
CREATE TABLE IF NOT EXISTS user_value_segmentation (
  user_value_segment STRING,
  user_count INT,
  total_transactions INT,
  total_spend_amount DECIMAL(12,2),
  avg_user_spend DECIMAL(10,2)
)
STORED AS PARQUET
LOCATION 'hdfs://node1:9000/user/hadoop/ecommerce_dw/analysis_results/user_value_segmentation';

INSERT OVERWRITE TABLE user_value_segmentation
WITH user_transaction_count AS (
  SELECT
    User_Name,
    COUNT(*) AS transaction_count,
    SUM(Purchase_Amount) AS total_spend
  FROM dwd_ecommerce_transactions
  GROUP BY User_Name
)
SELECT
  CASE
    WHEN transaction_count >= 1000 THEN 'High_Value_1000+'
    WHEN transaction_count >= 500 THEN 'Mid_High_Value_500_999'
    WHEN transaction_count >= 100 THEN 'Mid_Value_100_499'
    ELSE 'Regular_Value_Below_100'
  END AS user_value_segment,
  COUNT(User_Name) AS user_count,
  SUM(transaction_count) AS total_transactions,
  SUM(total_spend) AS total_spend_amount,
  AVG(total_spend) AS avg_user_spend
FROM user_transaction_count
GROUP BY
  CASE
    WHEN transaction_count >= 1000 THEN 'High_Value_1000+'
    WHEN transaction_count >= 500 THEN 'Mid_High_Value_500_999'
    WHEN transaction_count >= 100 THEN 'Mid_Value_100_499'
    ELSE 'Regular_Value_Below_100'
  END
ORDER BY total_spend_amount DESC;

SELECT * FROM user_value_segmentation;

-- ==============================================
-- 3. Product Analysis
-- ==============================================
SELECT '3. Product Category Performance' AS analysis_topic;
CREATE TABLE IF NOT EXISTS product_category_performance (
  product_category STRING,
  sales_count INT,
  total_sales DECIMAL(12,2),
  category_aov DECIMAL(10,2),
  sales_contribution_pct DECIMAL(5,2)
)
STORED AS PARQUET
LOCATION 'hdfs://node1:9000/user/hadoop/ecommerce_dw/analysis_results/product_category_performance';

INSERT OVERWRITE TABLE product_category_performance
SELECT
  Product_Category,
  COUNT(*) AS sales_count,
  SUM(Purchase_Amount) AS total_sales,
  AVG(Purchase_Amount) AS category_aov,
  (SUM(Purchase_Amount) / (SELECT SUM(Purchase_Amount) FROM dwd_ecommerce_transactions)) * 100 AS sales_contribution_pct
FROM dwd_ecommerce_transactions
GROUP BY Product_Category
ORDER BY total_sales DESC;

SELECT * FROM product_category_performance;

SELECT '3. Quarterly Product Category Trend' AS analysis_topic;
CREATE TABLE IF NOT EXISTS quarterly_product_trend (
  product_category STRING,
  year INT,
  quarter INT,
  sales_count INT,
  quarterly_sales DECIMAL(12,2)
)
STORED AS PARQUET
LOCATION 'hdfs://node1:9000/user/hadoop/ecommerce_dw/analysis_results/quarterly_product_trend';

INSERT OVERWRITE TABLE quarterly_product_trend
SELECT
  Product_Category,
  YEAR(dt) AS year,
  QUARTER(dt) AS quarter,
  COUNT(*) AS sales_count,
  SUM(Purchase_Amount) AS quarterly_sales
FROM dwd_ecommerce_transactions
GROUP BY Product_Category, YEAR(dt), QUARTER(dt)
ORDER BY Product_Category, year, quarter;

SELECT * FROM quarterly_product_trend;

-- ==============================================
-- 4. Geography & Payment Method Analysis
-- ==============================================
SELECT '4. Country-wise Performance' AS analysis_topic;
CREATE TABLE IF NOT EXISTS country_performance (
  country STRING,
  transaction_count INT,
  total_sales DECIMAL(12,2),
  country_aov DECIMAL(10,2),
  active_user_count INT
)
STORED AS PARQUET
LOCATION 'hdfs://node1:9000/user/hadoop/ecommerce_dw/analysis_results/country_performance';

INSERT OVERWRITE TABLE country_performance
SELECT
  Country,
  COUNT(*) AS transaction_count,
  SUM(Purchase_Amount) AS total_sales,
  AVG(Purchase_Amount) AS country_aov,
  COUNT(DISTINCT User_Name) AS active_user_count
FROM dwd_ecommerce_transactions
GROUP BY Country
ORDER BY total_sales DESC;

SELECT * FROM country_performance;

SELECT '4. Payment Method Preference by Country' AS analysis_topic;
CREATE TABLE IF NOT EXISTS payment_preference_by_country (
  country STRING,
  payment_method STRING,
  payment_count INT,
  country_payment_share_pct DECIMAL(5,2)
)
STORED AS PARQUET
LOCATION 'hdfs://node1:9000/user/hadoop/ecommerce_dw/analysis_results/payment_preference_by_country';

INSERT OVERWRITE TABLE payment_preference_by_country
SELECT
  Country,
  Payment_Method,
  COUNT(*) AS payment_count,
  (COUNT(*) / SUM(COUNT(*)) OVER (PARTITION BY Country)) * 100 AS country_payment_share_pct
FROM dwd_ecommerce_transactions
GROUP BY Country, Payment_Method
ORDER BY Country, payment_count DESC;

SELECT * FROM payment_preference_by_country;

-- ==============================================
-- Analysis Completed
-- ==============================================
SELECT 'All Business Analysis Completed' AS result;

執行:

# 執行並保存結果到文件
beeline -u jdbc:hive2://node1:10000 -n Hadoop -f business_analysis.sql > business_analysis_result.txt

4、提交到git倉庫(對應自己的文件)

# 1. 進入項目根目錄
cd /home/Hadoop/ecommerce_dw_project/

# 2. 添加新增文件到Git暫存區
git add scripts/spark_preprocess.py
git add scripts/hive_create_table.hql
git add scripts/basic_verification.sql
git add scripts/business_analysis.sql

# 3. 提交到本地倉庫(寫清楚提交內容,方便回溯)
git commit -m "完成全流程:數據集上傳+Spark預處理+Hive建模+業務分析"

# 4. 查看提交記錄(驗證提交成功)
git log --oneline -n 5
# 預期輸出:最新一條記錄為上述commit信息

案例小結

算是非常淺地過了一遍流程。

自我反思,像這樣的流程只能算是粗略的,肯定可以有很多技術或者工具可以加速或者更好地實現這個流程。期間遇到的報錯都可以靠一些方法規避,但是因為沒有經驗卻成為了非常沒有效率地挨個找。

知識體系的不成熟,沒有指導的無頭蒼蠅,看起來就像是一場自嗨,初學階段還是需要有方向指導,否則效率低成效小。

對於案例的啓動,初心是過一遍流程,但是真正的項目是為了解決問題而來,意識到這點即可。

未來案例可以升級。