row_number 使用

按a分组取b字段最小和最大时对应的c字段

select a,
    min(if(asc_rn = 1, c, null)) as min_c,
    max(if(desc_rn = 1, c, null)) as max_c
from (
    select a, b, c,
        row_number() over(partition by a order by b asc) as asc_rn
        row_number() over(partition by a order by b desc) as desc_rn 
    from t1
    ) t2
where asc_rn = 1 or desc_rn = 1
group by a

统计函数


select 
  a, 
  b, 
  c, 
  --   对c累计求和
  sum(c) over(partition by a order by b) as sum_c
  -- 对c取累计平均值
  avg(c) over(partition by a order by b) as avg_c
  -- 对b取累计排名比例
  round(
    row_number() over(partition by a order by b) / (count(c) over(partition by a)),
    2 -- 比例保留 2 位小数
  ) as ratio_c
from t1;

lag/lead 控制计算窗口

-- 按a分组按b字段排序,对c取前后各一行的和
select 
  a,
  b,
  lag(c,1,0) over(partition by a order by b) + lead(c,1,0) over(partition by a order by b) as sum_c
from t1;

LATERAL VIEW explode 处理嵌套结构数据

  1. explode() 函数

作用:将数组(Array)或键值对(Map)类型的列拆分为多行

输入类型:array 或 map<K,V>

输出类型:

  • 数组:生成单列(默认名 col),每行一个元素
  • Map:生成两列(默认名 key 和 value),每行一个键值对

还有一个 posexplode(map<K,V>) , Hive 中 explode 的增强版,它在展开数组(Array)时 同时返回元素的位置索引。posexplode(array<T>) → 生成 pos(索引)、col(值)两列

  1. LATERAL VIEW 子句

作用:将 explode() 生成的虚拟表与主查询关联,允许在 SELECT 语句中引用展开后的列。

语法:必须与 explode() 配合使用,生成一个可被查询的临时视图

-- 展开数组和 Map 的键、值分别组合
-- 如果 actions 有 M 个元素,attributes 有 N 个键值对,会生成 M×N 行, 需要注意数据量问题
SELECT 
  user_id,
  action,
  attr_key,
  attr_value
FROM 
  user_data
LATERAL VIEW 
  explode(actions) exploded_actions AS action  -- 第一级展开:数组
LATERAL VIEW 
  explode(attributes) exploded_attrs AS attr_key, attr_value;  -- 第二级展开:Map

concat_ws 合并

-- 按a分组,将b字段合并成字符串
select
  a,
  concat_ws('、', collect_set(t.b)) b
from t1
group by a;

数据扩充与收缩

-- 将字符串"1-5,16,11-13,9"扩展成"1,2,3,4,5,16,11,12,13,9"

select  
  concat_ws(',',collect_list(cast(rn as string)))
from
(
  select  
   a.rn,
   b.num,
   b.pos
  from
   (
    select
     row_number() over() as rn
    from (select split(space(20), ' ') as x) t -- space(20)可灵活调整
    lateral view explode(x) pe
   ) a lateral view outer 
   posexplode(split('1-5,16,11-13,9', ',')) b as pos, num
   where a.rn between cast(split(num, '-')[0] as int) and cast(split(num, '-')[1] as int) or a.rn = num
   order by pos, rn 
) t;

不使用distinct或group by去重

通过唯一标识(如最小 id)或行号筛选,避免直接依赖 DISTINCT 或 GROUP BY 导致的性能问题

-- 去重 name 和 department 组合,保留每个组合的第一条记录
SELECT id, name, department
FROM (
  SELECT 
    *,
    ROW_NUMBER() OVER (
      PARTITION BY name, department 
      ORDER BY id
    ) AS rn
  FROM employee
) t
WHERE rn = 1;

元素反转

-- "123,456,789" => "987,654,321"(整体顺序和元素内容均反转)
SELECT 
  REVERSE(CONCAT_WS(",", COLLECT_LIST(REVERSE(str)))) 
FROM (
  SELECT str 
  FROM t10
  LATERAL VIEW EXPLODE(SPLIT(a, ",")) t AS str
) tmp1;