row_number 使用
按a分组取b字段最小和最大时对应的c字段
select a,
min(if(asc_rn = 1, c, null)) as min_c,
max(if(desc_rn = 1, c, null)) as max_c
from (
select a, b, c,
row_number() over(partition by a order by b asc) as asc_rn
row_number() over(partition by a order by b desc) as desc_rn
from t1
) t2
where asc_rn = 1 or desc_rn = 1
group by a
统计函数
select
a,
b,
c,
-- 对c累计求和
sum(c) over(partition by a order by b) as sum_c
-- 对c取累计平均值
avg(c) over(partition by a order by b) as avg_c
-- 对b取累计排名比例
round(
row_number() over(partition by a order by b) / (count(c) over(partition by a)),
2 -- 比例保留 2 位小数
) as ratio_c
from t1;
lag/lead 控制计算窗口
-- 按a分组按b字段排序,对c取前后各一行的和
select
a,
b,
lag(c,1,0) over(partition by a order by b) + lead(c,1,0) over(partition by a order by b) as sum_c
from t1;
LATERAL VIEW explode 处理嵌套结构数据
- explode() 函数
作用:将数组(Array)或键值对(Map)类型的列拆分为多行
输入类型:array 或 map<K,V>
输出类型:
- 数组:生成单列(默认名 col),每行一个元素
- Map:生成两列(默认名 key 和 value),每行一个键值对
还有一个 posexplode(map<K,V>)
, Hive 中 explode 的增强版,它在展开数组(Array)时 同时返回元素的位置索引。posexplode(array<T>) → 生成 pos(索引)、col(值)两列
- LATERAL VIEW 子句
作用:将 explode() 生成的虚拟表与主查询关联,允许在 SELECT 语句中引用展开后的列。
语法:必须与 explode() 配合使用,生成一个可被查询的临时视图
-- 展开数组和 Map 的键、值分别组合
-- 如果 actions 有 M 个元素,attributes 有 N 个键值对,会生成 M×N 行, 需要注意数据量问题
SELECT
user_id,
action,
attr_key,
attr_value
FROM
user_data
LATERAL VIEW
explode(actions) exploded_actions AS action -- 第一级展开:数组
LATERAL VIEW
explode(attributes) exploded_attrs AS attr_key, attr_value; -- 第二级展开:Map
concat_ws 合并
-- 按a分组,将b字段合并成字符串
select
a,
concat_ws('、', collect_set(t.b)) b
from t1
group by a;
数据扩充与收缩
-- 将字符串"1-5,16,11-13,9"扩展成"1,2,3,4,5,16,11,12,13,9"
select
concat_ws(',',collect_list(cast(rn as string)))
from
(
select
a.rn,
b.num,
b.pos
from
(
select
row_number() over() as rn
from (select split(space(20), ' ') as x) t -- space(20)可灵活调整
lateral view explode(x) pe
) a lateral view outer
posexplode(split('1-5,16,11-13,9', ',')) b as pos, num
where a.rn between cast(split(num, '-')[0] as int) and cast(split(num, '-')[1] as int) or a.rn = num
order by pos, rn
) t;
不使用distinct或group by去重
通过唯一标识(如最小 id)或行号筛选,避免直接依赖 DISTINCT 或 GROUP BY 导致的性能问题
-- 去重 name 和 department 组合,保留每个组合的第一条记录
SELECT id, name, department
FROM (
SELECT
*,
ROW_NUMBER() OVER (
PARTITION BY name, department
ORDER BY id
) AS rn
FROM employee
) t
WHERE rn = 1;
元素反转
-- "123,456,789" => "987,654,321"(整体顺序和元素内容均反转)
SELECT
REVERSE(CONCAT_WS(",", COLLECT_LIST(REVERSE(str))))
FROM (
SELECT str
FROM t10
LATERAL VIEW EXPLODE(SPLIT(a, ",")) t AS str
) tmp1;