HiveQL使用总结

奔跑者-辉

已于 2022-09-19 16:46:48 修改

阅读量490

点赞数 2

分类专栏： hive 文章标签： hive 数据库 hadoop

于 2022-09-19 16:42:15 首次发布

本文链接：https://blog.csdn.net/index_test/article/details/126936598

版权

hive 专栏收录该内容

9 篇文章

订阅专栏

数据脱敏函数

hive有专门的脱敏函数供我们使用，就是mask()函数，返回值是string类型，默认需要脱敏的数据中大写字母就自动转换为X，小写字母就自动转换为x，数字就自动转换为n，也可通过mask()函数的参数来自定义转换格式。注意：入参也必须是string类型才不会有隐藏bug

select mask(要加密字段) from 表名                     -- 输出默认脱敏后的结果
select mask(要加密字段,'X','x','#') from 表名         -- 输出自定义脱敏后的结果
select mask_first_n(要加密的字段,n) from 表名         -- 对前n个字符进行脱敏
select mask_last_n(要加密的字段,n) from 表名          -- 对后n个字符进行脱敏
select mask_show_first_n(要加密的字段,n) from 表名    -- 对除了前n个字符之外的字符进行脱敏
select mask_show_last_n(要加密的字段,n) from 表名     -- 对除了后n个字符之外的字符进行脱敏
select mask_hash(字段) from 表名                     -- 对字段进行hash操作，若是非string类型的

举例：

① mask(string str [, string upper[, string lower[, string number]]] )
返回结果: 将字符串str中的大写字母替换为upper（默认为X），小写字母替换为lower（默认为x），数字替换为number（默认为n）
返回类型: string

select mask('Hello Uncle Bean! 1024');                 -- 结果为 Xxxxx Xxxxx Xxxx! nnnn
select mask('Hello Uncle Bean! 1024', 'A', 'a', '*');  -- 结果为 Aaaaa Aaaaa Aaaa! ****


② mask_first_n(string str[, int n])
返回结果: 对前n个字符进行脱敏
返回类型: string

select mask_first_n('Hello Uncle Bean!', 5);           -- 结果为 Xxxxx Uncle Bean!


③ mask_last_n(string str[, int n])
返回结果: 对后n个字符进行脱敏
返回类型: string

select mask_last_n('Hello Uncle Bean!', 4);             -- 结果为 Hello Uncle Xxxx!


④ mask_show_first_n(string str[, int n])
返回结果: 对除了前n个字符之外的字符进行脱敏
返回类型: string

select mask_show_first_n('Hello Uncle Bean!', 5);        -- 结果为 Hello Xxxxx Xxxx!


⑤ mask_show_last_n(string str[, int n])
返回结果: 对除了后n个字符之外的字符进行脱敏
返回类型: string

select mask_show_last_n('Hello Uncle Bean!', 4);         -- 结果为 Xxxxx Xxxxx Bean!


⑥ mask_hash(string|char|varchar str)
返回结果: 返回基于str的哈希值（对于非字符类型返回NULL）
返回类型: string

select mask_hash('Hello Uncle Bean!');                   -- 结果为 c4db6bf1917509938e67a712305385f9
select mask_hash(1024);                                  -- 结果为 NULL

hive获取当前天

-- PS：hive3版本对时间函数`unix_timestamp()`和`from_unixtime()`做了重写，需要加8小时或者减8小时，结果才正确
select current_date                              -- 2022-06-19
select from_unixtime(unix_timestamp() + 8*3600)  -- 2022-06-19 15:30:54

hive格式化时间数据

select from_unixtime(unix_timestamp() + 8*3600,'yyyy-MM')     -- 2022-06
select date_format(from_unixtime(unix_timestamp()),'yyyy-MM') -- 2022-06

hive 获取本月第一天，本年第一天，上个月第一天，本月最后一天，下个月第一天等指标

select trunc(from_unixtime(unix_timestamp() + 8*3600),'MM')                -- 2022-06-01
select trunc(from_unixtime(unix_timestamp() + 8*3600),'YEAR');             -- 2022-01-01
select trunc(add_months(from_unixtime(unix_timestamp() + 8*3600),-1),'MM') -- 2022-05-01
select last_day(from_unixtime(unix_timestamp() + 8*3600))                  -- 2022-06-30
select trunc(add_months(from_unixtime(unix_timestamp() + 8*3600),1),'MM')  -- 2022-07-01

datediff日期比较函数第一个参数是结束日期，第二个是开始日期，返回结束日期减开始日期

select datediff('2020-07-05','2020-06-15'); -- 返回20，注意日期格式认准- ，如果是/则无效，得使用格式转换

hive对yyyy/MM/dd格式的日期和yyyy-MM-dd格式的日期相互转换方案

第一种是通过from_unixtime()+unix_timestamp()转换时间戳方式转换
第二种是通过concat()+substr()拼接截取方式转换，
第三种是通过regexp_replace()正则匹配方式去掉横杠。

select 
 '2022/08/09' as source_text
,from_unixtime(unix_timestamp('2022/08/09','yyyy/MM/dd'),'yyyy-MM-dd') as func_text_1 -- 方案一
,concat(substr('2022/08/09',1,4),'-',substr('2022/08/09',6,2),'-',substr('2022/08/09',9,2)) as func_text_2 -- 方案二
,regexp_replace('2022/08/09','/','-') as func_text_3 -- 方案三

hive的多行转多列

方案一：利用拼接的方式构造map类型
方案二：利用if判断表达式+聚合收敛

-- 方案一，利用拼接的方式构造map类型
select stat_date
    ,event_list['test1'] as test1_cnt
    ,event_list['test2'] as test2_cnt
from 
(
    select 
         stat_date
        ,str_to_map(concat_ws(',',collect_list(concat_ws(':',event_name,cast(event_cnt as string))))) as event_list
    from
    (
        select 
             stat_date
            ,event_name
            ,count(1) as event_cnt
        from 表名
        where stat_date between 20220801 and 20220810
        and event_name in('test1','test2')
        group by stat_date 
                ,event_name
    ) s 
    group by stat_date
) w 
  
  
-- 方案二，利用if判断表达式
select 
     stat_date
    ,sum(if(event_name='test1',event_cnt,0)) as test1_cnt
    ,sum(if(event_name='test2',event_cnt,0)) as test2_cnt
from 
(
    select 
         stat_date
        ,event_name
        ,count(1) as event_cnt
    from 表名
    where stat_date between 20220801 and 20220810
    and event_name in('test1','test2')
    group by stat_date 
            ,event_name
) s 
group by stat_date