Bitmap 最佳实践和使用手册 7 - Hive Bitmap UDF

HiveUDF

版本:

  • 2.5.19+
  • 3.0.10+
  • 3.1.8+
  • 3.2.3+

5.1 基础用法

# create table in starrocks
CREATE TABLE `t1` (
  `c1` int(11) NULL COMMENT "",
  `c2` bitmap BITMAP_UNION NULL COMMENT ""
) ENGINE=OLAP 
AGGREGATE KEY(`c1`)
DISTRIBUTED BY HASH(`c1`)
PROPERTIES (
"replication_num" = "1",
"in_memory" = "false",
"enable_persistent_index" = "false",
"replicated_storage" = "true",
"fast_schema_evolution" = "true",
"compression" = "LZ4"
);
# insert 数据
insert into t1 select 1, bitmap_from_string("1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39");

# create table on hive
create table t1(c1 int, c2 binary) stored as parquet;

# show create table
hive> show create table t1;
OK
CREATE TABLE `t1`(
  `c1` int, 
  `c2` binary)
ROW FORMAT SERDE 
  'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' 
STORED AS INPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' 
OUTPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
LOCATION
  'hdfs://emr-header-1.cluster-49091:9000/user/hive/warehouse/lxh.db/t1'
TBLPROPERTIES (
  'transient_lastDdlTime'='1703846468')
Time taken: 5.302 seconds, Fetched: 13 row(s)
# insert into hive table
insert into files ("path" = "hdfs://emr-header-1.cluster-49091:9000/user/hive/warehouse/lxh.db/t1/", "format"="parquet", "compression" = "uncompressed") select c1, bitmap_to_binary(c2) as c2 from t1;

# add jar
add jar hdfs://emr-header-1.cluster-49091:9000/user/hive/warehouse/lxh.db/hive1-udf-1.0.0.jar;

# create function
hive> create temporary function  bitmap_agg as 'com.starrocks.hive.udf.UDAFBitmapAgg';
OK
Time taken: 0.102 seconds
hive> create temporary function  bitmap_count as 'com.starrocks.hive.udf.UDFBitmapCount';
OK
Time taken: 0.072 seconds
hive> create temporary function  bitmap_from_string as 'com.starrocks.hive.udf.UDFBitmapFromString';
OK
Time taken: 0.072 seconds
hive> create temporary function  bitmap_to_string as 'com.starrocks.hive.udf.UDFBitmapToString';
OK
Time taken: 0.071 seconds

# 建表
create table t2(c1 int, c2 bigint) stored as parquet;

# 数据
hive> select * from t2;
1       1
1       2
2       3
2       4

# bitmap_agg
hive> select c1, bitmap_to_string(bitmap_agg(c2)) from t2 group by c1;
1       1,2
2       3,4

# bitmap_from_string
hive> select bitmap_to_string(bitmap_from_string("1,2,3,4,5"));
1,2,3,4,5

5.2 支持的函数

//TODO