HiveUDF
版本:
- 2.5.19+
- 3.0.10+
- 3.1.8+
- 3.2.3+
5.1 基础用法
# create table in starrocks
CREATE TABLE `t1` (
`c1` int(11) NULL COMMENT "",
`c2` bitmap BITMAP_UNION NULL COMMENT ""
) ENGINE=OLAP
AGGREGATE KEY(`c1`)
DISTRIBUTED BY HASH(`c1`)
PROPERTIES (
"replication_num" = "1",
"in_memory" = "false",
"enable_persistent_index" = "false",
"replicated_storage" = "true",
"fast_schema_evolution" = "true",
"compression" = "LZ4"
);
# insert 数据
insert into t1 select 1, bitmap_from_string("1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39");
# create table on hive
create table t1(c1 int, c2 binary) stored as parquet;
# show create table
hive> show create table t1;
OK
CREATE TABLE `t1`(
`c1` int,
`c2` binary)
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
LOCATION
'hdfs://emr-header-1.cluster-49091:9000/user/hive/warehouse/lxh.db/t1'
TBLPROPERTIES (
'transient_lastDdlTime'='1703846468')
Time taken: 5.302 seconds, Fetched: 13 row(s)
# insert into hive table
insert into files ("path" = "hdfs://emr-header-1.cluster-49091:9000/user/hive/warehouse/lxh.db/t1/", "format"="parquet", "compression" = "uncompressed") select c1, bitmap_to_binary(c2) as c2 from t1;
# add jar
add jar hdfs://emr-header-1.cluster-49091:9000/user/hive/warehouse/lxh.db/hive1-udf-1.0.0.jar;
# create function
hive> create temporary function bitmap_agg as 'com.starrocks.hive.udf.UDAFBitmapAgg';
OK
Time taken: 0.102 seconds
hive> create temporary function bitmap_count as 'com.starrocks.hive.udf.UDFBitmapCount';
OK
Time taken: 0.072 seconds
hive> create temporary function bitmap_from_string as 'com.starrocks.hive.udf.UDFBitmapFromString';
OK
Time taken: 0.072 seconds
hive> create temporary function bitmap_to_string as 'com.starrocks.hive.udf.UDFBitmapToString';
OK
Time taken: 0.071 seconds
# 建表
create table t2(c1 int, c2 bigint) stored as parquet;
# 数据
hive> select * from t2;
1 1
1 2
2 3
2 4
# bitmap_agg
hive> select c1, bitmap_to_string(bitmap_agg(c2)) from t2 group by c1;
1 1,2
2 3,4
# bitmap_from_string
hive> select bitmap_to_string(bitmap_from_string("1,2,3,4,5"));
1,2,3,4,5
5.2 支持的函数
//TODO