Bitmap 最佳实践和使用手册 3 - 导出

  1. 导出

2.1 Mysql 协议导出

bitmap_to_string

版本: 2.5 ~ main

mysql> select c1, bitmap_to_string(c2) from t1;
+------+----------------------+
| c1   | bitmap_to_string(c2) |
+------+----------------------+
|    1 | 1,2,3,4,5,6,7,8,9,10 |
+------+----------------------+

bitmap_to_base64

版本: 2.5 ~ main

mysql> select c1, bitmap_to_base64(c2) from t1;
+------+----------------------------------------------------------------------------------------------------------------------+
| c1   | bitmap_to_base64(c2)                                                                                                 |
+------+----------------------------------------------------------------------------------------------------------------------+
|    1 | CgoAAAABAAAAAAAAAAIAAAAAAAAAAwAAAAAAAAAEAAAAAAAAAAUAAAAAAAAABgAAAAAAAAAHAAAAAAAAAAgAAAAAAAAACQAAAAAAAAAKAAAAAAAAAA== |
+------+----------------------------------------------------------------------------------------------------------------------+
1 row in set (0.02 sec)

bitmap_to_binary + hex

版本: 3.0 ~ main

不建议使用

支持,但是会导出成一些不可见字符,需要转成Hex使用。

mysql> select c1, hex(bitmap_to_binary(c2)) from t1;
+------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| c1   | hex(bitmap_to_binary(c2))                                                                                                                                                  |
+------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|    1 | 0A0A0000000100000000000000020000000000000003000000000000000400000000000000050000000000000006000000000000000700000000000000080000000000000009000000000000000A00000000000000 |
+------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+

bitmap_to_array + unnest

版本: 2.5 ~ main

先转成Array,然后 Array 转成明细导出

mysql> select c1, unnest from t1, unnest(bitmap_to_array(c2));
+------+--------+
| c1   | unnest |
+------+--------+
|    1 |      1 |
|    1 |      2 |
|    1 |      3 |
|    1 |      4 |
|    1 |      5 |
|    1 |      6 |
|    1 |      7 |
|    1 |      8 |
|    1 |      9 |
|    1 |     10 |
+------+--------+

unnest_bitmap

版本: 3.0 ~ main

bitmap直接转成明细导出

mysql> select c1, unnest_bitmap from t1, unnest_bitmap(c2);
+------+---------------+
| c1   | unnest_bitmap |
+------+---------------+
|    1 |             1 |
|    1 |             2 |
|    1 |             3 |
|    1 |             4 |
|    1 |             5 |
|    1 |             6 |
|    1 |             7 |
|    1 |             8 |
|    1 |             9 |
|    1 |            10 |
+------+---------------+
10 rows in set (0.01 sec)

subdivde_bitmap 拆成小Bitmap导出

版本: 2.5 ~ main

防止1个包太大,Mysql协议报错

mysql> select bitmap_to_string(subdivide_bitmap) from t1, subdivide_bitmap(c2, 3);
+------------------------------------+
| bitmap_to_string(subdivide_bitmap) |
+------------------------------------+
| 1,2,3                              |
| 4,5,6                              |
| 7,8,9                              |
| 10                                 |
+------------------------------------+

bitmap_to_array 导出

版本: 2.5 ~ main

mysql> select c1, bitmap_to_array(c2) from t1;
+------+------------------------+
| c1   | bitmap_to_array(c2)    |
+------+------------------------+
|    1 | [1,2,3,4,5,6,7,8,9,10] |
+------+------------------------+

2.2 Insert into files 导出

bitmap_to_string

版本: 3.1 ~ main

# hive 表结构
create table t3(c1 int, c2 string) stored as parquet;

# StarRocks 表结构
CREATE TABLE `t1` (
  `c1` int(11) NULL COMMENT "",
  `c2` bitmap BITMAP_UNION NULL COMMENT ""
) ENGINE=OLAP 
AGGREGATE KEY(`c1`)
DISTRIBUTED BY HASH(`c1`);

# insert into files
insert into files ("path" = "hdfs://xxx:9000/user/hive/warehouse/lxh.db/t3/", "format"="parquet", "compression" = "uncompressed") 
select c1, bitmap_to_string(c2) as c2 from t1;

bitmap_to_base64

版本: 3.1 ~ main

# hive 表结构
create table t3(c1 int, c2 string) stored as parquet;

# StarRocks 表结构
CREATE TABLE `t1` (
  `c1` int(11) NULL COMMENT "",
  `c2` bitmap BITMAP_UNION NULL COMMENT ""
) ENGINE=OLAP 
AGGREGATE KEY(`c1`)
DISTRIBUTED BY HASH(`c1`);

# insert into files
insert into files ("path" = "hdfs://xxx:9000/user/hive/warehouse/lxh.db/t3/", "format"="parquet", "compression" = "uncompressed") select c1, bitmap_to_base64(c2) as c2 from t1;

bitmap_to_binary

版本: 3.1 ~ main

# hive 表结构
create table t4(c1 int, c2 binary) stored as parquet;

# StarRocks 表结构
CREATE TABLE `t1` (
  `c1` int(11) NULL COMMENT "",
  `c2` bitmap BITMAP_UNION NULL COMMENT ""
) ENGINE=OLAP 
AGGREGATE KEY(`c1`)
DISTRIBUTED BY HASH(`c1`);

# insert into files
insert into files ("path" = "hdfs://xxx:9000/user/hive/warehouse/lxh.db/t4/", "format"="parquet", "compression" = "uncompressed") select c1, bitmap_to_binary(c2) as c2 from t1;

bitmap_to_array + unnest

转成明细导出

版本: 3.1 ~ main

# hive 表结构
create table t2(c1 bigint, c2 bigint) format as parquet;

# StarRocks 表结构
CREATE TABLE `t1` (
  `c1` int(11) NULL COMMENT "",
  `c2` bitmap BITMAP_UNION NULL COMMENT ""
) ENGINE=OLAP 
AGGREGATE KEY(`c1`)
DISTRIBUTED BY HASH(`c1`);

# insert into files
insert into files ("path" = "hdfs://xxx:9000/user/hive/warehouse/lxh.db/t2/", "format"="parquet", "compression" = "uncompressed") select c1, unnest as c2 from t1, unnest(bitmap_to_array(c2));

bitmap_unnest

Bitmap 直接转成明细导出

版本: 3.1 ~ main

# hive 表结构
create table t2(c1 bigint, c2 bigint) format as parquet;

# StarRocks 表结构
CREATE TABLE `t1` (
  `c1` int(11) NULL COMMENT "",
  `c2` bitmap BITMAP_UNION NULL COMMENT ""
) ENGINE=OLAP 
AGGREGATE KEY(`c1`)
DISTRIBUTED BY HASH(`c1`);

# insert into files
insert into files ("path" = "hdfs://xxxx:9000/user/hive/warehouse/lxh.db/t2/", "format"="parquet", "compression" = "uncompressed") select c1, unnest_bitmap as c2 from t1, unnest_bitmap(c2);

bitmap_to_array

版本: 3.1 ~ main

# hive 表结构
create table t_array(c1 int, c2 array<bigint>) stored as parquet;

# StarRocks 表结构
CREATE TABLE `t1` (
  `c1` int(11) NULL COMMENT "",
  `c2` bitmap BITMAP_UNION NULL COMMENT ""
) ENGINE=OLAP 
AGGREGATE KEY(`c1`)
DISTRIBUTED BY HASH(`c1`);

# insert into files
insert into files ("path" = "hdfs://xxx:9000/user/hive/warehouse/lxh.db/t_array/", "format"="parquet", "compression" = "uncompressed") select c1, bitmap_to_array(c2) as c2 from t1;

2.3 select into outfile 导出

逐渐废弃,不建议使用,用 insert into files 代替

3.0 版本当前支持 select into outfile,但是不成熟

bitmap_to_string

版本: 3.1 ~ main

# hive 表结构 (parquet)
create table t3(c1 int, c2 string) stored as parquet;

# StarRocks 表结构
CREATE TABLE `t1` (
  `c1` int(11) NULL COMMENT "",
  `c2` bitmap BITMAP_UNION NULL COMMENT ""
) ENGINE=OLAP 
AGGREGATE KEY(`c1`)
DISTRIBUTED BY HASH(`c1`);

select c1, bitmap_to_string(c2) as c2 from t1 into outfile "hdfs://xxx:9000/user/hive/warehouse/lxh.db/t3/" format as parquet;

# hive 表结构 (csv)
create table t_string_csv(c1 int, c2 string) row format delimited fields 
terminated by '\t' lines terminated by '\n' stored stored as textfile;

select c1, bitmap_to_string(c2) as c2 from t1 
into outfile "hdfs://emr-header-1.cluster-49091:9000/user/hive/warehouse/lxh.db/t_string_csv/" format as csv 
properties("column_separator" = "\t", "line_delimiter" = "\n");

bitmap_to_base64

版本: 3.1 ~ main

# hive 表结构 (parquet)
create table t3(c1 int, c2 string) stored as parquet;

# StarRocks 表结构
CREATE TABLE `t1` (
  `c1` int(11) NULL COMMENT "",
  `c2` bitmap BITMAP_UNION NULL COMMENT ""
) ENGINE=OLAP 
AGGREGATE KEY(`c1`)
DISTRIBUTED BY HASH(`c1`);

select c1, bitmap_to_base64(c2) as c2 from t1 into outfile "hdfs://xxx:9000/user/hive/warehouse/lxh.db/t3/" format as parquet;

# hive 表结构 (csv)
create table t_string_csv(c1 int, c2 string) row format delimited fields 
terminated by '\t' lines terminated by '\n' stored stored as textfile;

select c1, bitmap_to_base64(c2) as c2 from t1 
into outfile "hdfs://xxx:9000/user/hive/warehouse/lxh.db/t_string_csv/" format as csv 
properties("column_separator" = "\t", "line_delimiter" = "\n");

2.4 [不支持] export 导出

不支持 Bitmap 类型

2.5 外表 (catalog方式) 导出

insert into hive_catalog_hms.lxh.t4 select c1, bitmap_to_binary(c2) from ssb_20.t1;

2.6 外表(非 catalog 方式) 导出

//TODO

老师, 请问bitmap_to_base64出来的结果,在业务应用服务中如何解析出原始结果?golang或java应用

https://github.com/StarRocks/starrocks/tree/main/fe/hive-udf/src/main/java/com/starrocks/hive/udf 把这个目录下的Java代码Copy过去改改直接用就行

import com.starrocks.types.BitmapValue; 还有这个目录下的代码

找到了,thx

img_v3_02dq_5f410616-8bd2-4f40-bffb-395ae134e96g

When I export data using catalog, I have such an error that it is very strange.
Looking forward to your reply!!