Bitmap 最佳实践和使用手册 2 - 导入

  1. 导入

//TODO: 本地 parquet 文件通过 broker 导入

示例表结构

# 建表
CREATE TABLE `t1` (
  `c1` int(11) NULL COMMENT "",
  `c2` bitmap BITMAP_UNION NULL COMMENT ""
) ENGINE=OLAP 
AGGREGATE KEY(`c1`)
DISTRIBUTED BY HASH(`c1`);

//TODO: 不同导入方式下的长度限制,还需要系统测试下,当前有些导入会受字符串1M长度的限制。

1.1 Stream Load

  • 版本: 2.5 ~ main

  • 格式: 只支持 CSV/JSON 格式

to_bitmap

# 准备数据 data.txt
1   11
1   12
2   22
2   32

curl --location-trusted -u root: -T data.txt 
-H "columns: tmp1, tmp2, c1=tmp1, c2=to_bitmap(tmp2)" 
http://FE_IP:FE_HTTP_PORT/api/ssb_20/t1/_stream_load

bitmap_from_string

# 准备数据 data.txt
1   11,12,13
2   22,23,24
2   30,31,32,33

curl --location-trusted -u root: -T data.txt 
-H "columns: tmp1, tmp2, c1=tmp1, c2=bitmap_from_string(tmp2)" 
http://FE_IP:FE_HTTP_PORT/api/ssb_20/t1/_stream_load

base64_to_bitmap

# 准备数据 data.txt
1   AjswAAABAAAnAAEAAQAnAA==
2   CgYAAABkAAAAAAAAAGUAAAAAAAAAZgAAAAAAAABnAAAAAAAAAGgAAAAAAAAAaQAAAAAAAAA=

# stream load
curl --location-trusted -u root: -T data.txt 
-H "columns: tmp1, tmp2, c1=tmp1, c2=base64_to_bitmap(tmp2)" 
http://BE_IP:BE_HTTP_PORT/api/ssb_20/t1/_stream_load

[不支持] bitmap_from_binary

[不支持] array_to_bitmap

1.2 Routine Load

to_bitmap

# 数据
1       1
1       2
2       3
2       4
2       5

# routine load
create routine load label_1 on t1 
columns(c1, tmp_c2, c2=to_bitmap(tmp_c2)) properties("format"="csv") 
from kafka(
    "kafka_broker_list"="IP:PORT", 
    "kafka_topic"="lxh-test", 
    "property.kafka_default_offsets" = "OFFSET_BEGINNING"
);

bitmap_from_string

# 数据
1       1,2,3,4
2       5,6,7,8

# routine load
create routine load label_1 on t1 
columns(c1, tmp_c2, c2=bitmap_from_string(tmp_c2))
properties("format"="csv")
from kafka(
    "kafka_broker_list"="IP:PORT",
    "kafka_topic"="lxh-test-string",
    "property.kafka_default_offsets" = "OFFSET_BEGINNING"
);

base64_to_bitmap

# 数据
1       AjswAAABAAAnAAEAAQAnAA==
2       CgYAAABkAAAAAAAAAGUAAAAAAAAAZgAAAAAAAABnAAAAAAAAAGgAAAAAAAAAaQAAAAAAAAA=

# routine load
create routine load label_1 on t1 
columns(c1, tmp_c2, c2=base64_to_bitmap(tmp_c2)) 
properties("format"="csv") 
from kafka(
    "kafka_broker_list"="172.26.194.239:9092", 
    "kafka_topic"="lxh-test-base64", 
    "property.kafka_default_offsets" = "OFFSET_BEGINNING"
);

[不支持] bitmap_from_binary

[不支持] array_to_bitmap

1.3 Broker Load

to_bitmap

版本: 2.5 ~ main

# Hive 表结构
create table t3(c1 int, c2 int) stored as parquet;
  
# Hive 数据
hive> select * from t2;
1       11
1       12
1       13
2       21
2       22

# StarRocks 建表
CREATE TABLE `t1` (
  `c1` int(11) NULL COMMENT "",
  `c2` bitmap BITMAP_UNION NULL COMMENT ""
) ENGINE=OLAP 
AGGREGATE KEY(`c1`)
DISTRIBUTED BY HASH(`c1`);

# broker load 导入
load label label_t1(
    data infile("hdfs://IP:PORT/user/hive/warehouse/lxh.db/t2/*") 
    into table t1 format as "parquet" (c1, c2) 
    set (c1=c1, c2=to_bitmap(c2))
) with broker;

bitmap_from_string

版本: 2.5 ~ main

# hive 表结构
create table t3(c1 int, c2 string) stored as parquet;

# hive 数据
hive> select * from t3;
1       11,12,13,14,15
2       22,23,24

# StarRocks 建表
CREATE TABLE `t1` (
  `c1` int(11) NULL COMMENT "",
  `c2` bitmap BITMAP_UNION NULL COMMENT ""
) ENGINE=OLAP 
AGGREGATE KEY(`c1`)
DISTRIBUTED BY HASH(`c1`);

# broker load 导入
load label label_t1_1(
    data infile("hdfs://xxx:9000/user/hive/warehouse/lxh.db/t3/*")
    into table t1 format as "parquet" (c1, c2)
    set (c1=c1, c2=bitmap_from_string(c2))
) with broker;

base64_to_bitmap

版本: 2.5 ~ main

# hive 表结构
create table t3(c1 int, c2 string) stored as parquet;

# hive 数据
hive> select * from t3;
1       AjswAAABAAAnAAEAAQAnAA==
2       CgYAAABkAAAAAAAAAGUAAAAAAAAAZgAAAAAAAABnAAAAAAAAAGgAAAAAAAAAaQAAAAAAAAA=

# StarRocks 建表
CREATE TABLE `t1` (
  `c1` int(11) NULL COMMENT "",
  `c2` bitmap BITMAP_UNION NULL COMMENT ""
) ENGINE=OLAP 
AGGREGATE KEY(`c1`)
DISTRIBUTED BY HASH(`c1`);

# broker load 导入
load label label_t1_2(
    data infile("hdfs://IP:PORT/user/hive/warehouse/lxh.db/t3/*")
    into table t1 format as "parquet" (c1, c2)
    set (c1=c1, c2=base64_to_bitmap(c2))
) with broker;

bitmap_from_binary

版本: 3.0 ~ main

# hive 表结构
create table t3(c1 int, c2 binary) stored as parquet;

# hive 数据
hive> select c1, hex(c2) from t3;
1       0A0A0000000100000000000000020000000000000003000000000000000400000000000000050000000000000006000000000000000700000000000000080000000000000009000000000000000A00000000000000

# StarRocks 建表
CREATE TABLE `t1` (
  `c1` int(11) NULL COMMENT "",
  `c2` bitmap BITMAP_UNION NULL COMMENT ""
) ENGINE=OLAP 
AGGREGATE KEY(`c1`)
DISTRIBUTED BY HASH(`c1`);

# broker load 导入
load label label_t1_4(
    data infile("hdfs://IP:PORT/user/hive/warehouse/lxh.db/t3/*") 
    into table t1 format as "parquet" (c1, c2) 
    set (c1=c1, c2=bitmap_from_binary(c2))
) with broker;

[不支持] array_to_bitmap

1.4 Spark Load

to_bitmap

版本: 2.5 ~ main

# hive 表结构
create table t2(c1 int, c2 int) stored as parquet;

# hive 数据
hive> select * from t2;
1       11
1       12
1       13
2       21
2       22

# StarRocks 建表
CREATE TABLE `t1` (
  `c1` int(11) NULL COMMENT "",
  `c2` bitmap BITMAP_UNION NULL COMMENT ""
) ENGINE=OLAP 
AGGREGATE KEY(`c1`)
DISTRIBUTED BY HASH(`c1`);

# spark load 导入
load label lxh_tmp_18(data infile("hdfs://IP:PORT/user/hive/warehouse/lxh.db/t2/*")
into table t1 format as 'parquet' SET(c1=c1, c2=to_bitmap(c2)))
with resource 'spark_lxh' properties("timeout" = "3600");

[不支持] bitmap_from_string

[不支持] base64_to_bitmap

TODO:

bitmap_from_binary

版本: 3.0 ~ main

# hive 表结构
create table t1(c1 int, c2 binary) stored as parquet;

# hive 数据
hive> select c1, hex(c2) from t1;
1       0A0A0000000100000000000000020000000000000003000000000000000400000000000000050000000000000006000000000000000700000000000000080000000000000009000000000000000A00000000000000

# StarRocks 建表
CREATE TABLE `t1` (
  `c1` int(11) NULL COMMENT "",
  `c2` bitmap BITMAP_UNION NULL COMMENT ""
) ENGINE=OLAP 
AGGREGATE KEY(`c1`)
DISTRIBUTED BY HASH(`c1`);

# spark load 导入
load label lxh_tmp_19(data infile("hdfs://IP:PORT/user/hive/warehouse/lxh.db/t1/*") 
into table t1 format as 'parquet' SET(c1=c1, c2=bitmap_from_binary(c2)))
with resource 'spark_lxh' properties("timeout" = "3600")

[不支持] array_to_bitmap

bitmap_dict

在 Hive 里构建字典,导入时将字符串转成整数,并构建 bitmap

# create resource
create external resource hive0 properties("type"="hive", "hive.metastore.uris" = "thrift://172.26.194.238:9083");

# 创建外表
create external table hive_t1(c1 int, c2 string) engine=hive properties("resource"="hive0", "database"="lxh", "table"="t1");

# 查询外表
mysql> select * from hive_t1;
+------+------+
| c1   | c2   |
+------+------+
|    1 | str1 |
|    2 | str2 |
|    3 | str1 |
|    3 | str4 |
+------+------+

# load
load label lxh_tmp_19(data from table hive_t1 into table t1  SET(c1=c1, c2=bitmap_dict(c2))) with resource 'spark_lxh' properties("timeout" = "3600");

# 查询内表
mysql> select c1, bitmap_to_string(c2) from t1;
+------+----------------------+
| c1   | bitmap_to_string(c2) |
+------+----------------------+
|    3 | 1,3                  |
|    1 | 1                    |
|    2 | 2                    |
+------+----------------------+

1.5 Mysql 协议导入

to_bitmap

2.5 ~ main

insert into t1 select 1, to_bitmap(1);
insert into t1 select 1, to_bitmap(2);

mysql> select c1, bitmap_to_string(c2) from t1;
+------+----------------------+
| c1   | bitmap_to_string(c2) |
+------+----------------------+
|    1 | 1,2                  |
+------+----------------------+

bitmap_from_string

2.5 ~ main

mysql> insert into t1 select 1, bitmap_from_string("1,2,3,4");
Query OK, 1 row affected (0.18 sec)
{'label':'insert_2910d237-a557-11ee-86a2-5254007b45e6', 'status':'VISIBLE', 'txnId':'260112'}

mysql> select c1, bitmap_to_string(c2) from t1;
+------+----------------------+
| c1   | bitmap_to_string(c2) |
+------+----------------------+
|    1 | 1,2,3,4              |
+------+----------------------+

base64_to_bitmap

2.5 ~ main

mysql> insert into t1 select 1, base64_to_bitmap("AjswAAABAAAnAAEAAQAnAA==");
Query OK, 1 row affected (0.20 sec)
{'label':'insert_412474e5-a557-11ee-86a2-5254007b45e6', 'status':'VISIBLE', 'txnId':'260127'}

mysql> select c1, bitmap_to_string(c2) from t1;
+------+----------------------------------------------------------------------------------------------------------------+
| c1   | bitmap_to_string(c2)                                                                                           |
+------+----------------------------------------------------------------------------------------------------------------+
|    1 | 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40 |
+------+----------------------------------------------------------------------------------------------------------------+
1 row in set (0.01 sec)

bitmap_from_binary

3.0 ~ main

# c2 是通过 bitmap_to_binary 生成的列
select c1, bitmap_from_binary(c2) from t1;

array_to_bitmap

2.5 ~ main

mysql> insert into t1 select 1, array_to_bitmap([1,2,3,4]);
Query OK, 1 row affected (12.30 sec)

mysql> select c1, bitmap_to_string(c2) from t1;
+------+----------------------+
| c1   | bitmap_to_string(c2) |
+------+----------------------+
|    1 | 1,2,3,4              |
+------+----------------------+
1 row in set (0.01 sec)

1.6 外表导入

1.6.1 catalog 方式

支持 to_bitmap, bitmap_from_string, base64_to_bitmap, bitmap_from_binary, array_to_bitmap

# hive 建表
create table t4(c1 int, c2 binary) stored as parquet;

# hive 里查询数据
select c1, bitmap_to_string(c2) from t4;
1       1,2,3,4,5,6,7,8,9,10

# 创建 catalog
create external catalog hive_catalog_hms properties(
    "type"="hive", 
    "hive.metastore.type" = "hive", 
    "hive.metastore.uris" = "thrift://IP:PORT"
);

# insert into select
insert into ssb_20.t1 select c1, bitmap_from_binary(c2) from hive_catalog_hms.lxh.t4;

1.6.2 非 catalog 方式

支持 to_bitmap, bitmap_from_string, base64_to_bitmap, array_to_bitmap

不支持 bitmap_from_binary,

# hive 建表
create table t4(c1 int, c2 base64) stored as parquet;

# hive 里查询数据
select c1, bitmap_to_string(bitmap_from_base64(c2)) from t4;
1       1,2,3
2       4,5

# StarRocks 里创建外表
create external table t_external(c1 int, c2 string) engine=HIVE properties(
    "resource"="hive0", 
    "database"="lxh", 
    "table"="t4"
);

# 写入到 bitmap 表
insert into t1 select c1, base64_to_bitmap(c2) from t_external;

# 查看结果
mysql> select c1, bitmap_to_string(c2) from t1;
+------+----------------------+
| c1   | bitmap_to_string(c2) |
+------+----------------------+
|    1 | 1,2,3                |
|    2 | 4,5                  |
+------+----------------------+

1.7 FILES 函数导入

to_bitmap

hive> create table t1(c1 int, c2 int) stored as parquet;

hive> select * from t1;
1       1
1       2
2       3
2       4
2       5

insert into t1 select c1, to_bitmap(c2) from files ("path" = "hdfs://emr-header-1.cluster-49091:9000/user/hive/warehouse/t1/*", "format"="parquet", "compression" = "uncompressed") ;

mysql> select c1, bitmap_to_string(c2) from t1;
+------+----------------------+
| c1   | bitmap_to_string(c2) |
+------+----------------------+
|    1 | 1,2                  |
|    2 | 3,4,5                |
+------+----------------------+

bitmap_from_string

hive> create table t2(c1 int, c2 string) stored as parquet;

hive> select * from t2;
1       1,2,3,4,5,7

insert into t1 select c1, bitmap_from_string(c2) from files ("path" = "hdfs://emr-header-1.cluster-49091:9000/user/hive/warehouse/t2/*", "format"="parquet", "compression" = "uncompressed") ;


mysql> select c1, bitmap_to_string(c2) from t1;
+------+----------------------+
| c1   | bitmap_to_string(c2) |
+------+----------------------+
|    1 | 1,2,3,4,5,7          |
+------+----------------------+

base64_to_bitmap

hive> create table t2(c1 int, c2 string) stored as parquet;

hive> select * from t2;
OK
1       AjswAAABAAAnAAEAAQAnAA==

mysql> insert into t1 select c1, base64_to_bitmap(c2) from files ("path" = "hdfs://emr-header-1.cluster-49091:9000/user/hive/warehouse/t2/*", "format"="parquet", "compression" = "uncompressed") ;
Query OK, 1 row affected (0.21 sec)
{'label':'insert_9ff9696c-a9ed-11ee-aa48-c6ffa5c8f7dc', 'status':'VISIBLE', 'txnId':'276143'}

mysql> select c1, bitmap_to_string(c2) from t1;
+------+----------------------------------------------------------------------------------------------------------------+
| c1   | bitmap_to_string(c2)                                                                                           |
+------+----------------------------------------------------------------------------------------------------------------+
|    1 | 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40 |
+------+----------------------------------------------------------------------------------------------------------------+

bitmap_from_binary

# hive 表结构
create table t3(c1 int, c2 binary) stored as parquet;

insert into t3 select c1, bitmap_agg(c2) from t4 group by c1;

hive> select c1, bitmap_to_string(c2) from t3;
1       1,2,3,5
2       11,100

insert into t1 select c1, bitmap_from_binary(c2) from files ("path" = "hdfs://emr-header-1.cluster-49091:9000/user/hive/warehouse/lxh.db/t3/*", "format"="parquet", "compression" = "uncompressed") ;

mysql> select c1, bitmap_to_string(c2) from t1;
+------+----------------------+
| c1   | bitmap_to_string(c2) |
+------+----------------------+
|    1 | 1,2,3,5              |
|    2 | 11,100               |
+------+----------------------+

老师你好,这个有哪种方式不会受1MB限制吗?我这通过hive生成的bitmap大于1MB,通过 catalog 导入数据直接是空的

先用Files函数试试,通过Catalog的方式,我先测试下,如果受1M限制的话,我们优化下。你用的是哪个版本

你是怎么使用的,给个实例

版本是 3.2

hive:

– 2亿行,user_id去重后有240w
create table test_db.hive_tab_001(
user_id bigint
)stored as orcfile;

create table test_db.hive_tab_002(
user_bitmap binary
)stored as orcfile;

– 一行数据,user_bitmap 6.4MB
insert overwrite table test_db.hive_tab_002 select bitmap_agg(user_id) from test_db.hive_tab_001

starrocks:

CREATE TABLE sr_test_001 (
block_offset int not null comment “block_offset”,
user_bitmap bitmap not null comment “user bitmap”
)
PRIMARY KEY (block_offset)
DISTRIBUTED BY HASH (block_offset)
PROPERTIES (
“replication_num” = “1”,
“in_memory” = “false”,
“enable_persistent_index” = “false”,
“replicated_storage” = “true”,
“compression” = “LZ4”
);

insert into sr_test_001 select 1 as block_offset,bitmap_from_binary(user_bitmap) as user_bitmap from hive.test_db.tmp.sr_test_001;

– 结果为0,按上面步骤在生成hive_tab_002时limit 1w行数据,然后导入sr_test_001是可以查询出结果的
select
bitmap_count(user_bitmap) as bc
from sr_test_001;

我先复现下试试