-
导入
//TODO: 本地 parquet 文件通过 broker 导入
示例表结构
# 建表
CREATE TABLE `t1` (
`c1` int(11) NULL COMMENT "",
`c2` bitmap BITMAP_UNION NULL COMMENT ""
) ENGINE=OLAP
AGGREGATE KEY(`c1`)
DISTRIBUTED BY HASH(`c1`);
//TODO: 不同导入方式下的长度限制,还需要系统测试下,当前有些导入会受字符串1M长度的限制。
1.1 Stream Load
-
版本: 2.5 ~ main
-
格式: 只支持 CSV/JSON 格式
to_bitmap
# 准备数据 data.txt
1 11
1 12
2 22
2 32
curl --location-trusted -u root: -T data.txt
-H "columns: tmp1, tmp2, c1=tmp1, c2=to_bitmap(tmp2)"
http://FE_IP:FE_HTTP_PORT/api/ssb_20/t1/_stream_load
bitmap_from_string
# 准备数据 data.txt
1 11,12,13
2 22,23,24
2 30,31,32,33
curl --location-trusted -u root: -T data.txt
-H "columns: tmp1, tmp2, c1=tmp1, c2=bitmap_from_string(tmp2)"
http://FE_IP:FE_HTTP_PORT/api/ssb_20/t1/_stream_load
base64_to_bitmap
# 准备数据 data.txt
1 AjswAAABAAAnAAEAAQAnAA==
2 CgYAAABkAAAAAAAAAGUAAAAAAAAAZgAAAAAAAABnAAAAAAAAAGgAAAAAAAAAaQAAAAAAAAA=
# stream load
curl --location-trusted -u root: -T data.txt
-H "columns: tmp1, tmp2, c1=tmp1, c2=base64_to_bitmap(tmp2)"
http://BE_IP:BE_HTTP_PORT/api/ssb_20/t1/_stream_load
[不支持] bitmap_from_binary
[不支持] array_to_bitmap
1.2 Routine Load
to_bitmap
# 数据
1 1
1 2
2 3
2 4
2 5
# routine load
create routine load label_1 on t1
columns(c1, tmp_c2, c2=to_bitmap(tmp_c2)) properties("format"="csv")
from kafka(
"kafka_broker_list"="IP:PORT",
"kafka_topic"="lxh-test",
"property.kafka_default_offsets" = "OFFSET_BEGINNING"
);
bitmap_from_string
# 数据
1 1,2,3,4
2 5,6,7,8
# routine load
create routine load label_1 on t1
columns(c1, tmp_c2, c2=bitmap_from_string(tmp_c2))
properties("format"="csv")
from kafka(
"kafka_broker_list"="IP:PORT",
"kafka_topic"="lxh-test-string",
"property.kafka_default_offsets" = "OFFSET_BEGINNING"
);
base64_to_bitmap
# 数据
1 AjswAAABAAAnAAEAAQAnAA==
2 CgYAAABkAAAAAAAAAGUAAAAAAAAAZgAAAAAAAABnAAAAAAAAAGgAAAAAAAAAaQAAAAAAAAA=
# routine load
create routine load label_1 on t1
columns(c1, tmp_c2, c2=base64_to_bitmap(tmp_c2))
properties("format"="csv")
from kafka(
"kafka_broker_list"="172.26.194.239:9092",
"kafka_topic"="lxh-test-base64",
"property.kafka_default_offsets" = "OFFSET_BEGINNING"
);
[不支持] bitmap_from_binary
[不支持] array_to_bitmap
1.3 Broker Load
to_bitmap
版本: 2.5 ~ main
# Hive 表结构
create table t3(c1 int, c2 int) stored as parquet;
# Hive 数据
hive> select * from t2;
1 11
1 12
1 13
2 21
2 22
# StarRocks 建表
CREATE TABLE `t1` (
`c1` int(11) NULL COMMENT "",
`c2` bitmap BITMAP_UNION NULL COMMENT ""
) ENGINE=OLAP
AGGREGATE KEY(`c1`)
DISTRIBUTED BY HASH(`c1`);
# broker load 导入
load label label_t1(
data infile("hdfs://IP:PORT/user/hive/warehouse/lxh.db/t2/*")
into table t1 format as "parquet" (c1, c2)
set (c1=c1, c2=to_bitmap(c2))
) with broker;
bitmap_from_string
版本: 2.5 ~ main
# hive 表结构
create table t3(c1 int, c2 string) stored as parquet;
# hive 数据
hive> select * from t3;
1 11,12,13,14,15
2 22,23,24
# StarRocks 建表
CREATE TABLE `t1` (
`c1` int(11) NULL COMMENT "",
`c2` bitmap BITMAP_UNION NULL COMMENT ""
) ENGINE=OLAP
AGGREGATE KEY(`c1`)
DISTRIBUTED BY HASH(`c1`);
# broker load 导入
load label label_t1_1(
data infile("hdfs://xxx:9000/user/hive/warehouse/lxh.db/t3/*")
into table t1 format as "parquet" (c1, c2)
set (c1=c1, c2=bitmap_from_string(c2))
) with broker;
base64_to_bitmap
版本: 2.5 ~ main
# hive 表结构
create table t3(c1 int, c2 string) stored as parquet;
# hive 数据
hive> select * from t3;
1 AjswAAABAAAnAAEAAQAnAA==
2 CgYAAABkAAAAAAAAAGUAAAAAAAAAZgAAAAAAAABnAAAAAAAAAGgAAAAAAAAAaQAAAAAAAAA=
# StarRocks 建表
CREATE TABLE `t1` (
`c1` int(11) NULL COMMENT "",
`c2` bitmap BITMAP_UNION NULL COMMENT ""
) ENGINE=OLAP
AGGREGATE KEY(`c1`)
DISTRIBUTED BY HASH(`c1`);
# broker load 导入
load label label_t1_2(
data infile("hdfs://IP:PORT/user/hive/warehouse/lxh.db/t3/*")
into table t1 format as "parquet" (c1, c2)
set (c1=c1, c2=base64_to_bitmap(c2))
) with broker;
bitmap_from_binary
版本: 3.0 ~ main
# hive 表结构
create table t3(c1 int, c2 binary) stored as parquet;
# hive 数据
hive> select c1, hex(c2) from t3;
1 0A0A0000000100000000000000020000000000000003000000000000000400000000000000050000000000000006000000000000000700000000000000080000000000000009000000000000000A00000000000000
# StarRocks 建表
CREATE TABLE `t1` (
`c1` int(11) NULL COMMENT "",
`c2` bitmap BITMAP_UNION NULL COMMENT ""
) ENGINE=OLAP
AGGREGATE KEY(`c1`)
DISTRIBUTED BY HASH(`c1`);
# broker load 导入
load label label_t1_4(
data infile("hdfs://IP:PORT/user/hive/warehouse/lxh.db/t3/*")
into table t1 format as "parquet" (c1, c2)
set (c1=c1, c2=bitmap_from_binary(c2))
) with broker;
[不支持] array_to_bitmap
1.4 Spark Load
to_bitmap
版本: 2.5 ~ main
# hive 表结构
create table t2(c1 int, c2 int) stored as parquet;
# hive 数据
hive> select * from t2;
1 11
1 12
1 13
2 21
2 22
# StarRocks 建表
CREATE TABLE `t1` (
`c1` int(11) NULL COMMENT "",
`c2` bitmap BITMAP_UNION NULL COMMENT ""
) ENGINE=OLAP
AGGREGATE KEY(`c1`)
DISTRIBUTED BY HASH(`c1`);
# spark load 导入
load label lxh_tmp_18(data infile("hdfs://IP:PORT/user/hive/warehouse/lxh.db/t2/*")
into table t1 format as 'parquet' SET(c1=c1, c2=to_bitmap(c2)))
with resource 'spark_lxh' properties("timeout" = "3600");
[不支持] bitmap_from_string
[不支持] base64_to_bitmap
TODO:
bitmap_from_binary
版本: 3.0 ~ main
# hive 表结构
create table t1(c1 int, c2 binary) stored as parquet;
# hive 数据
hive> select c1, hex(c2) from t1;
1 0A0A0000000100000000000000020000000000000003000000000000000400000000000000050000000000000006000000000000000700000000000000080000000000000009000000000000000A00000000000000
# StarRocks 建表
CREATE TABLE `t1` (
`c1` int(11) NULL COMMENT "",
`c2` bitmap BITMAP_UNION NULL COMMENT ""
) ENGINE=OLAP
AGGREGATE KEY(`c1`)
DISTRIBUTED BY HASH(`c1`);
# spark load 导入
load label lxh_tmp_19(data infile("hdfs://IP:PORT/user/hive/warehouse/lxh.db/t1/*")
into table t1 format as 'parquet' SET(c1=c1, c2=bitmap_from_binary(c2)))
with resource 'spark_lxh' properties("timeout" = "3600")
[不支持] array_to_bitmap
bitmap_dict
在 Hive 里构建字典,导入时将字符串转成整数,并构建 bitmap
# create resource
create external resource hive0 properties("type"="hive", "hive.metastore.uris" = "thrift://172.26.194.238:9083");
# 创建外表
create external table hive_t1(c1 int, c2 string) engine=hive properties("resource"="hive0", "database"="lxh", "table"="t1");
# 查询外表
mysql> select * from hive_t1;
+------+------+
| c1 | c2 |
+------+------+
| 1 | str1 |
| 2 | str2 |
| 3 | str1 |
| 3 | str4 |
+------+------+
# load
load label lxh_tmp_19(data from table hive_t1 into table t1 SET(c1=c1, c2=bitmap_dict(c2))) with resource 'spark_lxh' properties("timeout" = "3600");
# 查询内表
mysql> select c1, bitmap_to_string(c2) from t1;
+------+----------------------+
| c1 | bitmap_to_string(c2) |
+------+----------------------+
| 3 | 1,3 |
| 1 | 1 |
| 2 | 2 |
+------+----------------------+
1.5 Mysql 协议导入
to_bitmap
2.5 ~ main
insert into t1 select 1, to_bitmap(1);
insert into t1 select 1, to_bitmap(2);
mysql> select c1, bitmap_to_string(c2) from t1;
+------+----------------------+
| c1 | bitmap_to_string(c2) |
+------+----------------------+
| 1 | 1,2 |
+------+----------------------+
bitmap_from_string
2.5 ~ main
mysql> insert into t1 select 1, bitmap_from_string("1,2,3,4");
Query OK, 1 row affected (0.18 sec)
{'label':'insert_2910d237-a557-11ee-86a2-5254007b45e6', 'status':'VISIBLE', 'txnId':'260112'}
mysql> select c1, bitmap_to_string(c2) from t1;
+------+----------------------+
| c1 | bitmap_to_string(c2) |
+------+----------------------+
| 1 | 1,2,3,4 |
+------+----------------------+
base64_to_bitmap
2.5 ~ main
mysql> insert into t1 select 1, base64_to_bitmap("AjswAAABAAAnAAEAAQAnAA==");
Query OK, 1 row affected (0.20 sec)
{'label':'insert_412474e5-a557-11ee-86a2-5254007b45e6', 'status':'VISIBLE', 'txnId':'260127'}
mysql> select c1, bitmap_to_string(c2) from t1;
+------+----------------------------------------------------------------------------------------------------------------+
| c1 | bitmap_to_string(c2) |
+------+----------------------------------------------------------------------------------------------------------------+
| 1 | 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40 |
+------+----------------------------------------------------------------------------------------------------------------+
1 row in set (0.01 sec)
bitmap_from_binary
3.0 ~ main
# c2 是通过 bitmap_to_binary 生成的列
select c1, bitmap_from_binary(c2) from t1;
array_to_bitmap
2.5 ~ main
mysql> insert into t1 select 1, array_to_bitmap([1,2,3,4]);
Query OK, 1 row affected (12.30 sec)
mysql> select c1, bitmap_to_string(c2) from t1;
+------+----------------------+
| c1 | bitmap_to_string(c2) |
+------+----------------------+
| 1 | 1,2,3,4 |
+------+----------------------+
1 row in set (0.01 sec)
1.6 外表导入
1.6.1 catalog 方式
支持 to_bitmap, bitmap_from_string, base64_to_bitmap, bitmap_from_binary, array_to_bitmap
# hive 建表
create table t4(c1 int, c2 binary) stored as parquet;
# hive 里查询数据
select c1, bitmap_to_string(c2) from t4;
1 1,2,3,4,5,6,7,8,9,10
# 创建 catalog
create external catalog hive_catalog_hms properties(
"type"="hive",
"hive.metastore.type" = "hive",
"hive.metastore.uris" = "thrift://IP:PORT"
);
# insert into select
insert into ssb_20.t1 select c1, bitmap_from_binary(c2) from hive_catalog_hms.lxh.t4;
1.6.2 非 catalog 方式
支持 to_bitmap, bitmap_from_string, base64_to_bitmap, array_to_bitmap
不支持 bitmap_from_binary,
# hive 建表
create table t4(c1 int, c2 base64) stored as parquet;
# hive 里查询数据
select c1, bitmap_to_string(bitmap_from_base64(c2)) from t4;
1 1,2,3
2 4,5
# StarRocks 里创建外表
create external table t_external(c1 int, c2 string) engine=HIVE properties(
"resource"="hive0",
"database"="lxh",
"table"="t4"
);
# 写入到 bitmap 表
insert into t1 select c1, base64_to_bitmap(c2) from t_external;
# 查看结果
mysql> select c1, bitmap_to_string(c2) from t1;
+------+----------------------+
| c1 | bitmap_to_string(c2) |
+------+----------------------+
| 1 | 1,2,3 |
| 2 | 4,5 |
+------+----------------------+
1.7 FILES 函数导入
to_bitmap
hive> create table t1(c1 int, c2 int) stored as parquet;
hive> select * from t1;
1 1
1 2
2 3
2 4
2 5
insert into t1 select c1, to_bitmap(c2) from files ("path" = "hdfs://emr-header-1.cluster-49091:9000/user/hive/warehouse/t1/*", "format"="parquet", "compression" = "uncompressed") ;
mysql> select c1, bitmap_to_string(c2) from t1;
+------+----------------------+
| c1 | bitmap_to_string(c2) |
+------+----------------------+
| 1 | 1,2 |
| 2 | 3,4,5 |
+------+----------------------+
bitmap_from_string
hive> create table t2(c1 int, c2 string) stored as parquet;
hive> select * from t2;
1 1,2,3,4,5,7
insert into t1 select c1, bitmap_from_string(c2) from files ("path" = "hdfs://emr-header-1.cluster-49091:9000/user/hive/warehouse/t2/*", "format"="parquet", "compression" = "uncompressed") ;
mysql> select c1, bitmap_to_string(c2) from t1;
+------+----------------------+
| c1 | bitmap_to_string(c2) |
+------+----------------------+
| 1 | 1,2,3,4,5,7 |
+------+----------------------+
base64_to_bitmap
hive> create table t2(c1 int, c2 string) stored as parquet;
hive> select * from t2;
OK
1 AjswAAABAAAnAAEAAQAnAA==
mysql> insert into t1 select c1, base64_to_bitmap(c2) from files ("path" = "hdfs://emr-header-1.cluster-49091:9000/user/hive/warehouse/t2/*", "format"="parquet", "compression" = "uncompressed") ;
Query OK, 1 row affected (0.21 sec)
{'label':'insert_9ff9696c-a9ed-11ee-aa48-c6ffa5c8f7dc', 'status':'VISIBLE', 'txnId':'276143'}
mysql> select c1, bitmap_to_string(c2) from t1;
+------+----------------------------------------------------------------------------------------------------------------+
| c1 | bitmap_to_string(c2) |
+------+----------------------------------------------------------------------------------------------------------------+
| 1 | 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40 |
+------+----------------------------------------------------------------------------------------------------------------+
bitmap_from_binary
# hive 表结构
create table t3(c1 int, c2 binary) stored as parquet;
insert into t3 select c1, bitmap_agg(c2) from t4 group by c1;
hive> select c1, bitmap_to_string(c2) from t3;
1 1,2,3,5
2 11,100
insert into t1 select c1, bitmap_from_binary(c2) from files ("path" = "hdfs://emr-header-1.cluster-49091:9000/user/hive/warehouse/lxh.db/t3/*", "format"="parquet", "compression" = "uncompressed") ;
mysql> select c1, bitmap_to_string(c2) from t1;
+------+----------------------+
| c1 | bitmap_to_string(c2) |
+------+----------------------+
| 1 | 1,2,3,5 |
| 2 | 11,100 |
+------+----------------------+