【详述】创建hive+starrocks物理表的视图,dt作为分区字段,查询视图表的sql where条件包含自定义函数对dt字段的操作,但自定义函数返回的是常量。观察执行计划,发现自定义函数谓词不下推,分区裁剪失效
【背景】自定义函数 谓词不下推
【业务影响】
【StarRocks版本】2.5.4
【集群规模】例如:3fe(1 follower+2observer)+5be(fe与be混部)
【机器信息】CPU虚拟核/内存/网卡,例如:48C/64G/万兆
【联系方式】社区群3-张立杰
【附件】
– 创建starrocks表
CREATE TABLE tmp.test
(
dt
date NOT NULL COMMENT “”,
id
int(11) NOT NULL COMMENT “”,
name
varchar(65533) NULL COMMENT “”
) ENGINE = OLAP PRIMARY KEY(dt
,id
)
COMMENT “OLAP”
PARTITION BY RANGE(dt
)
(
PARTITION p20230330 VALUES [(“2023-03-30”), (“2023-03-31”)),
PARTITION p20230331 VALUES [(“2023-03-31”), (“2023-04-01”)),
PARTITION p20230401 VALUES [(“2023-04-01”), (“2023-04-02”)),
PARTITION p20230402 VALUES [(“2023-04-02”), (“2023-04-03”)),
PARTITION p20230403 VALUES [(“2023-04-03”), (“2023-04-04”)),
PARTITION p20230404 VALUES [(“2023-04-04”), (“2023-04-05”)),
PARTITION p20230405 VALUES [(“2023-04-05”), (“2023-04-06”)),
PARTITION p20230406 VALUES [(“2023-04-06”), (“2023-04-07”)),
PARTITION p20230407 VALUES [(“2023-04-07”), (“2023-04-08”)),
PARTITION p20230408 VALUES [(“2023-04-08”), (“2023-04-09”)),
PARTITION p20230409 VALUES [(“2023-04-09”), (“2023-04-10”)))
DISTRIBUTED BY HASH(id
) BUCKETS 3
PROPERTIES (
“replication_num” = “1”
);
– 插入数据
insert into tmp.test
values (“2023-04-01”, “72777”, “tt”),
(“2023-04-02”, “7777”, “ttt”),
(“2023-04-03”, “7777”, “t”),
(“2023-04-04”, “7777”, “tttt”),
(“2023-04-05”, “7777”, “bb”),
(“2023-04-06”, “7777”, “ttt”),
(“2023-04-07”, “7777”, “ttt”),
(“2023-04-08”, “7777”, “ttt”),
(“2023-04-09”, “7”, “ttt”);
– 创建hive表
create table if not exists tmp.t0
(
id bigint,
name string
) partitioned by (dt String);
– 插入数据
insert into tmp.t0 PARTITION (dt=‘2023-03-25’)
values (“111”, “t”);
insert into tmp.t0 PARTITION (dt=‘2023-03-26’)
values (“222”, “t”);
insert into tmp.t0 PARTITION (dt=‘2023-03-27’)
values (“333”, “t”);
insert into tmp.t0 PARTITION (dt=‘2023-03-28’)
values (“444”, “t”);
insert into tmp.t0 PARTITION (dt=‘2023-03-29’)
values (“555”, “t”);
insert into tmp.t0 PARTITION (dt=‘2023-03-30’)
values (“666”, “t”);
insert into tmp.t0 PARTITION (dt=‘2023-03-31’)
values (“8888”, “t”);
insert into tmp.t0 PARTITION (dt=‘2023-04-01’)
values (“1111”, “bb”);
insert into tmp.t0 PARTITION (dt=‘2023-04-02’)
values (“2222”, “ttt”);
insert into tmp.t0 PARTITION (dt=‘2023-04-03’)
values (“3333”, “t”);
insert into tmp.t0 PARTITION (dt=‘2023-04-04’)
values (“4444”, “ttt”);
insert into tmp.t0 PARTITION (dt=‘2023-04-05’)
values (“5555”, “t”);
insert into tmp.t0 PARTITION (dt=‘2023-04-06’)
values (“6666”, “ttt”);
insert into tmp.t0 PARTITION (dt=‘2023-04-08’)
values (“7777”, “ttt”);
insert into tmp.t0 PARTITION (dt=‘2023-04-09’)
values (“8888”, “t”);
– 创建视图
drop view tmp.view_sr_hive;
create view tmp.view_sr_hive
(dt, id, name)
as
select dt, id, name
from default_catalog.tmp.test
where dt >= ‘2023-04-01’
union all
select dt, id, name
from hive_catalog.tmp.t0
where dt < ‘2023-04-01’;
– 查询计划
REFRESH EXTERNAL TABLE hive_catalog.tmp.t0;
explain select count(1), dt from tmp.view_sr_hive where dt >= CAST(db_udf.days_add(CAST(db_udf.first_day(‘2023-04-10’, 1, 0) AS DATETIME), -1) AS VARCHAR) group by dt order by dt;