【详述】row_number()分析函数很慢
【业务影响】
【StarRocks版本】2.3.0
【集群规模】 3fe +4be
【机器信息】四台机器,每台机器配置(48c,251g,6块ssd盘)
表结构如下:
CREATE TABLE mid_ship_site_seq (
sub_ship_id bigint(20) NULL COMMENT “”,
ent_site_min_tm datetime NULL COMMENT “”,
ship_id int(11) NULL COMMENT “”,
unit_tm datetime NULL COMMENT “”,
scan_site int(11) NULL COMMENT “”,
cur_site_typ smallint(6) NULL COMMENT “”,
stip_nxt_site int(11) NULL COMMENT “”,
lod_scan_tm datetime NULL COMMENT “”,
lod_db_tm datetime NULL COMMENT “”,
lod_hdvr_id varchar(30) NULL COMMENT “”,
lod_hdvr_ent_site_tm datetime NULL COMMENT “”,
uld_hdvr_out_site_tm datetime NULL COMMENT “”,
nxt_scan_site int(11) NULL COMMENT “”,
nxt_site_typ int(11) NULL COMMENT “”,
uld_scan_tm datetime NULL COMMENT “”,
uld_scan_tm_30m datetime NULL COMMENT “卸车扫描时间加半小时时间格式”,
uld_scan_dt_30m datetime NULL COMMENT “卸车扫描时间加半小时日期格式”,
uld_db_tm datetime NULL COMMENT “”,
uld_hdvr_id varchar(30) NULL COMMENT “”,
pre_scan_site int(11) NULL COMMENT “”,
pre_site_typ smallint(6) NULL COMMENT “”,
actl_ent_site_tm datetime NULL COMMENT “”,
actl_out_site_tm datetime NULL COMMENT “”,
actl_ent_site_tm_3h datetime NULL COMMENT “实际进站时间+3小时时间格式”,
actl_ent_site_dt_3h datetime NULL COMMENT “实际进站时间+3小时日期格式”,
delv_tm datetime NULL COMMENT “”,
rcv_site int(11) NULL COMMENT “”,
rcv_site_typ smallint(6) NULL COMMENT “”,
rcv_tm datetime NULL COMMENT “”
) ENGINE=OLAP
DUPLICATE KEY(sub_ship_id, ent_site_min_tm)
COMMENT “补充站点装卸车信息”
DISTRIBUTED BY HASH(ship_id) BUCKETS 48
PROPERTIES (
“replication_num” = “1”,
“in_memory” = “false”,
“storage_format” = “DEFAULT”,
“enable_persistent_index” = “false”
);
CREATE TABLE ods_sub_rtng_wid_unit (
sub_ship_id bigint(20) NULL COMMENT “子单号”,
ent_site_min_tm datetime NULL COMMENT “进站最小时间”,
unit_tm datetime NULL COMMENT “开单时间”,
site_seq int(11) NULL COMMENT “站点顺序”,
ship_id int(11) NULL COMMENT “主单号”,
actl_pre_scan_site int(11) NULL COMMENT “实际上一站”,
actl_pre_site_typ smallint(6) NULL COMMENT “上一站机构类型”,
scan_site int(11) NULL COMMENT “当前站点”,
cur_site_typ smallint(6) NULL COMMENT “当前站点类型”,
actl_nxt_scan_site int(11) NULL COMMENT “”,
actl_nxt_site_typ smallint(6) NULL COMMENT “”,
stip_nxt_site int(11) NULL COMMENT “”,
lod_scan_tm datetime NULL COMMENT “”,
lod_db_tm datetime NULL COMMENT “”,
lod_hdvr_id varchar(30) NULL COMMENT “”,
uld_scan_tm datetime NULL COMMENT “”,
uld_db_tm datetime NULL COMMENT “”,
uld_hdvr_id varchar(30) NULL COMMENT “”,
actl_ent_site_tm datetime NULL COMMENT “”,
actl_out_site_tm datetime NULL COMMENT “”,
stip_out_site_tm datetime NULL COMMENT “”,
delv_tm datetime NULL COMMENT “”,
rcv_site int(11) NULL COMMENT “”,
rcv_site_typ smallint(6) NULL COMMENT “”,
rcv_tm datetime NULL COMMENT “”
) ENGINE=OLAP
DUPLICATE KEY(sub_ship_id, ent_site_min_tm)
COMMENT “”
DISTRIBUTED BY HASH(ship_id) BUCKETS 48
PROPERTIES (
“replication_num” = “1”,
“in_memory” = “false”,
“storage_format” = “DEFAULT”,
“enable_persistent_index” = “false”
);
执行如下insert into select 语句使用row_number函数耗时10s,不使用row_number函数耗时2s,数据量大概1700w,并行度24,开启CBO优化器。同样的语句在greenplum里也是耗时10s左右,starrocks还有提升空间吗?
因为还要使用各种lead和lag函数,就会变的更慢。
profile.txt (24.6 KB)
explain costs.txt (7.6 KB)
insert into ods.ods_sub_rtng_wid_unit
(
unit_tm,
site_seq,
ent_site_min_tm,
ship_id,
sub_ship_id,
actl_pre_scan_site,
actl_pre_site_typ,
scan_site,
cur_site_typ,
actl_nxt_scan_site,
actl_nxt_site_typ,
stip_nxt_site,
lod_scan_tm,
lod_db_tm,
lod_hdvr_id,
uld_scan_tm,
uld_db_tm,
uld_hdvr_id,
actl_ent_site_tm,
actl_out_site_tm,
delv_tm,
rcv_site,
rcv_site_typ,
rcv_tm
)
select unit_tm
,row_number() over(partition by sub_ship_id order by ent_site_min_tm) site_seq
,ent_site_min_tm
,ship_id
,sub_ship_id
,scan_site actl_pre_scan_site
,cur_site_typ actl_pre_site_typ
,scan_site
,cur_site_typ
,scan_site actl_nxt_scan_site
,cur_site_typ actl_nxt_site_typ
,stip_nxt_site --规定下一站
,lod_scan_tm
,lod_db_tm
,lod_hdvr_id
,uld_scan_tm
,uld_db_tm
,uld_hdvr_id
,actl_ent_site_tm
,actl_out_site_tm
,delv_tm
,rcv_site
,rcv_site_typ
,rcv_tm
from ods.mid_ship_site_seq a
;