starRocks大表join 查询超时

我现在遇到一个场景,a 表 join b 表,a表 数据量 30亿条, b表 5000万,join 会超时 有好的解决方案吗?手动设置过 SET exec_mem_limit = 10589934592;
SET query_timeout = 600; 还是超时

您好,可以将您建表语句,查询语句脱敏发一下。集群配置,并行度设置也请补充一下。

SET exec_mem_limit = 10589934592;
SET query_timeout = 600;
select count(uid), sum(real_money) ,sum(count_in) from
(
select tbla.uid, tbla.product_mongo_id_real, tbla.act_type ,tblb2.real_money ,tblb2.count_in from
(
select distinct uid,
case
when component_id = ‘product_cart’ and act_type=‘click’ then ‘consumer’
when act_type=‘click’ then ‘click’
when act_type=‘exposure’ then ‘exposure’ else null end act_type,
product_mongo_id_real
from table_a where page_id= ‘categoryPage’
and date = ‘2021-11-14’ and product_mongo_id_real is not null
) tbla
left join
(select distinct order_number , pay_money, ifnull(real_money,0) as real_money,id_in,ifnull(count_in,0) as count_in from table_b where intime <= ‘2021-11-14 23:59:59’ and intime >= ‘2021-11-14 00:00:00’ and pay_status= 1 and status= 0 ) tblb2
on tbla.product_mongo_id_real = tblb2.id_in
) tbc;

集群三个节点,fe的 jvm 配置 80G, 三台机器 32 核 128G,并行度 默认

CREATE TABLE IF NOT EXISTS table_a
(
time bigint,
version VARCHAR(200),
build VARCHAR(200),
device VARCHAR(200),
channel VARCHAR(200),
ip VARCHAR(200),
station_id VARCHAR(200),
longitude double,
latitude double,
area_id VARCHAR(200),
is_make boolean,
uid VARCHAR(200),
is_vip int,
session_id VARCHAR(200),
android VARCHAR(200),
brand VARCHAR(200),
mobile_model VARCHAR(200),
seq VARCHAR(200),
app VARCHAR(200),
trace VARCHAR(200),
os int,
page VARCHAR(200) ,
act_type VARCHAR(200),
module VARCHAR(200),
component_id VARCHAR(200),
event int,
event_version int,
first_category_id VARCHAR(200),
first_category_name VARCHAR(200),
second_id VARCHAR(200),
second_name VARCHAR(200),
pv_id VARCHAR(200) ,
date date NOT NULL COMMENT “”
)
ENGINE=olap
PARTITION BY RANGE (date)
()
DISTRIBUTED BY HASH(time)
BUCKETS 5
PROPERTIES(
“replication_num” = “2”,
“storage_medium” = “HDD”,
“dynamic_partition.enable” = “true”,
“dynamic_partition.time_unit” = “DAY”,
“dynamic_partition.start” = “-3”,
“dynamic_partition.end” = “3”,
“dynamic_partition.prefix” = “p”,
“dynamic_partition.buckets” = “5”
);

CREATE TABLE IF NOT EXISTS table_b
(
_id VARCHAR(40960),
activity ARRAY<VARCHAR(40960)>,
address VARCHAR(40960),
add_city VARCHAR(40960),
client_id VARCHAR(40960),
delivery_time bigint,
balance_money DOUBLE,
buyer VARCHAR(40960),
cancel_note VARCHAR(40960),
cancel_reason VARCHAR(40960),
cancel_sub_type BIGINT,
cancel_time BIGINT,
cancel_type BIGINT,
clean_product_pack BOOLEAN,
create_time BIGINT,
current_position ARRAY,
cut_stock BOOLEAN,
deliverer_id VARCHAR(40960),
discount_ticket_money STRING,
discount_money DOUBLE,
event_id VARCHAR(40960),
finish_time BIGINT,
finish_type BIGINT,
first_blood BOOLEAN,
flags ARRAY<VARCHAR(40960)>,
free_type BIGINT,
freight_money DOUBLE,
origin_money DOUBLE,
full_to_off STRING,

activity_id_in VARCHAR(40960),
bar_code_in VARCHAR(40960),
type_in BIGINT,
category_path_in VARCHAR(65530),
conditions_num_in VARCHAR(40960),
count_in BIGINT,
discount_gift_ticket_money_in VARCHAR(40960),
discount_money_in DOUBLE,
doing_refund_count_in BIGINT,
doing_refund_price_in DOUBLE,
extra_part_refund_money_in DOUBLE,
full_name_in VARCHAR(40960),
gift_activity_id_in VARCHAR(40960),
gp_date_in BIGINT,
id_in VARCHAR(40960),
instant_rebate_money_in DOUBLE,
ish_booking_in BIGINT,
ish_bulk_in BIGINT,
ish_coupon_gift_in BIGINT,
ish_new_user_in BOOLEAN,
ish_unsold_in BIGINT,
manage_category_path_in VARCHAR(40960) ,
net_weight_in BIGINT,
net_weight_unit_in VARCHAR(40960),
number_in BIGINT,
origin_price_in DOUBLE,
parent_id_in VARCHAR(40960),
pay_money_in VARCHAR(40960),
presale_id_in VARCHAR(40960),
presale_type_in BIGINT,
price_in DOUBLE,
price_type_in BIGINT,
product_group_id_in VARCHAR(40960),
product_name_in VARCHAR(40960),
promotion_num_in BIGINT,
purchase_price_in VARCHAR(40960),
purchase_refund_money_in DOUBLE,
purchase_stockout_in BIGINT,
real_price_in DOUBLE,
real_weight_in BIGINT,
refund_count_in BIGINT,
refund_point_num_in BIGINT,
refund_point_product_count_in BIGINT,
refund_price_in DOUBLE,
return_money_in DOUBLE,
sale_type_in BIGINT,
size_price_in DOUBLE,
sku_activity_id_in VARCHAR(40960),
small_image_in VARCHAR(40960),
stockout_in BIGINT,
storage_value_id_in BIGINT,
tax_category_in VARCHAR(40960),
temperature_layer_in VARCHAR(40960),
total_price_in DOUBLE,
type_in BIGINT,
used_balance_money_in DOUBLE,
used_point_money_in DOUBLE,
used_point_num_in BIGINT,
view_total_weight_in VARCHAR(40960),
voucher_card_money_in VARCHAR(40960),
weight_in BIGINT,

goods_origin_money DOUBLE,
goods_real_money DOUBLE,
groupon_config VARCHAR(40960),
groupon_id VARCHAR(40960),
has_clean_product BOOLEAN,
instant_rebate_money DOUBLE,
invoice_money DOUBLE,
invoice_type VARCHAR(40960),
ish_bad_delivery BIGINT,
ish_booking BIGINT,
ish_comment BOOLEAN,
ish_comment_delivery BIGINT,
ish_cut_stock BOOLEAN,
ish_groupon BIGINT,
ish_help_buy BIGINT,
ish_limit_order BOOLEAN,
ish_notice_partner BIGINT,
ish_open_invoice BIGINT,
ish_presale BIGINT,
ish_purchase_stockout BIGINT,
ish_push_fdc BIGINT,
ish_refund BOOLEAN,
ish_timeout BIGINT,
ish_vip BIGINT,
need_pay_money DOUBLE,
new_share BIGINT,
order_note VARCHAR(65530),
order_number VARCHAR(40960),
order_type BIGINT,
pack_money DOUBLE,
partner_task_status BIGINT,
pay_discount_money DOUBLE,
pay_money DOUBLE,
pay_note VARCHAR(40960),
pay_status BIGINT,
pay_time BIGINT,
pay_type BIGINT,
platform_id VARCHAR(40960),
purchase_stockout_refund_money VARCHAR(40960),
record_create_time BIGINT,
refund_balance_money DOUBLE,
refund_money DOUBLE,
update_time BIGINT,
time_big_end BIGINT,
time_big_start BIGINT,
time_end BIGINT,
time_start BIGINT,
return_money DOUBLE,
schedule_note VARCHAR(40960),
shard_key BIGINT,
arrival BIGINT,
station_id VARCHAR(40960),
station_oid BIGINT,
status  BIGINT,
replenishment BIGINT,
return_coupon BIGINT,
order_type BIGINT,
time_type BIGINT,
total DOUBLE,
trade_id VARCHAR(40960),
update_time BIGINT,
used_money DOUBLE,
used_num BIGINT,
user VARCHAR(200),
user_delete BIGINT,
user_info VARCHAR(40960),
reserved_time_end BIGINT,
reserved_time_start BIGINT,
ticket_id VARCHAR(40960),
vipdiscount_money DOUBLE ,
vipuser_ticket_id VARCHAR(40960),
vip_money DOUBLE,
vip_number VARCHAR(40960),
viporiginmoney DOUBLE,
card_money VARCHAR(40960),
intime DATETIME

)
ENGINE=olap
PARTITION BY RANGE (intime)
()
DISTRIBUTED BY HASH(_id)
BUCKETS 5
PROPERTIES(
“replication_num” = “2”,
“storage_medium” = “HDD”,
“dynamic_partition.enable” = “true”,
“dynamic_partition.time_unit” = “DAY”,
“dynamic_partition.start” = “-3”,
“dynamic_partition.end” = “3”,
“dynamic_partition.prefix” = “p”,
“dynamic_partition.buckets” = “5”
);

您执行下该命令 show variables like “parallel_fragment_exec_instance_num”; show variables like “%cbo%”; 反馈一下执行结果

我减小数据量, 一个 表 1000多万,一个表 400多万 也出不来结果,为啥,是需要设置别的什么参数吗

请您执行一下show data;看一下当前两张表的大小,您目前的分桶数较低,建议增加分桶数。每个桶的数据量建议100M-1G,目前您设置的并行度是16,最优建议一个并行度设置两个分桶,您可以根据以上两点调整分桶数。另外您的sql中可以将关联表的左表设置为小表。

±---------------------------------±---------------±--------------------+
| TableName | Size | ReplicaCount |
±---------------------------------±---------------±--------------------+
| table_b | 34.300 GB | 70 |
| table_a | 2.264 TB | 150 |

您可以开启profile,将query_timeout 时间调整更大一点,使其有查询结果。将profile以附件的形式上传,帮您分析下。

超时时间 设置为 30分钟都没结果生成,分桶数可以修改吗

分桶数调整仅能在新增的分区中进行调整,已经创建的分桶无法调整。而且您的表现在没有创建分区。你可以将表删除掉重新创建导入数据创建时设置好分区分桶。

| TableName | Size | ReplicaCount |
±---------------------------------±---------------±--------------------+
| table_b | 6.402 GB | 500 |
| table_a | 983.114 GB | 1500 |
现在这个数据量join 都跑不出来,table_a 100个bukets ,table_b 50个 bucket