查询 HLL 列时, 报错 large memory alloc,

【详述】查询 HLL 列时, 报错 large memory alloc
【背景】
【业务影响】
【是否存算分离】是
【StarRocks版本】3.3.9
【集群规模】3fe + 30 cn
【机器信息】CPU虚拟核/内存/网卡,例如:16C/64G/万兆
【导入或者导出方式】直接查询
【附件】
表结构:

CREATE TABLE `adt_ip_label_hll` (
  `__d` date NOT NULL COMMENT "创建时间",
  `ip` varchar(50) NOT NULL COMMENT "ip",
  `country` varchar(3) NOT NULL COMMENT "国家",
  `label_name` varchar(65533) NOT NULL COMMENT "标签名称",
  `label_value` hll NOT NULL COMMENT "标签值, hll类型"
) ENGINE=OLAP 
UNIQUE KEY(`__d`, `ip`, `country`, `label_name`)
COMMENT "ip 标签表, 值为 hll 类型"
PARTITION BY date_trunc('day', __d)
DISTRIBUTED BY HASH(`ip`, `country`) BUCKETS 16 
PROPERTIES (
"compression" = "LZ4",
"datacache.enable" = "false",
"enable_async_write_back" = "false",
"partition_live_number" = "30",
"replication_num" = "1",
"storage_volume" = "volume_hdfs"
);

CREATE TABLE dmp.adt_ip_ml_export (
  `__dt` datetime NOT NULL COMMENT "创建时间",
  `ip` varchar(50) NOT NULL COMMENT "ip",
  `country` varchar(3) NOT NULL COMMENT "国家",
  `label_name` varchar(65533) NOT NULL COMMENT "标签名称",
  `label_value` bigint(20) NOT NULL COMMENT "标签值"
) ENGINE=OLAP
DUPLICATE KEY(`__dt`, `ip`, `country`, `label_name`)
COMMENT "ip ml 导出表, 临时存储"
PARTITION BY date_trunc('hour', __dt)
DISTRIBUTED BY HASH(`ip`, `country`) BUCKETS  32
PROPERTIES (
"compression" = "LZ4",
"datacache.enable" = "false",
"enable_async_write_back" = "false",
"replication_num" = "1",
"storage_volume" = "volume_s3"
);

查询 SQL

INSERT OVERWRITE dmp.adt_ip_ml_export PARTITION (__dt='2025-03-28 04:00:00') 
SELECT '2025-03-28 04:00:00' as __dt,  
  ip, 
  country, 
  label_name, 
  HLL_UNION_AGG(label_value) AS label_value
FROM dmp.adt_ip_label_hll
WHERE __d >= DATE(DAYS_SUB('2025-03-27', 2))
          AND __d <  DATE(DAYS_ADD('2025-03-27', 1))
GROUP BY ip, country, label_name;

客户端报错信息:

[2025-03-31 17:17:58] [HY000][5025] Internal error: std::bad_alloc: BE:5450298

找对对应的 CN 节点, 查看日志, 错误如下

W20250331 07:40:47.760180 139955826542144 mem_hook.cpp:90] large memory alloc, query_id:70d5b5cd-0e03-11f0-b0f1-aa5d5176f662 instance: 70d5b5cd-0e03-11f0-b0f1-aa5d5176f6f5 acquire:2533274790395912 bytes, stack:
    @          0x673d4d5  starrocks::get_stack_trace[abi:cxx11]()
    @          0x890e117  malloc
    @          0xeeb018c  operator new(unsigned long)
    @          0x899db2c  starrocks::HyperLogLog::HyperLogLog(starrocks::HyperLogLog const&)
    @          0x543fe7f  starrocks::ObjectColumn<starrocks::HyperLogLog>::append(starrocks::HyperLogLog const*)
    @          0x543ffe1  starrocks::ObjectColumn<starrocks::HyperLogLog>::append(starrocks::Column const&, unsigned long, unsigned long)
    @          0x53cc1ca  starrocks::Chunk::append(starrocks::Chunk const&, unsigned long, unsigned long)
    @          0x58cec59  starrocks::spill::OrderedMemTable::append(std::shared_ptr<starrocks::Chunk>)
    @          0x58411cc  starrocks::Status starrocks::spill::RawSpillerWriter::spill<starrocks::spill::IOTaskExecutor, starrocks::spill::ResourceMemTrackerGuard<std::weak_ptr<starrocks::pipeline::QueryContext>, std::weak_ptr<starrocks::spill::Spiller> >&
>(starrocks::RuntimeState*,<9C>^B
    @          0x58437ed  starrocks::Status starrocks::spill::Spiller::spill<starrocks::spill::IOTaskExecutor, starrocks::spill::ResourceMemTrackerGuard<std::weak_ptr<starrocks::pipeline::QueryContext>, std::weak_ptr<starrocks::spill::Spiller> > >(starroc
ks::RuntimeState*, std::sha<9C>^B
    @          0x57d5a08  starrocks::Aggregator::spill_aggregate_data(starrocks::RuntimeState*, std::function<starrocks::StatusOr<std::shared_ptr<starrocks::Chunk> > ()>)
    @          0x6520ad6  starrocks::pipeline::SpillableAggregateBlockingSinkOperator::_spill_all_data(starrocks::RuntimeState*, bool)
    @          0x652aedf  starrocks::pipeline::SpillableAggregateBlockingSinkOperator::_try_to_spill_by_auto(starrocks::RuntimeState*, std::shared_ptr<starrocks::Chunk> const&)
    @          0x652b8f6  starrocks::pipeline::SpillableAggregateBlockingSinkOperator::push_chunk(starrocks::RuntimeState*, std::shared_ptr<starrocks::Chunk> const&)
    @          0x53b2da0  starrocks::pipeline::PipelineDriver::process(starrocks::RuntimeState*, int)
    @          0x7dab933  starrocks::pipeline::GlobalDriverExecutor::_worker_thread()
    @          0x8ae1f22  starrocks::ThreadPool::dispatch_thread()
    @          0x8ada259  starrocks::Thread::supervise_thread(void*)
    @     0x7f4a81e53ac3  (/usr/lib/x86_64-linux-gnu/libc.so.6+0x94ac2)
    @     0x7f4a81ee5850  (/usr/lib/x86_64-linux-gnu/libc.so.6+0x12684f)
W20250331 07:40:48.258187 139955826542144 stack_util.cpp:347] 2025-03-31 07:40:48.258160, query_id=70d5b5cd-0e03-11f0-b0f1-aa5d5176f662, fragment_instance_id=70d5b5cd-0e03-11f0-b0f1-aa5d5176f6f5 throws exception: std::bad_alloc, trace:
     @          0x6748efd  __wrap___cxa_throw
    @          0xeeb016a  operator new(unsigned long) [clone .cold]
    @          0x899db2c  starrocks::HyperLogLog::HyperLogLog(starrocks::HyperLogLog const&)
    @          0x543fe7f  starrocks::ObjectColumn<starrocks::HyperLogLog>::append(starrocks::HyperLogLog const*)
    @          0x543ffe1  starrocks::ObjectColumn<starrocks::HyperLogLog>::append(starrocks::Column const&, unsigned long, unsigned long)
    @          0x53cc1ca  starrocks::Chunk::append(starrocks::Chunk const&, unsigned long, unsigned long)
    @          0x58cec59  starrocks::spill::OrderedMemTable::append(std::shared_ptr<starrocks::Chunk>)
    @          0x58411cc  starrocks::Status starrocks::spill::RawSpillerWriter::spill<starrocks::spill::IOTaskExecutor, starrocks::spill::ResourceMemTrackerGuard<std::weak_ptr<starrocks::pipeline::QueryContext>, std::weak_ptr<starrocks::spill::Spiller> >&
>(starrocks::RuntimeState*,<99>^B
    @          0x58437ed  starrocks::Status starrocks::spill::Spiller::spill<starrocks::spill::IOTaskExecutor, starrocks::spill::ResourceMemTrackerGuard<std::weak_ptr<starrocks::pipeline::QueryContext>, std::weak_ptr<starrocks::spill::Spiller> > >(starroc
ks::RuntimeState*, std::sha<99>^B
    @          0x57d5a08  starrocks::Aggregator::spill_aggregate_data(starrocks::RuntimeState*, std::function<starrocks::StatusOr<std::shared_ptr<starrocks::Chunk> > ()>)
    @          0x6520ad6  starrocks::pipeline::SpillableAggregateBlockingSinkOperator::_spill_all_data(starrocks::RuntimeState*, bool)
    @          0x652aedf  starrocks::pipeline::SpillableAggregateBlockingSinkOperator::_try_to_spill_by_auto(starrocks::RuntimeState*, std::shared_ptr<starrocks::Chunk> const&)
    @          0x652b8f6  starrocks::pipeline::SpillableAggregateBlockingSinkOperator::push_chunk(starrocks::RuntimeState*, std::shared_ptr<starrocks::Chunk> const&)
    @          0x53b2da0  starrocks::pipeline::PipelineDriver::process(starrocks::RuntimeState*, int)
    @          0x7dab933  starrocks::pipeline::GlobalDriverExecutor::_worker_thread()
    @          0x8ae1f22  starrocks::ThreadPool::dispatch_thread()
    @          0x8ada259  starrocks::Thread::supervise_thread(void*)
    @     0x7f4a81e53ac3  (/usr/lib/x86_64-linux-gnu/libc.so.6+0x94ac2)
    @     0x7f4a81ee5850  (/usr/lib/x86_64-linux-gnu/libc.so.6+0x12684f)

W20250331 07:40:48.258782 139955826542144 pipeline_driver_executor.cpp:175] [Driver] Process error, query_id=70d5b5cd-0e03-11f0-b0f1-aa5d5176f662, instance_id=70d5b5cd-0e03-11f0-b0f1-aa5d5176f6f5, status=Internal error: Internal error: std::bad_alloc: BE:
5450307

目前测试结论 感觉上跟 enable_spill 有关系,

场景一:
只查一个分区, 数据几个 G,
SET SESSION enable_spill = true;
SET session spill_mode = ‘force’;
必然报错

场景二:
只查一个分区, 数据几个 G,
SET SESSION enable_spill = true;
SET session spill_mode = ‘auto’;
在没有触发中间结果落盘, :point_left: 情况下, 查询正常

场景三:
只查一个分区, 数据几百个 G,
SET SESSION enable_spill = true;
SET session spill_mode = ‘auto’;
一旦触发中间结果落盘, :point_left: 立马报错

场景三运行状态如下图, 当 disk io util 稍微上抬一点点, 此时 sql 报错了