【详述】查询 HLL 列时, 报错 large memory alloc
【背景】
【业务影响】
【是否存算分离】是
【StarRocks版本】3.3.9
【集群规模】3fe + 30 cn
【机器信息】CPU虚拟核/内存/网卡,例如:16C/64G/万兆
【导入或者导出方式】直接查询
【附件】
表结构:
CREATE TABLE `adt_ip_label_hll` (
`__d` date NOT NULL COMMENT "创建时间",
`ip` varchar(50) NOT NULL COMMENT "ip",
`country` varchar(3) NOT NULL COMMENT "国家",
`label_name` varchar(65533) NOT NULL COMMENT "标签名称",
`label_value` hll NOT NULL COMMENT "标签值, hll类型"
) ENGINE=OLAP
UNIQUE KEY(`__d`, `ip`, `country`, `label_name`)
COMMENT "ip 标签表, 值为 hll 类型"
PARTITION BY date_trunc('day', __d)
DISTRIBUTED BY HASH(`ip`, `country`) BUCKETS 16
PROPERTIES (
"compression" = "LZ4",
"datacache.enable" = "false",
"enable_async_write_back" = "false",
"partition_live_number" = "30",
"replication_num" = "1",
"storage_volume" = "volume_hdfs"
);
CREATE TABLE dmp.adt_ip_ml_export (
`__dt` datetime NOT NULL COMMENT "创建时间",
`ip` varchar(50) NOT NULL COMMENT "ip",
`country` varchar(3) NOT NULL COMMENT "国家",
`label_name` varchar(65533) NOT NULL COMMENT "标签名称",
`label_value` bigint(20) NOT NULL COMMENT "标签值"
) ENGINE=OLAP
DUPLICATE KEY(`__dt`, `ip`, `country`, `label_name`)
COMMENT "ip ml 导出表, 临时存储"
PARTITION BY date_trunc('hour', __dt)
DISTRIBUTED BY HASH(`ip`, `country`) BUCKETS 32
PROPERTIES (
"compression" = "LZ4",
"datacache.enable" = "false",
"enable_async_write_back" = "false",
"replication_num" = "1",
"storage_volume" = "volume_s3"
);
查询 SQL
INSERT OVERWRITE dmp.adt_ip_ml_export PARTITION (__dt='2025-03-28 04:00:00')
SELECT '2025-03-28 04:00:00' as __dt,
ip,
country,
label_name,
HLL_UNION_AGG(label_value) AS label_value
FROM dmp.adt_ip_label_hll
WHERE __d >= DATE(DAYS_SUB('2025-03-27', 2))
AND __d < DATE(DAYS_ADD('2025-03-27', 1))
GROUP BY ip, country, label_name;
客户端报错信息:
[2025-03-31 17:17:58] [HY000][5025] Internal error: std::bad_alloc: BE:5450298
找对对应的 CN 节点, 查看日志, 错误如下
W20250331 07:40:47.760180 139955826542144 mem_hook.cpp:90] large memory alloc, query_id:70d5b5cd-0e03-11f0-b0f1-aa5d5176f662 instance: 70d5b5cd-0e03-11f0-b0f1-aa5d5176f6f5 acquire:2533274790395912 bytes, stack:
@ 0x673d4d5 starrocks::get_stack_trace[abi:cxx11]()
@ 0x890e117 malloc
@ 0xeeb018c operator new(unsigned long)
@ 0x899db2c starrocks::HyperLogLog::HyperLogLog(starrocks::HyperLogLog const&)
@ 0x543fe7f starrocks::ObjectColumn<starrocks::HyperLogLog>::append(starrocks::HyperLogLog const*)
@ 0x543ffe1 starrocks::ObjectColumn<starrocks::HyperLogLog>::append(starrocks::Column const&, unsigned long, unsigned long)
@ 0x53cc1ca starrocks::Chunk::append(starrocks::Chunk const&, unsigned long, unsigned long)
@ 0x58cec59 starrocks::spill::OrderedMemTable::append(std::shared_ptr<starrocks::Chunk>)
@ 0x58411cc starrocks::Status starrocks::spill::RawSpillerWriter::spill<starrocks::spill::IOTaskExecutor, starrocks::spill::ResourceMemTrackerGuard<std::weak_ptr<starrocks::pipeline::QueryContext>, std::weak_ptr<starrocks::spill::Spiller> >&
>(starrocks::RuntimeState*,<9C>^B
@ 0x58437ed starrocks::Status starrocks::spill::Spiller::spill<starrocks::spill::IOTaskExecutor, starrocks::spill::ResourceMemTrackerGuard<std::weak_ptr<starrocks::pipeline::QueryContext>, std::weak_ptr<starrocks::spill::Spiller> > >(starroc
ks::RuntimeState*, std::sha<9C>^B
@ 0x57d5a08 starrocks::Aggregator::spill_aggregate_data(starrocks::RuntimeState*, std::function<starrocks::StatusOr<std::shared_ptr<starrocks::Chunk> > ()>)
@ 0x6520ad6 starrocks::pipeline::SpillableAggregateBlockingSinkOperator::_spill_all_data(starrocks::RuntimeState*, bool)
@ 0x652aedf starrocks::pipeline::SpillableAggregateBlockingSinkOperator::_try_to_spill_by_auto(starrocks::RuntimeState*, std::shared_ptr<starrocks::Chunk> const&)
@ 0x652b8f6 starrocks::pipeline::SpillableAggregateBlockingSinkOperator::push_chunk(starrocks::RuntimeState*, std::shared_ptr<starrocks::Chunk> const&)
@ 0x53b2da0 starrocks::pipeline::PipelineDriver::process(starrocks::RuntimeState*, int)
@ 0x7dab933 starrocks::pipeline::GlobalDriverExecutor::_worker_thread()
@ 0x8ae1f22 starrocks::ThreadPool::dispatch_thread()
@ 0x8ada259 starrocks::Thread::supervise_thread(void*)
@ 0x7f4a81e53ac3 (/usr/lib/x86_64-linux-gnu/libc.so.6+0x94ac2)
@ 0x7f4a81ee5850 (/usr/lib/x86_64-linux-gnu/libc.so.6+0x12684f)
W20250331 07:40:48.258187 139955826542144 stack_util.cpp:347] 2025-03-31 07:40:48.258160, query_id=70d5b5cd-0e03-11f0-b0f1-aa5d5176f662, fragment_instance_id=70d5b5cd-0e03-11f0-b0f1-aa5d5176f6f5 throws exception: std::bad_alloc, trace:
@ 0x6748efd __wrap___cxa_throw
@ 0xeeb016a operator new(unsigned long) [clone .cold]
@ 0x899db2c starrocks::HyperLogLog::HyperLogLog(starrocks::HyperLogLog const&)
@ 0x543fe7f starrocks::ObjectColumn<starrocks::HyperLogLog>::append(starrocks::HyperLogLog const*)
@ 0x543ffe1 starrocks::ObjectColumn<starrocks::HyperLogLog>::append(starrocks::Column const&, unsigned long, unsigned long)
@ 0x53cc1ca starrocks::Chunk::append(starrocks::Chunk const&, unsigned long, unsigned long)
@ 0x58cec59 starrocks::spill::OrderedMemTable::append(std::shared_ptr<starrocks::Chunk>)
@ 0x58411cc starrocks::Status starrocks::spill::RawSpillerWriter::spill<starrocks::spill::IOTaskExecutor, starrocks::spill::ResourceMemTrackerGuard<std::weak_ptr<starrocks::pipeline::QueryContext>, std::weak_ptr<starrocks::spill::Spiller> >&
>(starrocks::RuntimeState*,<99>^B
@ 0x58437ed starrocks::Status starrocks::spill::Spiller::spill<starrocks::spill::IOTaskExecutor, starrocks::spill::ResourceMemTrackerGuard<std::weak_ptr<starrocks::pipeline::QueryContext>, std::weak_ptr<starrocks::spill::Spiller> > >(starroc
ks::RuntimeState*, std::sha<99>^B
@ 0x57d5a08 starrocks::Aggregator::spill_aggregate_data(starrocks::RuntimeState*, std::function<starrocks::StatusOr<std::shared_ptr<starrocks::Chunk> > ()>)
@ 0x6520ad6 starrocks::pipeline::SpillableAggregateBlockingSinkOperator::_spill_all_data(starrocks::RuntimeState*, bool)
@ 0x652aedf starrocks::pipeline::SpillableAggregateBlockingSinkOperator::_try_to_spill_by_auto(starrocks::RuntimeState*, std::shared_ptr<starrocks::Chunk> const&)
@ 0x652b8f6 starrocks::pipeline::SpillableAggregateBlockingSinkOperator::push_chunk(starrocks::RuntimeState*, std::shared_ptr<starrocks::Chunk> const&)
@ 0x53b2da0 starrocks::pipeline::PipelineDriver::process(starrocks::RuntimeState*, int)
@ 0x7dab933 starrocks::pipeline::GlobalDriverExecutor::_worker_thread()
@ 0x8ae1f22 starrocks::ThreadPool::dispatch_thread()
@ 0x8ada259 starrocks::Thread::supervise_thread(void*)
@ 0x7f4a81e53ac3 (/usr/lib/x86_64-linux-gnu/libc.so.6+0x94ac2)
@ 0x7f4a81ee5850 (/usr/lib/x86_64-linux-gnu/libc.so.6+0x12684f)
W20250331 07:40:48.258782 139955826542144 pipeline_driver_executor.cpp:175] [Driver] Process error, query_id=70d5b5cd-0e03-11f0-b0f1-aa5d5176f662, instance_id=70d5b5cd-0e03-11f0-b0f1-aa5d5176f6f5, status=Internal error: Internal error: std::bad_alloc: BE:
5450307
目前测试结论 感觉上跟 enable_spill 有关系,
场景一:
只查一个分区, 数据几个 G,
SET SESSION enable_spill = true;
SET session spill_mode = ‘force’;
必然报错
场景二:
只查一个分区, 数据几个 G,
SET SESSION enable_spill = true;
SET session spill_mode = ‘auto’;
在没有触发中间结果落盘, 情况下, 查询正常
场景三:
只查一个分区, 数据几百个 G,
SET SESSION enable_spill = true;
SET session spill_mode = ‘auto’;
一旦触发中间结果落盘, 立马报错
场景三运行状态如下图, 当 disk io util 稍微上抬一点点, 此时 sql 报错了