常见 Crash / BUG / 优化 查询

  1. BE TabletSchemaMap 死锁

Thread 1331 (Thread 0x7f697b1fe700 (LWP 98875)):
#0  0x00007fe5a12794ed in __lll_lock_wait () from /lib64/libpthread.so.0
#1  0x00007fe5a1274dcb in _L_lock_883 () from /lib64/libpthread.so.0
#2  0x00007fe5a1274c98 in pthread_mutex_lock () from /lib64/libpthread.so.0
#3  0x00000000042f5efb in starrocks::TabletSchemaMap::emplace(starrocks::TabletSchemaPB const&) ()
#4  0x00000000042d673d in starrocks::TabletMeta::init_from_pb(starrocks::TabletMetaPB*) ()
#5  0x00000000042b4e42 in starrocks::Tablet::generate_tablet_meta_copy_unlocked(std::shared_ptr<starrocks::TabletMeta> const&) const ()
#6  0x000000000429d05e in starrocks::SnapshotManager::snapshot_full[abi:cxx11](std::shared_ptr<starrocks::Tablet> const&, long, long, bool) ()
#7  0x000000000429f223 in starrocks::SnapshotManager::make_snapshot(starrocks::TSnapshotRequest const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >*) ()
#8  0x0000000002caccd1 in starrocks::AgentServer::Impl::make_snapshot(starrocks::TAgentResult&, starrocks::TSnapshotRequest const&) ()
#9  0x0000000004c4f94d in starrocks::BackendServiceProcessor::process_make_snapshot(int, apache::thrift::protocol::TProtocol*, apache::thrift::protocol::TProtocol*, void*) ()
#10 0x0000000004c55c92 in starrocks::BackendServiceProcessor::dispatchCall(apache::thrift::protocol::TProtocol*, apache::thrift::protocol::TProtocol*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, int, void*) ()
#11 0x0000000004c579a2 in apache::thrift::TDispatchProcessor::process(std::shared_ptr<apache::thrift::protocol::TProtocol>, std::shared_ptr<apache::thrift::protocol::TProtocol>, void*) ()
#12 0x0000000005c081b8 in apache::thrift::server::TConnectedClient::run() ()
#13 0x0000000005c006b4 in apache::thrift::server::TThreadedServer::TConnectedClientRunner::run() ()
#14 0x0000000005c02ebd in apache::thrift::concurrency::Thread::threadMain(std::shared_ptr<apache::thrift::concurrency::Thread>) ()
#15 0x0000000005be8626 in std::thread::_State_impl<std::thread::_Invoker<std::tuple<void (*)(std::shared_ptr<apache::thrift::concurrency::Thread>), std::shared_ptr<apache::thrift::concurrency::Thread> > > >::_M_run() ()
#16 0x0000000008133fa0 in execute_native_thread_routine ()
#17 0x00007fe5a1272dd5 in start_thread () from /lib64/libpthread.so.0
#18 0x00007fe5a088dead in clone () from /lib64/libc.so.6
  1. Compaciton 大文件 crash

   @          0x5e50559 google::LogMessageFatal::~LogMessageFatal()
    @          0x27ecfaf starrocks::BinaryColumnBase<>::_build_slices()
    @          0x2757dd5 starrocks::BinaryColumnBase<>::raw_data()
    @          0x480a0da starrocks::ScalarColumnWriter::append()
    @          0x480a33c starrocks::StringColumnWriter::append()
    @          0x43a2806 starrocks::SegmentWriter::append_chunk()
    @          0x4a98da0 starrocks::VerticalRowsetWriter::add_columns()
    @          0x4a73ac7 starrocks::VerticalCompactionTask::_compact_data()
    @          0x4a74d0f starrocks::VerticalCompactionTask::_compact_column_group()
    @          0x4a75632 starrocks::VerticalCompactionTask::_vertical_compaction_data()
    @          0x4a76129 starrocks::VerticalCompactionTask::run_impl()
    @          0x4a6b6fc starrocks::CompactionTask::run()
    @          0x44d30d3 _ZNSt17_Function_handlerIFvvEZN9starrocks17CompactionManager9_scheduleEvEUlvE_E9_M_invokeERKSt9_Any_data
    @          0x4cc33d2 starrocks::ThreadPool::dispatch_thread()
    @          0x4cbde6a starrocks::Thread::supervise_thread()
    @     0x7f4658c6644b start_thread
    @     0x7f465803f40f __GI___clone
    @                0x0 (unknown)
  1. Persistent index 导致 BE 启动失败

*** Aborted at 1711349618 (unix time) try "date -d @1711349618" if you are using GNU date ***
PC: @     0x7f39cefe5387 __GI_raise
*** SIGABRT (@0x3e80000c50f) received by PID 50447 (TID 0x7f39425ff700) from PID 50447; stack trace: ***
    @          0x5b1ba42 google::(anonymous namespace)::FailureSignalHandler()
    @     0x7f39cfa9a630 (unknown)
    @     0x7f39cefe5387 __GI_raise
    @     0x7f39cefe6a78 __GI_abort
    @          0x2cdf2be starrocks::failure_function()
    @          0x5b0f41d google::LogMessage::Fail()
    @          0x5b1188f google::LogMessage::SendToLog()
    @          0x5b0ef6e google::LogMessage::Flush()
    @          0x5b11e99 google::LogMessageFatal::~LogMessageFatal()
    @          0x4265eaf starrocks::TabletUpdates::_apply_rowset_commit()
    @          0x4266353 starrocks::TabletUpdates::do_apply()
    @          0x4b17465 starrocks::ThreadPool::dispatch_thread()
    @          0x4b11e4a starrocks::Thread::supervise_thread()
    @     0x7f39cfa92ea5 start_thread
    @     0x7f39cf0ad96d __clone
    @                0x0 (unknown)
W0325 14:58:06.988127 52911 rowset.cpp:141] Fail to open /data/starrocks/storage/be/data/1021/321209132/1248468755/0200000000005e48a648e0f47d7f27a81aa03e6bcc4b45b4_0.dat: Corruption: Bad segment file /data/starrocks/storage/be/data/1021/321209132/1248468755/0200000000005e48a648e0f47d7f27a81aa03e6bcc4b45b4_0.dat: file size 0 < 12
/build/starrocks/be/src/storage/rowset/segment.cpp:195 Segment::parse_segment_footer(read_file.get(), &footer, footer_length_hint, partial_rowset_footer)
/build/starrocks/be/src/storage/rowset/segment.cpp:67 segment->_open(footer_length_hint, partial_rowset_footer)
W0325 14:58:06.988687 52911 rowset_update_state.cpp:39] load RowsetUpdateState error: Corruption: Bad segment file /data/starrocks/storage/be/data/1021/321209132/1248468755/0200000000005e48a648e0f47d7f27a81aa03e6bcc4b45b4_0.dat: file size 0 < 12
/build/starrocks/be/src/storage/rowset/segment.cpp:195 Segment::parse_segment_footer(read_file.get(), &footer, footer_length_hint, partial_rowset_footer)
/build/starrocks/be/src/storage/rowset/segment.cpp:67 segment->_open(footer_length_hint, partial_rowset_footer)
/build/starrocks/be/src/storage/rowset/rowset.cpp:75 do_load()
/build/starrocks/be/src/storage/rowset/rowset.cpp:454 load()
/build/starrocks/be/src/storage/rowset_update_state.cpp:161 _load_upserts(rowset, 0, pk_column.get()) tablet:321209132 stack:
    @          0x46a4e19  _ZZSt9call_onceIZN9starrocks17RowsetUpdateState4loadEPNS0_6TabletEPNS0_6RowsetEEUlvE_JEEvRSt9once_flagOT_DpOT0_ENUlvE0_4_FUNEv
    @     0x7fb3c0d7b20b  __pthread_once_slow
    @          0x469fde7  starrocks::RowsetUpdateState::load()
    @          0x4262183  starrocks::TabletUpdates::_apply_rowset_commit()
    @          0x4266353  starrocks::TabletUpdates::do_apply()
    @          0x4b17465  starrocks::ThreadPool::dispatch_thread()
    @          0x4b11e4a  starrocks::Thread::supervise_thread()
    @     0x7fb3c0d7cea5  start_thread
    @     0x7fb3c039796d  __clone
    @              (nil)  (unknown)
get_applied_rowsets failed, tablet updates is in error state: tablet:85018 actual row size changed after compaction 50531 -> 50041tablet:85018 #version:13 [29445 29456.1@12 29456.1] #pending:0 backend
F0423 22:31:29.743636 475679 tablet_updates.cpp:1132] delvec inconsistent tablet:8858730 rssid:5262 #old:1402 #add:4 #new:1402 old_v:10497 v:10498
  • Github Issue:

  • Github Fix PR:

  • Jira

  • 问题版本:

    • 2.5.0 ~ 2.5.20

    • 3.0.0 ~ 3.0.8

    • 3.1.0 ~ 3.1.5

  • 修复版本:

    • 2.5.21+

    • 3.0.9+

    • 3.1.6+

  • 问题原因:

  • 临时解决办法:

    • 使用 ./meta_tool.sh --operation=delete_persistent_index_meta 功能 删除有问题 tablet 的 persistent index 并重新启动 。如果是3副本的话可以使用 ./meta_tool.sh --operation=delete_meta功能删除
  1. BE manual_compact 线程占用大量磁盘IO

  1. 查询 _finish_late_materialization crash

*** Aborted at 1699870338 (unix time) try "date -d @1699870338" if you are using GNU date ***
PC: @          0x435387b starrocks::vectorized::SegmentIterator::_finish_late_materialization()
*** SIGSEGV (@0xd0) received by PID 25789 (TID 0x7fd423b90700) from PID 208; stack trace: ***
    @          0x5b97b22 google::(anonymous namespace)::FailureSignalHandler()
    @     0x7fd6797af630 (unknown)
    @          0x435387b starrocks::vectorized::SegmentIterator::_finish_late_materialization()
    @          0x435c790 starrocks::vectorized::SegmentIterator::_do_get_next()
    @          0x435f270 starrocks::vectorized::SegmentIterator::do_get_next()
    @          0x43e3f42 starrocks::vectorized::ProjectionIterator::do_get_next()
    @          0x4994e95 starrocks::SegmentIteratorWrapper::do_get_next()
    @          0x47c41c3 starrocks::vectorized::TimedChunkIterator::do_get_next()
    @          0x4415466 starrocks::vectorized::TabletReader::do_get_next()
    @          0x304a28b starrocks::pipeline::OlapChunkSource::_read_chunk_from_storage()
    @          0x304a976 starrocks::pipeline::OlapChunkSource::_read_chunk()
    @          0x303a22f starrocks::pipeline::ChunkSource::buffer_next_batch_chunks_blocking()
    @          0x2db3f14 _ZZN9starrocks8pipeline12ScanOperator18_trigger_next_scanEPNS_12RuntimeStateEiENKUlvE_clEv
    @          0x2dc533e starrocks::workgroup::ScanExecutor::worker_thread()
    @          0x4b968f2 starrocks::ThreadPool::dispatch_thread()
    @          0x4b9138a starrocks::Thread::supervise_thread()
    @     0x7fd6797a7ea5 start_thread
    @     0x7fd678dc2b0d __clone
    @                0x0 (unknown)
  • Github Issue:

  • Github Fix PR:

  • Jira

  • 问题版本:

    • 2.5.0 ~ 2.5.20

    • 3.0.0 ~ 3.0.9

    • 3.1.0 ~ 3.1.9

    • 3.2.0 ~ 3.2.4

  • 修复版本:

    • 2.5.21+

    • 3.0.10+

    • 3.1.10+

    • 3.2.5+

  • 问题原因:

  • 临时解决办法:

    • set global enable_filter_unused_columns_in_scan_stage = false
  1. Join cancel crash

*** SIGSEGV (@0x0) received by PID 555045 (TID 0x7f944a3ad700) from PID 0; stack trace: ***
@ 0x481e332 google::(anonymous namespace)::FailureSignalHandler()
@ 0x7f950f8a73ab os::Linux::chained_handler()
@ 0x7f950f8abefc JVM_handle_linux_signal
@ 0x7f950f89ed48 signalHandler()
@ 0x7f950ef72420 (unknown)
@ 0x2f35d3f starrocks::vectorized::HashJoiner::_has_null()
@ 0x2ec31ea starrocks::pipeline::HashJoinBuildOperator::set_finishing()
@ 0x2e76ff7 starrocks::pipeline::PipelineDriver::_mark_operator_finishing()
@ 0x2e77099 starrocks::pipeline::PipelineDriver::_mark_operator_finished()
@ 0x2e77629 starrocks::pipeline::PipelineDriver::_mark_operator_cancelled()
@ 0x2e779b2 starrocks::pipeline::PipelineDriver::_check_fragment_is_canceled()
@ 0x2e77dd0 starrocks::pipeline::PipelineDriver::process()
@ 0x2e6e5a3 starrocks::pipeline::GlobalDriverExecutor::_worker_thread()
@ 0x2680a05 starrocks::ThreadPool::dispatch_thread()
@ 0x267bf2a starrocks::supervise_thread()
@ 0x7f950ef66609 start_thread
@ 0x7f950ed2c133 clone
@ 0x0 (unknown)
  • Github Issue:

  • Github Fix PR:

  • Jira

  • 问题版本:

    • 2.3.0 ~ 2.3.18

    • 2.4.0 ~ latest

    • 2.5.0 ~ 2.5.16

    • 3.0.0 ~ 3.0.6

    • 3.1.0 ~ 3.1.3

  • 修复版本:

    • 2.3.19+

    • 2.4 没有 fix

    • 2.5.17+

    • 3.0.7+

    • 3.1.4+

  • 问题原因:

  • 临时解决办法:

  1. Schema change 后,unique 模型/agg模型的表查询结果不对

一般添加删除索引后,查询 where 条件中有 key 列,容易触发

  • Github Issue:

  • Github Fix PR:

  • Jira

  • 问题版本:

    • 2.5.0 ~ 2.5.20

    • 3.0.0 ~ 3.0.9

    • 3.1.0 ~ 3.1.11

    • 3.2.0 ~ 3.2.6

  • 修复版本:

    • 2.5.21

    • 3.0.10

    • 3.1.12

    • 3.2.7

  • 问题原因:

  • 解决办法:

    • 需要升级后,并且重建表才能解决。
  1. Unique表或是Agg表,order by Desc limit 查询结果不对

Order by xxx desc limit

  1. FE CPU打满,大量查询超时

Jstack 有如下堆栈

"starrocks-mysql-nio-pool-2791" #119533 daemon prio=5 os_prio=0 tid=0x00007fe65c060800 nid=0x19244 runnable [0x00007fe629881000]
   java.lang.Thread.State: RUNNABLE
        at java.util.Arrays.hashCode(Arrays.java:4146)
        at java.util.Objects.hash(Objects.java:128)
        at com.starrocks.sql.optimizer.base.DistributionCol.hashCode(DistributionCol.java:116)
        at java.util.HashMap.hash(HashMap.java:340)
        at java.util.HashMap.get(HashMap.java:558)
        at com.starrocks.sql.optimizer.base.DistributionDisjointSet.find(DistributionDisjointSet.java:59)
        at com.starrocks.sql.optimizer.base.DistributionDisjointSet.union(DistributionDisjointSet.java:73)
        at com.starrocks.sql.optimizer.base.DistributionSpec$PropertyInfo.unionNullRelaxCols(DistributionSpec.java:98)
        at com.starrocks.sql.optimizer.OutputPropertyDeriver.computeHashJoinDistributionPropertyInfo(OutputPropertyDeriver.java:183)
        at com.starrocks.sql.optimizer.OutputPropertyDeriver.visitPhysicalJoin(OutputPropertyDeriver.java:259)
        at com.starrocks.sql.optimizer.OutputPropertyDeriver.visitPhysicalHashJoin(OutputPropertyDeriver.java:199)
        at com.starrocks.sql.optimizer.OutputPropertyDeriver.visitPhysicalHashJoin(OutputPropertyDeriver.java:76)
        at com.starrocks.sql.optimizer.operator.physical.PhysicalHashJoinOperator.accept(PhysicalHashJoinOperator.java:41)
        at com.starrocks.sql.optimizer.OutputPropertyDeriver.getOutputProperty(OutputPropertyDeriver.java:95)
        at com.starrocks.sql.optimizer.task.EnforceAndCostTask.execute(EnforceAndCostTask.java:206)
        at com.starrocks.sql.optimizer.task.SeriallyTaskScheduler.executeTasks(SeriallyTaskScheduler.java:69)
        at com.starrocks.sql.optimizer.Optimizer.memoOptimize(Optimizer.java:571)
        at com.starrocks.sql.optimizer.Optimizer.optimizeByCost(Optimizer.java:188)
        at com.starrocks.sql.optimizer.Optimizer.optimize(Optimizer.java:126)
        at com.starrocks.sql.StatementPlanner.createQueryPlanWithReTry(StatementPlanner.java:203)
        at com.starrocks.sql.StatementPlanner.planQuery(StatementPlanner.java:119)
        at com.starrocks.sql.StatementPlanner.plan(StatementPlanner.java:88)
        at com.starrocks.sql.StatementPlanner.plan(StatementPlanner.java:57)
        at com.starrocks.qe.StmtExecutor.execute(StmtExecutor.java:436)
        at com.starrocks.qe.ConnectProcessor.handleQuery(ConnectProcessor.java:362)
        at com.starrocks.qe.ConnectProcessor.dispatch(ConnectProcessor.java:476)
        at com.starrocks.qe.ConnectProcessor.processOnce(ConnectProcessor.java:742)
        at com.starrocks.mysql.nio.ReadListener.lambda$handleEvent$0(ReadListener.java:69)
        at com.starrocks.mysql.nio.ReadListener$Lambda$737/1304093818.run(Unknown Source)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
        at java.lang.Thread.run(Thread.java:750)

   Locked ownable synchronizers:
        - <0x00000004efd52070> (a java.util.concurrent.ThreadPoolExecutor$Worker)
  1. 3.2 版本,有非等值 on 条件的 join 结果不对

  1. The tablet write operation update metadata take a long time

The tablet write operation update metadata take a long time
  1. Spark load 导致 FE 死锁

Jstack 有如下堆栈

"starrocks-mysql-nio-pool-76":
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x000000030a0b4430> (a java.util.concurrent.locks.ReentrantReadWriteLock$FairSync)
        at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175)
        at java.util.concurrent.locks.AbstractQueuedSynchronizer.parkAndCheckInterrupt(AbstractQueuedSynchronizer.java:836)
        at java.util.concurrent.locks.AbstractQueuedSynchronizer.doAcquireShared(AbstractQueuedSynchronizer.java:967)
        at java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireShared(AbstractQueuedSynchronizer.java:1283)
        at java.util.concurrent.locks.ReentrantReadWriteLock$ReadLock.lock(ReentrantReadWriteLock.java:727)
        at com.starrocks.common.util.QueryableReentrantReadWriteLock.sharedLock(QueryableReentrantReadWriteLock.java:43)
        at com.starrocks.catalog.Database.readLock(Database.java:182)
        at com.starrocks.load.loadv2.BulkLoadJob.checkAndSetDataSourceInfo(BulkLoadJob.java:171)
        at com.starrocks.load.loadv2.BulkLoadJob.fromLoadStmt(BulkLoadJob.java:162)
        at com.starrocks.load.loadv2.LoadMgr.createLoadJobFromStmt(LoadMgr.java:144)
        at com.starrocks.qe.DDLStmtExecutor$StmtExecutorVisitor.lambda$visitLoadStatement$16(DDLStmtExecutor.java:370)
        at com.starrocks.qe.DDLStmtExecutor$StmtExecutorVisitor$Lambda$1580/1903605390.apply(Unknown Source)
        at com.starrocks.common.ErrorReport.wrapWithRuntimeException(ErrorReport.java:112)
        at com.starrocks.qe.DDLStmtExecutor$StmtExecutorVisitor.visitLoadStatement(DDLStmtExecutor.java:360)
        at com.starrocks.qe.DDLStmtExecutor$StmtExecutorVisitor.visitLoadStatement(DDLStmtExecutor.java:163)
        at com.starrocks.sql.ast.LoadStmt.accept(LoadStmt.java:346)
        at com.starrocks.qe.DDLStmtExecutor.execute(DDLStmtExecutor.java:149)
        at com.starrocks.qe.StmtExecutor.handleDdlStmt(StmtExecutor.java:1420)
        at com.starrocks.qe.StmtExecutor.execute(StmtExecutor.java:595)
        at com.starrocks.qe.ConnectProcessor.handleQuery(ConnectProcessor.java:374)
        at com.starrocks.qe.ConnectProcessor.dispatch(ConnectProcessor.java:480)
        at com.starrocks.qe.ConnectProcessor.processOnce(ConnectProcessor.java:756)
        at com.starrocks.mysql.nio.ReadListener.lambda$handleEvent$0(ReadListener.java:69)
        at com.starrocks.mysql.nio.ReadListener$Lambda$1089/609879095.run(Unknown Source)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
        at java.lang.Thread.run(Thread.java:750)
        
 "thrift-server-pool-2863":
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x000000030475d590> (a java.util.concurrent.locks.ReentrantReadWriteLock$NonfairSync)
        at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175)
        at java.util.concurrent.locks.AbstractQueuedSynchronizer.parkAndCheckInterrupt(AbstractQueuedSynchronizer.java:836)
        at java.util.concurrent.locks.AbstractQueuedSynchronizer.doAcquireShared(AbstractQueuedSynchronizer.java:967)
        at java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireShared(AbstractQueuedSynchronizer.java:1283)
        at java.util.concurrent.locks.ReentrantReadWriteLock$ReadLock.lock(ReentrantReadWriteLock.java:727)
        at com.starrocks.load.loadv2.LoadMgr.readLock(LoadMgr.java:698)
        at com.starrocks.load.loadv2.LoadMgr.getLoadJob(LoadMgr.java:635)
        at com.starrocks.leader.LeaderImpl.finishRealtimePush(LeaderImpl.java:546)
        at com.starrocks.leader.LeaderImpl.finishTask(LeaderImpl.java:275)
        at com.starrocks.service.FrontendServiceImpl.finishTask(FrontendServiceImpl.java:1082)
        at com.starrocks.thrift.FrontendService$Processor$finishTask.getResult(FrontendService.java:3621)
        at com.starrocks.thrift.FrontendService$Processor$finishTask.getResult(FrontendService.java:3601)
        at org.apache.thrift.ProcessFunction.process(ProcessFunction.java:38)
        at org.apache.thrift.TBaseProcessor.process(TBaseProcessor.java:38)
        at com.starrocks.common.SRTThreadPoolServer$WorkerProcess.run(SRTThreadPoolServer.java:311)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
        at java.lang.Thread.run(Thread.java:750)       
  • Github Issue:

  • Github Fix PR:

  • Jira

  • 问题版本:

    • 2.5.0 ~ 2.5.19

    • 3.0.0 ~ 3.0.9

    • 3.1.0 ~ 3.1.9

    • 3.2.0 ~ 3.2.6

  • 修复版本:

    • 2.5.20+

    • 3.0.10+

    • 3.1.10+

    • 3.2.7+

  • 问题原因:

    • Spark load 任务多容易触发
  • 临时解决办法:

  1. Left join 错误的转成 inner join 导致查询结果不对

  1. 低基数改写报错

2024-02-20 09:32:18,029 WARN (starrocks-mysql-nio-pool-331|28388) [StmtExecutor.execute():551] execute Exception, sql SELECT CASE WHEN assignee_id = '' THEN '' ELSE SUBSTR(MD5(assignee_id), 1, 8) END AS sample_value FROM data_center.mart_board_issues_basic LIMIT 10
java.lang.IllegalStateException: null
        at com.google.common.base.Preconditions.checkState(Preconditions.java:496) ~[spark-dpp-1.0.0.jar:?]
        at com.starrocks.sql.optimizer.rule.tree.DictMappingRewriter.rewriteAsDictMapping(DictMappingRewriter.java:67) ~[starrocks-fe.jar:?]
        at com.starrocks.sql.optimizer.rule.tree.DictMappingRewriter.rewrite(DictMappingRewriter.java:46) ~[starrocks-fe.jar:?]
        at com.starrocks.sql.optimizer.rule.tree.AddDecodeNodeForDictStringRule$DecodeVisitor.rewriteOneScalarOperatorForProjection(AddDecodeNodeForDictStringRule.java:560) ~[starrocks-fe.jar:?]
        at com.starrocks.sql.optimizer.rule.tree.AddDecodeNodeForDictStringRule$DecodeVisitor.rewriteProjectOperator(AddDecodeNodeForDictStringRule.java:489) ~[starrocks-fe.jar:?]
        at com.starrocks.sql.optimizer.rule.tree.AddDecodeNodeForDictStringRule$DecodeVisitor.visitProjectionAfter(AddDecodeNodeForDictStringRule.java:262) ~[starrocks-fe.jar:?]
        at com.starrocks.sql.optimizer.rule.tree.AddDecodeNodeForDictStringRule$DecodeVisitor.visitPhysicalOlapScan(AddDecodeNodeForDictStringRule.java:458) ~[starrocks-fe.jar:?]
        at com.starrocks.sql.optimizer.rule.tree.AddDecodeNodeForDictStringRule$DecodeVisitor.visitPhysicalOlapScan(AddDecodeNodeForDictStringRule.java:171) ~[starrocks-fe.jar:?]
        at com.starrocks.sql.optimizer.operator.physical.PhysicalOlapScanOperator.accept(PhysicalOlapScanOperator.java:138) ~[starrocks-fe.jar:?]
        at com.starrocks.sql.optimizer.rule.tree.AddDecodeNodeForDictStringRule$DecodeVisitor.visitPhysicalDistribution(AddDecodeNodeForDictStringRule.java:840) ~[starrocks-fe.jar:?]
        at com.starrocks.sql.optimizer.rule.tree.AddDecodeNodeForDictStringRule$DecodeVisitor.visitPhysicalDistribution(AddDecodeNodeForDictStringRule.java:171) ~[starrocks-fe.jar:?]
        at com.starrocks.sql.optimizer.operator.physical.PhysicalDistributionOperator.accept(PhysicalDistributionOperator.java:44) ~[starrocks-fe.jar:?]
        at com.starrocks.sql.optimizer.rule.tree.AddDecodeNodeForDictStringRule$DecodeVisitor.visitPhysicalLimit(AddDecodeNodeForDictStringRule.java:308) ~[starrocks-fe.jar:?]
        at com.starrocks.sql.optimizer.rule.tree.AddDecodeNodeForDictStringRule$DecodeVisitor.visitPhysicalLimit(AddDecodeNodeForDictStringRule.java:171) ~[starrocks-fe.jar:?]
        at com.starrocks.sql.optimizer.operator.physical.PhysicalLimitOperator.accept(PhysicalLimitOperator.java:33) ~[starrocks-fe.jar:?]
        at com.starrocks.sql.optimizer.rule.tree.AddDecodeNodeForDictStringRule.rewrite(AddDecodeNodeForDictStringRule.java:930) ~[starrocks-fe.jar:?]
        at com.starrocks.sql.optimizer.Optimizer.physicalRuleRewrite(Optimizer.java:484) ~[starrocks-fe.jar:?]
        at com.starrocks.sql.optimizer.Optimizer.optimizeByCost(Optimizer.java:174) ~[starrocks-fe.jar:?]
        at com.starrocks.sql.optimizer.Optimizer.optimize(Optimizer.java:95) ~[starrocks-fe.jar:?]
        at com.starrocks.sql.StatementPlanner.createQueryPlanWithReTry(StatementPlanner.java:181) ~[starrocks-fe.jar:?]
        at com.starrocks.sql.StatementPlanner.planQuery(StatementPlanner.java:103) ~[starrocks-fe.jar:?]
        at com.starrocks.sql.StatementPlanner.plan(StatementPlanner.java:73) ~[starrocks-fe.jar:?]
        at com.starrocks.sql.StatementPlanner.plan(StatementPlanner.java:44) ~[starrocks-fe.jar:?]
        at com.starrocks.qe.StmtExecutor.execute(StmtExecutor.java:402) ~[starrocks-fe.jar:?]
        at com.starrocks.qe.ConnectProcessor.handleQuery(ConnectProcessor.java:327) ~[starrocks-fe.jar:?]
        at com.starrocks.qe.ConnectProcessor.dispatch(ConnectProcessor.java:444) ~[starrocks-fe.jar:?]
        at com.starrocks.qe.ConnectProcessor.processOnce(ConnectProcessor.java:711) ~[starrocks-fe.jar:?]
        at com.starrocks.mysql.nio.ReadListener.lambda$handleEvent$0(ReadListener.java:55) ~[starrocks-fe.jar:?]
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) ~[?:1.8.0_392]
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) ~[?:1.8.0_392]
        at java.lang.Thread.run(Thread.java:750) ~[?:1.8.0_392]
  • Github Issue:

  • Github Fix PR:

  • Jira

  • 问题版本:

    • 2.5.0 ~ 2.5.19

    • 3.0.0 ~ 3.0.9

    • 3.1.0 ~ 3.1.9

    • 3.2.0 ~ 3.2.6

  • 修复版本:

    • 2.5.20+

    • 3.0.10+

    • 3.1.10+

    • 3.2.7+

  • 问题原因:

  • 临时解决办法:

    • set global cbo_enable_low_cardinality_optimize =false;
  1. nullable处理有问题导致外表 crash

*** Aborted at 1715391277 (unix time) try “date -d @1715391277” if you are using GNU date ***
PC: @ 0x7f09e938d61a __memcpy_ssse3_back
*** SIGSEGV (@0x7effcbcffffa) received by PID 36590 (TID 0x7f0864581700) from PID 18446744072833990650; stack trace: ***
@ 0x5ae06d2 google::(anonymous namespace)::FailureSignalHandler()
@ 0x7f09ea87bab7 os::Linux::chained_handler()
@ 0x7f09ea883055 JVM_handle_linux_signal
@ 0x7f09ea878383 signalHandler()
@ 0x7f09e9d22630 (unknown)
@ 0x7f09e938d61a __memcpy_ssse3_back
@ 0x4a3f30b starrocks::MysqlRowBuffer::_push_string_normal()
@ 0x5364e14 starrocks::MysqlResultWriter::process_chunk()
@ 0x508ea64 starrocks::pipeline::ResultSinkOperator::push_chunk()
@ 0x2df4b93 starrocks::pipeline::PipelineDriver::process()
@ 0x50a20f3 starrocks::pipeline::GlobalDriverExecutor::_worker_thread()
@ 0x4a69e72 starrocks::ThreadPool::dispatch_thread()
@ 0x4a6496a starrocks:
:supervise_thread()
@ 0x7f09e9d1aea5 start_thread
@ 0x7f09e93359fd __clone
  • Github Issue:

  • Github Fix PR:

  • Jira

  • 问题版本:

    • 2.5.0 ~ 2.5.7

    • 3.0.0 ~ 3.0.2

  • 修复版本:

    • 2.5.8+

    • 3.0.3+

  • 问题原因:

    • 有些列建外表的时候指定义not null,但是实际数据是有null的
  • 临时解决办法:

    • 建外表时,指定列为nullable的
  1. Storage Page Cache 实际使用的内存比限制的大

  1. Tablet 比较多的时候,频繁查 information_schema.tables 表,导致查询/导入变慢

[
    {
        "lockState": "readLocked",
        "slowReadLockCount": 3,
        "dumpThreads": "lockHoldTime: 3465 ms;dump thread: thrift-server-pool-16449, id: 17420
    java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireShared(AbstractQueuedSynchronizer.java:1282)
    java.util.concurrent.locks.ReentrantReadWriteLock$ReadLock.lock(ReentrantReadWriteLock.java:727)
    com.starrocks.common.CloseableLock.lock(CloseableLock.java:27)
    com.starrocks.common.CloseableLock.lock(CloseableLock.java:37)
    com.starrocks.catalog.LocalTablet.getDataSize(LocalTablet.java:453)
    com.starrocks.catalog.MaterializedIndex.getDataSize(MaterializedIndex.java:218)
    com.starrocks.catalog.Partition.getDataSize(Partition.java:320)
    com.starrocks.catalog.OlapTable.getDataSize(OlapTable.java:1680)
    com.starrocks.service.InformationSchemaDataSource.genNormalTableInfo(InformationSchemaDataSource.java:367)
    com.starrocks.service.InformationSchemaDataSource.generateTablesInfoResponse(InformationSchemaDataSource.java:324)
    com.starrocks.service.FrontendServiceImpl.getTablesInfo(FrontendServiceImpl.java:1492)
    com.starrocks.thrift.FrontendService$Processor$getTablesInfo.getResult(FrontendService.java:2301)
    com.starrocks.thrift.FrontendService$Processor$getTablesInfo.getResult(FrontendService.java:2281)
    org.apache.thrift.ProcessFunction.process(ProcessFunction.java:38)
    org.apache.thrift.TBaseProcessor.process(TBaseProcessor.java:38)
    com.starrocks.common.SRTThreadPoolServer$WorkerProcess.run(SRTThreadPoolServer.java:311)
    java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
    java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
    java.lang.Thread.run(Thread.java:745)
;lockHoldTime: 3460 ms;dump thread: thrift-server-pool-16431, id: 17399
    java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireShared(AbstractQueuedSynchronizer.java:1282)
    java.util.concurrent.locks.ReentrantReadWriteLock$ReadLock.lock(ReentrantReadWriteLock.java:727)
    com.starrocks.common.CloseableLock.lock(CloseableLock.java:27)
    com.starrocks.common.CloseableLock.lock(CloseableLock.java:37)
    com.starrocks.catalog.LocalTablet.getDataSize(LocalTablet.java:453)
    com.starrocks.catalog.MaterializedIndex.getDataSize(MaterializedIndex.java:218)
    com.starrocks.service.InformationSchemaDataSource.genNormalTableInfo(InformationSchemaDataSource.java:356)
    com.starrocks.service.InformationSchemaDataSource.generateTablesInfoResponse(InformationSchemaDataSource.java:324)
    com.starrocks.service.FrontendServiceImpl.getTablesInfo(FrontendServiceImpl.java:1492)
    com.starrocks.thrift.FrontendService$Processor$getTablesInfo.getResult(FrontendService.java:2301)
    com.starrocks.thrift.FrontendService$Processor$getTablesInfo.getResult(FrontendService.java:2281)
    org.apache.thrift.ProcessFunction.process(ProcessFunction.java:38)
    org.apache.thrift.TBaseProcessor.process(TBaseProcessor.java:38)
    com.starrocks.common.SRTThreadPoolServer$WorkerProcess.run(SRTThreadPoolServer.java:311)
    java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
    java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
    java.lang.Thread.run(Thread.java:745)
;lockHoldTime: 3485 ms;dump thread: thrift-server-pool-16437, id: 17405
    java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireShared(AbstractQueuedSynchronizer.java:1282)
    java.util.concurrent.locks.ReentrantReadWriteLock$ReadLock.lock(ReentrantReadWriteLock.java:727)
    com.starrocks.common.CloseableLock.lock(CloseableLock.java:27)
    com.starrocks.common.CloseableLock.lock(CloseableLock.java:37)
    com.starrocks.catalog.LocalTablet.getDataSize(LocalTablet.java:453)
    com.starrocks.catalog.MaterializedIndex.getDataSize(MaterializedIndex.java:218)
    com.starrocks.service.InformationSchemaDataSource.genNormalTableInfo(InformationSchemaDataSource.java:356)
    com.starrocks.service.InformationSchemaDataSource.generateTablesInfoResponse(InformationSchemaDataSource.java:324)
    com.starrocks.service.FrontendServiceImpl.getTablesInfo(FrontendServiceImpl.java:1492)
    com.starrocks.thrift.FrontendService$Processor$getTablesInfo.getResult(FrontendService.java:2301)
    com.starrocks.thrift.FrontendService$Processor$getTablesInfo.getResult(FrontendService.java:2281)
    org.apache.thrift.ProcessFunction.process(ProcessFunction.java:38)
    org.apache.thrift.TBaseProcessor.process(TBaseProcessor.java:38)
    com.starrocks.common.SRTThreadPoolServer$WorkerProcess.run(SRTThreadPoolServer.java:311)
    java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
    java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
    java.lang.Thread.run(Thread.java:745)
;",
        "lockDbName": "edw_brain_kfc_db",
        "lockWaiters": [

        ]
    }
]
  • Github Issue:

  • Github Fix PR:

  • Jira

  • 问题版本:

    • 2.5.0 ~ 2.5.20

    • 3.0.0 ~ 3.0.9

    • 3.1.0 ~ 3.1.11

    • 2.2.0 ~ 3.2.6

  • 修复版本:

    • 2.5.21+

    • 3.0.10+

    • 3.1.12+

    • 3.2.7+

  • 问题原因:

  • 解决办法:

    • 当前这个方法,只是优化,还不能彻底解决这个问题。
  1. 主键模型 compaction 报错

writer add_columns error, tablet=10087, err=Internal error: column 0 is sort key but not find while init segment writer
  1. 主键模型表产生超大的l1文件,导致IO比较重