【详述】一开始跑任务3个cn节点会全部同时宕机
【是否存算分离】存算分离
【StarRocks版本】3.3.4
【集群规模】例如:3fe+3cn
【机器信息】16C/32G
【联系方式】879124944@qq.com
【附件】崩溃前cn.INFO日志片段
Caused by: org.apache.hadoop.ipc.RemoteException(java.io.FileNotFoundException): File does not exist: /user/starrocks/badb2078-cc9e-410f-a560-3c2400355f3a/db10555/12380/12379/meta/000000000000305E_000000000000158F.meta
at org.apache.hadoop.hdfs.server.namenode.INodeFile.valueOf(INodeFile.java:86)
at org.apache.hadoop.hdfs.server.namenode.INodeFile.valueOf(INodeFile.java:76)
at org.apache.hadoop.hdfs.server.namenode.FSDirStatAndListingOp.getBlockLocations(FSDirStatAndListingOp.java:153)
at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.getBlockLocations(FSNamesystem.java:1946)
at org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.getBlockLocations(NameNodeRpcServer.java:739)
at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.getBlockLocations(ClientNamenodeProtocolServerSideTranslatorPB.java:432)
at org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java)
at org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:524)
at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1025)
at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:876)
at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:822)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1730)
at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2682)
at org.apache.hadoop.ipc.Client.getRpcResponse(Client.java:1584)
at org.apache.hadoop.ipc.Client.call(Client.java:1529)
at org.apache.hadoop.ipc.Client.call(Client.java:1426)
at org.apache.hadoop.ipc.ProtobufRpcEngine2$Invoker.invoke(ProtobufRpcEngine2.java:258)
at org.apache.hadoop.ipc.ProtobufRpcEngine2$Invoker.invoke(ProtobufRpcEngine2.java:139)
at com.sun.proxy.$Proxy12.getBlockLocations(Unknown Source)
at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.lambda$getBlockLocations$0(ClientNamenodeProtocolTranslatorPB.java:340)
at org.apache.hadoop.ipc.internal.ShadedProtobufHelper.ipc(ShadedProtobufHelper.java:160)
at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.getBlockLocations(ClientNamenodeProtocolTranslatorPB.java:340)
at jdk.internal.reflect.GeneratedMethodAccessor18.invoke(Unknown Source)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:566)
at org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:437)
at org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeMethod(RetryInvocationHandler.java:170)
at org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invoke(RetryInvocationHandler.java:162)
at org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeOnce(RetryInvocationHandler.java:100)
at org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:366)
at com.sun.proxy.$Proxy13.getBlockLocations(Unknown Source)
at org.apache.hadoop.hdfs.DFSClient.callGetBlockLocations(DFSClient.java:931)
... 7 more
Call to AttachCurrentThread failed with error: -1
getJNIEnv: getGlobalJNIEnv failed
Call to AttachCurrentThread failed with error: -1
getJNIEnv: getGlobalJNIEnv failed
Call to AttachCurrentThread failed with error: -1
getJNIEnv: getGlobalJNIEnv failed
Call to AttachCurrentThread failed with error: -1
getJNIEnv: getGlobalJNIEnv failed
3.3.4 RELEASE (build 56bcf6f)
query_id:00000000-0000-0000-0000-000000000000, fragment_instance:00000000-0000-0000-0000-000000000000
tracker:process consumption: 1007643720
tracker:query_pool consumption: 4567560
tracker:query_pool/connector_scan consumption: 0
tracker:load consumption: 0
tracker:metadata consumption: 22988876
tracker:tablet_metadata consumption: 704642
tracker:rowset_metadata consumption: 0
tracker:segment_metadata consumption: 3007674
tracker:column_metadata consumption: 19276560
tracker:tablet_schema consumption: 704642
tracker:segment_zonemap consumption: 1009760
tracker:short_key_index consumption: 2284
tracker:column_zonemap_index consumption: 1341616
tracker:ordinal_index consumption: 3341408
tracker:bitmap_index consumption: 0
tracker:bloom_filter_index consumption: 0
tracker:compaction consumption: 0
tracker:schema_change consumption: 0
tracker:column_pool consumption: 0
tracker:page_cache consumption: 703570352
tracker:jit_cache consumption: 0
tracker:update consumption: 0
tracker:chunk_allocator consumption: 8455256
tracker:clone consumption: 0
tracker:consistency consumption: 0
tracker:datacache consumption: 2316031
tracker:replication consumption: 0
*** Aborted at 1732813207 (unix time) try “date -d @1732813207” if you are using GNU date ***
PC: @ 0x7268ba8 newJavaStr
*** SIGSEGV (@0x64004981) received by PID 990312 (TID 0x7f338c19a640) from PID 1677740417; stack trace: ***
@ 0x7f347e98a160 (/usr/lib64/libc.so.6+0x9015f)
@ 0x79d9f80 google::(anonymous namespace)::FailureSignalHandler(int, siginfo_t*, void*)
@ 0x7f347f87977b os::Linux::chained_handler(int, siginfo*, void*)
@ 0x7f347f87e725 JVM_handle_linux_signal
@ 0x7f347f872108 signalHandler(int, siginfo*, void*)
@ 0x7f347e93aef0 (/usr/lib64/libc.so.6+0x40eef)
@ 0x7268ba8 newJavaStr
@ 0x726a246 constructNewObjectOfPath
@ 0x72719e1 hdfsListDirectory
@ 0x70b2a74 staros::starlet::fslib::HdfsFileSystem::list_dir_internal(std::basic_string_view<char, std::char_traits<char> >, std::basic_string_view<char, std::char_traits<char> >, bool, std::function<bool (staros::starlet::fslib::EntryStat)>) [clone .localalias]
@ 0x70b4036 staros::starlet::fslib::HdfsFileSystem::list_dir(std::basic_string_view<char, std::char_traits<char> >, bool, std::function<bool (staros::starlet::fslib::EntryStat)>)
@ 0x7095aff staros::starlet::fslib::CacheFileSystemImpl::list_dir(std::basic_string_view<char, std::char_traits<char> >, bool, std::function<bool (staros::starlet::fslib::EntryStat)>)
@ 0x70940e5 staros::starlet::fslib::CacheFileSystem::list_dir(std::basic_string_view<char, std::char_traits<char> >, bool, std::function<bool (staros::starlet::fslib::EntryStat)>)
@ 0x5dc63dd starrocks::StarletFileSystem::iterate_dir(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::function<bool (std::basic_string_view<char, std::char_traits<char> >)> const&)
@ 0x61a8a0c starrocks::lake::TabletManager::list_tablet_metadata(long)
@ 0x61ac3f8 starrocks::lake::TabletManager::get_tablet_data_size(long, long*)
@ 0x61ade66 starrocks::lake::TabletManager::add_in_writing_data_size(long, long)
@ 0x61605fa starrocks::lake::DeltaWriterImpl::check_immutable()
@ 0x6160875 starrocks::lake::DeltaWriter::check_immutable()
@ 0x6150f18 starrocks::lake::AsyncDeltaWriter::check_immutable()
@ 0x37095fd starrocks::LakeTabletsChannel::open(starrocks::PTabletWriterOpenRequest const&, starrocks::PTabletWriterOpenResult*, std::shared_ptr<starrocks::OlapTableSchemaParam>, bool)
@ 0x36a24e7 starrocks::LoadChannel::open(brpc::Controller*, starrocks::PTabletWriterOpenRequest const&, starrocks::PTabletWriterOpenResult*, google::protobuf::Closure*)
@ 0x369c118 starrocks::LoadChannelMgr::open(brpc::Controller*, starrocks::PTabletWriterOpenRequest const&, starrocks::PTabletWriterOpenResult*, google::protobuf::Closure*)
@ 0x7c66944 brpc::policy::ProcessRpcRequest(brpc::InputMessageBase*)
@ 0x7b92ce7 brpc::ProcessInputMessage(void*)
@ 0x7b94065 brpc::InputMessenger::OnNewMessages(brpc::Socket*)
@ 0x7b8235e brpc::Socket::ProcessEvent(void*)
@ 0x7b533e2 bthread::TaskGroup::task_runner(long)
@ 0x7ca8871 bthread_make_fcontext