3.3.4 存算分离cn节点频繁奔溃

【详述】一开始跑任务3个cn节点会全部同时宕机
【是否存算分离】存算分离
【StarRocks版本】3.3.4
【集群规模】例如:3fe+3cn
【机器信息】16C/32G
【联系方式】879124944@qq.com
【附件】崩溃前cn.INFO日志片段
Caused by: org.apache.hadoop.ipc.RemoteException(java.io.FileNotFoundException): File does not exist: /user/starrocks/badb2078-cc9e-410f-a560-3c2400355f3a/db10555/12380/12379/meta/000000000000305E_000000000000158F.meta

at org.apache.hadoop.hdfs.server.namenode.INodeFile.valueOf(INodeFile.java:86)

at org.apache.hadoop.hdfs.server.namenode.INodeFile.valueOf(INodeFile.java:76)

at org.apache.hadoop.hdfs.server.namenode.FSDirStatAndListingOp.getBlockLocations(FSDirStatAndListingOp.java:153)

at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.getBlockLocations(FSNamesystem.java:1946)

at org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.getBlockLocations(NameNodeRpcServer.java:739)

at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.getBlockLocations(ClientNamenodeProtocolServerSideTranslatorPB.java:432)

at org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java)

at org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:524)

at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1025)

at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:876)

at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:822)

at java.security.AccessController.doPrivileged(Native Method)

at javax.security.auth.Subject.doAs(Subject.java:422)

at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1730)

at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2682)

at org.apache.hadoop.ipc.Client.getRpcResponse(Client.java:1584)

at org.apache.hadoop.ipc.Client.call(Client.java:1529)

at org.apache.hadoop.ipc.Client.call(Client.java:1426)

at org.apache.hadoop.ipc.ProtobufRpcEngine2$Invoker.invoke(ProtobufRpcEngine2.java:258)

at org.apache.hadoop.ipc.ProtobufRpcEngine2$Invoker.invoke(ProtobufRpcEngine2.java:139)

at com.sun.proxy.$Proxy12.getBlockLocations(Unknown Source)

at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.lambda$getBlockLocations$0(ClientNamenodeProtocolTranslatorPB.java:340)

at org.apache.hadoop.ipc.internal.ShadedProtobufHelper.ipc(ShadedProtobufHelper.java:160)

at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.getBlockLocations(ClientNamenodeProtocolTranslatorPB.java:340)

at jdk.internal.reflect.GeneratedMethodAccessor18.invoke(Unknown Source)

at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)

at java.base/java.lang.reflect.Method.invoke(Method.java:566)

at org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:437)

at org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeMethod(RetryInvocationHandler.java:170)

at org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invoke(RetryInvocationHandler.java:162)

at org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeOnce(RetryInvocationHandler.java:100)

at org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:366)

at com.sun.proxy.$Proxy13.getBlockLocations(Unknown Source)

at org.apache.hadoop.hdfs.DFSClient.callGetBlockLocations(DFSClient.java:931)

... 7 more

Call to AttachCurrentThread failed with error: -1

getJNIEnv: getGlobalJNIEnv failed

Call to AttachCurrentThread failed with error: -1

getJNIEnv: getGlobalJNIEnv failed

Call to AttachCurrentThread failed with error: -1

getJNIEnv: getGlobalJNIEnv failed

Call to AttachCurrentThread failed with error: -1

getJNIEnv: getGlobalJNIEnv failed

3.3.4 RELEASE (build 56bcf6f)

query_id:00000000-0000-0000-0000-000000000000, fragment_instance:00000000-0000-0000-0000-000000000000

tracker:process consumption: 1007643720

tracker:query_pool consumption: 4567560

tracker:query_pool/connector_scan consumption: 0

tracker:load consumption: 0

tracker:metadata consumption: 22988876

tracker:tablet_metadata consumption: 704642

tracker:rowset_metadata consumption: 0

tracker:segment_metadata consumption: 3007674

tracker:column_metadata consumption: 19276560

tracker:tablet_schema consumption: 704642

tracker:segment_zonemap consumption: 1009760

tracker:short_key_index consumption: 2284

tracker:column_zonemap_index consumption: 1341616

tracker:ordinal_index consumption: 3341408

tracker:bitmap_index consumption: 0

tracker:bloom_filter_index consumption: 0

tracker:compaction consumption: 0

tracker:schema_change consumption: 0

tracker:column_pool consumption: 0

tracker:page_cache consumption: 703570352

tracker:jit_cache consumption: 0

tracker:update consumption: 0

tracker:chunk_allocator consumption: 8455256

tracker:clone consumption: 0

tracker:consistency consumption: 0

tracker:datacache consumption: 2316031

tracker:replication consumption: 0

*** Aborted at 1732813207 (unix time) try “date -d @1732813207” if you are using GNU date ***

PC: @ 0x7268ba8 newJavaStr

*** SIGSEGV (@0x64004981) received by PID 990312 (TID 0x7f338c19a640) from PID 1677740417; stack trace: ***

@     0x7f347e98a160 (/usr/lib64/libc.so.6+0x9015f)

@          0x79d9f80 google::(anonymous namespace)::FailureSignalHandler(int, siginfo_t*, void*)

@     0x7f347f87977b os::Linux::chained_handler(int, siginfo*, void*)

@     0x7f347f87e725 JVM_handle_linux_signal

@     0x7f347f872108 signalHandler(int, siginfo*, void*)

@     0x7f347e93aef0 (/usr/lib64/libc.so.6+0x40eef)

@          0x7268ba8 newJavaStr

@          0x726a246 constructNewObjectOfPath

@          0x72719e1 hdfsListDirectory

@          0x70b2a74 staros::starlet::fslib::HdfsFileSystem::list_dir_internal(std::basic_string_view<char, std::char_traits<char> >, std::basic_string_view<char, std::char_traits<char> >, bool, std::function<bool (staros::starlet::fslib::EntryStat)>) [clone .localalias]

@          0x70b4036 staros::starlet::fslib::HdfsFileSystem::list_dir(std::basic_string_view<char, std::char_traits<char> >, bool, std::function<bool (staros::starlet::fslib::EntryStat)>)

@          0x7095aff staros::starlet::fslib::CacheFileSystemImpl::list_dir(std::basic_string_view<char, std::char_traits<char> >, bool, std::function<bool (staros::starlet::fslib::EntryStat)>)

@          0x70940e5 staros::starlet::fslib::CacheFileSystem::list_dir(std::basic_string_view<char, std::char_traits<char> >, bool, std::function<bool (staros::starlet::fslib::EntryStat)>)

@          0x5dc63dd starrocks::StarletFileSystem::iterate_dir(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::function<bool (std::basic_string_view<char, std::char_traits<char> >)> const&)

@          0x61a8a0c starrocks::lake::TabletManager::list_tablet_metadata(long)

@          0x61ac3f8 starrocks::lake::TabletManager::get_tablet_data_size(long, long*)

@          0x61ade66 starrocks::lake::TabletManager::add_in_writing_data_size(long, long)

@          0x61605fa starrocks::lake::DeltaWriterImpl::check_immutable()

@          0x6160875 starrocks::lake::DeltaWriter::check_immutable()

@          0x6150f18 starrocks::lake::AsyncDeltaWriter::check_immutable()

@          0x37095fd starrocks::LakeTabletsChannel::open(starrocks::PTabletWriterOpenRequest const&, starrocks::PTabletWriterOpenResult*, std::shared_ptr<starrocks::OlapTableSchemaParam>, bool)

@          0x36a24e7 starrocks::LoadChannel::open(brpc::Controller*, starrocks::PTabletWriterOpenRequest const&, starrocks::PTabletWriterOpenResult*, google::protobuf::Closure*)

@          0x369c118 starrocks::LoadChannelMgr::open(brpc::Controller*, starrocks::PTabletWriterOpenRequest const&, starrocks::PTabletWriterOpenResult*, google::protobuf::Closure*)

@          0x7c66944 brpc::policy::ProcessRpcRequest(brpc::InputMessageBase*)

@          0x7b92ce7 brpc::ProcessInputMessage(void*)

@          0x7b94065 brpc::InputMessenger::OnNewMessages(brpc::Socket*)

@          0x7b8235e brpc::Socket::ProcessEvent(void*)

@          0x7b533e2 bthread::TaskGroup::task_runner(long)

@          0x7ca8871 bthread_make_fcontext

有人帮忙看看报错吗? 谢谢

能稳定复现吗?

应该是bthread调用JNI导致的crash

可以稳定复现,一直都在崩溃