执行SQL抛出异常fail to prepare tablet reader

【详述】执行分析SQL,报错W0613 14:00:55.185508 220228 tablet_scanner.cpp:64] [192.168.226.239] fail to prepare tablet reader 427815112.543525640.614aeee4fccb94a8-1894258e01e88cbd: Not found:
has missed versions
【背景】执行SQL,抛出tablet reader异常,后重跑SQL可正常执行完成
【业务影响】重跑SQL解决,暂无影响
【StarRocks版本】2.1.7
【集群规模】3fe+9be(独立部署)
【机器信息】20C/256G/万兆
【附件】
对应be节点be.INFO搜索相关tablet信息如下
(指令:grep 427815112 be.INFO* >tablet_427815112.log)
be.INFO.log.20220613-085751:I0613 14:00:48.607714 15265 task_worker_pool.cpp:194] Submit task success. type=CREATE, signature=427815112, task_count_in_queue=1
be.INFO.log.20220613-085751:I0613 14:00:48.607853 220434 tablet_manager.cpp:137] Creating tablet 427815112
be.INFO.log.20220613-085751:I0613 14:00:48.608788 220434 tablet_manager.cpp:179] Created tablet 427815112
be.INFO.log.20220613-085751:I0613 14:00:52.010315 402388 txn_manager.cpp:208] committed transaction partition_id: 427815095 transaction_id: 10163462 tablet: 427815
112 rowsetid: 0200000006f013d41948e23ce85faf0b2db73a061b533ea3 version: 0
be.INFO.log.20220613-085751:I0613 14:00:52.010326 402388 delta_writer.cpp:305] Closed delta writer. tablet_id=427815112 stats=(flush time(ms)=252, flush count=1), f
lush flush_size_bytes = 8396851
be.INFO.log.20220613-085751:W0613 14:00:55.184846 220495 tablet.cpp:476] tablet427815112 missed version [0-2]
be.INFO.log.20220613-085751:W0613 14:00:55.184859 220495 tablet.cpp:821] 427815112.543525640.614aeee4fccb94a8-1894258e01e88cbd has 1 missed version:[2-2],
be.INFO.log.20220613-085751:W0613 14:00:55.185437 220228 tablet.cpp:476] tablet427815112 missed version [0-2]
be.INFO.log.20220613-085751:W0613 14:00:55.185479 220228 tablet.cpp:821] 427815112.543525640.614aeee4fccb94a8-1894258e01e88cbd has 1 missed version:[2-2],
be.INFO.log.20220613-085751:W0613 14:00:55.185508 220228 tablet_scanner.cpp:64] [10.10.226.239] fail to prepare tablet reader 427815112.543525640.614aeee4fccb94a8-1
894258e01e88cbd: Not found: has missed versions
be.INFO.log.20220613-085751:E0613 14:00:55.185532 220228 olap_scan_node.cpp:97] Failed to start scan node: Internal error: [10.10.226.239] fail to prepare tablet re
ader 427815112.543525640.614aeee4fccb94a8-1894258e01e88cbd: Not found: has missed versions
be.INFO.log.20220613-085751:W0613 14:00:55.185570 220228 plan_fragment_executor.cpp:188] fail to open fragment, instance_id=300ead15-eade-11ec-b924-6c92bf5f858e, st
atus=Internal error: [10.10.226.239] fail to prepare tablet reader 427815112.543525640.614aeee4fccb94a8-1894258e01e88cbd: Not found: has missed versions
be.INFO.log.20220613-085751:W0613 14:00:55.200286 220228 fragment_mgr.cpp:195] Fail to open fragment 300ead15-eade-11ec-b924-6c92bf5f858e: Internal error: [10.10.22
6.239] fail to prepare tablet reader 427815112.543525640.614aeee4fccb94a8-1894258e01e88cbd: Not found: has missed versions
be.INFO.log.20220613-085751:I0613 14:00:56.253129 220462 txn_manager.cpp:260] publish txn successfully. partition_id: 427815095, txn_id: 10163462, tablet: 427815112
.543525640.614aeee4fccb94a8-1894258e01e88cbd, rowsetid: 0200000006f013d41948e23ce85faf0b2db73a061b533ea3, version: 2,2
be.INFO.log.20220613-085751:I0613 14:00:56.738472 15265 task_worker_pool.cpp:194] Submit task success. type=CLONE, signature=427815112, task_count_in_queue=1
be.INFO.log.20220613-085751:I0613 14:00:56.738543 220470 task_worker_pool.cpp:982] get clone task. signature:427815112
be.INFO.log.20220613-085751:I0613 14:00:56.738592 220470 engine_clone_task.cpp:102] Cloning existing tablet. signature=427815112 tablet_id=427815112 schema_hash=543
525640 committed_version=2 keys_type=0
be.INFO.log.20220613-085751:I0613 14:00:56.738656 220470 tablet_manager.cpp:787] Reporting tablet info. tablet_id=427815112
be.INFO.log.20220613-085751:I0613 14:00:56.738672 220470 engine_clone_task.cpp:235] clone get tablet info success. tablet_id:427815112, schema_hash:543525640, signa
ture:427815112, version:2
be.INFO.log.20220613-085751:I0613 14:00:56.738688 220470 task_worker_pool.cpp:1039] clone success, set tablet infos. status:0, signature:427815112
be.INFO.log.20220613-085751:I0613 14:01:56.498378 220404 tablet.cpp:955] start to do tablet meta checkpoint, tablet=427815112.543525640.614aeee4fccb94a8-1894258e01e
88cbd
be.INFO.log.20220613-085751:I0613 14:07:31.089967 289730 task_worker_pool.cpp:194] Submit task success. type=DROP, signature=427815112, task_count_in_queue=587
be.INFO.log.20220613-085751:I0613 14:07:31.138875 220436 tablet_manager.cpp:346] Dropping tablet 427815112
be.INFO.log.20220613-085751:I0613 14:07:42.145220 220310 tablet_manager.cpp:906] Moved /data09/starrocks/be/storage/data/773/427815112

从日志里看当时应该是缺版本了,后来从其他副本clone了版本过来查询就正常了

这里还有一个问题,导入还没有publish成功的时候提前收到查询请求,这个查询请求正常应该不会发到这个没准备好的副本上,具体可以把当时这台be的日志和fe的日志发一下看下

日志文件已上传
be.INFO.log-20220613 (5.3 KB) fe.audit.log-20220613 (14.2 KB) fe.log-20220613 (17.7 KB)

这边环境对于一类SQL执行错误频度很高,视图的访问,访问正常与异常是交替出现的,数据总量6966128
异常截图如下:

视图创建语句
CREATE VIEW REAL_LST_ITV_CUST_CUR_Z AS
SELECT
REAL_LST_ITV_CUST_CUR_A.CCUST_ROW_ID AS CCUST_ROW_ID,
REAL_LST_ITV_CUST_CUR_A.CCUST_ROW_ID_NEW AS CCUST_ROW_ID_NEW,
REAL_LST_ITV_CUST_CUR_A.NEW_CCUST_FLG AS NEW_CCUST_FLG,
REAL_LST_ITV_CUST_CUR_A.LATN_ID AS LATN_ID
FROM
default_cluster:PRTDATA.REAL_LST_ITV_CUST_CUR_A
UNION ALL
SELECT
REAL_LST_ITV_CUST_CUR_B.CCUST_ROW_ID AS CCUST_ROW_ID,
REAL_LST_ITV_CUST_CUR_B.CCUST_ROW_ID_NEW AS CCUST_ROW_ID_NEW,
REAL_LST_ITV_CUST_CUR_B.NEW_CCUST_FLG AS NEW_CCUST_FLG,
REAL_LST_ITV_CUST_CUR_B.LATN_ID AS LATN_ID
FROM
default_cluster:PRTDATA.REAL_LST_ITV_CUST_CUR_B
UNION ALL
SELECT
REAL_LST_ITV_CUST_CUR_C.CCUST_ROW_ID AS CCUST_ROW_ID,
REAL_LST_ITV_CUST_CUR_C.CCUST_ROW_ID_NEW AS CCUST_ROW_ID_NEW,
REAL_LST_ITV_CUST_CUR_C.NEW_CCUST_FLG AS NEW_CCUST_FLG,
REAL_LST_ITV_CUST_CUR_C.LATN_ID AS LATN_ID
FROM
default_cluster:PRTDATA.REAL_LST_ITV_CUST_CUR_C
UNION ALL
SELECT
REAL_LST_ITV_CUST_CUR_D.CCUST_ROW_ID AS CCUST_ROW_ID,
REAL_LST_ITV_CUST_CUR_D.CCUST_ROW_ID_NEW AS CCUST_ROW_ID_NEW,
REAL_LST_ITV_CUST_CUR_D.NEW_CCUST_FLG AS NEW_CCUST_FLG,
REAL_LST_ITV_CUST_CUR_D.LATN_ID AS LATN_ID
FROM
default_cluster:PRTDATA.REAL_LST_ITV_CUST_CUR_D
UNION ALL
SELECT
REAL_LST_ITV_CUST_CUR_E.CCUST_ROW_ID AS CCUST_ROW_ID,
REAL_LST_ITV_CUST_CUR_E.CCUST_ROW_ID_NEW AS CCUST_ROW_ID_NEW,
REAL_LST_ITV_CUST_CUR_E.NEW_CCUST_FLG AS NEW_CCUST_FLG,
REAL_LST_ITV_CUST_CUR_E.LATN_ID AS LATN_ID
FROM
default_cluster:PRTDATA.REAL_LST_ITV_CUST_CUR_E
UNION ALL
SELECT
REAL_LST_ITV_CUST_CUR_F.CCUST_ROW_ID AS CCUST_ROW_ID,
REAL_LST_ITV_CUST_CUR_F.CCUST_ROW_ID_NEW AS CCUST_ROW_ID_NEW,
REAL_LST_ITV_CUST_CUR_F.NEW_CCUST_FLG AS NEW_CCUST_FLG,
REAL_LST_ITV_CUST_CUR_F.LATN_ID AS LATN_ID
FROM
default_cluster:PRTDATA.REAL_LST_ITV_CUST_CUR_F
UNION ALL
SELECT
REAL_LST_ITV_CUST_CUR_G.CCUST_ROW_ID AS CCUST_ROW_ID,
REAL_LST_ITV_CUST_CUR_G.CCUST_ROW_ID_NEW AS CCUST_ROW_ID_NEW,
REAL_LST_ITV_CUST_CUR_G.NEW_CCUST_FLG AS NEW_CCUST_FLG,
REAL_LST_ITV_CUST_CUR_G.LATN_ID AS LATN_ID
FROM
default_cluster:PRTDATA.REAL_LST_ITV_CUST_CUR_G
UNION ALL
SELECT
REAL_LST_ITV_CUST_CUR_H.CCUST_ROW_ID AS CCUST_ROW_ID,
REAL_LST_ITV_CUST_CUR_H.CCUST_ROW_ID_NEW AS CCUST_ROW_ID_NEW,
REAL_LST_ITV_CUST_CUR_H.NEW_CCUST_FLG AS NEW_CCUST_FLG,
REAL_LST_ITV_CUST_CUR_H.LATN_ID AS LATN_ID
FROM
default_cluster:PRTDATA.REAL_LST_ITV_CUST_CUR_H
UNION ALL
SELECT
REAL_LST_ITV_CUST_CUR_I.CCUST_ROW_ID AS CCUST_ROW_ID,
REAL_LST_ITV_CUST_CUR_I.CCUST_ROW_ID_NEW AS CCUST_ROW_ID_NEW,
REAL_LST_ITV_CUST_CUR_I.NEW_CCUST_FLG AS NEW_CCUST_FLG,
REAL_LST_ITV_CUST_CUR_I.LATN_ID AS LATN_ID
FROM
default_cluster:PRTDATA.REAL_LST_ITV_CUST_CUR_I
UNION ALL
SELECT
REAL_LST_ITV_CUST_CUR_J.CCUST_ROW_ID AS CCUST_ROW_ID,
REAL_LST_ITV_CUST_CUR_J.CCUST_ROW_ID_NEW AS CCUST_ROW_ID_NEW,
REAL_LST_ITV_CUST_CUR_J.NEW_CCUST_FLG AS NEW_CCUST_FLG,
REAL_LST_ITV_CUST_CUR_J.LATN_ID AS LATN_ID
FROM
default_cluster:PRTDATA.REAL_LST_ITV_CUST_CUR_J
UNION ALL
SELECT
REAL_LST_ITV_CUST_CUR_K.CCUST_ROW_ID AS CCUST_ROW_ID,
REAL_LST_ITV_CUST_CUR_K.CCUST_ROW_ID_NEW AS CCUST_ROW_ID_NEW,
REAL_LST_ITV_CUST_CUR_K.NEW_CCUST_FLG AS NEW_CCUST_FLG,
REAL_LST_ITV_CUST_CUR_K.LATN_ID AS LATN_ID
FROM
default_cluster:PRTDATA.REAL_LST_ITV_CUST_CUR_K;

基于我之前对视图访问发现问题发生频度很高,对视图的每张表做了多频次访问,发现其中一张表存在问题PRTDATA.REAL_LST_ITV_CUST_CUR_E
但通过SHOW TABLET FROM PRTDATA.REAL_LST_ITV_CUST_CUR_E;查到副本状态都是NORMAL,按照你前面解释不应该会出现这类异常
现对该表进行了重建,多频次访问该表和视图问题未再出现