diff --git a/example/session/pom.xml b/example/session/pom.xml index e707c5b25d1ce..331fbf0c46df8 100644 --- a/example/session/pom.xml +++ b/example/session/pom.xml @@ -40,4 +40,17 @@ ${project.version} + + + + + org.apache.maven.plugins + maven-compiler-plugin + + 11 + 11 + + + + diff --git a/example/session/src/main/java/org/apache/iotdb/ConsensusSubscriptionTableTest.java b/example/session/src/main/java/org/apache/iotdb/ConsensusSubscriptionTableTest.java new file mode 100644 index 0000000000000..a10d2361067d3 --- /dev/null +++ b/example/session/src/main/java/org/apache/iotdb/ConsensusSubscriptionTableTest.java @@ -0,0 +1,1351 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb; + +import org.apache.iotdb.isession.ITableSession; +import org.apache.iotdb.rpc.subscription.config.TopicConstant; +import org.apache.iotdb.session.TableSessionBuilder; +import org.apache.iotdb.session.subscription.ISubscriptionTableSession; +import org.apache.iotdb.session.subscription.SubscriptionTableSessionBuilder; +import org.apache.iotdb.session.subscription.consumer.ISubscriptionTablePullConsumer; +import org.apache.iotdb.session.subscription.consumer.table.SubscriptionTablePullConsumer; +import org.apache.iotdb.session.subscription.consumer.table.SubscriptionTablePullConsumerBuilder; +import org.apache.iotdb.session.subscription.payload.SubscriptionMessage; +import org.apache.iotdb.session.subscription.payload.SubscriptionSessionDataSet; + +import org.apache.tsfile.enums.ColumnCategory; +import org.apache.tsfile.enums.TSDataType; +import org.apache.tsfile.write.record.Tablet; +import org.apache.tsfile.write.schema.IMeasurementSchema; +import org.apache.tsfile.write.schema.MeasurementSchema; + +import java.time.Duration; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.Set; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.atomic.AtomicInteger; + +/** TODO: Move these manual tests into ITs */ +public class ConsensusSubscriptionTableTest { + + private static final String HOST = "127.0.0.1"; + private static final int PORT = 6667; + private static final String USER = "root"; + private static final String PASSWORD = "root"; + + private static int testCounter = 0; + private static int passed = 0; + private static int failed = 0; + private static final List failedTests = new ArrayList<>(); + + public static void main(String[] args) throws Exception { + System.out.println("=== Consensus-Based Subscription Table Model Test Suite ===\n"); + + String targetTest = args.length > 0 ? args[0] : null; + + if (targetTest == null || "testBasicFlow".equals(targetTest)) { + runTest("testBasicFlow", ConsensusSubscriptionTableTest::testBasicFlow); + } + if (targetTest == null || "testDataTypes".equals(targetTest)) { + runTest("testDataTypes", ConsensusSubscriptionTableTest::testDataTypes); + } + if (targetTest == null || "testPathFiltering".equals(targetTest)) { + runTest("testPathFiltering", ConsensusSubscriptionTableTest::testPathFiltering); + } + if (targetTest == null || "testSubscribeBeforeRegion".equals(targetTest)) { + runTest( + "testSubscribeBeforeRegion", ConsensusSubscriptionTableTest::testSubscribeBeforeRegion); + } + if (targetTest == null || "testMultiEntityIsolation".equals(targetTest)) { + runTest("testMultiEntityIsolation", ConsensusSubscriptionTableTest::testMultiEntityIsolation); + } + if (targetTest == null || "testBurstWriteGapRecovery".equals(targetTest)) { + runTest( + "testBurstWriteGapRecovery", ConsensusSubscriptionTableTest::testBurstWriteGapRecovery); + } + if (targetTest == null || "testCommitAfterUnsubscribe".equals(targetTest)) { + runTest( + "testCommitAfterUnsubscribe", ConsensusSubscriptionTableTest::testCommitAfterUnsubscribe); + } + if (targetTest == null || "testSeek".equals(targetTest)) { + runTest("testSeek", ConsensusSubscriptionTableTest::testSeek); + } + + // Summary + System.out.println("\n=== Test Suite Summary ==="); + System.out.println("Passed: " + passed); + System.out.println("Failed: " + failed); + if (!failedTests.isEmpty()) { + System.out.println("Failed tests: " + failedTests); + } + System.out.println("=== Done ==="); + } + + // ============================ + // Test Infrastructure + // ============================ + + @FunctionalInterface + interface TestMethod { + void run() throws Exception; + } + + private static void runTest(String name, TestMethod test) { + System.out.println("\n" + "================================================================="); + System.out.println("Running: " + name); + System.out.println("================================================================="); + try { + test.run(); + passed++; + System.out.println(">>> PASSED: " + name); + } catch (AssertionError e) { + failed++; + failedTests.add(name); + System.out.println(">>> FAILED: " + name + " - " + e.getMessage()); + e.printStackTrace(System.out); + } catch (Exception e) { + failed++; + failedTests.add(name); + System.out.println(">>> ERROR: " + name + " - " + e.getMessage()); + e.printStackTrace(System.out); + } + } + + private static String nextDatabase() { + testCounter++; + return "csub_tbl_" + testCounter; + } + + private static String nextTopic() { + return "topic_tbl_" + testCounter; + } + + private static String nextConsumerGroup() { + return "cg_tbl_" + testCounter; + } + + private static String nextConsumerId() { + return "consumer_tbl_" + testCounter; + } + + private static ITableSession openTableSession() throws Exception { + return new TableSessionBuilder() + .nodeUrls(Collections.singletonList(HOST + ":" + PORT)) + .username(USER) + .password(PASSWORD) + .build(); + } + + private static void createDatabaseAndTable( + ITableSession session, String database, String tableName, String tableSchema) + throws Exception { + session.executeNonQueryStatement("CREATE DATABASE IF NOT EXISTS " + database); + session.executeNonQueryStatement("USE " + database); + session.executeNonQueryStatement(String.format("CREATE TABLE %s (%s)", tableName, tableSchema)); + } + + private static void deleteDatabase(String database) { + try (ITableSession session = openTableSession()) { + session.executeNonQueryStatement("DROP DATABASE IF EXISTS " + database); + } catch (Exception e) { + // ignore + } + } + + private static void dropTopicTable(String topicName) { + try (ISubscriptionTableSession subSession = + new SubscriptionTableSessionBuilder() + .host(HOST) + .port(PORT) + .username(USER) + .password(PASSWORD) + .build()) { + subSession.dropTopicIfExists(topicName); + } catch (Exception e) { + // ignore + } + } + + private static void createTopicTable(String topicName, String dbKey, String tableKey) + throws Exception { + try (ISubscriptionTableSession subSession = + new SubscriptionTableSessionBuilder() + .host(HOST) + .port(PORT) + .username(USER) + .password(PASSWORD) + .build()) { + try { + subSession.dropTopicIfExists(topicName); + } catch (Exception e) { + // ignore + } + + Properties topicConfig = new Properties(); + topicConfig.put(TopicConstant.MODE_KEY, TopicConstant.MODE_LIVE_VALUE); + topicConfig.put( + TopicConstant.FORMAT_KEY, TopicConstant.FORMAT_SESSION_DATA_SETS_HANDLER_VALUE); + topicConfig.put(TopicConstant.DATABASE_KEY, dbKey); + topicConfig.put(TopicConstant.TABLE_KEY, tableKey); + subSession.createTopic(topicName, topicConfig); + System.out.println( + " Created topic: " + topicName + " (database=" + dbKey + ", table=" + tableKey + ")"); + } + } + + private static ISubscriptionTablePullConsumer createConsumer( + String consumerId, String consumerGroupId) throws Exception { + ISubscriptionTablePullConsumer consumer = + new SubscriptionTablePullConsumerBuilder() + .host(HOST) + .port(PORT) + .consumerId(consumerId) + .consumerGroupId(consumerGroupId) + .autoCommit(false) + .build(); + consumer.open(); + return consumer; + } + + // ============================ + // Polling & Verification + // ============================ + + /** + * Poll and commit messages. After reaching expectedRows, continues polling for 5 consecutive + * empty rounds to verify no extra data arrives. + */ + private static PollResult pollUntilComplete( + ISubscriptionTablePullConsumer consumer, int expectedRows, int maxPollAttempts) { + return pollUntilComplete(consumer, expectedRows, maxPollAttempts, 1000, true); + } + + /** + * Poll until we accumulate the expected number of rows, then verify no extra data arrives. + * + *

After reaching expectedRows, continues polling until 5 consecutive empty polls confirm + * quiescence. Any extra rows polled are included in the count (will break assertEquals). + * + * @param commitMessages if false, messages are NOT committed + */ + private static PollResult pollUntilComplete( + ISubscriptionTablePullConsumer consumer, + int expectedRows, + int maxPollAttempts, + long pollTimeoutMs, + boolean commitMessages) { + PollResult result = new PollResult(); + int consecutiveEmpty = 0; + + for (int attempt = 1; attempt <= maxPollAttempts; attempt++) { + List messages = consumer.poll(Duration.ofMillis(pollTimeoutMs)); + + if (messages.isEmpty()) { + consecutiveEmpty++; + // Normal completion: reached expected rows and verified quiescence + if (consecutiveEmpty >= 3 && result.totalRows >= expectedRows) { + System.out.println( + " Verified: " + + consecutiveEmpty + + " consecutive empty polls after " + + result.totalRows + + " rows (expected " + + expectedRows + + ")"); + break; + } + // Stuck: have data but cannot reach expected count + if (consecutiveEmpty >= 5 && result.totalRows > 0) { + System.out.println( + " Stuck: " + + consecutiveEmpty + + " consecutive empty polls at " + + result.totalRows + + " rows (expected " + + expectedRows + + ")"); + break; + } + // Never received anything + if (consecutiveEmpty >= 10 && result.totalRows == 0 && expectedRows > 0) { + System.out.println(" No data received after " + consecutiveEmpty + " polls"); + break; + } + try { + Thread.sleep(1000); + } catch (InterruptedException ignored) { + } + continue; + } + + consecutiveEmpty = 0; + + for (SubscriptionMessage message : messages) { + for (SubscriptionSessionDataSet dataSet : message.getSessionDataSetsHandler()) { + String tableName = dataSet.getTableName(); + String databaseName = dataSet.getDatabaseName(); + List columnNames = dataSet.getColumnNames(); + + while (dataSet.hasNext()) { + org.apache.tsfile.read.common.RowRecord record = dataSet.next(); + result.totalRows++; + if (tableName != null) { + result.rowsPerTable.merge(tableName, 1, Integer::sum); + } + if (databaseName != null) { + result.rowsPerDatabase.merge(databaseName, 1, Integer::sum); + } + for (int i = 0; i < columnNames.size(); i++) { + result.seenColumns.add(columnNames.get(i)); + } + if (result.totalRows <= 5) { + System.out.println( + " Row: time=" + + record.getTimestamp() + + ", values=" + + record.getFields() + + ", table=" + + tableName + + ", database=" + + databaseName); + } + } + } + if (commitMessages) { + consumer.commitSync(message); + } + } + + System.out.println( + " Poll attempt " + + attempt + + ": totalRows=" + + result.totalRows + + " / expected=" + + expectedRows); + + // Stop immediately if we exceeded the expected row count + if (expectedRows > 0 && result.totalRows > expectedRows) { + System.out.println( + " EXCEEDED: totalRows=" + result.totalRows + " > expectedRows=" + expectedRows); + break; + } + } + + return result; + } + + // ============================ + // Cleanup + // ============================ + + /** Clean up all test artifacts: unsubscribe, close consumer, drop topic, delete database. */ + private static void cleanup( + ISubscriptionTablePullConsumer consumer, String topicName, String database) { + if (consumer != null) { + try { + consumer.unsubscribe(topicName); + } catch (Exception e) { + // ignore + } + try { + consumer.close(); + } catch (Exception e) { + // ignore + } + } + dropTopicTable(topicName); + deleteDatabase(database); + } + + /** Clean up with multiple databases. */ + private static void cleanup( + ISubscriptionTablePullConsumer consumer, String topicName, String... databases) { + if (consumer != null) { + try { + consumer.unsubscribe(topicName); + } catch (Exception e) { + // ignore + } + try { + consumer.close(); + } catch (Exception e) { + // ignore + } + } + dropTopicTable(topicName); + for (String db : databases) { + deleteDatabase(db); + } + } + + // ============================ + // Result & Assertions + // ============================ + + static class PollResult { + int totalRows = 0; + Map rowsPerTable = new HashMap<>(); + Map rowsPerDatabase = new HashMap<>(); + Set seenColumns = new HashSet<>(); + + @Override + public String toString() { + return "PollResult{totalRows=" + + totalRows + + ", rowsPerTable=" + + rowsPerTable + + ", rowsPerDatabase=" + + rowsPerDatabase + + ", seenColumns=" + + seenColumns + + "}"; + } + } + + private static void assertEquals(String msg, int expected, int actual) { + if (expected != actual) { + throw new AssertionError(msg + ": expected=" + expected + ", actual=" + actual); + } + } + + private static void assertTrue(String msg, boolean condition) { + if (!condition) { + throw new AssertionError(msg); + } + } + + private static void assertAtLeast(String msg, int min, int actual) { + if (actual < min) { + throw new AssertionError(msg + ": expected at least " + min + ", actual=" + actual); + } + } + + // ====================================================================== + // Test 1: Basic Flow (merged: BasicDataDelivery + MultiTables + Flush) + // ====================================================================== + /** + * Verifies: + * + *

+ */ + private static void testBasicFlow() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + ISubscriptionTablePullConsumer consumer = null; + + try { + // Step 1: Write initial data to create DataRegion (should NOT be received) + System.out.println(" Step 1: Writing initial data (should NOT be received)"); + try (ITableSession session = openTableSession()) { + createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD"); + session.executeNonQueryStatement("USE " + database); + session.executeNonQueryStatement("CREATE TABLE t2 (tag1 STRING TAG, s1 INT64 FIELD)"); + session.executeNonQueryStatement("CREATE TABLE t3 (tag1 STRING TAG, s1 INT64 FIELD)"); + for (int i = 0; i < 50; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i)); + } + session.executeNonQueryStatement("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', 0, 0)"); + session.executeNonQueryStatement("INSERT INTO t3 (tag1, s1, time) VALUES ('d1', 0, 0)"); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + // Step 2: Create topic and subscribe + System.out.println(" Step 2: Creating topic and subscribing"); + createTopicTable(topicName, database, ".*"); + Thread.sleep(1000); + + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + // Step 3: Write to 3 tables (30 rows each = 90 total), then flush + System.out.println(" Step 3: Writing 30 rows x 3 tables AFTER subscribe, then flush"); + try (ITableSession session = openTableSession()) { + session.executeNonQueryStatement("USE " + database); + for (int i = 100; i < 130; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i)); + session.executeNonQueryStatement( + String.format("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 20, i)); + session.executeNonQueryStatement( + String.format("INSERT INTO t3 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 30, i)); + } + System.out.println(" Flushing..."); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + // Step 4: Poll and verify + System.out.println(" Step 4: Polling..."); + PollResult result = pollUntilComplete(consumer, 90, 100); + System.out.println(" Result: " + result); + + assertEquals("Expected exactly 90 rows (30 per table)", 90, result.totalRows); + if (!result.rowsPerTable.isEmpty()) { + System.out.println(" Rows per table: " + result.rowsPerTable); + for (String tbl : new String[] {"t1", "t2", "t3"}) { + Integer tblRows = result.rowsPerTable.get(tbl); + assertAtLeast("Expected rows from " + tbl, 1, tblRows != null ? tblRows : 0); + } + } + } finally { + cleanup(consumer, topicName, database); + } + } + + // ====================================================================== + // Test 2: Data Types (merged: MultipleDataTypes + MultiColumnTypes + CrossPartition) + // ====================================================================== + /** + * Verifies: + * + * + */ + private static void testDataTypes() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + ISubscriptionTablePullConsumer consumer = null; + final long GAP = 604_800_001L; // slightly over 1 week + + try { + try (ITableSession session = openTableSession()) { + createDatabaseAndTable( + session, + database, + "t1", + "tag1 STRING TAG, s_int32 INT32 FIELD, s_int64 INT64 FIELD, " + + "s_float FLOAT FIELD, s_double DOUBLE FIELD, s_bool BOOLEAN FIELD, " + + "s_text TEXT FIELD"); + session.executeNonQueryStatement("USE " + database); + // Init row to force DataRegion creation + session.executeNonQueryStatement( + "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time) " + + "VALUES ('d1', 0, 0, 0.0, 0.0, false, 'init', 0)"); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + createTopicTable(topicName, database, ".*"); + Thread.sleep(1000); + + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + int totalExpected = 0; + try (ITableSession session = openTableSession()) { + session.executeNonQueryStatement("USE " + database); + + // --- Part A: 6 data types x 20 rows, separate INSERTs --- + System.out.println(" Part A: 6 data types x 20 rows (separate INSERTs)"); + for (int i = 1; i <= 20; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO t1 (tag1, s_int32, time) VALUES ('d1', %d, %d)", i, i)); + session.executeNonQueryStatement( + String.format( + "INSERT INTO t1 (tag1, s_int64, time) VALUES ('d1', %d, %d)", + (long) i * 100000L, i)); + session.executeNonQueryStatement( + String.format( + "INSERT INTO t1 (tag1, s_float, time) VALUES ('d1', %f, %d)", i * 1.1f, i)); + session.executeNonQueryStatement( + String.format( + "INSERT INTO t1 (tag1, s_double, time) VALUES ('d1', %f, %d)", i * 2.2, i)); + session.executeNonQueryStatement( + String.format( + "INSERT INTO t1 (tag1, s_bool, time) VALUES ('d1', %s, %d)", + i % 2 == 0 ? "true" : "false", i)); + session.executeNonQueryStatement( + String.format( + "INSERT INTO t1 (tag1, s_text, time) VALUES ('d1', 'text_%d', %d)", i, i)); + } + totalExpected += 120; // 6 types x 20 rows + + // --- Part B: All-column rows (50 rows) --- + System.out.println(" Part B: 50 all-column rows"); + for (int i = 21; i <= 70; i++) { + session.executeNonQueryStatement( + String.format( + "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time)" + + " VALUES ('d1', %d, %d, %f, %f, %s, 'text_%d', %d)", + i, (long) i * 100000L, i * 1.1f, i * 2.2, i % 2 == 0 ? "true" : "false", i, i)); + } + totalExpected += 50; + + // --- Part C: Cross-partition writes --- + System.out.println(" Part C: Cross-partition (SQL single, multi, Tablet)"); + long baseTs = 1_000_000_000L; + + // SQL single-row x2 + session.executeNonQueryStatement( + String.format( + "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time) " + + "VALUES ('d1', 1, 100, 1.1, 1.11, true, 'xp_single_1', %d)", + baseTs)); + session.executeNonQueryStatement( + String.format( + "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time) " + + "VALUES ('d1', 2, 200, 2.2, 2.22, false, 'xp_single_2', %d)", + baseTs + GAP)); + totalExpected += 2; + + // SQL multi-row x3 + session.executeNonQueryStatement( + String.format( + "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time) " + + "VALUES ('d1', 3, 300, 3.3, 3.33, true, 'xp_multi_1', %d), " + + "('d1', 4, 400, 4.4, 4.44, false, 'xp_multi_2', %d), " + + "('d1', 5, 500, 5.5, 5.55, true, 'xp_multi_3', %d)", + baseTs + GAP * 2, baseTs + GAP * 3, baseTs + GAP * 4)); + totalExpected += 3; + + // Tablet x4 + List schemaList = new ArrayList<>(); + schemaList.add(new MeasurementSchema("tag1", TSDataType.STRING)); + schemaList.add(new MeasurementSchema("s_int32", TSDataType.INT32)); + schemaList.add(new MeasurementSchema("s_int64", TSDataType.INT64)); + schemaList.add(new MeasurementSchema("s_float", TSDataType.FLOAT)); + schemaList.add(new MeasurementSchema("s_double", TSDataType.DOUBLE)); + schemaList.add(new MeasurementSchema("s_bool", TSDataType.BOOLEAN)); + schemaList.add(new MeasurementSchema("s_text", TSDataType.STRING)); + + List categories = + java.util.Arrays.asList( + ColumnCategory.TAG, + ColumnCategory.FIELD, + ColumnCategory.FIELD, + ColumnCategory.FIELD, + ColumnCategory.FIELD, + ColumnCategory.FIELD, + ColumnCategory.FIELD); + + Tablet tablet = + new Tablet( + "t1", + IMeasurementSchema.getMeasurementNameList(schemaList), + IMeasurementSchema.getDataTypeList(schemaList), + categories, + 10); + for (int i = 0; i < 4; i++) { + int row = tablet.getRowSize(); + long ts = baseTs + GAP * (5 + i); + tablet.addTimestamp(row, ts); + tablet.addValue("tag1", row, "d1"); + tablet.addValue("s_int32", row, 6 + i); + tablet.addValue("s_int64", row, (long) (600 + i * 100)); + tablet.addValue("s_float", row, (6 + i) * 1.1f); + tablet.addValue("s_double", row, (6 + i) * 2.22); + tablet.addValue("s_bool", row, i % 2 == 0); + tablet.addValue("s_text", row, "xp_tablet_" + (i + 1)); + } + session.insert(tablet); + totalExpected += 4; + } + + System.out.println(" Total expected rows: " + totalExpected); + Thread.sleep(2000); + + PollResult result = pollUntilComplete(consumer, totalExpected, 200); + System.out.println(" Result: " + result); + + assertAtLeast( + "Expected at least " + totalExpected + " rows", totalExpected, result.totalRows); + assertAtLeast("Expected multiple column types in result", 2, result.seenColumns.size()); + } finally { + cleanup(consumer, topicName, database); + } + } + + // ====================================================================== + // Test 3: Path Filtering (merged: TableLevel + DatabaseLevel) + // ====================================================================== + /** + * Verifies: + * + *
    + *
  • Table-level: topic on table=t1 does NOT deliver t2 data + *
  • Database-level: topic on db1 does NOT deliver db2 data + *
+ */ + private static void testPathFiltering() throws Exception { + String database1 = nextDatabase(); + String database2 = database1 + "_other"; + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + ISubscriptionTablePullConsumer consumer = null; + + try { + try (ITableSession session = openTableSession()) { + // db1 with t1 and t2 + createDatabaseAndTable(session, database1, "t1", "tag1 STRING TAG, s1 INT64 FIELD"); + session.executeNonQueryStatement("USE " + database1); + session.executeNonQueryStatement("CREATE TABLE t2 (tag1 STRING TAG, s1 INT64 FIELD)"); + session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)"); + session.executeNonQueryStatement("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', 0, 0)"); + // db2 with t1 + createDatabaseAndTable(session, database2, "t1", "tag1 STRING TAG, s1 INT64 FIELD"); + session.executeNonQueryStatement("USE " + database2); + session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)"); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + // Topic: only db1, only table t1 + createTopicTable(topicName, database1, "t1"); + Thread.sleep(1000); + + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + System.out.println(" Writing to db1.t1, db1.t2, db2.t1 (topic filter: db1.t1 only)"); + try (ITableSession session = openTableSession()) { + session.executeNonQueryStatement("USE " + database1); + for (int i = 100; i < 150; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i)); + session.executeNonQueryStatement( + String.format("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 20, i)); + } + session.executeNonQueryStatement("USE " + database2); + for (int i = 100; i < 150; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 30, i)); + } + } + Thread.sleep(2000); + + System.out.println(" Polling (expecting only db1.t1 data = 50 rows)..."); + PollResult result = pollUntilComplete(consumer, 50, 60); + System.out.println(" Result: " + result); + + assertEquals("Expected exactly 50 rows from db1.t1 only", 50, result.totalRows); + if (!result.rowsPerTable.isEmpty()) { + Integer t2Rows = result.rowsPerTable.get("t2"); + assertTrue("Expected NO rows from t2, but got " + t2Rows, t2Rows == null || t2Rows == 0); + System.out.println(" Table filtering verified: t1 only"); + } + if (!result.rowsPerDatabase.isEmpty()) { + Integer db2Rows = result.rowsPerDatabase.get(database2); + assertTrue("Expected NO rows from " + database2, db2Rows == null || db2Rows == 0); + System.out.println(" Database filtering verified: " + database1 + " only"); + } + } finally { + cleanup(consumer, topicName, database1, database2); + } + } + + // ====================================================================== + // Test 4: Subscribe Before Region Creation (kept as-is) + // ====================================================================== + /** + * Subscribe BEFORE the database/region exists, then create database and write. Tests the + * IoTConsensus.onNewPeerCreated auto-binding path with table model. + */ + private static void testSubscribeBeforeRegion() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + ISubscriptionTablePullConsumer consumer = null; + + try { + System.out.println(" Step 1: Creating topic BEFORE database exists"); + createTopicTable(topicName, database, ".*"); + Thread.sleep(1000); + + System.out.println(" Step 2: Subscribing (no DataRegion exists yet)"); + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + System.out.println(" Step 3: Creating database, table and writing data (100 rows)"); + try (ITableSession session = openTableSession()) { + createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD"); + session.executeNonQueryStatement("USE " + database); + for (int i = 0; i < 100; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i)); + } + } + Thread.sleep(5000); + + System.out.println(" Step 4: Polling..."); + PollResult result = pollUntilComplete(consumer, 100, 100); + System.out.println(" Result: " + result); + + if (result.totalRows >= 100) { + System.out.println(" Auto-binding works! All " + result.totalRows + " rows received."); + } else if (result.totalRows > 0) { + System.out.println( + " Partial: " + result.totalRows + "/100 rows. First writes may precede binding."); + } else { + System.out.println(" No data received. Check logs for auto-binding messages."); + } + assertAtLeast( + "Expected some rows from subscribe-before-region (auto-binding)", 1, result.totalRows); + } finally { + cleanup(consumer, topicName, database); + } + } + + // testRedelivery removed — will be re-added with proper timeout-based nack testing + + // ====================================================================== + // Test 6: Multi-Entity Isolation (merged: MultiConsumerGroup + MultiTopic) + // ====================================================================== + /** + * Verifies: + * + *
    + *
  • Two consumer groups on same topic: each group gets ALL data independently + *
  • One consumer subscribes to two topics with different TABLE_KEY filters: each topic + * delivers only matching data + *
+ */ + private static void testMultiEntityIsolation() throws Exception { + String database = nextDatabase(); + String topicName1 = "topic_tbl_multi_" + testCounter + "_a"; + String topicName2 = "topic_tbl_multi_" + testCounter + "_b"; + String consumerGroupId1 = "cg_tbl_multi_" + testCounter + "_a"; + String consumerId1 = "consumer_tbl_multi_" + testCounter + "_a"; + String consumerGroupId2 = "cg_tbl_multi_" + testCounter + "_b"; + String consumerId2 = "consumer_tbl_multi_" + testCounter + "_b"; + ISubscriptionTablePullConsumer consumer1 = null; + ISubscriptionTablePullConsumer consumer2 = null; + + try { + // Setup: database with t1 and t2 + try (ITableSession session = openTableSession()) { + createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD"); + session.executeNonQueryStatement("USE " + database); + session.executeNonQueryStatement("CREATE TABLE t2 (tag1 STRING TAG, s1 INT64 FIELD)"); + session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)"); + session.executeNonQueryStatement("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', 0, 0)"); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + // Topic 1: covers t1 only, Topic 2: covers t2 only + createTopicTable(topicName1, database, "t1"); + createTopicTable(topicName2, database, "t2"); + Thread.sleep(1000); + + // Consumer 1 (group A): subscribes to BOTH topics + consumer1 = createConsumer(consumerId1, consumerGroupId1); + consumer1.subscribe(topicName1, topicName2); + // Consumer 2 (group B): subscribes to BOTH topics + consumer2 = createConsumer(consumerId2, consumerGroupId2); + consumer2.subscribe(topicName1, topicName2); + Thread.sleep(3000); + + // Write 30 rows to t1, 40 rows to t2 + System.out.println(" Writing 30 rows to t1, 40 rows to t2"); + try (ITableSession session = openTableSession()) { + session.executeNonQueryStatement("USE " + database); + for (int i = 1; i <= 40; i++) { + if (i <= 30) { + session.executeNonQueryStatement( + String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i)); + } + session.executeNonQueryStatement( + String.format("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 20, i)); + } + } + Thread.sleep(2000); + + // Part A: Both groups should get 70 rows independently + System.out.println(" Part A: Multi-group isolation"); + System.out.println(" Polling from group 1..."); + PollResult result1 = pollUntilComplete(consumer1, 70, 80); + System.out.println(" Group 1 result: " + result1); + + System.out.println(" Polling from group 2..."); + PollResult result2 = pollUntilComplete(consumer2, 70, 80); + System.out.println(" Group 2 result: " + result2); + + assertEquals("Group 1 should receive all 70 rows", 70, result1.totalRows); + assertEquals("Group 2 should receive all 70 rows", 70, result2.totalRows); + + // Part B: Verify per-topic table isolation + if (!result1.rowsPerTable.isEmpty()) { + Integer t1Rows = result1.rowsPerTable.get("t1"); + Integer t2Rows = result1.rowsPerTable.get("t2"); + assertEquals("Expected 30 rows from t1 (topic1)", 30, t1Rows != null ? t1Rows : 0); + assertEquals("Expected 40 rows from t2 (topic2)", 40, t2Rows != null ? t2Rows : 0); + System.out.println(" Multi-topic isolation verified: t1=" + t1Rows + ", t2=" + t2Rows); + } + System.out.println( + " Multi-group isolation verified: group1=" + + result1.totalRows + + ", group2=" + + result2.totalRows); + } finally { + if (consumer1 != null) { + try { + consumer1.unsubscribe(topicName1, topicName2); + } catch (Exception e) { + /* ignore */ + } + try { + consumer1.close(); + } catch (Exception e) { + /* ignore */ + } + } + if (consumer2 != null) { + try { + consumer2.unsubscribe(topicName1, topicName2); + } catch (Exception e) { + /* ignore */ + } + try { + consumer2.close(); + } catch (Exception e) { + /* ignore */ + } + } + dropTopicTable(topicName1); + dropTopicTable(topicName2); + deleteDatabase(database); + } + } + + // ====================================================================== + // Test 7: Burst Write Gap Recovery (NEW — tests C2 fix) + // ====================================================================== + /** + * Tests that burst writing beyond the pending queue capacity (4096) does not cause data loss. The + * pending queue overflow triggers gaps, which should be recovered from WAL. + * + *

Mechanism: Each {@code IoTConsensusServerImpl.write()} call produces exactly one + * {@code pendingEntries.offer()}. A single {@code session.insert(tablet)} with N rows in one time + * partition = 1 write() call = 1 offer, so Tablet batches rarely overflow the queue. To actually + * overflow, we need 4096+ individual write() calls arriving faster than the prefetch + * thread can drain. We achieve this with multiple concurrent writer threads, each performing + * individual SQL INSERTs, to maximize the aggregate write rate vs. drain rate. + * + *

Note: Gap occurrence is inherently timing-dependent (race between writers and the + * prefetch drain loop). This test maximizes the probability by using concurrent threads, but + * cannot guarantee gap occurrence on every run. Check server logs for "gap detected" / "Filling + * from WAL" messages to confirm the gap path was exercised. + * + *

Fix verified: C2 — gap entries are not skipped when WAL fill times out; they are deferred to + * the next prefetch iteration. + */ + private static void testBurstWriteGapRecovery() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + ISubscriptionTablePullConsumer consumer = null; + + try { + try (ITableSession session = openTableSession()) { + createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD"); + session.executeNonQueryStatement("USE " + database); + session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)"); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + createTopicTable(topicName, database, ".*"); + Thread.sleep(1000); + + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + // Use multiple concurrent writer threads with individual SQL INSERTs. + // Each INSERT → 1 IoTConsensusServerImpl.write() → 1 pendingEntries.offer(). + // With N threads writing concurrently, aggregate rate should exceed drain rate + // and overflow the 4096-capacity queue, creating gaps. + final int writerThreads = 4; + final int rowsPerThread = 1500; // 4 * 1500 = 6000 total write() calls > 4096 + final int totalRows = writerThreads * rowsPerThread; + final AtomicInteger errorCount = new AtomicInteger(0); + final CountDownLatch startLatch = new CountDownLatch(1); + final CountDownLatch doneLatch = new CountDownLatch(writerThreads); + + System.out.println( + " Burst writing " + + totalRows + + " rows via " + + writerThreads + + " concurrent threads (" + + rowsPerThread + + " individual SQL INSERTs each)"); + System.out.println( + " (Each INSERT = 1 WAL entry = 1 pendingEntries.offer(); " + "queue capacity = 4096)"); + + ExecutorService executor = Executors.newFixedThreadPool(writerThreads); + for (int t = 0; t < writerThreads; t++) { + final int threadId = t; + final int startTs = threadId * rowsPerThread + 1; + executor.submit( + () -> { + try { + startLatch.await(); // all threads start at the same time + try (ITableSession session = openTableSession()) { + session.executeNonQueryStatement("USE " + database); + for (int i = 0; i < rowsPerThread; i++) { + int ts = startTs + i; + session.executeNonQueryStatement( + String.format( + "INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", + (long) ts * 10, ts)); + } + } + } catch (Exception e) { + System.out.println(" Writer thread " + threadId + " error: " + e.getMessage()); + errorCount.incrementAndGet(); + } finally { + doneLatch.countDown(); + } + }); + } + + // Fire all threads simultaneously + startLatch.countDown(); + doneLatch.await(); + executor.shutdown(); + + if (errorCount.get() > 0) { + System.out.println(" WARNING: " + errorCount.get() + " writer threads encountered errors"); + } + + // Do NOT add artificial delay — let the consumer compete with ongoing WAL writes + System.out.println( + " Polling (expecting " + totalRows + " rows, may need WAL gap recovery)..."); + System.out.println( + " (Check server logs for 'gap detected' to confirm gap recovery was triggered)"); + PollResult result = pollUntilComplete(consumer, totalRows, 6000, 2000, true); + System.out.println(" Result: " + result); + + assertEquals( + "Expected exactly " + totalRows + " rows (no data loss despite pending queue overflow)", + totalRows, + result.totalRows); + } finally { + cleanup(consumer, topicName, database); + } + } + + // ====================================================================== + // Test 8: Commit After Unsubscribe (NEW — tests H7 fix) + // ====================================================================== + /** + * Tests that commit still works correctly after the consumer has unsubscribed (queue has been + * torn down). The commit routing should use metadata-based topic config check instead of runtime + * queue state. + * + *

Fix verified: H7 — commit routes via isConsensusBasedTopic() instead of hasQueue(). + */ + private static void testCommitAfterUnsubscribe() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + ISubscriptionTablePullConsumer consumer = null; + + try { + try (ITableSession session = openTableSession()) { + createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD"); + session.executeNonQueryStatement("USE " + database); + session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)"); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + createTopicTable(topicName, database, ".*"); + Thread.sleep(1000); + + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + // Write data + System.out.println(" Writing 50 rows"); + try (ITableSession session = openTableSession()) { + session.executeNonQueryStatement("USE " + database); + for (int i = 1; i <= 50; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i)); + } + } + Thread.sleep(2000); + + // Poll WITHOUT commit + System.out.println(" Polling WITHOUT commit..."); + List uncommittedMessages = new ArrayList<>(); + int polledRows = 0; + for (int attempt = 0; attempt < 60 && polledRows < 50; attempt++) { + List msgs = consumer.poll(Duration.ofMillis(2000)); + if (msgs.isEmpty()) { + if (polledRows > 0) break; + Thread.sleep(500); + continue; + } + for (SubscriptionMessage msg : msgs) { + uncommittedMessages.add(msg); + for (SubscriptionSessionDataSet ds : msg.getSessionDataSetsHandler()) { + while (ds.hasNext()) { + ds.next(); + polledRows++; + } + } + } + } + System.out.println( + " Polled " + + polledRows + + " rows, holding " + + uncommittedMessages.size() + + " uncommitted messages"); + assertAtLeast("Should have polled some rows before unsubscribe", 1, polledRows); + + // Unsubscribe (tears down the consensus queue) + System.out.println(" Unsubscribing (queue teardown)..."); + consumer.unsubscribe(topicName); + Thread.sleep(2000); + + // Now commit the previously polled messages — should NOT throw + System.out.println( + " Committing " + uncommittedMessages.size() + " messages AFTER unsubscribe..."); + boolean commitSucceeded = true; + for (SubscriptionMessage msg : uncommittedMessages) { + try { + consumer.commitSync(msg); + } catch (Exception e) { + System.out.println(" Commit threw exception: " + e.getMessage()); + commitSucceeded = false; + } + } + + System.out.println(" Commit after unsubscribe completed. Success=" + commitSucceeded); + System.out.println(" (Key: no exception crash, routing handled gracefully)"); + } finally { + if (consumer != null) { + try { + consumer.close(); + } catch (Exception e) { + /* ignore */ + } + } + dropTopicTable(topicName); + deleteDatabase(database); + } + } + + // ====================================================================== + // Test 8: Seek (seekToBeginning, seekToEnd, seek by timestamp) + // ====================================================================== + /** + * Verifies all three seek operations in a single flow: + * + *

    + *
  • seekToBeginning — re-delivers previously committed data from earliest available position + *
  • seekToEnd — skips all existing data, only new writes are received + *
  • seek(timestamp) — positions at the approximate WAL entry matching the given timestamp + *
+ */ + private static void testSeek() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + SubscriptionTablePullConsumer consumer = null; + + try { + // Step 0: Create DataRegion + try (ITableSession session = openTableSession()) { + createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD"); + } + Thread.sleep(2000); + + // Step 1: Create topic + consumer + subscribe + System.out.println(" Step 1: Create topic and subscribe"); + createTopicTable(topicName, database, "t1"); + Thread.sleep(1000); + + consumer = (SubscriptionTablePullConsumer) createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + // Step 2: Write 1000 rows with timestamps 1000..1999 and poll+commit all + System.out.println(" Step 2: Write 1000 rows (timestamps 1000..1999) and poll+commit"); + try (ITableSession session = openTableSession()) { + session.executeNonQueryStatement("USE " + database); + for (int i = 0; i < 1000; i++) { + long ts = 1000 + i; + session.executeNonQueryStatement( + String.format( + "INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", ts * 10, ts)); + } + } + Thread.sleep(2000); + + PollResult firstPoll = pollUntilComplete(consumer, 1000, 120); + System.out.println(" First poll: " + firstPoll.totalRows + " rows"); + assertAtLeast("First poll should get rows", 1, firstPoll.totalRows); + + // ------------------------------------------------------------------ + // Step 3: seekToBeginning — should re-deliver data from the start + // ------------------------------------------------------------------ + System.out.println(" Step 3: seekToBeginning → expect re-delivery"); + consumer.seekToBeginning(topicName); + Thread.sleep(2000); + + // No initial INSERT in table test (Step 0 only creates DB+table), so expectedRows=1000 + PollResult beginningPoll = pollUntilComplete(consumer, 1000, 120); + System.out.println(" After seekToBeginning: " + beginningPoll); + assertAtLeast( + "seekToBeginning should re-deliver rows (WAL retention permitting)", + 1, + beginningPoll.totalRows); + + // ------------------------------------------------------------------ + // Step 4: seekToEnd — should receive nothing until new writes + // ------------------------------------------------------------------ + System.out.println(" Step 4: seekToEnd → expect no old data"); + consumer.seekToEnd(topicName); + Thread.sleep(2000); + + PollResult endPoll = new PollResult(); + int consecutiveEmpty = 0; + for (int attempt = 0; attempt < 15; attempt++) { + List msgs = consumer.poll(Duration.ofMillis(1000)); + if (msgs.isEmpty()) { + consecutiveEmpty++; + if (consecutiveEmpty >= 5) break; + Thread.sleep(500); + continue; + } + consecutiveEmpty = 0; + for (SubscriptionMessage msg : msgs) { + for (SubscriptionSessionDataSet ds : msg.getSessionDataSetsHandler()) { + while (ds.hasNext()) { + ds.next(); + endPoll.totalRows++; + } + } + consumer.commitSync(msg); + } + } + System.out.println(" After seekToEnd (no new writes): " + endPoll.totalRows + " rows"); + // May occasionally be 1 due to prefetch thread race; tolerate small values + assertTrue( + "seekToEnd should yield at most 1 row (race tolerance)", endPoll.totalRows <= 1); + + // Write 200 new rows — they should be received + System.out.println(" Writing 200 new rows after seekToEnd"); + try (ITableSession session = openTableSession()) { + session.executeNonQueryStatement("USE " + database); + for (int i = 2000; i < 2200; i++) { + session.executeNonQueryStatement( + String.format( + "INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i)); + } + } + Thread.sleep(2000); + + PollResult afterEndPoll = pollUntilComplete(consumer, 200, 120); + System.out.println(" After seekToEnd + new writes: " + afterEndPoll); + assertEquals("Should receive exactly 200 new rows after seekToEnd", 200, afterEndPoll.totalRows); + + // ------------------------------------------------------------------ + // Step 5: seek(timestamp) — seek to timestamp 1500 + // ------------------------------------------------------------------ + System.out.println(" Step 5: seek(1500) → expect rows from near ts=1500"); + consumer.seek(topicName, 1500); + Thread.sleep(2000); + + // Sparse mapping (interval=100) positions near ts=1500. + // Expect: ~500 rows from ts≥1500 in original data (1500..1999) + // + 200 rows from new writes (2000..2199) = ~700 minimum + PollResult afterSeek = pollUntilComplete(consumer, 1200, 120); + System.out.println(" After seek(1500): " + afterSeek.totalRows + " rows"); + assertAtLeast("seek(1500) should deliver at least 700 rows (ts >= 1500)", 700, afterSeek.totalRows); + + // ------------------------------------------------------------------ + // Step 6: seek(future timestamp) — expect 0 rows + // ------------------------------------------------------------------ + System.out.println(" Step 6: seek(99999) → expect no data"); + consumer.seek(topicName, 99999); + Thread.sleep(2000); + + PollResult futurePoll = new PollResult(); + consecutiveEmpty = 0; + for (int attempt = 0; attempt < 10; attempt++) { + List msgs = consumer.poll(Duration.ofMillis(1000)); + if (msgs.isEmpty()) { + consecutiveEmpty++; + if (consecutiveEmpty >= 5) break; + Thread.sleep(500); + continue; + } + consecutiveEmpty = 0; + for (SubscriptionMessage msg : msgs) { + for (SubscriptionSessionDataSet ds : msg.getSessionDataSetsHandler()) { + while (ds.hasNext()) { + ds.next(); + futurePoll.totalRows++; + } + } + consumer.commitSync(msg); + } + } + System.out.println(" After seek(99999): " + futurePoll.totalRows + " rows"); + // seek(99999) should behave like seekToEnd — 0 rows normally, + // but may yield up to 1 row due to prefetch thread race (same as seekToEnd) + assertTrue("seek(future) should yield at most 1 row (race tolerance)", + futurePoll.totalRows <= 1); + + System.out.println(" testSeek passed all sub-tests!"); + } finally { + cleanup(consumer, topicName, database); + } + } +} diff --git a/example/session/src/main/java/org/apache/iotdb/ConsensusSubscriptionTest.java b/example/session/src/main/java/org/apache/iotdb/ConsensusSubscriptionTest.java new file mode 100644 index 0000000000000..c8584f7d99d8b --- /dev/null +++ b/example/session/src/main/java/org/apache/iotdb/ConsensusSubscriptionTest.java @@ -0,0 +1,1327 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb; + +import org.apache.iotdb.isession.ISession; +import org.apache.iotdb.rpc.subscription.config.TopicConstant; +import org.apache.iotdb.session.Session; +import org.apache.iotdb.session.subscription.SubscriptionTreeSession; +import org.apache.iotdb.session.subscription.consumer.tree.SubscriptionTreePullConsumer; +import org.apache.iotdb.session.subscription.payload.SubscriptionMessage; +import org.apache.iotdb.session.subscription.payload.SubscriptionSessionDataSet; + +import org.apache.tsfile.common.conf.TSFileConfig; +import org.apache.tsfile.enums.TSDataType; +import org.apache.tsfile.utils.Binary; +import org.apache.tsfile.write.record.Tablet; +import org.apache.tsfile.write.schema.IMeasurementSchema; +import org.apache.tsfile.write.schema.MeasurementSchema; + +import java.time.Duration; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.Set; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.atomic.AtomicInteger; + +/** TODO: move these manual tests into ITs */ +public class ConsensusSubscriptionTest { + + private static final String HOST = "127.0.0.1"; + private static final int PORT = 6667; + private static final String USER = "root"; + private static final String PASSWORD = "root"; + + private static int testCounter = 0; + private static int passed = 0; + private static int failed = 0; + private static final List failedTests = new ArrayList<>(); + + public static void main(String[] args) throws Exception { + System.out.println("=== Consensus-Based Subscription Test Suite ===\n"); + + String targetTest = args.length > 0 ? args[0] : null; + + if (targetTest == null || "testBasicFlow".equals(targetTest)) { + runTest("testBasicFlow", ConsensusSubscriptionTest::testBasicFlow); + } + if (targetTest == null || "testDataTypes".equals(targetTest)) { + runTest("testDataTypes", ConsensusSubscriptionTest::testDataTypes); + } + if (targetTest == null || "testPathFiltering".equals(targetTest)) { + runTest("testPathFiltering", ConsensusSubscriptionTest::testPathFiltering); + } + if (targetTest == null || "testSubscribeBeforeRegion".equals(targetTest)) { + runTest("testSubscribeBeforeRegion", ConsensusSubscriptionTest::testSubscribeBeforeRegion); + } + if (targetTest == null || "testMultiEntityIsolation".equals(targetTest)) { + runTest("testMultiEntityIsolation", ConsensusSubscriptionTest::testMultiEntityIsolation); + } + if (targetTest == null || "testBurstWriteGapRecovery".equals(targetTest)) { + runTest("testBurstWriteGapRecovery", ConsensusSubscriptionTest::testBurstWriteGapRecovery); + } + if (targetTest == null || "testCommitAfterUnsubscribe".equals(targetTest)) { + runTest("testCommitAfterUnsubscribe", ConsensusSubscriptionTest::testCommitAfterUnsubscribe); + } + if (targetTest == null || "testSeek".equals(targetTest)) { + runTest("testSeek", ConsensusSubscriptionTest::testSeek); + } + + // Summary + System.out.println("\n=== Test Suite Summary ==="); + System.out.println("Passed: " + passed); + System.out.println("Failed: " + failed); + if (!failedTests.isEmpty()) { + System.out.println("Failed tests: " + failedTests); + } + System.out.println("=== Done ==="); + } + + // ============================ + // Test Infrastructure + // ============================ + + @FunctionalInterface + interface TestMethod { + void run() throws Exception; + } + + private static void runTest(String name, TestMethod test) { + System.out.println("\n" + "================================================================="); + System.out.println("Running: " + name); + System.out.println("================================================================="); + try { + test.run(); + passed++; + System.out.println(">>> PASSED: " + name); + } catch (AssertionError e) { + failed++; + failedTests.add(name); + System.out.println(">>> FAILED: " + name + " - " + e.getMessage()); + e.printStackTrace(System.out); + } catch (Exception e) { + failed++; + failedTests.add(name); + System.out.println(">>> ERROR: " + name + " - " + e.getMessage()); + e.printStackTrace(System.out); + } + } + + private static String nextDatabase() { + testCounter++; + return "root.csub_test_" + testCounter; + } + + private static String nextTopic() { + return "topic_csub_" + testCounter; + } + + private static String nextConsumerGroup() { + return "cg_csub_" + testCounter; + } + + private static String nextConsumerId() { + return "consumer_csub_" + testCounter; + } + + private static ISession openSession() throws Exception { + ISession session = + new Session.Builder().host(HOST).port(PORT).username(USER).password(PASSWORD).build(); + session.open(); + return session; + } + + private static void createDatabase(ISession session, String database) throws Exception { + try { + session.executeNonQueryStatement("CREATE DATABASE " + database); + } catch (Exception e) { + // ignore if already exists + } + } + + private static void deleteDatabase(String database) { + try (ISession session = openSession()) { + session.executeNonQueryStatement("DELETE DATABASE " + database); + } catch (Exception e) { + // ignore + } + } + + private static void dropTopic(String topicName) { + try (SubscriptionTreeSession subSession = new SubscriptionTreeSession(HOST, PORT)) { + subSession.open(); + subSession.dropTopic(topicName); + } catch (Exception e) { + // ignore + } + } + + private static void createTopic(String topicName, String path) throws Exception { + try (SubscriptionTreeSession subSession = new SubscriptionTreeSession(HOST, PORT)) { + subSession.open(); + try { + subSession.dropTopic(topicName); + } catch (Exception e) { + // ignore + } + + Properties topicConfig = new Properties(); + topicConfig.put(TopicConstant.MODE_KEY, TopicConstant.MODE_LIVE_VALUE); + topicConfig.put( + TopicConstant.FORMAT_KEY, TopicConstant.FORMAT_SESSION_DATA_SETS_HANDLER_VALUE); + topicConfig.put(TopicConstant.PATH_KEY, path); + subSession.createTopic(topicName, topicConfig); + System.out.println(" Created topic: " + topicName + " (path=" + path + ")"); + } + } + + private static SubscriptionTreePullConsumer createConsumer( + String consumerId, String consumerGroupId) throws Exception { + SubscriptionTreePullConsumer consumer = + new SubscriptionTreePullConsumer.Builder() + .host(HOST) + .port(PORT) + .consumerId(consumerId) + .consumerGroupId(consumerGroupId) + .autoCommit(false) + .buildPullConsumer(); + consumer.open(); + return consumer; + } + + // ============================ + // Polling & Verification + // ============================ + + /** + * Poll and commit messages. After reaching expectedRows, continues polling for 5 consecutive + * empty rounds to verify no extra data arrives. + */ + private static PollResult pollUntilComplete( + SubscriptionTreePullConsumer consumer, int expectedRows, int maxPollAttempts) { + return pollUntilComplete(consumer, expectedRows, maxPollAttempts, 1000, true); + } + + private static PollResult pollUntilComplete( + SubscriptionTreePullConsumer consumer, + int expectedRows, + int maxPollAttempts, + long pollTimeoutMs, + boolean commitMessages) { + PollResult result = new PollResult(); + int consecutiveEmpty = 0; + + for (int attempt = 1; attempt <= maxPollAttempts; attempt++) { + List messages = consumer.poll(Duration.ofMillis(pollTimeoutMs)); + + if (messages.isEmpty()) { + consecutiveEmpty++; + // Normal completion: reached expected rows and verified quiescence + if (consecutiveEmpty >= 3 && result.totalRows >= expectedRows) { + System.out.println( + " Verified: " + + consecutiveEmpty + + " consecutive empty polls after " + + result.totalRows + + " rows (expected " + + expectedRows + + ")"); + break; + } + // Stuck: have data but cannot reach expected count + if (consecutiveEmpty >= 5 && result.totalRows > 0) { + System.out.println( + " Stuck: " + + consecutiveEmpty + + " consecutive empty polls at " + + result.totalRows + + " rows (expected " + + expectedRows + + ")"); + break; + } + // Never received anything + if (consecutiveEmpty >= 10 && result.totalRows == 0 && expectedRows > 0) { + System.out.println(" No data received after " + consecutiveEmpty + " polls"); + break; + } + try { + Thread.sleep(1000); + } catch (InterruptedException ignored) { + } + continue; + } + + consecutiveEmpty = 0; + + for (SubscriptionMessage message : messages) { + for (SubscriptionSessionDataSet dataSet : message.getSessionDataSetsHandler()) { + String device = null; + List columnNames = dataSet.getColumnNames(); + if (columnNames.size() > 1) { + String fullPath = columnNames.get(1); + int lastDot = fullPath.lastIndexOf('.'); + device = lastDot > 0 ? fullPath.substring(0, lastDot) : fullPath; + } + + while (dataSet.hasNext()) { + org.apache.tsfile.read.common.RowRecord record = dataSet.next(); + result.totalRows++; + if (device != null) { + result.rowsPerDevice.merge(device, 1, Integer::sum); + } + for (int i = 1; i < columnNames.size(); i++) { + result.seenColumns.add(columnNames.get(i)); + } + if (result.totalRows <= 5) { + System.out.println( + " Row: time=" + + record.getTimestamp() + + ", values=" + + record.getFields() + + ", device=" + + device); + } + } + } + if (commitMessages) { + consumer.commitSync(message); + } + } + + System.out.println( + " Poll attempt " + + attempt + + ": totalRows=" + + result.totalRows + + " / expected=" + + expectedRows); + + // Stop immediately if we exceeded the expected row count + if (expectedRows > 0 && result.totalRows > expectedRows) { + System.out.println( + " EXCEEDED: totalRows=" + result.totalRows + " > expectedRows=" + expectedRows); + break; + } + } + + return result; + } + + // ============================ + // Cleanup + // ============================ + + /** Clean up all test artifacts: unsubscribe, close consumer, drop topic, delete database. */ + private static void cleanup( + SubscriptionTreePullConsumer consumer, String topicName, String database) { + if (consumer != null) { + try { + consumer.unsubscribe(topicName); + } catch (Exception e) { + // ignore + } + try { + consumer.close(); + } catch (Exception e) { + // ignore + } + } + dropTopic(topicName); + deleteDatabase(database); + } + + // ============================ + // Result & Assertions + // ============================ + + static class PollResult { + int totalRows = 0; + Map rowsPerDevice = new HashMap<>(); + Set seenColumns = new HashSet<>(); + + @Override + public String toString() { + return "PollResult{totalRows=" + + totalRows + + ", rowsPerDevice=" + + rowsPerDevice + + ", seenColumns=" + + seenColumns + + "}"; + } + } + + private static void assertEquals(String msg, int expected, int actual) { + if (expected != actual) { + throw new AssertionError(msg + ": expected=" + expected + ", actual=" + actual); + } + } + + private static void assertTrue(String msg, boolean condition) { + if (!condition) { + throw new AssertionError(msg); + } + } + + private static void assertAtLeast(String msg, int min, int actual) { + if (actual < min) { + throw new AssertionError(msg + ": expected at least " + min + ", actual=" + actual); + } + } + + // ====================================================================== + // Test 1: Basic Flow (merged: BasicDataDelivery + MultiDevices + Flush) + // ====================================================================== + /** + * Verifies: + * + *
    + *
  • Data written BEFORE subscribe is NOT received + *
  • Multiple devices (d1, d2, d3) written AFTER subscribe are all received + *
  • Flush does not cause data loss (WAL pinning keeps entries available) + *
  • Exact row count matches expectation + *
+ */ + private static void testBasicFlow() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + SubscriptionTreePullConsumer consumer = null; + + try { + // Step 1: Write initial data to create DataRegion (should NOT be received) + System.out.println(" Step 1: Writing initial data (should NOT be received)"); + try (ISession session = openSession()) { + createDatabase(session, database); + for (int i = 0; i < 50; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10)); + } + // Also write to d2, d3 for multi-device readiness + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d2(time, s1) VALUES (0, 0)", database)); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d3(time, s1) VALUES (0, 0)", database)); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + // Step 2: Create topic and subscribe + System.out.println(" Step 2: Creating topic and subscribing"); + createTopic(topicName, database + ".**"); + Thread.sleep(1000); + + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + // Step 3: Write to 3 devices (30 rows each = 90 total), then flush + System.out.println(" Step 3: Writing 30 rows x 3 devices AFTER subscribe, then flush"); + try (ISession session = openSession()) { + for (int i = 100; i < 130; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10)); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d2(time, s1) VALUES (%d, %d)", database, i, i * 20)); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d3(time, s1) VALUES (%d, %d)", database, i, i * 30)); + } + System.out.println(" Flushing..."); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + // Step 4: Poll and verify + System.out.println(" Step 4: Polling..."); + PollResult result = pollUntilComplete(consumer, 90, 100); + System.out.println(" Result: " + result); + + assertEquals("Expected exactly 90 rows (30 per device)", 90, result.totalRows); + if (!result.rowsPerDevice.isEmpty()) { + System.out.println(" Rows per device: " + result.rowsPerDevice); + for (String dev : new String[] {"d1", "d2", "d3"}) { + Integer devRows = result.rowsPerDevice.get(database + "." + dev); + assertAtLeast("Expected rows from " + dev, 1, devRows != null ? devRows : 0); + } + } + } finally { + cleanup(consumer, topicName, database); + } + } + + // ====================================================================== + // Test 2: Data Types (merged: MultipleDataTypes + Aligned + CrossPartition) + // ====================================================================== + /** + * Verifies: + * + *
    + *
  • Non-aligned: 6 data types (INT32, INT64, FLOAT, DOUBLE, BOOLEAN, TEXT) + *
  • Aligned: 6 data types, cross-partition timestamps (>1 week apart) + *
  • 6 write methods: SQL single/multi-row, insertAlignedRecord/Records/Tablet/Tablets + *
+ */ + private static void testDataTypes() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + SubscriptionTreePullConsumer consumer = null; + final long GAP = 604_800_001L; // slightly over 1 week + + try { + try (ISession session = openSession()) { + createDatabase(session, database); + // Create aligned timeseries + session.executeNonQueryStatement( + String.format( + "CREATE ALIGNED TIMESERIES %s.d_aligned" + + "(s_int32 INT32, s_int64 INT64, s_float FLOAT," + + " s_double DOUBLE, s_bool BOOLEAN, s_text TEXT)", + database)); + // Init rows to force DataRegion creation + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s_int32) VALUES (0, 0)", database)); + session.executeNonQueryStatement( + String.format( + "INSERT INTO %s.d_aligned(time, s_int32, s_int64, s_float," + + " s_double, s_bool, s_text)" + + " VALUES (0, 0, 0, 0.0, 0.0, false, 'init')", + database)); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + createTopic(topicName, database + ".**"); + Thread.sleep(1000); + + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + int totalExpected = 0; + final String device = database + ".d_aligned"; + List measurements = + Arrays.asList("s_int32", "s_int64", "s_float", "s_double", "s_bool", "s_text"); + List types = + Arrays.asList( + TSDataType.INT32, + TSDataType.INT64, + TSDataType.FLOAT, + TSDataType.DOUBLE, + TSDataType.BOOLEAN, + TSDataType.TEXT); + List schemas = new ArrayList<>(); + schemas.add(new MeasurementSchema("s_int32", TSDataType.INT32)); + schemas.add(new MeasurementSchema("s_int64", TSDataType.INT64)); + schemas.add(new MeasurementSchema("s_float", TSDataType.FLOAT)); + schemas.add(new MeasurementSchema("s_double", TSDataType.DOUBLE)); + schemas.add(new MeasurementSchema("s_bool", TSDataType.BOOLEAN)); + schemas.add(new MeasurementSchema("s_text", TSDataType.TEXT)); + + try (ISession session = openSession()) { + // --- Part A: Non-aligned, 6 types x 20 rows --- + System.out.println(" Part A: Non-aligned 6 data types x 20 rows"); + for (int i = 1; i <= 20; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s_int32) VALUES (%d, %d)", database, i, i)); + session.executeNonQueryStatement( + String.format( + "INSERT INTO %s.d1(time, s_int64) VALUES (%d, %d)", + database, i, (long) i * 100000L)); + session.executeNonQueryStatement( + String.format( + "INSERT INTO %s.d1(time, s_float) VALUES (%d, %f)", database, i, i * 1.1f)); + session.executeNonQueryStatement( + String.format( + "INSERT INTO %s.d1(time, s_double) VALUES (%d, %f)", database, i, i * 2.2)); + session.executeNonQueryStatement( + String.format( + "INSERT INTO %s.d1(time, s_bool) VALUES (%d, %s)", + database, i, i % 2 == 0 ? "true" : "false")); + session.executeNonQueryStatement( + String.format( + "INSERT INTO %s.d1(time, s_text) VALUES (%d, 'text_%d')", database, i, i)); + } + totalExpected += 120; // 6 types x 20 rows + + // --- Part B: Aligned cross-partition, 6 write methods --- + System.out.println(" Part B: Aligned cross-partition, 6 write methods"); + + // Method 1: SQL single row + long t1 = 1; + session.executeNonQueryStatement( + String.format( + "INSERT INTO %s.d_aligned(time, s_int32, s_int64, s_float," + + " s_double, s_bool, s_text)" + + " VALUES (%d, 1, 100, 1.1, 1.11, true, 'sql_single')", + database, t1)); + totalExpected += 1; + + // Method 2: SQL multi-row (cross-partition) + long t2a = 1 + GAP; + long t2b = 1 + 2 * GAP; + session.executeNonQueryStatement( + String.format( + "INSERT INTO %s.d_aligned(time, s_int32, s_int64, s_float," + + " s_double, s_bool, s_text)" + + " VALUES (%d, 2, 200, 2.2, 2.22, false, 'sql_multi_a')," + + " (%d, 3, 300, 3.3, 3.33, true, 'sql_multi_b')", + database, t2a, t2b)); + totalExpected += 2; + + // Method 3: insertAlignedRecord + long t3 = 1 + 3 * GAP; + session.insertAlignedRecord( + device, + t3, + measurements, + types, + Arrays.asList(4, 400L, 4.4f, 4.44, true, "record_single")); + totalExpected += 1; + + // Method 4: insertAlignedRecordsOfOneDevice (cross-partition) + long t4a = 1 + 4 * GAP; + long t4b = 1 + 5 * GAP; + session.insertAlignedRecordsOfOneDevice( + device, + Arrays.asList(t4a, t4b), + Arrays.asList(measurements, measurements), + Arrays.asList(types, types), + Arrays.asList( + Arrays.asList(5, 500L, 5.5f, 5.55, false, "records_a"), + Arrays.asList(6, 600L, 6.6f, 6.66, true, "records_b"))); + totalExpected += 2; + + // Method 5: insertAlignedTablet (cross-partition) + long t5a = 1 + 6 * GAP; + long t5b = 1 + 7 * GAP; + Tablet tablet5 = new Tablet(device, schemas, 2); + addAlignedTabletRow(tablet5, 0, t5a, 7, 700L, 7.7f, 7.77, false, "tablet_a"); + addAlignedTabletRow(tablet5, 1, t5b, 8, 800L, 8.8f, 8.88, true, "tablet_b"); + session.insertAlignedTablet(tablet5); + totalExpected += 2; + + // Method 6: insertAlignedTablets (cross-partition) + long t6a = 1 + 8 * GAP; + long t6b = 1 + 9 * GAP; + Tablet tablet6 = new Tablet(device, schemas, 2); + addAlignedTabletRow(tablet6, 0, t6a, 9, 900L, 9.9f, 9.99, false, "tablets_a"); + addAlignedTabletRow(tablet6, 1, t6b, 10, 1000L, 10.1f, 10.10, true, "tablets_b"); + Map tabletMap = new HashMap<>(); + tabletMap.put(device, tablet6); + session.insertAlignedTablets(tabletMap); + totalExpected += 2; + } + + System.out.println(" Total expected rows: " + totalExpected); + Thread.sleep(2000); + + PollResult result = pollUntilComplete(consumer, totalExpected, 150); + System.out.println(" Result: " + result); + + assertAtLeast( + "Expected at least " + totalExpected + " rows", totalExpected, result.totalRows); + assertAtLeast("Expected multiple column types in result", 2, result.seenColumns.size()); + } finally { + cleanup(consumer, topicName, database); + } + } + + // ====================================================================== + // Test 3: Path Filtering (merged: DeviceLevel + TimeseriesLevel) + // ====================================================================== + /** + * Verifies: + * + *
    + *
  • Device-level: topic on d1.** does NOT deliver d2 data + *
  • Timeseries-level: topic on d1.s1 — lenient check for s2 filtering + *
+ */ + private static void testPathFiltering() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + SubscriptionTreePullConsumer consumer = null; + + try { + try (ISession session = openSession()) { + createDatabase(session, database); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1, s2) VALUES (0, 0, 0)", database)); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d2(time, s1) VALUES (0, 0)", database)); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + // Topic filters d1.s1 only (timeseries-level) + String filterPath = database + ".d1.s1"; + createTopic(topicName, filterPath); + Thread.sleep(1000); + + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + System.out.println(" Writing to d1 (s1 + s2) and d2 (s1)"); + try (ISession session = openSession()) { + for (int i = 100; i < 150; i++) { + session.executeNonQueryStatement( + String.format( + "INSERT INTO %s.d1(time, s1, s2) VALUES (%d, %d, %d)", + database, i, i * 10, i * 20)); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d2(time, s1) VALUES (%d, %d)", database, i, i * 30)); + } + } + Thread.sleep(2000); + + System.out.println(" Polling (expecting d1 data only, ideally s1 only)..."); + PollResult result = pollUntilComplete(consumer, 50, 60); + System.out.println(" Result: " + result); + + // Device-level: d2 must NOT appear + if (!result.rowsPerDevice.isEmpty()) { + Integer d2Rows = result.rowsPerDevice.get(database + ".d2"); + assertTrue("Expected NO rows from d2, but got " + d2Rows, d2Rows == null || d2Rows == 0); + Integer d1Rows = result.rowsPerDevice.get(database + ".d1"); + assertAtLeast("Expected d1 rows", 1, d1Rows != null ? d1Rows : 0); + System.out.println(" Device filtering verified: d1=" + d1Rows + ", d2=" + d2Rows); + } + + // Timeseries-level: lenient check + boolean hasS2 = result.seenColumns.stream().anyMatch(c -> c.contains(".s2")); + if (hasS2) { + System.out.println( + " INFO: Both s1 and s2 received — converter uses device-level filtering only."); + assertAtLeast("Should have received d1 rows", 50, result.totalRows); + } else { + System.out.println(" Timeseries-level filtering verified: only s1 data received"); + assertEquals("Expected exactly 50 rows from d1.s1 only", 50, result.totalRows); + } + } finally { + cleanup(consumer, topicName, database); + } + } + + // ====================================================================== + // Test 4: Subscribe Before Region Creation (kept as-is) + // ====================================================================== + /** + * Subscribe BEFORE the database/region exists, then create database and write. Tests the + * IoTConsensus.onNewPeerCreated auto-binding path. + */ + private static void testSubscribeBeforeRegion() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + SubscriptionTreePullConsumer consumer = null; + + try { + System.out.println(" Step 1: Creating topic BEFORE database exists"); + createTopic(topicName, database + ".**"); + Thread.sleep(1000); + + System.out.println(" Step 2: Subscribing (no DataRegion exists yet)"); + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + System.out.println(" Step 3: Creating database and writing data (100 rows)"); + try (ISession session = openSession()) { + createDatabase(session, database); + for (int i = 0; i < 100; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10)); + } + } + Thread.sleep(5000); + + System.out.println(" Step 4: Polling..."); + PollResult result = pollUntilComplete(consumer, 100, 100); + System.out.println(" Result: " + result); + + if (result.totalRows >= 100) { + System.out.println(" Auto-binding works! All " + result.totalRows + " rows received."); + } else if (result.totalRows > 0) { + System.out.println( + " Partial: " + result.totalRows + "/100 rows. First writes may precede binding."); + } else { + System.out.println(" No data received. Check logs for auto-binding messages."); + } + assertAtLeast( + "Expected some rows from subscribe-before-region (auto-binding)", 1, result.totalRows); + } finally { + cleanup(consumer, topicName, database); + } + } + + // ====================================================================== + // Test 6: Multi-Entity Isolation (merged: MultiConsumerGroup + MultiTopic) + // ====================================================================== + /** + * Verifies: + * + *
    + *
  • Two consumer groups on same topic: each group gets ALL data independently + *
  • One consumer subscribes to two topics with different path filters: each topic delivers + * only matching data + *
+ */ + private static void testMultiEntityIsolation() throws Exception { + String database = nextDatabase(); + String topicName1 = "topic_multi_" + testCounter + "_a"; + String topicName2 = "topic_multi_" + testCounter + "_b"; + String consumerGroupId1 = "cg_multi_" + testCounter + "_a"; + String consumerId1 = "consumer_multi_" + testCounter + "_a"; + String consumerGroupId2 = "cg_multi_" + testCounter + "_b"; + String consumerId2 = "consumer_multi_" + testCounter + "_b"; + SubscriptionTreePullConsumer consumer1 = null; + SubscriptionTreePullConsumer consumer2 = null; + + try { + // Setup: database with d1 and d2 + try (ISession session = openSession()) { + createDatabase(session, database); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1) VALUES (0, 0)", database)); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d2(time, s1) VALUES (0, 0)", database)); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + // Topic 1: covers d1 only, Topic 2: covers d2 only + createTopic(topicName1, database + ".d1.**"); + createTopic(topicName2, database + ".d2.**"); + Thread.sleep(1000); + + // Consumer 1 (group A): subscribes to BOTH topics + consumer1 = createConsumer(consumerId1, consumerGroupId1); + consumer1.subscribe(topicName1, topicName2); + // Consumer 2 (group B): subscribes to BOTH topics + consumer2 = createConsumer(consumerId2, consumerGroupId2); + consumer2.subscribe(topicName1, topicName2); + Thread.sleep(3000); + + // Write 30 rows to d1, 40 rows to d2 + System.out.println(" Writing 30 rows to d1, 40 rows to d2"); + try (ISession session = openSession()) { + for (int i = 1; i <= 40; i++) { + if (i <= 30) { + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10)); + } + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d2(time, s1) VALUES (%d, %d)", database, i, i * 20)); + } + } + Thread.sleep(2000); + + // Part A: Both groups should get 70 rows independently + System.out.println(" Part A: Multi-group isolation"); + System.out.println(" Polling from group 1..."); + PollResult result1 = pollUntilComplete(consumer1, 70, 80); + System.out.println(" Group 1 result: " + result1); + + System.out.println(" Polling from group 2..."); + PollResult result2 = pollUntilComplete(consumer2, 70, 80); + System.out.println(" Group 2 result: " + result2); + + assertEquals("Group 1 should receive all 70 rows", 70, result1.totalRows); + assertEquals("Group 2 should receive all 70 rows", 70, result2.totalRows); + + // Part B: Verify per-topic device isolation + if (!result1.rowsPerDevice.isEmpty()) { + Integer d1Rows = result1.rowsPerDevice.get(database + ".d1"); + Integer d2Rows = result1.rowsPerDevice.get(database + ".d2"); + assertEquals("Expected 30 rows from d1 (topic1)", 30, d1Rows != null ? d1Rows : 0); + assertEquals("Expected 40 rows from d2 (topic2)", 40, d2Rows != null ? d2Rows : 0); + System.out.println(" Multi-topic isolation verified: d1=" + d1Rows + ", d2=" + d2Rows); + } + System.out.println( + " Multi-group isolation verified: group1=" + + result1.totalRows + + ", group2=" + + result2.totalRows); + } finally { + if (consumer1 != null) { + try { + consumer1.unsubscribe(topicName1, topicName2); + } catch (Exception e) { + /* ignore */ + } + try { + consumer1.close(); + } catch (Exception e) { + /* ignore */ + } + } + if (consumer2 != null) { + try { + consumer2.unsubscribe(topicName1, topicName2); + } catch (Exception e) { + /* ignore */ + } + try { + consumer2.close(); + } catch (Exception e) { + /* ignore */ + } + } + dropTopic(topicName1); + dropTopic(topicName2); + deleteDatabase(database); + } + } + + // ====================================================================== + // Test 7: Burst Write Gap Recovery (NEW — tests C2 fix) + // ====================================================================== + /** + * Tests that burst writing beyond the pending queue capacity (4096) does not cause data loss. The + * pending queue overflow triggers gaps, which should be recovered from WAL. + * + *

Mechanism: Each {@code IoTConsensusServerImpl.write()} call produces exactly one + * {@code pendingEntries.offer()}. A single {@code session.insertTablet(tablet)} with N rows in + * one time partition = 1 write() call = 1 offer, so Tablet batches rarely overflow the queue. To + * actually overflow, we need 4096+ individual write() calls arriving faster than the + * prefetch thread can drain. We achieve this with multiple concurrent writer threads, each + * performing individual SQL INSERTs, to maximize the aggregate write rate vs. drain rate. + * + *

Note: Gap occurrence is inherently timing-dependent (race between writers and the + * prefetch drain loop). This test maximizes the probability by using concurrent threads, but + * cannot guarantee gap occurrence on every run. Check server logs for "gap detected" / "Filling + * from WAL" messages to confirm the gap path was exercised. + * + *

Fix verified: C2 — gap entries are not skipped when WAL fill times out; they are deferred to + * the next prefetch iteration. + */ + private static void testBurstWriteGapRecovery() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + SubscriptionTreePullConsumer consumer = null; + + try { + try (ISession session = openSession()) { + createDatabase(session, database); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1) VALUES (0, 0)", database)); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + createTopic(topicName, database + ".**"); + Thread.sleep(1000); + + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + // Use multiple concurrent writer threads with individual SQL INSERTs. + // Each INSERT → 1 IoTConsensusServerImpl.write() → 1 pendingEntries.offer(). + // With N threads writing concurrently, aggregate rate should exceed drain rate + // and overflow the 4096-capacity queue, creating gaps. + final int writerThreads = 4; + final int rowsPerThread = 1500; // 4 * 1500 = 6000 total write() calls > 4096 + final int totalRows = writerThreads * rowsPerThread; + final AtomicInteger errorCount = new AtomicInteger(0); + final CountDownLatch startLatch = new CountDownLatch(1); + final CountDownLatch doneLatch = new CountDownLatch(writerThreads); + + System.out.println( + " Burst writing " + + totalRows + + " rows via " + + writerThreads + + " concurrent threads (" + + rowsPerThread + + " individual SQL INSERTs each)"); + System.out.println( + " (Each INSERT = 1 WAL entry = 1 pendingEntries.offer(); " + "queue capacity = 4096)"); + + ExecutorService executor = Executors.newFixedThreadPool(writerThreads); + for (int t = 0; t < writerThreads; t++) { + final int threadId = t; + final int startTs = threadId * rowsPerThread + 1; + executor.submit( + () -> { + try { + startLatch.await(); // all threads start at the same time + try (ISession session = openSession()) { + for (int i = 0; i < rowsPerThread; i++) { + int ts = startTs + i; + session.executeNonQueryStatement( + String.format( + "INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", + database, ts, (long) ts * 10)); + } + } + } catch (Exception e) { + System.out.println(" Writer thread " + threadId + " error: " + e.getMessage()); + errorCount.incrementAndGet(); + } finally { + doneLatch.countDown(); + } + }); + } + + // Fire all threads simultaneously + startLatch.countDown(); + doneLatch.await(); + executor.shutdown(); + + if (errorCount.get() > 0) { + System.out.println(" WARNING: " + errorCount.get() + " writer threads encountered errors"); + } + + // Do NOT add artificial delay — let the consumer compete with ongoing WAL writes + System.out.println( + " Polling (expecting " + totalRows + " rows, may need WAL gap recovery)..."); + System.out.println( + " (Check server logs for 'gap detected' to confirm gap recovery was triggered)"); + PollResult result = pollUntilComplete(consumer, totalRows, 6000, 2000, true); + System.out.println(" Result: " + result); + + assertEquals( + "Expected exactly " + totalRows + " rows (no data loss despite pending queue overflow)", + totalRows, + result.totalRows); + } finally { + cleanup(consumer, topicName, database); + } + } + + // ====================================================================== + // Test 8: Commit After Unsubscribe (NEW — tests H7 fix) + // ====================================================================== + /** + * Tests that commit still works correctly after the consumer has unsubscribed (queue has been + * torn down). The commit routing should use metadata-based topic config check instead of runtime + * queue state. + * + *

Fix verified: H7 — commit routes via isConsensusBasedTopic() instead of hasQueue(). + */ + private static void testCommitAfterUnsubscribe() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + SubscriptionTreePullConsumer consumer = null; + + try { + try (ISession session = openSession()) { + createDatabase(session, database); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1) VALUES (0, 0)", database)); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + createTopic(topicName, database + ".**"); + Thread.sleep(1000); + + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + // Write data + System.out.println(" Writing 50 rows"); + try (ISession session = openSession()) { + for (int i = 1; i <= 50; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10)); + } + } + Thread.sleep(2000); + + // Poll WITHOUT commit + System.out.println(" Polling WITHOUT commit..."); + List uncommittedMessages = new ArrayList<>(); + int polledRows = 0; + for (int attempt = 0; attempt < 60 && polledRows < 50; attempt++) { + List msgs = consumer.poll(Duration.ofMillis(2000)); + if (msgs.isEmpty()) { + if (polledRows > 0) break; + Thread.sleep(500); + continue; + } + for (SubscriptionMessage msg : msgs) { + uncommittedMessages.add(msg); + for (SubscriptionSessionDataSet ds : msg.getSessionDataSetsHandler()) { + while (ds.hasNext()) { + ds.next(); + polledRows++; + } + } + } + } + System.out.println( + " Polled " + + polledRows + + " rows, holding " + + uncommittedMessages.size() + + " uncommitted messages"); + assertAtLeast("Should have polled some rows before unsubscribe", 1, polledRows); + + // Unsubscribe (tears down the consensus queue) + System.out.println(" Unsubscribing (queue teardown)..."); + consumer.unsubscribe(topicName); + Thread.sleep(2000); + + // Now commit the previously polled messages — should NOT throw + System.out.println( + " Committing " + uncommittedMessages.size() + " messages AFTER unsubscribe..."); + boolean commitSucceeded = true; + for (SubscriptionMessage msg : uncommittedMessages) { + try { + consumer.commitSync(msg); + } catch (Exception e) { + System.out.println(" Commit threw exception: " + e.getMessage()); + commitSucceeded = false; + } + } + + // The commit may silently succeed or fail gracefully — the key is no crash + System.out.println(" Commit after unsubscribe completed. Success=" + commitSucceeded); + System.out.println(" (Key: no exception crash, routing handled gracefully)"); + } finally { + if (consumer != null) { + try { + consumer.close(); + } catch (Exception e) { + /* ignore */ + } + } + dropTopic(topicName); + deleteDatabase(database); + } + } + + // ====================================================================== + // Test 8: Seek (seekToBeginning, seekToEnd, seek by timestamp) + // ====================================================================== + /** + * Verifies all three seek operations in a single flow: + * + *

    + *
  • seekToBeginning — re-delivers previously committed data from earliest available position + *
  • seekToEnd — skips all existing data, only new writes are received + *
  • seek(timestamp) — positions at the approximate WAL entry matching the given timestamp + *
+ */ + private static void testSeek() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + SubscriptionTreePullConsumer consumer = null; + + try { + // Step 0: Create DataRegion + try (ISession session = openSession()) { + createDatabase(session, database); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1) VALUES (0, 0)", database)); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + // Step 1: Create topic + consumer + subscribe + System.out.println(" Step 1: Create topic and subscribe"); + createTopic(topicName, database + ".**"); + Thread.sleep(1000); + + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + // Step 2: Write 1000 rows with timestamps 1000..1999 and poll+commit all + System.out.println(" Step 2: Write 1000 rows (timestamps 1000..1999) and poll+commit"); + try (ISession session = openSession()) { + for (int i = 0; i < 1000; i++) { + long ts = 1000 + i; + session.executeNonQueryStatement( + String.format( + "INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, ts, ts * 10)); + } + } + Thread.sleep(2000); + + PollResult firstPoll = pollUntilComplete(consumer, 1000, 120); + System.out.println(" First poll: " + firstPoll.totalRows + " rows"); + assertAtLeast("First poll should get rows", 1, firstPoll.totalRows); + + // ------------------------------------------------------------------ + // Step 3: seekToBeginning — should re-deliver data from the start + // ------------------------------------------------------------------ + System.out.println(" Step 3: seekToBeginning → expect re-delivery"); + consumer.seekToBeginning(topicName); + Thread.sleep(2000); + + // expectedRows=1001: 1000 from Step 2 + 1 from Step 0 initial INSERT (if WAL not yet cleaned) + PollResult beginningPoll = pollUntilComplete(consumer, 1001, 120); + System.out.println(" After seekToBeginning: " + beginningPoll); + assertAtLeast( + "seekToBeginning should re-deliver rows (WAL retention permitting)", + 1, + beginningPoll.totalRows); + + // ------------------------------------------------------------------ + // Step 4: seekToEnd — should receive nothing until new writes + // ------------------------------------------------------------------ + System.out.println(" Step 4: seekToEnd → expect no old data"); + consumer.seekToEnd(topicName); + Thread.sleep(2000); + + PollResult endPoll = new PollResult(); + int consecutiveEmpty = 0; + for (int attempt = 0; attempt < 15; attempt++) { + List msgs = consumer.poll(Duration.ofMillis(1000)); + if (msgs.isEmpty()) { + consecutiveEmpty++; + if (consecutiveEmpty >= 5) break; + Thread.sleep(500); + continue; + } + consecutiveEmpty = 0; + for (SubscriptionMessage msg : msgs) { + for (SubscriptionSessionDataSet ds : msg.getSessionDataSetsHandler()) { + while (ds.hasNext()) { + ds.next(); + endPoll.totalRows++; + } + } + consumer.commitSync(msg); + } + } + System.out.println(" After seekToEnd (no new writes): " + endPoll.totalRows + " rows"); + // May occasionally be 1 due to prefetch thread race; tolerate small values + assertTrue( + "seekToEnd should yield at most 1 row (race tolerance)", endPoll.totalRows <= 1); + + // Write 200 new rows — they should be received + System.out.println(" Writing 200 new rows after seekToEnd"); + try (ISession session = openSession()) { + for (int i = 2000; i < 2200; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10)); + } + } + Thread.sleep(2000); + + PollResult afterEndPoll = pollUntilComplete(consumer, 200, 120); + System.out.println(" After seekToEnd + new writes: " + afterEndPoll); + assertEquals( + "Should receive exactly 200 new rows after seekToEnd", 200, afterEndPoll.totalRows); + + // ------------------------------------------------------------------ + // Step 5: seek(timestamp) — seek to midpoint timestamp 1500 + // ------------------------------------------------------------------ + System.out.println(" Step 5: seek(1500) → expect rows from near midpoint"); + consumer.seek(topicName, 1500); + Thread.sleep(2000); + + // With 1000 rows (ts=1000..1999) + 200 rows (ts=2000..2199), sparse mapping (interval=100) + // produces ~12 samples. seek(1500) should position near ts=1500. + // Minimum expected: 500 rows (ts=1500..1999) + 200 rows (ts=2000..2199) = 700 + // May get more due to sparse mapping imprecision (up to ~100 extra rows) + PollResult afterSeek = pollUntilComplete(consumer, 1201, 120); + System.out.println(" After seek(1500): " + afterSeek.totalRows + " rows"); + assertAtLeast( + "seek(1500) should deliver at least 700 rows (ts >= 1500)", + 700, + afterSeek.totalRows); + + // ------------------------------------------------------------------ + // Step 6: seek(future timestamp) — expect 0 rows + // ------------------------------------------------------------------ + System.out.println(" Step 6: seek(99999) → expect no data"); + consumer.seek(topicName, 99999); + Thread.sleep(2000); + + PollResult futurePoll = new PollResult(); + consecutiveEmpty = 0; + for (int attempt = 0; attempt < 10; attempt++) { + List msgs = consumer.poll(Duration.ofMillis(1000)); + if (msgs.isEmpty()) { + consecutiveEmpty++; + if (consecutiveEmpty >= 5) break; + Thread.sleep(500); + continue; + } + consecutiveEmpty = 0; + for (SubscriptionMessage msg : msgs) { + for (SubscriptionSessionDataSet ds : msg.getSessionDataSetsHandler()) { + while (ds.hasNext()) { + ds.next(); + futurePoll.totalRows++; + } + } + consumer.commitSync(msg); + } + } + System.out.println(" After seek(99999): " + futurePoll.totalRows + " rows"); + // seek(99999) should behave like seekToEnd — 0 rows normally, + // but may yield up to 1 row due to prefetch thread race (same as seekToEnd) + assertTrue("seek(future) should yield at most 1 row (race tolerance)", + futurePoll.totalRows <= 1); + + System.out.println(" testSeek passed all sub-tests!"); + } finally { + cleanup(consumer, topicName, database); + } + } + + /** Helper: populate one row of an aligned Tablet with all 6 data types. */ + private static void addAlignedTabletRow( + Tablet tablet, + int rowIndex, + long timestamp, + int intVal, + long longVal, + float floatVal, + double doubleVal, + boolean boolVal, + String textVal) { + tablet.addTimestamp(rowIndex, timestamp); + tablet.addValue("s_int32", rowIndex, intVal); + tablet.addValue("s_int64", rowIndex, longVal); + tablet.addValue("s_float", rowIndex, floatVal); + tablet.addValue("s_double", rowIndex, doubleVal); + tablet.addValue("s_bool", rowIndex, boolVal); + tablet.addValue("s_text", rowIndex, new Binary(textVal, TSFileConfig.STRING_CHARSET)); + } +} diff --git a/iotdb-client/service-rpc/src/main/java/org/apache/iotdb/rpc/TSStatusCode.java b/iotdb-client/service-rpc/src/main/java/org/apache/iotdb/rpc/TSStatusCode.java index 6af20dc2f53ab..df8b4e2c2b9e7 100644 --- a/iotdb-client/service-rpc/src/main/java/org/apache/iotdb/rpc/TSStatusCode.java +++ b/iotdb-client/service-rpc/src/main/java/org/apache/iotdb/rpc/TSStatusCode.java @@ -313,6 +313,7 @@ public enum TSStatusCode { SHOW_SUBSCRIPTION_ERROR(1910), SUBSCRIPTION_PIPE_TIMEOUT_ERROR(1911), SUBSCRIPTION_NOT_ENABLED_ERROR(1912), + SUBSCRIPTION_SEEK_ERROR(1913), // Topic CREATE_TOPIC_ERROR(2000), diff --git a/iotdb-client/service-rpc/src/main/java/org/apache/iotdb/rpc/subscription/payload/request/PipeSubscribeRequestType.java b/iotdb-client/service-rpc/src/main/java/org/apache/iotdb/rpc/subscription/payload/request/PipeSubscribeRequestType.java index d649aa567ade4..9fcc1d86b0c75 100644 --- a/iotdb-client/service-rpc/src/main/java/org/apache/iotdb/rpc/subscription/payload/request/PipeSubscribeRequestType.java +++ b/iotdb-client/service-rpc/src/main/java/org/apache/iotdb/rpc/subscription/payload/request/PipeSubscribeRequestType.java @@ -31,6 +31,7 @@ public enum PipeSubscribeRequestType { CLOSE((short) 4), SUBSCRIBE((short) 5), UNSUBSCRIBE((short) 6), + SEEK((short) 7), ; private final short type; diff --git a/iotdb-client/service-rpc/src/main/java/org/apache/iotdb/rpc/subscription/payload/request/PipeSubscribeSeekReq.java b/iotdb-client/service-rpc/src/main/java/org/apache/iotdb/rpc/subscription/payload/request/PipeSubscribeSeekReq.java new file mode 100644 index 0000000000000..3cfb8cc6dad03 --- /dev/null +++ b/iotdb-client/service-rpc/src/main/java/org/apache/iotdb/rpc/subscription/payload/request/PipeSubscribeSeekReq.java @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.rpc.subscription.payload.request; + +import org.apache.iotdb.service.rpc.thrift.TPipeSubscribeReq; + +import org.apache.tsfile.utils.PublicBAOS; +import org.apache.tsfile.utils.ReadWriteIOUtils; + +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Objects; + +public class PipeSubscribeSeekReq extends TPipeSubscribeReq { + + /** Seek type constants. */ + public static final short SEEK_TO_BEGINNING = 1; + + public static final short SEEK_TO_END = 2; + public static final short SEEK_TO_TIMESTAMP = 3; + + private transient String topicName; + private transient short seekType; + private transient long timestamp; // only meaningful when seekType == SEEK_TO_TIMESTAMP + + public String getTopicName() { + return topicName; + } + + public short getSeekType() { + return seekType; + } + + public long getTimestamp() { + return timestamp; + } + + /////////////////////////////// Thrift /////////////////////////////// + + /** + * Serialize the incoming parameters into {@code PipeSubscribeSeekReq}, called by the subscription + * client. + */ + public static PipeSubscribeSeekReq toTPipeSubscribeReq( + final String topicName, final short seekType, final long timestamp) + throws IOException { + final PipeSubscribeSeekReq req = new PipeSubscribeSeekReq(); + + req.topicName = topicName; + req.seekType = seekType; + req.timestamp = timestamp; + + req.version = PipeSubscribeRequestVersion.VERSION_1.getVersion(); + req.type = PipeSubscribeRequestType.SEEK.getType(); + try (final PublicBAOS byteArrayOutputStream = new PublicBAOS(); + final DataOutputStream outputStream = new DataOutputStream(byteArrayOutputStream)) { + ReadWriteIOUtils.write(topicName, outputStream); + ReadWriteIOUtils.write(seekType, outputStream); + if (seekType == SEEK_TO_TIMESTAMP) { + ReadWriteIOUtils.write(timestamp, outputStream); + } + req.body = ByteBuffer.wrap(byteArrayOutputStream.getBuf(), 0, byteArrayOutputStream.size()); + } + + return req; + } + + /** Deserialize {@code TPipeSubscribeReq} to obtain parameters, called by the subscription server. */ + public static PipeSubscribeSeekReq fromTPipeSubscribeReq(final TPipeSubscribeReq seekReq) { + final PipeSubscribeSeekReq req = new PipeSubscribeSeekReq(); + + if (Objects.nonNull(seekReq.body) && seekReq.body.hasRemaining()) { + req.topicName = ReadWriteIOUtils.readString(seekReq.body); + req.seekType = ReadWriteIOUtils.readShort(seekReq.body); + if (req.seekType == SEEK_TO_TIMESTAMP) { + req.timestamp = ReadWriteIOUtils.readLong(seekReq.body); + } + } + + req.version = seekReq.version; + req.type = seekReq.type; + req.body = seekReq.body; + + return req; + } + + /////////////////////////////// Object /////////////////////////////// + + @Override + public boolean equals(final Object obj) { + if (this == obj) { + return true; + } + if (obj == null || getClass() != obj.getClass()) { + return false; + } + final PipeSubscribeSeekReq that = (PipeSubscribeSeekReq) obj; + return Objects.equals(this.topicName, that.topicName) + && this.seekType == that.seekType + && this.timestamp == that.timestamp + && this.version == that.version + && this.type == that.type + && Objects.equals(this.body, that.body); + } + + @Override + public int hashCode() { + return Objects.hash(topicName, seekType, timestamp, version, type, body); + } +} diff --git a/iotdb-client/service-rpc/src/main/java/org/apache/iotdb/rpc/subscription/payload/response/PipeSubscribeSeekResp.java b/iotdb-client/service-rpc/src/main/java/org/apache/iotdb/rpc/subscription/payload/response/PipeSubscribeSeekResp.java new file mode 100644 index 0000000000000..fc85ad71ced64 --- /dev/null +++ b/iotdb-client/service-rpc/src/main/java/org/apache/iotdb/rpc/subscription/payload/response/PipeSubscribeSeekResp.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.rpc.subscription.payload.response; + +import org.apache.iotdb.common.rpc.thrift.TSStatus; +import org.apache.iotdb.service.rpc.thrift.TPipeSubscribeResp; + +import java.util.Objects; + +public class PipeSubscribeSeekResp extends TPipeSubscribeResp { + + /////////////////////////////// Thrift /////////////////////////////// + + /** + * Serialize the incoming parameters into {@code PipeSubscribeSeekResp}, called by the + * subscription server. + */ + public static PipeSubscribeSeekResp toTPipeSubscribeResp(final TSStatus status) { + final PipeSubscribeSeekResp resp = new PipeSubscribeSeekResp(); + + resp.status = status; + resp.version = PipeSubscribeResponseVersion.VERSION_1.getVersion(); + resp.type = PipeSubscribeResponseType.ACK.getType(); + + return resp; + } + + /** Deserialize {@code TPipeSubscribeResp} to obtain parameters, called by the subscription client. */ + public static PipeSubscribeSeekResp fromTPipeSubscribeResp( + final TPipeSubscribeResp seekResp) { + final PipeSubscribeSeekResp resp = new PipeSubscribeSeekResp(); + + resp.status = seekResp.status; + resp.version = seekResp.version; + resp.type = seekResp.type; + resp.body = seekResp.body; + + return resp; + } + + /////////////////////////////// Object /////////////////////////////// + + @Override + public boolean equals(final Object obj) { + if (this == obj) { + return true; + } + if (obj == null || getClass() != obj.getClass()) { + return false; + } + final PipeSubscribeSeekResp that = (PipeSubscribeSeekResp) obj; + return Objects.equals(this.status, that.status) + && this.version == that.version + && this.type == that.type + && Objects.equals(this.body, that.body); + } + + @Override + public int hashCode() { + return Objects.hash(status, version, type, body); + } +} diff --git a/iotdb-client/session/src/main/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionConsumer.java b/iotdb-client/session/src/main/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionConsumer.java index a12340e9d7662..6cdf4e8288760 100644 --- a/iotdb-client/session/src/main/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionConsumer.java +++ b/iotdb-client/session/src/main/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionConsumer.java @@ -39,6 +39,7 @@ import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionPollResponse; import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionPollResponseType; import org.apache.iotdb.rpc.subscription.payload.poll.TabletsPayload; +import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeSeekReq; import org.apache.iotdb.session.subscription.consumer.AsyncCommitCallback; import org.apache.iotdb.session.subscription.payload.SubscriptionMessage; import org.apache.iotdb.session.subscription.payload.SubscriptionMessageType; @@ -374,6 +375,44 @@ private void unsubscribe(Set topicNames, final boolean needParse) } } + /////////////////////////////// seek /////////////////////////////// + + /** + * Seeks to the earliest available WAL position. Actual position depends on WAL retention — old + * segments may have been reclaimed. + */ + public void seekToBeginning(final String topicName) throws SubscriptionException { + checkIfOpened(); + seekInternal(topicName, PipeSubscribeSeekReq.SEEK_TO_BEGINNING, 0); + } + + /** Seeks to the current WAL tail. Only newly written data will be consumed after this. */ + public void seekToEnd(final String topicName) throws SubscriptionException { + checkIfOpened(); + seekInternal(topicName, PipeSubscribeSeekReq.SEEK_TO_END, 0); + } + + /** + * Seeks to the earliest WAL entry whose data timestamp >= targetTimestamp. Each node independently + * locates its own position, so this works correctly across multi-leader replicas. + */ + public void seek(final String topicName, final long targetTimestamp) + throws SubscriptionException { + checkIfOpened(); + seekInternal(topicName, PipeSubscribeSeekReq.SEEK_TO_TIMESTAMP, targetTimestamp); + } + + private void seekInternal( + final String topicName, final short seekType, final long timestamp) + throws SubscriptionException { + providers.acquireReadLock(); + try { + seekWithRedirection(topicName, seekType, timestamp); + } finally { + providers.releaseReadLock(); + } + } + /////////////////////////////// subscription provider /////////////////////////////// protected abstract AbstractSubscriptionProvider constructSubscriptionProvider( @@ -1373,6 +1412,44 @@ private void unsubscribeWithRedirection(final Set topicNames) throw new SubscriptionRuntimeCriticalException(errorMessage); } + /** + * Sends seek request to ALL available providers. Unlike subscribe/unsubscribe, seek must reach + * every node because data regions for the topic may be distributed across different nodes. + */ + private void seekWithRedirection( + final String topicName, final short seekType, final long timestamp) + throws SubscriptionException { + final List providers = this.providers.getAllAvailableProviders(); + if (providers.isEmpty()) { + throw new SubscriptionConnectionException( + String.format( + "Cluster has no available subscription providers when %s seek topic %s", + this, topicName)); + } + boolean anySuccess = false; + for (final AbstractSubscriptionProvider provider : providers) { + try { + provider.seek(topicName, seekType, timestamp); + anySuccess = true; + } catch (final Exception e) { + LOGGER.warn( + "{} failed to seek topic {} from subscription provider {}, continuing with other providers...", + this, + topicName, + provider, + e); + } + } + if (!anySuccess) { + final String errorMessage = + String.format( + "%s failed to seek topic %s from all available subscription providers %s", + this, topicName, providers); + LOGGER.warn(errorMessage); + throw new SubscriptionRuntimeCriticalException(errorMessage); + } + } + Map fetchAllEndPointsWithRedirection() throws SubscriptionException { final List providers = this.providers.getAllAvailableProviders(); if (providers.isEmpty()) { diff --git a/iotdb-client/session/src/main/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionProvider.java b/iotdb-client/session/src/main/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionProvider.java index 9bf119c76c428..67b752a5930a7 100644 --- a/iotdb-client/session/src/main/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionProvider.java +++ b/iotdb-client/session/src/main/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionProvider.java @@ -42,6 +42,7 @@ import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeHandshakeReq; import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeHeartbeatReq; import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribePollReq; +import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeSeekReq; import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeSubscribeReq; import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeUnsubscribeReq; import org.apache.iotdb.rpc.subscription.payload.response.PipeSubscribeHandshakeResp; @@ -316,6 +317,34 @@ Map unsubscribe(final Set topicNames) throws Subscr return unsubscribeResp.getTopics(); } + void seek(final String topicName, final short seekType, final long timestamp) + throws SubscriptionException { + final PipeSubscribeSeekReq req; + try { + req = PipeSubscribeSeekReq.toTPipeSubscribeReq(topicName, seekType, timestamp); + } catch (final IOException e) { + LOGGER.warn( + "IOException occurred when SubscriptionProvider {} serialize seek request for topic {}", + this, + topicName, + e); + throw new SubscriptionRuntimeNonCriticalException(e.getMessage(), e); + } + final TPipeSubscribeResp resp; + try { + resp = getSessionConnection().pipeSubscribe(req); + } catch (final TException | IoTDBConnectionException e) { + LOGGER.warn( + "TException/IoTDBConnectionException occurred when SubscriptionProvider {} seek with request for topic {}, set SubscriptionProvider unavailable", + this, + topicName, + e); + setUnavailable(); + throw new SubscriptionConnectionException(e.getMessage(), e); + } + verifyPipeSubscribeSuccess(resp.status); + } + List poll(final Set topicNames, final long timeoutMs) throws SubscriptionException { return poll( diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/CreateSubscriptionProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/CreateSubscriptionProcedure.java index cb5edd8cd91a3..6b71d5b16f79a 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/CreateSubscriptionProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/CreateSubscriptionProcedure.java @@ -39,6 +39,7 @@ import org.apache.iotdb.confignode.rpc.thrift.TSubscribeReq; import org.apache.iotdb.consensus.exception.ConsensusException; import org.apache.iotdb.rpc.TSStatusCode; +import org.apache.iotdb.rpc.subscription.config.TopicConstant; import org.apache.iotdb.rpc.subscription.exception.SubscriptionException; import org.apache.tsfile.utils.ReadWriteIOUtils; @@ -52,6 +53,7 @@ import java.util.HashSet; import java.util.List; import java.util.Objects; +import java.util.Set; import java.util.stream.Collectors; public class CreateSubscriptionProcedure extends AbstractOperateSubscriptionAndPipeProcedure { @@ -66,6 +68,8 @@ public class CreateSubscriptionProcedure extends AbstractOperateSubscriptionAndP private AlterConsumerGroupProcedure alterConsumerGroupProcedure; private List createPipeProcedures = new ArrayList<>(); + private Set consensusTopicNames = new HashSet<>(); + // TODO: remove this variable later private final List alterTopicProcedures = new ArrayList<>(); // unused now @@ -103,15 +107,41 @@ protected boolean executeFromValidate(final ConfigNodeProcedureEnv env) alterConsumerGroupProcedure = new AlterConsumerGroupProcedure(updatedConsumerGroupMeta, subscriptionInfo); - // Construct CreatePipeProcedureV2s + // Construct CreatePipeProcedureV2s (for non-consensus topics) for (final String topicName : subscribeReq.getTopicNames()) { + final TopicMeta topicMeta = subscriptionInfo.get().deepCopyTopicMeta(topicName); + + // Check if this topic should use consensus subscription: mode is live, format is Tablet + final String topicMode = + topicMeta + .getConfig() + .getStringOrDefault(TopicConstant.MODE_KEY, TopicConstant.MODE_DEFAULT_VALUE); + final String topicFormat = + topicMeta + .getConfig() + .getStringOrDefault(TopicConstant.FORMAT_KEY, TopicConstant.FORMAT_DEFAULT_VALUE); + final boolean isConsensusBasedTopic = + TopicConstant.MODE_LIVE_VALUE.equalsIgnoreCase(topicMode) + && !TopicConstant.FORMAT_TS_FILE_HANDLER_VALUE.equalsIgnoreCase(topicFormat); + + if (isConsensusBasedTopic) { + // skip pipe creation + consensusTopicNames.add(topicName); + LOGGER.info( + "CreateSubscriptionProcedure: topic [{}] uses consensus-based subscription " + + "(mode={}, format={}), skipping pipe creation", + topicName, + topicMode, + topicFormat); + continue; + } + final String pipeName = PipeStaticMeta.generateSubscriptionPipeName(topicName, consumerGroupId); if (!subscriptionInfo.get().isTopicSubscribedByConsumerGroup(topicName, consumerGroupId) // even if there existed subscription meta, if there is no corresponding pipe meta, it // will try to create the pipe || !pipeTaskInfo.get().isPipeExisted(pipeName)) { - final TopicMeta topicMeta = subscriptionInfo.get().deepCopyTopicMeta(topicName); createPipeProcedures.add( new CreatePipeProcedureV2( new TCreatePipeReq() @@ -177,20 +207,29 @@ protected void executeFromOperateOnDataNodes(final ConfigNodeProcedureEnv env) // Push consumer group meta to data nodes alterConsumerGroupProcedure.executeFromOperateOnDataNodes(env); - // Push pipe meta to data nodes - final List pipeNames = - createPipeProcedures.stream() - .map(CreatePipeProcedureV2::getPipeName) - .collect(Collectors.toList()); - final String exceptionMessage = - AbstractOperatePipeProcedureV2.parsePushPipeMetaExceptionForPipe( - null, pushMultiPipeMetaToDataNodes(pipeNames, env)); - if (!exceptionMessage.isEmpty()) { - // throw exception instead of logging warn, do not rely on metadata synchronization - throw new SubscriptionException( - String.format( - "Failed to create pipes %s when creating subscription with request %s, details: %s, metadata will be synchronized later.", - pipeNames, subscribeReq, exceptionMessage)); + if (!consensusTopicNames.isEmpty()) { + LOGGER.info( + "CreateSubscriptionProcedure: consensus-based topics {} will be handled by DataNode " + + "via consumer group meta push (no pipe creation needed)", + consensusTopicNames); + } + + // Push pipe meta to data nodes (only for non-consensus pipe-based topics) + if (!createPipeProcedures.isEmpty()) { + final List pipeNames = + createPipeProcedures.stream() + .map(CreatePipeProcedureV2::getPipeName) + .collect(Collectors.toList()); + final String exceptionMessage = + AbstractOperatePipeProcedureV2.parsePushPipeMetaExceptionForPipe( + null, pushMultiPipeMetaToDataNodes(pipeNames, env)); + if (!exceptionMessage.isEmpty()) { + // throw exception instead of logging warn, do not rely on metadata synchronization + throw new SubscriptionException( + String.format( + "Failed to create pipes %s when creating subscription with request %s, details: %s, metadata will be synchronized later.", + pipeNames, subscribeReq, exceptionMessage)); + } } } @@ -297,6 +336,12 @@ public void serialize(final DataOutputStream stream) throws IOException { } else { ReadWriteIOUtils.write(false, stream); } + + // Serialize consensus topic names + ReadWriteIOUtils.write(consensusTopicNames.size(), stream); + for (final String consensusTopicName : consensusTopicNames) { + ReadWriteIOUtils.write(consensusTopicName, stream); + } } @Override @@ -348,6 +393,14 @@ public void deserialize(final ByteBuffer byteBuffer) { } } } + + // Deserialize consensus topic names + if (byteBuffer.hasRemaining()) { + size = ReadWriteIOUtils.readInt(byteBuffer); + for (int i = 0; i < size; ++i) { + consensusTopicNames.add(ReadWriteIOUtils.readString(byteBuffer)); + } + } } @Override @@ -364,7 +417,8 @@ public boolean equals(final Object o) { && getCycles() == that.getCycles() && Objects.equals(subscribeReq, that.subscribeReq) && Objects.equals(alterConsumerGroupProcedure, that.alterConsumerGroupProcedure) - && Objects.equals(createPipeProcedures, that.createPipeProcedures); + && Objects.equals(createPipeProcedures, that.createPipeProcedures) + && Objects.equals(consensusTopicNames, that.consensusTopicNames); } @Override @@ -375,7 +429,8 @@ public int hashCode() { getCycles(), subscribeReq, alterConsumerGroupProcedure, - createPipeProcedures); + createPipeProcedures, + consensusTopicNames); } @TestOnly diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/DropSubscriptionProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/DropSubscriptionProcedure.java index 6741a6c1e2a84..99f8ed649d852 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/DropSubscriptionProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/DropSubscriptionProcedure.java @@ -22,6 +22,7 @@ import org.apache.iotdb.common.rpc.thrift.TSStatus; import org.apache.iotdb.commons.pipe.agent.task.meta.PipeStaticMeta; import org.apache.iotdb.commons.subscription.meta.consumer.ConsumerGroupMeta; +import org.apache.iotdb.commons.subscription.meta.topic.TopicMeta; import org.apache.iotdb.commons.utils.TestOnly; import org.apache.iotdb.confignode.consensus.request.ConfigPhysicalPlan; import org.apache.iotdb.confignode.consensus.request.write.pipe.task.DropPipePlanV2; @@ -36,6 +37,7 @@ import org.apache.iotdb.confignode.rpc.thrift.TUnsubscribeReq; import org.apache.iotdb.consensus.exception.ConsensusException; import org.apache.iotdb.rpc.TSStatusCode; +import org.apache.iotdb.rpc.subscription.config.TopicConstant; import org.apache.iotdb.rpc.subscription.exception.SubscriptionException; import org.apache.tsfile.utils.ReadWriteIOUtils; @@ -100,6 +102,31 @@ protected boolean executeFromValidate(final ConfigNodeProcedureEnv env) for (final String topic : unsubscribeReq.getTopicNames()) { if (topicsUnsubByGroup.contains(topic)) { + // Check if this topic uses consensus-based subscription (same detection as + // CreateSubscriptionProcedure). Consensus topics have no pipe to drop. + final TopicMeta topicMeta = subscriptionInfo.get().deepCopyTopicMeta(topic); + final String topicMode = + topicMeta + .getConfig() + .getStringOrDefault(TopicConstant.MODE_KEY, TopicConstant.MODE_DEFAULT_VALUE); + final String topicFormat = + topicMeta + .getConfig() + .getStringOrDefault(TopicConstant.FORMAT_KEY, TopicConstant.FORMAT_DEFAULT_VALUE); + final boolean isConsensusBasedTopic = + TopicConstant.MODE_LIVE_VALUE.equalsIgnoreCase(topicMode) + && !TopicConstant.FORMAT_TS_FILE_HANDLER_VALUE.equalsIgnoreCase(topicFormat); + + if (isConsensusBasedTopic) { + LOGGER.info( + "DropSubscriptionProcedure: topic [{}] is consensus-based (mode={}, format={}), " + + "skipping pipe removal", + topic, + topicMode, + topicFormat); + continue; + } + // Topic will be subscribed by no consumers in this group dropPipeProcedures.add( new DropPipeProcedureV2( diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/config/IoTConsensusConfig.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/config/IoTConsensusConfig.java index 32c4664b60dfd..738a72c4bc4ec 100644 --- a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/config/IoTConsensusConfig.java +++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/config/IoTConsensusConfig.java @@ -323,6 +323,7 @@ public static class Replication { private final IMemoryBlock consensusMemoryBlock; private final double maxMemoryRatioForQueue; private final long regionMigrationSpeedLimitBytesPerSecond; + private final long subscriptionWalRetentionSizeInBytes; private Replication( int maxLogEntriesNumPerBatch, @@ -338,7 +339,8 @@ private Replication( long checkpointGap, IMemoryBlock consensusMemoryBlock, double maxMemoryRatioForQueue, - long regionMigrationSpeedLimitBytesPerSecond) { + long regionMigrationSpeedLimitBytesPerSecond, + long subscriptionWalRetentionSizeInBytes) { this.maxLogEntriesNumPerBatch = maxLogEntriesNumPerBatch; this.maxSizePerBatch = maxSizePerBatch; this.maxPendingBatchesNum = maxPendingBatchesNum; @@ -353,6 +355,7 @@ private Replication( this.consensusMemoryBlock = consensusMemoryBlock; this.maxMemoryRatioForQueue = maxMemoryRatioForQueue; this.regionMigrationSpeedLimitBytesPerSecond = regionMigrationSpeedLimitBytesPerSecond; + this.subscriptionWalRetentionSizeInBytes = subscriptionWalRetentionSizeInBytes; } public int getMaxLogEntriesNumPerBatch() { @@ -411,6 +414,10 @@ public long getRegionMigrationSpeedLimitBytesPerSecond() { return regionMigrationSpeedLimitBytesPerSecond; } + public long getSubscriptionWalRetentionSizeInBytes() { + return subscriptionWalRetentionSizeInBytes; + } + public static Replication.Builder newBuilder() { return new Replication.Builder(); } @@ -434,6 +441,7 @@ public static class Builder { "Consensus-Default", null, Runtime.getRuntime().maxMemory() / 10); private double maxMemoryRatioForQueue = 0.6; private long regionMigrationSpeedLimitBytesPerSecond = 32 * 1024 * 1024L; + private long subscriptionWalRetentionSizeInBytes = 0; public Replication.Builder setMaxLogEntriesNumPerBatch(int maxLogEntriesNumPerBatch) { this.maxLogEntriesNumPerBatch = maxLogEntriesNumPerBatch; @@ -508,6 +516,12 @@ public Builder setRegionMigrationSpeedLimitBytesPerSecond( return this; } + public Builder setSubscriptionWalRetentionSizeInBytes( + long subscriptionWalRetentionSizeInBytes) { + this.subscriptionWalRetentionSizeInBytes = subscriptionWalRetentionSizeInBytes; + return this; + } + public Replication build() { return new Replication( maxLogEntriesNumPerBatch, @@ -523,7 +537,8 @@ public Replication build() { checkpointGap, consensusMemoryBlock, maxMemoryRatioForQueue, - regionMigrationSpeedLimitBytesPerSecond); + regionMigrationSpeedLimitBytesPerSecond, + subscriptionWalRetentionSizeInBytes); } } } diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensus.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensus.java index 959191ca2d6d3..8cb168272b295 100644 --- a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensus.java +++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensus.java @@ -82,6 +82,7 @@ import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.function.BiConsumer; +import java.util.function.Consumer; import java.util.stream.Collectors; public class IoTConsensus implements IConsensus { @@ -98,6 +99,19 @@ public class IoTConsensus implements IConsensus { private final IoTConsensusRPCService service; private final RegisterManager registerManager = new RegisterManager(); private IoTConsensusConfig config; + + /** + * Optional callback invoked after a new local peer is created via {@link #createLocalPeer}. Used + * by the subscription system to auto-bind prefetching queues to new DataRegions. + */ + public static volatile BiConsumer onNewPeerCreated; + + /** + * Optional callback invoked before a local peer is deleted via {@link #deleteLocalPeer}. Used by + * the subscription system to unbind and clean up prefetching queues before the region is removed. + */ + public static volatile Consumer onPeerRemoved; + private final IClientManager clientManager; private final IClientManager syncClientManager; private final ScheduledExecutorService backgroundTaskService; @@ -299,11 +313,33 @@ public void createLocalPeer(ConsensusGroupId groupId, List peers) if (exist.get()) { throw new ConsensusGroupAlreadyExistException(groupId); } + + // Notify subscription system about new peer creation for auto-binding + final BiConsumer callback = onNewPeerCreated; + if (callback != null) { + try { + callback.accept(groupId, stateMachineMap.get(groupId)); + } catch (final Exception e) { + logger.warn("onNewPeerCreated callback failed for group {}", groupId, e); + } + } } @Override public void deleteLocalPeer(ConsensusGroupId groupId) throws ConsensusException { KillPoint.setKillPoint(IoTConsensusDeleteLocalPeerKillPoints.BEFORE_DELETE); + + // Notify subscription system before stopping the peer, so that subscription queues can + // properly unregister from the still-alive serverImpl. + final Consumer removeCallback = onPeerRemoved; + if (removeCallback != null) { + try { + removeCallback.accept(groupId); + } catch (final Exception e) { + logger.warn("onPeerRemoved callback failed for group {}", groupId, e); + } + } + AtomicBoolean exist = new AtomicBoolean(false); stateMachineMap.computeIfPresent( groupId, diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensusServerImpl.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensusServerImpl.java index 567261efffffa..7dfef6a71372a 100644 --- a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensusServerImpl.java +++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensusServerImpl.java @@ -89,7 +89,9 @@ import java.util.PriorityQueue; import java.util.TreeSet; import java.util.UUID; +import java.util.concurrent.BlockingQueue; import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.CopyOnWriteArrayList; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicLong; @@ -128,6 +130,11 @@ public class IoTConsensusServerImpl { IoTConsensusRateLimiter.getInstance(); private IndexedConsensusRequest lastConsensusRequest; + // Subscription queues receive IndexedConsensusRequest in real-time from write(), + // similar to LogDispatcher, enabling in-memory data delivery without waiting for WAL flush. + private final List> subscriptionQueues = + new CopyOnWriteArrayList<>(); + public IoTConsensusServerImpl( String storageDir, Peer thisNode, @@ -236,6 +243,44 @@ public TSStatus write(IConsensusRequest request) { // in one transaction. synchronized (searchIndex) { logDispatcher.offer(indexedConsensusRequest); + // Deliver to subscription queues for real-time in-memory consumption. + // Offer AFTER stateMachine.write() so that InsertNode has inferred types + // and properly typed values (same timing as LogDispatcher). + final int sqCount = subscriptionQueues.size(); + if (sqCount > 0) { + logger.debug( + "write() offering to {} subscription queue(s), " + + "group={}, searchIndex={}, requestType={}", + sqCount, + consensusGroupId, + indexedConsensusRequest.getSearchIndex(), + indexedConsensusRequest.getRequests().isEmpty() + ? "EMPTY" + : indexedConsensusRequest.getRequests().get(0).getClass().getSimpleName()); + for (final BlockingQueue sq : subscriptionQueues) { + final boolean offered = sq.offer(indexedConsensusRequest); + logger.debug( + "offer result={}, queueSize={}, queueRemaining={}", + offered, + sq.size(), + sq.remainingCapacity()); + if (!offered) { + logger.warn( + "Subscription queue full, dropped entry searchIndex={}", + indexedConsensusRequest.getSearchIndex()); + } + } + } else { + // Log periodically when no subscription queues are registered + if (indexedConsensusRequest.getSearchIndex() % 50 == 0) { + logger.debug( + "write() no subscription queues registered, " + + "group={}, searchIndex={}, this={}", + consensusGroupId, + indexedConsensusRequest.getSearchIndex(), + System.identityHashCode(this)); + } + } searchIndex.incrementAndGet(); } // statistic the time of offering request into queue @@ -243,10 +288,13 @@ public TSStatus write(IConsensusRequest request) { System.nanoTime() - writeToStateMachineEndTime); } else { logger.debug( - "{}: write operation failed. searchIndex: {}. Code: {}", + "write operation FAILED. group={}, searchIndex={}, code={}, " + + "subscriptionQueues={}, this={}", thisNode.getGroupId(), indexedConsensusRequest.getSearchIndex(), - result.getCode()); + result.getCode(), + subscriptionQueues.size(), + System.identityHashCode(this)); } // statistic the time of total write process ioTConsensusServerMetrics.recordConsensusWriteTime( @@ -757,6 +805,41 @@ public long getSearchIndex() { return searchIndex.get(); } + public ConsensusReqReader getConsensusReqReader() { + return consensusReqReader; + } + + /** + * Registers a subscription pending queue for real-time in-memory data delivery. When {@link + * #write(IConsensusRequest)} succeeds, the IndexedConsensusRequest is offered to all registered + * subscription queues, enabling subscription consumers to receive data without waiting for WAL + * flush. + * + * @param queue the blocking queue to receive IndexedConsensusRequest entries + */ + public void registerSubscriptionQueue(final BlockingQueue queue) { + subscriptionQueues.add(queue); + // Immediately re-evaluate the safe delete index with new subscription awareness + checkAndUpdateSafeDeletedSearchIndex(); + logger.info( + "Registered subscription queue for group {}, " + + "total subscription queues: {}, currentSearchIndex={}, this={}", + consensusGroupId, + subscriptionQueues.size(), + searchIndex.get(), + System.identityHashCode(this)); + } + + public void unregisterSubscriptionQueue(final BlockingQueue queue) { + subscriptionQueues.remove(queue); + // Re-evaluate: with fewer subscribers, more WAL may be deletable + checkAndUpdateSafeDeletedSearchIndex(); + logger.info( + "Unregistered subscription queue for group {}, remaining subscription queues: {}", + consensusGroupId, + subscriptionQueues.size()); + } + public long getSyncLag() { long minSyncIndex = getMinSyncIndex(); return getSearchIndex() - minSyncIndex; @@ -872,17 +955,41 @@ void checkAndUpdateIndex() { } /** - * If there is only one replica, set it to Long.MAX_VALUE. If there are multiple replicas, get the - * latest SafelyDeletedSearchIndex again. This enables wal to be deleted in a timely manner. + * Computes and updates the safe-to-delete WAL search index based on replication progress and + * subscription WAL retention policy. When no subscriptions exist, WAL is cleaned normally. */ - void checkAndUpdateSafeDeletedSearchIndex() { + public void checkAndUpdateSafeDeletedSearchIndex() { if (configuration.isEmpty()) { logger.error( "Configuration is empty, which is unexpected. Safe deleted search index won't be updated this time."); - } else if (configuration.size() == 1) { + return; + } + + final boolean hasSubscriptions = !subscriptionQueues.isEmpty(); + final long retentionSizeLimit = + config.getReplication().getSubscriptionWalRetentionSizeInBytes(); + + if (configuration.size() == 1 && !hasSubscriptions) { + // Single replica, no subscription consumers => delete all WAL freely consensusReqReader.setSafelyDeletedSearchIndex(Long.MAX_VALUE); } else { - consensusReqReader.setSafelyDeletedSearchIndex(getMinFlushedSyncIndex()); + final long replicationIndex = + configuration.size() > 1 ? getMinFlushedSyncIndex() : Long.MAX_VALUE; + + // Subscription WAL retention: if subscriptions exist and retention is configured, + // prevent WAL deletion when total WAL size is within the retention limit. + long subscriptionRetentionBound = Long.MAX_VALUE; + if (hasSubscriptions && retentionSizeLimit > 0) { + final long totalWalSize = consensusReqReader.getTotalSize(); + if (totalWalSize <= retentionSizeLimit) { + // WAL size is within retention limit — preserve all WAL for subscribers + subscriptionRetentionBound = ConsensusReqReader.DEFAULT_SAFELY_DELETED_SEARCH_INDEX; + } + // else: WAL exceeds retention limit — allow normal cleanup (bound stays MAX_VALUE) + } + + consensusReqReader.setSafelyDeletedSearchIndex( + Math.min(replicationIndex, subscriptionRetentionBound)); } } diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/logdispatcher/LogDispatcher.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/logdispatcher/LogDispatcher.java index 374691bf38bf1..51704a24c74a5 100644 --- a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/logdispatcher/LogDispatcher.java +++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/logdispatcher/LogDispatcher.java @@ -167,15 +167,16 @@ public synchronized OptionalLong getMinFlushedSyncIndex() { return threads.stream().mapToLong(LogDispatcherThread::getLastFlushedSyncIndex).min(); } - public void checkAndFlushIndex() { + public synchronized void checkAndFlushIndex() { if (!threads.isEmpty()) { threads.forEach( thread -> { IndexController controller = thread.getController(); controller.update(controller.getCurrentIndex(), true); }); - // do not set SafelyDeletedSearchIndex as it is Long.MAX_VALUE when replica is 1 - reader.setSafelyDeletedSearchIndex(impl.getMinFlushedSyncIndex()); + // Use subscription-aware safe-delete to avoid deleting WAL entries + // still needed by subscription consumers. + impl.checkAndUpdateSafeDeletedSearchIndex(); } } @@ -397,8 +398,9 @@ public void updateSafelyDeletedSearchIndex() { // indicating that insert nodes whose search index are before this value can be deleted // safely. // - // Use minFlushedSyncIndex here to reserve the WAL which are not flushed and support kill -9. - reader.setSafelyDeletedSearchIndex(impl.getMinFlushedSyncIndex()); + // Use subscription-aware safe-delete to avoid deleting WAL entries + // still needed by subscription consumers. + impl.checkAndUpdateSafeDeletedSearchIndex(); // notify if (impl.unblockWrite()) { impl.signal(); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/consensus/DataRegionConsensusImpl.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/consensus/DataRegionConsensusImpl.java index 700fd79e5eb84..18461d2ece3bd 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/consensus/DataRegionConsensusImpl.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/consensus/DataRegionConsensusImpl.java @@ -160,6 +160,8 @@ private static ConsensusConfig buildConsensusConfig() { .setMaxMemoryRatioForQueue(CONF.getMaxMemoryRatioForQueue()) .setRegionMigrationSpeedLimitBytesPerSecond( CONF.getRegionMigrationSpeedLimitBytesPerSecond()) + .setSubscriptionWalRetentionSizeInBytes( + COMMON_CONF.getSubscriptionConsensusWalRetentionSizeInBytes()) .build()) .build()) .setPipeConsensusConfig( diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionBrokerAgent.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionBrokerAgent.java index 510f8559bc147..01cf926dfdef8 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionBrokerAgent.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionBrokerAgent.java @@ -19,7 +19,12 @@ package org.apache.iotdb.db.subscription.agent; +import org.apache.iotdb.consensus.iot.IoTConsensusServerImpl; +import org.apache.iotdb.db.subscription.broker.ConsensusSubscriptionBroker; import org.apache.iotdb.db.subscription.broker.SubscriptionBroker; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusLogToTabletConverter; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusSubscriptionCommitManager; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusSubscriptionSetupHandler; import org.apache.iotdb.db.subscription.event.SubscriptionEvent; import org.apache.iotdb.db.subscription.resource.SubscriptionDataNodeResourceManager; import org.apache.iotdb.db.subscription.task.subtask.SubscriptionSinkSubtask; @@ -30,6 +35,8 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; +import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; @@ -43,7 +50,12 @@ public class SubscriptionBrokerAgent { private static final Logger LOGGER = LoggerFactory.getLogger(SubscriptionBrokerAgent.class); - private final Map consumerGroupIdToSubscriptionBroker = + /** Pipe-based subscription brokers, one per consumer group. */ + private final Map consumerGroupIdToPipeBroker = + new ConcurrentHashMap<>(); + + /** Consensus-based subscription brokers, one per consumer group. */ + private final Map consumerGroupIdToConsensusBroker = new ConcurrentHashMap<>(); private final Cache prefetchingQueueCount = @@ -54,17 +66,54 @@ public class SubscriptionBrokerAgent { public List poll( final ConsumerConfig consumerConfig, final Set topicNames, final long maxBytes) { final String consumerGroupId = consumerConfig.getConsumerGroupId(); - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); - if (Objects.isNull(broker)) { + final String consumerId = consumerConfig.getConsumerId(); + final List allEvents = new ArrayList<>(); + long remainingBytes = maxBytes; + + // Poll from pipe-based broker + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + if (Objects.nonNull(pipeBroker)) { + final List pipeEvents = + pipeBroker.poll(consumerId, topicNames, remainingBytes); + allEvents.addAll(pipeEvents); + for (final SubscriptionEvent event : pipeEvents) { + try { + remainingBytes -= event.getCurrentResponseSize(); + } catch (final IOException ignored) { + // best effort + } + } + } + + // Poll from consensus-based broker + if (remainingBytes > 0) { + final ConsensusSubscriptionBroker consensusBroker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); + if (Objects.nonNull(consensusBroker)) { + LOGGER.debug( + "SubscriptionBrokerAgent: polling consensus broker for consumer group [{}], " + + "topicNames={}, remainingBytes={}", + consumerGroupId, + topicNames, + remainingBytes); + allEvents.addAll(consensusBroker.poll(consumerId, topicNames, remainingBytes)); + } else { + LOGGER.debug( + "SubscriptionBrokerAgent: no consensus broker for consumer group [{}]", + consumerGroupId); + } + } + + if (allEvents.isEmpty() + && Objects.isNull(pipeBroker) + && Objects.isNull(consumerGroupIdToConsensusBroker.get(consumerGroupId))) { final String errorMessage = - String.format( - "Subscription: broker bound to consumer group [%s] does not exist", consumerGroupId); + String.format("Subscription: no broker bound to consumer group [%s]", consumerGroupId); LOGGER.warn(errorMessage); throw new SubscriptionException(errorMessage); } - // TODO: currently we fetch messages from all topics - final String consumerId = consumerConfig.getConsumerId(); - return broker.poll(consumerId, topicNames, maxBytes); + + return allEvents; } public List pollTsFile( @@ -72,16 +121,18 @@ public List pollTsFile( final SubscriptionCommitContext commitContext, final long writingOffset) { final String consumerGroupId = consumerConfig.getConsumerGroupId(); - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); - if (Objects.isNull(broker)) { + // TsFile polling can only be called by pipe-based subscriptions + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + if (Objects.isNull(pipeBroker)) { final String errorMessage = String.format( - "Subscription: broker bound to consumer group [%s] does not exist", consumerGroupId); + "Subscription: pipe broker bound to consumer group [%s] does not exist", + consumerGroupId); LOGGER.warn(errorMessage); throw new SubscriptionException(errorMessage); } final String consumerId = consumerConfig.getConsumerId(); - return broker.pollTsFile(consumerId, commitContext, writingOffset); + return pipeBroker.pollTsFile(consumerId, commitContext, writingOffset); } public List pollTablets( @@ -89,16 +140,26 @@ public List pollTablets( final SubscriptionCommitContext commitContext, final int offset) { final String consumerGroupId = consumerConfig.getConsumerGroupId(); - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); - if (Objects.isNull(broker)) { + final String consumerId = consumerConfig.getConsumerId(); + final String topicName = commitContext.getTopicName(); + + // Try consensus-based broker first + final ConsensusSubscriptionBroker consensusBroker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); + if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) { + return consensusBroker.pollTablets(consumerId, commitContext, offset); + } + + // Fall back to pipe-based broker + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + if (Objects.isNull(pipeBroker)) { final String errorMessage = String.format( "Subscription: broker bound to consumer group [%s] does not exist", consumerGroupId); LOGGER.warn(errorMessage); throw new SubscriptionException(errorMessage); } - final String consumerId = consumerConfig.getConsumerId(); - return broker.pollTablets(consumerId, commitContext, offset); + return pipeBroker.pollTablets(consumerId, commitContext, offset); } /** @@ -109,46 +170,122 @@ public List commit( final List commitContexts, final boolean nack) { final String consumerGroupId = consumerConfig.getConsumerGroupId(); - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); - if (Objects.isNull(broker)) { + final String consumerId = consumerConfig.getConsumerId(); + final List allSuccessful = new ArrayList<>(); + + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + final ConsensusSubscriptionBroker consensusBroker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); + + if (Objects.isNull(pipeBroker) && Objects.isNull(consensusBroker)) { final String errorMessage = - String.format( - "Subscription: broker bound to consumer group [%s] does not exist", consumerGroupId); + String.format("Subscription: no broker bound to consumer group [%s]", consumerGroupId); LOGGER.warn(errorMessage); throw new SubscriptionException(errorMessage); } - final String consumerId = consumerConfig.getConsumerId(); - return broker.commit(consumerId, commitContexts, nack); + + // Partition commit contexts by which broker owns the topic. + final List pipeContexts = new ArrayList<>(); + final List consensusContexts = new ArrayList<>(); + for (final SubscriptionCommitContext ctx : commitContexts) { + final String topicName = ctx.getTopicName(); + if (Objects.nonNull(consensusBroker) + && ConsensusSubscriptionSetupHandler.isConsensusBasedTopic(topicName)) { + consensusContexts.add(ctx); + } else { + pipeContexts.add(ctx); + } + } + + if (Objects.nonNull(pipeBroker) && !pipeContexts.isEmpty()) { + allSuccessful.addAll(pipeBroker.commit(consumerId, pipeContexts, nack)); + } + if (Objects.nonNull(consensusBroker) && !consensusContexts.isEmpty()) { + allSuccessful.addAll(consensusBroker.commit(consumerId, consensusContexts, nack)); + } + + return allSuccessful; + } + + public void seek( + final ConsumerConfig consumerConfig, + final String topicName, + final short seekType, + final long timestamp) { + final String consumerGroupId = consumerConfig.getConsumerGroupId(); + + final ConsensusSubscriptionBroker consensusBroker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); + if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) { + consensusBroker.seek(topicName, seekType, timestamp); + return; + } + + final String errorMessage = + String.format( + "Subscription: seek is only supported for consensus-based subscriptions, " + + "consumerGroup=%s, topic=%s", + consumerGroupId, topicName); + LOGGER.warn(errorMessage); + throw new SubscriptionException(errorMessage); } public boolean isCommitContextOutdated(final SubscriptionCommitContext commitContext) { final String consumerGroupId = commitContext.getConsumerGroupId(); - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); - if (Objects.isNull(broker)) { + final String topicName = commitContext.getTopicName(); + + // Try consensus broker first + final ConsensusSubscriptionBroker consensusBroker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); + if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) { + return consensusBroker.isCommitContextOutdated(commitContext); + } + + // Fall back to pipe broker + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + if (Objects.isNull(pipeBroker)) { return true; } - return broker.isCommitContextOutdated(commitContext); + return pipeBroker.isCommitContextOutdated(commitContext); } public List fetchTopicNamesToUnsubscribe( final ConsumerConfig consumerConfig, final Set topicNames) { final String consumerGroupId = consumerConfig.getConsumerGroupId(); - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); - if (Objects.isNull(broker)) { + + // Consensus-based subscription topics are unbounded streams, so they do not trigger + // auto-unsubscribe. + final ConsensusSubscriptionBroker consensusBroker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); + final Set pipeOnlyTopicNames; + if (Objects.nonNull(consensusBroker)) { + pipeOnlyTopicNames = new java.util.HashSet<>(topicNames); + pipeOnlyTopicNames.removeIf(consensusBroker::hasQueue); + } else { + pipeOnlyTopicNames = topicNames; + } + + if (pipeOnlyTopicNames.isEmpty()) { return Collections.emptyList(); } - return broker.fetchTopicNamesToUnsubscribe(topicNames); + + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + if (Objects.isNull(pipeBroker)) { + return Collections.emptyList(); + } + return pipeBroker.fetchTopicNamesToUnsubscribe(pipeOnlyTopicNames); } /////////////////////////////// broker /////////////////////////////// public boolean isBrokerExist(final String consumerGroupId) { - return consumerGroupIdToSubscriptionBroker.containsKey(consumerGroupId); + return consumerGroupIdToPipeBroker.containsKey(consumerGroupId) + || consumerGroupIdToConsensusBroker.containsKey(consumerGroupId); } public void createBrokerIfNotExist(final String consumerGroupId) { - consumerGroupIdToSubscriptionBroker.computeIfAbsent(consumerGroupId, SubscriptionBroker::new); - LOGGER.info("Subscription: create broker bound to consumer group [{}]", consumerGroupId); + consumerGroupIdToPipeBroker.computeIfAbsent(consumerGroupId, SubscriptionBroker::new); + LOGGER.info("Subscription: create pipe broker bound to consumer group [{}]", consumerGroupId); } /** @@ -156,26 +293,46 @@ public void createBrokerIfNotExist(final String consumerGroupId) { */ public boolean dropBroker(final String consumerGroupId) { final AtomicBoolean dropped = new AtomicBoolean(false); - consumerGroupIdToSubscriptionBroker.compute( + + // Drop pipe broker + consumerGroupIdToPipeBroker.compute( consumerGroupId, (id, broker) -> { if (Objects.isNull(broker)) { + dropped.set(true); + return null; + } + if (!broker.isEmpty()) { LOGGER.warn( - "Subscription: broker bound to consumer group [{}] does not exist", + "Subscription: pipe broker bound to consumer group [{}] is not empty when dropping", consumerGroupId); - dropped.set(true); + return broker; + } + dropped.set(true); + LOGGER.info( + "Subscription: drop pipe broker bound to consumer group [{}]", consumerGroupId); + return null; + }); + + // Drop consensus broker + consumerGroupIdToConsensusBroker.compute( + consumerGroupId, + (id, broker) -> { + if (Objects.isNull(broker)) { return null; } if (!broker.isEmpty()) { LOGGER.warn( - "Subscription: broker bound to consumer group [{}] is not empty when dropping", + "Subscription: consensus broker bound to consumer group [{}] is not empty when dropping", consumerGroupId); return broker; } dropped.set(true); - LOGGER.info("Subscription: drop broker bound to consumer group [{}]", consumerGroupId); - return null; // remove this entry + LOGGER.info( + "Subscription: drop consensus broker bound to consumer group [{}]", consumerGroupId); + return null; }); + return dropped.get(); } @@ -183,15 +340,14 @@ public boolean dropBroker(final String consumerGroupId) { public void bindPrefetchingQueue(final SubscriptionSinkSubtask subtask) { final String consumerGroupId = subtask.getConsumerGroupId(); - consumerGroupIdToSubscriptionBroker + consumerGroupIdToPipeBroker .compute( consumerGroupId, (id, broker) -> { if (Objects.isNull(broker)) { LOGGER.info( - "Subscription: broker bound to consumer group [{}] does not exist, create new for binding prefetching queue", + "Subscription: pipe broker bound to consumer group [{}] does not exist, create new for binding prefetching queue", consumerGroupId); - // TODO: consider more robust metadata semantics return new SubscriptionBroker(consumerGroupId); } return broker; @@ -200,41 +356,119 @@ public void bindPrefetchingQueue(final SubscriptionSinkSubtask subtask) { prefetchingQueueCount.invalidate(); } - public void updateCompletedTopicNames(final String consumerGroupId, final String topicName) { - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); + public void bindConsensusPrefetchingQueue( + final String consumerGroupId, + final String topicName, + final String consensusGroupId, + final IoTConsensusServerImpl serverImpl, + final ConsensusLogToTabletConverter converter, + final ConsensusSubscriptionCommitManager commitManager, + final long startSearchIndex) { + consumerGroupIdToConsensusBroker + .compute( + consumerGroupId, + (id, broker) -> { + if (Objects.isNull(broker)) { + LOGGER.info( + "Subscription: consensus broker bound to consumer group [{}] does not exist, create new for binding consensus prefetching queue", + consumerGroupId); + return new ConsensusSubscriptionBroker(consumerGroupId); + } + return broker; + }) + .bindConsensusPrefetchingQueue( + topicName, consensusGroupId, serverImpl, converter, commitManager, startSearchIndex); + prefetchingQueueCount.invalidate(); + } + + public void unbindConsensusPrefetchingQueue( + final String consumerGroupId, final String topicName) { + final ConsensusSubscriptionBroker broker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); if (Objects.isNull(broker)) { LOGGER.warn( - "Subscription: broker bound to consumer group [{}] does not exist", consumerGroupId); + "Subscription: consensus broker bound to consumer group [{}] does not exist", + consumerGroupId); + return; + } + broker.unbindConsensusPrefetchingQueue(topicName); + prefetchingQueueCount.invalidate(); + } + + public void unbindByRegion(final String regionId) { + int totalClosed = 0; + for (final ConsensusSubscriptionBroker broker : consumerGroupIdToConsensusBroker.values()) { + totalClosed += broker.unbindByRegion(regionId); + } + if (totalClosed > 0) { + prefetchingQueueCount.invalidate(); + LOGGER.info( + "Subscription: unbound {} consensus prefetching queue(s) for removed region [{}]", + totalClosed, + regionId); + } + } + + public void updateCompletedTopicNames(final String consumerGroupId, final String topicName) { + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + if (Objects.isNull(pipeBroker)) { + LOGGER.warn( + "Subscription: pipe broker bound to consumer group [{}] does not exist", consumerGroupId); return; } - broker.updateCompletedTopicNames(topicName); + pipeBroker.updateCompletedTopicNames(topicName); } public void unbindPrefetchingQueue(final String consumerGroupId, final String topicName) { - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); - if (Objects.isNull(broker)) { + // Try consensus broker first + final ConsensusSubscriptionBroker consensusBroker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); + if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) { + consensusBroker.removeQueue(topicName); + prefetchingQueueCount.invalidate(); + return; + } + // Fall back to pipe broker + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + if (Objects.isNull(pipeBroker)) { LOGGER.warn( "Subscription: broker bound to consumer group [{}] does not exist", consumerGroupId); return; } - broker.unbindPrefetchingQueue(topicName); + pipeBroker.unbindPrefetchingQueue(topicName); prefetchingQueueCount.invalidate(); } public void removePrefetchingQueue(final String consumerGroupId, final String topicName) { - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); - if (Objects.isNull(broker)) { + // Try consensus broker + final ConsensusSubscriptionBroker consensusBroker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); + if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) { + consensusBroker.removeQueue(topicName); + prefetchingQueueCount.invalidate(); + return; + } + // Fall back to pipe broker + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + if (Objects.isNull(pipeBroker)) { LOGGER.warn( "Subscription: broker bound to consumer group [{}] does not exist", consumerGroupId); return; } - broker.removePrefetchingQueue(topicName); + pipeBroker.removePrefetchingQueue(topicName); prefetchingQueueCount.invalidate(); } public boolean executePrefetch(final String consumerGroupId, final String topicName) { - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); - if (Objects.isNull(broker)) { + // Try consensus broker first + final ConsensusSubscriptionBroker consensusBroker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); + if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) { + return consensusBroker.executePrefetch(topicName); + } + // Fall back to pipe broker + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + if (Objects.isNull(pipeBroker)) { SubscriptionDataNodeResourceManager.log() .schedule(SubscriptionBrokerAgent.class, consumerGroupId, topicName) .ifPresent( @@ -244,17 +478,24 @@ public boolean executePrefetch(final String consumerGroupId, final String topicN consumerGroupId)); return false; } - return broker.executePrefetch(topicName); + return pipeBroker.executePrefetch(topicName); } public int getPipeEventCount(final String consumerGroupId, final String topicName) { - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); - if (Objects.isNull(broker)) { + // Try consensus broker first + final ConsensusSubscriptionBroker consensusBroker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); + if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) { + return consensusBroker.getEventCount(topicName); + } + // Fall back to pipe broker + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + if (Objects.isNull(pipeBroker)) { LOGGER.warn( "Subscription: broker bound to consumer group [{}] does not exist", consumerGroupId); return 0; } - return broker.getPipeEventCount(topicName); + return pipeBroker.getPipeEventCount(topicName); } public int getPrefetchingQueueCount() { @@ -262,9 +503,15 @@ public int getPrefetchingQueueCount() { } private int getPrefetchingQueueCountInternal() { - return consumerGroupIdToSubscriptionBroker.values().stream() - .map(SubscriptionBroker::getPrefetchingQueueCount) - .reduce(0, Integer::sum); + int count = + consumerGroupIdToPipeBroker.values().stream() + .map(SubscriptionBroker::getPrefetchingQueueCount) + .reduce(0, Integer::sum); + count += + consumerGroupIdToConsensusBroker.values().stream() + .map(ConsensusSubscriptionBroker::getQueueCount) + .reduce(0, Integer::sum); + return count; } /////////////////////////////// Cache /////////////////////////////// @@ -272,14 +519,15 @@ private int getPrefetchingQueueCountInternal() { /** * A simple generic cache that computes and stores a value on demand. * - *

Note that since the get() and invalidate() methods are not modified with synchronized, the - * value obtained may not be entirely accurate. + *

Both {@code value} and {@code valid} are volatile to ensure visibility across threads. The + * {@code get()} method uses a local snapshot of {@code valid} to avoid double-read reordering. + * Concurrent recomputation by multiple threads is benign (idempotent supplier). * * @param the type of the cached value */ private static class Cache { - private T value; + private volatile T value; private volatile boolean valid = false; private final Supplier supplier; @@ -304,8 +552,10 @@ private void invalidate() { */ private T get() { if (!valid) { - value = supplier.get(); + final T computed = supplier.get(); + value = computed; valid = true; + return computed; } return value; } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionConsumerAgent.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionConsumerAgent.java index fee23cf6af4cb..9c54497b6f468 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionConsumerAgent.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionConsumerAgent.java @@ -21,6 +21,7 @@ import org.apache.iotdb.commons.subscription.meta.consumer.ConsumerGroupMeta; import org.apache.iotdb.commons.subscription.meta.consumer.ConsumerGroupMetaKeeper; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusSubscriptionSetupHandler; import org.apache.iotdb.mpp.rpc.thrift.TPushConsumerGroupMetaRespExceptionMessage; import org.apache.iotdb.rpc.subscription.exception.SubscriptionException; @@ -132,11 +133,34 @@ private void handleSingleConsumerGroupMetaChangesInternal( for (final String topicName : topicsUnsubByGroup) { SubscriptionAgent.broker().removePrefetchingQueue(consumerGroupId, topicName); } + // Tear down consensus-based subscriptions for unsubscribed topics + if (!topicsUnsubByGroup.isEmpty()) { + ConsensusSubscriptionSetupHandler.teardownConsensusSubscriptions( + consumerGroupId, topicsUnsubByGroup); + } + + // Detect newly subscribed topics (present in new meta but not in old meta) + final Set newlySubscribedTopics = + ConsumerGroupMeta.getTopicsNewlySubByGroup(metaInAgent, metaFromCoordinator); + + LOGGER.info( + "Subscription: consumer group [{}] meta change detected, " + + "topicsUnsubByGroup={}, newlySubscribedTopics={}", + consumerGroupId, + topicsUnsubByGroup, + newlySubscribedTopics); // TODO: Currently we fully replace the entire ConsumerGroupMeta without carefully checking the // changes in its fields. consumerGroupMetaKeeper.removeConsumerGroupMeta(consumerGroupId); consumerGroupMetaKeeper.addConsumerGroupMeta(consumerGroupId, metaFromCoordinator); + + // Set up consensus-based subscription for newly subscribed live-mode topics. + // This must happen after the meta is updated so that the broker can find the topic config. + if (!newlySubscribedTopics.isEmpty()) { + ConsensusSubscriptionSetupHandler.handleNewSubscriptions( + consumerGroupId, newlySubscribedTopics); + } } public TPushConsumerGroupMetaRespExceptionMessage handleConsumerGroupMetaChanges( @@ -222,4 +246,24 @@ public Set getTopicNamesSubscribedByConsumer( releaseReadLock(); } } + + /** + * Get all active subscriptions: consumerGroupId → set of subscribed topic names. Used by + * consensus subscription auto-binding when a new DataRegion is created. + */ + public java.util.Map> getAllSubscriptions() { + acquireReadLock(); + try { + final java.util.Map> result = new java.util.HashMap<>(); + for (final ConsumerGroupMeta meta : consumerGroupMetaKeeper.getAllConsumerGroupMeta()) { + final Set topics = meta.getSubscribedTopicNames(); + if (!topics.isEmpty()) { + result.put(meta.getConsumerGroupId(), new java.util.HashSet<>(topics)); + } + } + return result; + } finally { + releaseReadLock(); + } + } } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ConsensusSubscriptionBroker.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ConsensusSubscriptionBroker.java new file mode 100644 index 0000000000000..0c09e28765bd4 --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ConsensusSubscriptionBroker.java @@ -0,0 +1,432 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.broker; + +import org.apache.iotdb.consensus.iot.IoTConsensusServerImpl; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusLogToTabletConverter; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusPrefetchingQueue; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusSubscriptionCommitManager; +import org.apache.iotdb.db.subscription.event.SubscriptionEvent; +import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionCommitContext; +import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeSeekReq; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.CopyOnWriteArrayList; +import java.util.concurrent.atomic.AtomicLong; +import java.util.stream.Collectors; + +/** + * Consensus-based subscription broker that reads data directly from IoTConsensus WAL. Each instance + * manages consensus prefetching queues for a single consumer group. + */ +public class ConsensusSubscriptionBroker implements ISubscriptionBroker { + + private static final Logger LOGGER = LoggerFactory.getLogger(ConsensusSubscriptionBroker.class); + + private final String brokerId; // consumer group id + + /** Maps topic name to a list of ConsensusPrefetchingQueues, one per data region. */ + private final Map> topicNameToConsensusPrefetchingQueues; + + /** Shared commit ID generators per topic. */ + private final Map topicNameToCommitIdGenerator; + + public ConsensusSubscriptionBroker(final String brokerId) { + this.brokerId = brokerId; + this.topicNameToConsensusPrefetchingQueues = new ConcurrentHashMap<>(); + this.topicNameToCommitIdGenerator = new ConcurrentHashMap<>(); + } + + @Override + public boolean isEmpty() { + return topicNameToConsensusPrefetchingQueues.isEmpty(); + } + + @Override + public boolean hasQueue(final String topicName) { + final List queues = + topicNameToConsensusPrefetchingQueues.get(topicName); + return Objects.nonNull(queues) + && !queues.isEmpty() + && queues.stream().anyMatch(q -> !q.isClosed()); + } + + //////////////////////////// poll //////////////////////////// + + @Override + public List poll( + final String consumerId, final Set topicNames, final long maxBytes) { + LOGGER.debug( + "ConsensusSubscriptionBroker [{}]: poll called, consumerId={}, topicNames={}, " + + "queueCount={}, maxBytes={}", + brokerId, + consumerId, + topicNames, + topicNameToConsensusPrefetchingQueues.size(), + maxBytes); + + final List eventsToPoll = new ArrayList<>(); + final List eventsToNack = new ArrayList<>(); + long totalSize = 0; + + for (final String topicName : topicNames) { + final List queues = + topicNameToConsensusPrefetchingQueues.get(topicName); + if (Objects.isNull(queues) || queues.isEmpty()) { + continue; + } + + // Poll from all region queues for this topic + for (final ConsensusPrefetchingQueue consensusQueue : queues) { + if (consensusQueue.isClosed()) { + continue; + } + + final SubscriptionEvent event = consensusQueue.poll(consumerId); + if (Objects.isNull(event)) { + continue; + } + + final long currentSize; + try { + currentSize = event.getCurrentResponseSize(); + } catch (final IOException e) { + eventsToNack.add(event); + continue; + } + + eventsToPoll.add(event); + totalSize += currentSize; + + if (totalSize >= maxBytes) { + break; + } + } + + if (totalSize >= maxBytes) { + break; + } + } + + // Nack any events that had errors + if (!eventsToNack.isEmpty()) { + commit( + consumerId, + eventsToNack.stream() + .map(SubscriptionEvent::getCommitContext) + .collect(Collectors.toList()), + true); + } + + LOGGER.debug( + "ConsensusSubscriptionBroker [{}]: poll result, consumerId={}, eventsPolled={}, eventsNacked={}", + brokerId, + consumerId, + eventsToPoll.size(), + eventsToNack.size()); + + return eventsToPoll; + } + + @Override + public List pollTablets( + final String consumerId, final SubscriptionCommitContext commitContext, final int offset) { + final String topicName = commitContext.getTopicName(); + final List queues = + topicNameToConsensusPrefetchingQueues.get(topicName); + if (Objects.isNull(queues) || queues.isEmpty()) { + return Collections.emptyList(); + } + + // Try each region queue until one returns a match + for (final ConsensusPrefetchingQueue consensusQueue : queues) { + if (consensusQueue.isClosed()) { + continue; + } + final SubscriptionEvent event = consensusQueue.pollTablets(consumerId, commitContext, offset); + if (Objects.nonNull(event)) { + return Collections.singletonList(event); + } + } + return Collections.emptyList(); + } + + //////////////////////////// commit //////////////////////////// + + @Override + public List commit( + final String consumerId, + final List commitContexts, + final boolean nack) { + final List successfulCommitContexts = new ArrayList<>(); + for (final SubscriptionCommitContext commitContext : commitContexts) { + final String topicName = commitContext.getTopicName(); + final List queues = + topicNameToConsensusPrefetchingQueues.get(topicName); + if (Objects.isNull(queues) || queues.isEmpty()) { + LOGGER.warn( + "ConsensusSubscriptionBroker [{}]: no queues for topic [{}] to commit", + brokerId, + topicName); + continue; + } + + // Try each region queue for this topic (the event belongs to exactly one region). + // Don't warn per-queue miss — only warn if NO queue handled the commit. + boolean handled = false; + for (final ConsensusPrefetchingQueue consensusQueue : queues) { + if (consensusQueue.isClosed()) { + continue; + } + final boolean success; + if (!nack) { + success = consensusQueue.ackSilent(consumerId, commitContext); + } else { + success = consensusQueue.nackSilent(consumerId, commitContext); + } + if (success) { + successfulCommitContexts.add(commitContext); + handled = true; + break; // committed in the right queue, no need to try others + } + } + if (!handled) { + LOGGER.warn( + "ConsensusSubscriptionBroker [{}]: commit context {} not found in any of {} region queue(s) for topic [{}]", + brokerId, + commitContext, + queues.size(), + topicName); + } + } + return successfulCommitContexts; + } + + @Override + public boolean isCommitContextOutdated(final SubscriptionCommitContext commitContext) { + final String topicName = commitContext.getTopicName(); + final List queues = + topicNameToConsensusPrefetchingQueues.get(topicName); + if (Objects.isNull(queues) || queues.isEmpty()) { + return true; + } + // Any queue that considers it NOT outdated means it's not outdated + for (final ConsensusPrefetchingQueue q : queues) { + if (!q.isCommitContextOutdated(commitContext)) { + return false; + } + } + return true; + } + + //////////////////////////// seek //////////////////////////// + + public void seek(final String topicName, final short seekType, final long timestamp) { + final List queues = + topicNameToConsensusPrefetchingQueues.get(topicName); + if (Objects.isNull(queues) || queues.isEmpty()) { + LOGGER.warn( + "ConsensusSubscriptionBroker [{}]: no queues for topic [{}] to seek", + brokerId, + topicName); + return; + } + + for (final ConsensusPrefetchingQueue queue : queues) { + if (queue.isClosed()) { + continue; + } + switch (seekType) { + case PipeSubscribeSeekReq.SEEK_TO_BEGINNING: + queue.seekToBeginning(); + break; + case PipeSubscribeSeekReq.SEEK_TO_END: + queue.seekToEnd(); + break; + case PipeSubscribeSeekReq.SEEK_TO_TIMESTAMP: + queue.seekToTimestamp(timestamp); + break; + default: + LOGGER.warn( + "ConsensusSubscriptionBroker [{}]: unknown seekType {} for topic [{}]", + brokerId, + seekType, + topicName); + break; + } + } + } + + //////////////////////////// prefetching //////////////////////////// + + @Override + public boolean executePrefetch(final String topicName) { + final List queues = + topicNameToConsensusPrefetchingQueues.get(topicName); + if (Objects.isNull(queues) || queues.isEmpty()) { + return false; + } + boolean anyPrefetched = false; + for (final ConsensusPrefetchingQueue q : queues) { + if (!q.isClosed() && q.executePrefetch()) { + anyPrefetched = true; + } + } + return anyPrefetched; + } + + @Override + public int getEventCount(final String topicName) { + final List queues = + topicNameToConsensusPrefetchingQueues.get(topicName); + if (Objects.isNull(queues)) { + return 0; + } + return queues.stream().mapToInt(ConsensusPrefetchingQueue::getPrefetchedEventCount).sum(); + } + + @Override + public int getQueueCount() { + return topicNameToConsensusPrefetchingQueues.size(); + } + + //////////////////////////// queue management //////////////////////////// + + public void bindConsensusPrefetchingQueue( + final String topicName, + final String consensusGroupId, + final IoTConsensusServerImpl serverImpl, + final ConsensusLogToTabletConverter converter, + final ConsensusSubscriptionCommitManager commitManager, + final long startSearchIndex) { + // Get or create the list of queues for this topic + final List queues = + topicNameToConsensusPrefetchingQueues.computeIfAbsent( + topicName, k -> new CopyOnWriteArrayList<>()); + + // Check for duplicate region binding + for (final ConsensusPrefetchingQueue existing : queues) { + if (consensusGroupId.equals(existing.getConsensusGroupId()) && !existing.isClosed()) { + LOGGER.info( + "Subscription: consensus prefetching queue for topic [{}], region [{}] " + + "in consumer group [{}] already exists, skipping", + topicName, + consensusGroupId, + brokerId); + return; + } + } + + // Get or create the shared commit ID generator for this topic + final AtomicLong sharedCommitIdGenerator = + topicNameToCommitIdGenerator.computeIfAbsent(topicName, k -> new AtomicLong(0)); + + final ConsensusPrefetchingQueue consensusQueue = + new ConsensusPrefetchingQueue( + brokerId, + topicName, + consensusGroupId, + serverImpl, + converter, + commitManager, + startSearchIndex, + sharedCommitIdGenerator); + queues.add(consensusQueue); + LOGGER.info( + "Subscription: create consensus prefetching queue bound to topic [{}] for consumer group [{}], " + + "consensusGroupId={}, startSearchIndex={}, totalRegionQueues={}", + topicName, + brokerId, + consensusGroupId, + startSearchIndex, + queues.size()); + } + + public void unbindConsensusPrefetchingQueue(final String topicName) { + final List queues = + topicNameToConsensusPrefetchingQueues.get(topicName); + if (Objects.isNull(queues) || queues.isEmpty()) { + LOGGER.warn( + "Subscription: consensus prefetching queues bound to topic [{}] for consumer group [{}] do not exist", + topicName, + brokerId); + return; + } + + for (final ConsensusPrefetchingQueue q : queues) { + q.close(); + } + topicNameToConsensusPrefetchingQueues.remove(topicName); + topicNameToCommitIdGenerator.remove(topicName); + LOGGER.info( + "Subscription: drop all {} consensus prefetching queue(s) bound to topic [{}] for consumer group [{}]", + queues.size(), + topicName, + brokerId); + } + + public int unbindByRegion(final String regionId) { + int closedCount = 0; + for (final Map.Entry> entry : + topicNameToConsensusPrefetchingQueues.entrySet()) { + final List queues = entry.getValue(); + final Iterator iterator = queues.iterator(); + while (iterator.hasNext()) { + final ConsensusPrefetchingQueue q = iterator.next(); + if (regionId.equals(q.getConsensusGroupId())) { + q.close(); + iterator.remove(); + closedCount++; + LOGGER.info( + "Subscription: closed consensus prefetching queue for topic [{}] region [{}] " + + "in consumer group [{}] due to region removal", + entry.getKey(), + regionId, + brokerId); + } + } + } + return closedCount; + } + + @Override + public void removeQueue(final String topicName) { + final List queues = + topicNameToConsensusPrefetchingQueues.get(topicName); + if (Objects.nonNull(queues) && !queues.isEmpty()) { + LOGGER.info( + "Subscription: consensus prefetching queue(s) bound to topic [{}] for consumer group [{}] still exist, unbind before closing", + topicName, + brokerId); + unbindConsensusPrefetchingQueue(topicName); + } + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ISubscriptionBroker.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ISubscriptionBroker.java new file mode 100644 index 0000000000000..aaa88a5f84777 --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ISubscriptionBroker.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.broker; + +import org.apache.iotdb.db.subscription.event.SubscriptionEvent; +import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionCommitContext; + +import java.util.List; +import java.util.Set; + +public interface ISubscriptionBroker { + + List poll(String consumerId, Set topicNames, long maxBytes); + + List pollTablets( + String consumerId, SubscriptionCommitContext commitContext, int offset); + + List commit( + String consumerId, List commitContexts, boolean nack); + + boolean isCommitContextOutdated(SubscriptionCommitContext commitContext); + + boolean executePrefetch(String topicName); + + int getEventCount(String topicName); + + int getQueueCount(); + + void removeQueue(String topicName); + + boolean isEmpty(); + + boolean hasQueue(String topicName); +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/SubscriptionBroker.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/SubscriptionBroker.java index cc03f7261419b..8f9d05324e905 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/SubscriptionBroker.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/SubscriptionBroker.java @@ -56,7 +56,7 @@ import static org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionCommitContext.INVALID_COMMIT_ID; -public class SubscriptionBroker { +public class SubscriptionBroker implements ISubscriptionBroker { private static final Logger LOGGER = LoggerFactory.getLogger(SubscriptionBroker.class); @@ -83,14 +83,23 @@ public SubscriptionBroker(final String brokerId) { .build(consumerId -> new SubscriptionStates()); } + @Override public boolean isEmpty() { return topicNameToPrefetchingQueue.isEmpty() && completedTopicNames.isEmpty() && topicNameToCommitIdGenerator.isEmpty(); } + @Override + public boolean hasQueue(final String topicName) { + final SubscriptionPrefetchingQueue prefetchingQueue = + topicNameToPrefetchingQueue.get(topicName); + return Objects.nonNull(prefetchingQueue) && !prefetchingQueue.isClosed(); + } + //////////////////////////// provided for SubscriptionBrokerAgent //////////////////////////// + @Override public List poll( final String consumerId, final Set topicNames, final long maxBytes) { final List eventsToPoll = new ArrayList<>(); @@ -112,9 +121,10 @@ public List poll( // Iterate over each sorted topic name and poll the corresponding events int remainingTopicSize = sortedTopicNames.size(); for (final String topicName : sortedTopicNames) { + remainingTopicSize -= 1; + // Check pipe-based queue final SubscriptionPrefetchingQueue prefetchingQueue = topicNameToPrefetchingQueue.get(topicName); - remainingTopicSize -= 1; // Recheck if (Objects.isNull(prefetchingQueue) || prefetchingQueue.isClosed()) { @@ -182,6 +192,7 @@ private Set prepareCandidateTopicNames( final List eventsToPoll /* output parameter */) { final Set candidateTopicNames = new HashSet<>(); for (final String topicName : topicNames) { + // Check pipe-based queue final SubscriptionPrefetchingQueue prefetchingQueue = topicNameToPrefetchingQueue.get(topicName); // If there is no prefetching queue for the topic, check if it's completed @@ -271,6 +282,7 @@ public List pollTsFile( return Collections.emptyList(); } + @Override public List pollTablets( final String consumerId, final SubscriptionCommitContext commitContext, final int offset) { final String topicName = commitContext.getTopicName(); @@ -312,6 +324,7 @@ public List pollTablets( /** * @return list of successful commit contexts */ + @Override public List commit( final String consumerId, final List commitContexts, @@ -348,6 +361,7 @@ public List commit( return successfulCommitContexts; } + @Override public boolean isCommitContextOutdated(final SubscriptionCommitContext commitContext) { final String topicName = commitContext.getTopicName(); final SubscriptionPrefetchingQueue prefetchingQueue = @@ -457,6 +471,11 @@ public void unbindPrefetchingQueue(final String topicName) { brokerId); } + @Override + public void removeQueue(final String topicName) { + removePrefetchingQueue(topicName); + } + public void removePrefetchingQueue(final String topicName) { final SubscriptionPrefetchingQueue prefetchingQueue = topicNameToPrefetchingQueue.get(topicName); @@ -473,6 +492,7 @@ public void removePrefetchingQueue(final String topicName) { topicNameToCommitIdGenerator.remove(topicName); } + @Override public boolean executePrefetch(final String topicName) { final SubscriptionPrefetchingQueue prefetchingQueue = topicNameToPrefetchingQueue.get(topicName); @@ -505,6 +525,11 @@ public boolean executePrefetch(final String topicName) { : prefetchingQueue.executePrefetchV2(); } + @Override + public int getEventCount(final String topicName) { + return getPipeEventCount(topicName); + } + public int getPipeEventCount(final String topicName) { final SubscriptionPrefetchingQueue prefetchingQueue = topicNameToPrefetchingQueue.get(topicName); @@ -525,6 +550,11 @@ public int getPipeEventCount(final String topicName) { return prefetchingQueue.getPipeEventCount(); } + @Override + public int getQueueCount() { + return getPrefetchingQueueCount(); + } + public int getPrefetchingQueueCount() { return topicNameToPrefetchingQueue.size(); } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusLogToTabletConverter.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusLogToTabletConverter.java new file mode 100644 index 0000000000000..9d3f2b283c556 --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusLogToTabletConverter.java @@ -0,0 +1,542 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.broker.consensus; + +import org.apache.iotdb.commons.pipe.datastructure.pattern.TablePattern; +import org.apache.iotdb.commons.pipe.datastructure.pattern.TreePattern; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.PlanNodeType; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertMultiTabletsNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertRowNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertRowsNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertRowsOfOneDeviceNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertTabletNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.RelationalInsertRowNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.RelationalInsertRowsNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.RelationalInsertTabletNode; + +import org.apache.tsfile.enums.TSDataType; +import org.apache.tsfile.file.metadata.IDeviceID; +import org.apache.tsfile.utils.Binary; +import org.apache.tsfile.utils.BitMap; +import org.apache.tsfile.write.record.Tablet; +import org.apache.tsfile.write.schema.IMeasurementSchema; +import org.apache.tsfile.write.schema.MeasurementSchema; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Objects; + +/** Converts IoTConsensus WAL log entries (InsertNode) to Tablet format for subscription. */ +public class ConsensusLogToTabletConverter { + + private static final Logger LOGGER = LoggerFactory.getLogger(ConsensusLogToTabletConverter.class); + + private final TreePattern treePattern; + private final TablePattern tablePattern; + + /** + * The actual database name of the DataRegion this converter processes (table-model format without + * "root." prefix). Null for tree-model topics. + */ + private final String databaseName; + + public ConsensusLogToTabletConverter( + final TreePattern treePattern, final TablePattern tablePattern, final String databaseName) { + this.treePattern = treePattern; + this.tablePattern = tablePattern; + this.databaseName = databaseName; + } + + public String getDatabaseName() { + return databaseName; + } + + static String safeDeviceIdForLog(final InsertNode node) { + try { + final Object deviceId = node.getDeviceID(); + return deviceId != null ? deviceId.toString() : "null"; + } catch (final Exception e) { + return "N/A(" + node.getType() + ")"; + } + } + + public List convert(final InsertNode insertNode) { + if (Objects.isNull(insertNode)) { + return Collections.emptyList(); + } + + final PlanNodeType nodeType = insertNode.getType(); + if (nodeType == null) { + LOGGER.warn("InsertNode type is null, skipping conversion"); + return Collections.emptyList(); + } + + LOGGER.debug( + "ConsensusLogToTabletConverter: converting InsertNode type={}, deviceId={}", + nodeType, + safeDeviceIdForLog(insertNode)); + + switch (nodeType) { + case INSERT_ROW: + return convertInsertRowNode((InsertRowNode) insertNode); + case INSERT_TABLET: + return convertInsertTabletNode((InsertTabletNode) insertNode); + case INSERT_ROWS: + return convertInsertRowsNode((InsertRowsNode) insertNode); + case INSERT_ROWS_OF_ONE_DEVICE: + return convertInsertRowsOfOneDeviceNode((InsertRowsOfOneDeviceNode) insertNode); + case INSERT_MULTI_TABLET: + return convertInsertMultiTabletsNode((InsertMultiTabletsNode) insertNode); + case RELATIONAL_INSERT_ROW: + return convertRelationalInsertRowNode((RelationalInsertRowNode) insertNode); + case RELATIONAL_INSERT_TABLET: + return convertRelationalInsertTabletNode((RelationalInsertTabletNode) insertNode); + case RELATIONAL_INSERT_ROWS: + return convertRelationalInsertRowsNode((RelationalInsertRowsNode) insertNode); + default: + LOGGER.debug("Unsupported InsertNode type for subscription: {}", nodeType); + return Collections.emptyList(); + } + } + + // ======================== Tree Model Conversion ======================== + + private List convertInsertRowNode(final InsertRowNode node) { + final IDeviceID deviceId = node.getDeviceID(); + + // Device-level path filtering + if (treePattern != null && !treePattern.mayOverlapWithDevice(deviceId)) { + return Collections.emptyList(); + } + + final long time = node.getTime(); + + // Determine which columns match the pattern + final String[] measurements = node.getMeasurements(); + final TSDataType[] dataTypes = node.getDataTypes(); + final Object[] values = node.getValues(); + final List matchedColumnIndices = getMatchedTreeColumnIndices(deviceId, measurements); + + if (matchedColumnIndices.isEmpty()) { + return Collections.emptyList(); + } + + // Build Tablet with matched columns + final int columnCount = matchedColumnIndices.size(); + final List schemas = new ArrayList<>(columnCount); + for (final int colIdx : matchedColumnIndices) { + schemas.add(new MeasurementSchema(measurements[colIdx], dataTypes[colIdx])); + } + + final Tablet tablet = new Tablet(deviceId.toString(), schemas, 1 /* maxRowNumber */); + tablet.addTimestamp(0, time); + + for (int i = 0; i < columnCount; i++) { + final int originalColIdx = matchedColumnIndices.get(i); + final Object value = values[originalColIdx]; + if (value == null) { + if (tablet.getBitMaps() == null) { + tablet.initBitMaps(); + } + tablet.getBitMaps()[i].mark(0); + } else { + addValueToTablet(tablet, 0, i, dataTypes[originalColIdx], value); + } + } + tablet.setRowSize(1); + + return Collections.singletonList(tablet); + } + + private List convertInsertTabletNode(final InsertTabletNode node) { + final IDeviceID deviceId = node.getDeviceID(); + + // Device-level path filtering + if (treePattern != null && !treePattern.mayOverlapWithDevice(deviceId)) { + return Collections.emptyList(); + } + + final String[] measurements = node.getMeasurements(); + final TSDataType[] dataTypes = node.getDataTypes(); + final long[] times = node.getTimes(); + final Object[] columns = node.getColumns(); + final BitMap[] bitMaps = node.getBitMaps(); + final int rowCount = node.getRowCount(); + + // Column filtering + final List matchedColumnIndices = getMatchedTreeColumnIndices(deviceId, measurements); + if (matchedColumnIndices.isEmpty()) { + return Collections.emptyList(); + } + + final int columnCount = matchedColumnIndices.size(); + final boolean allColumnsMatch = (columnCount == measurements.length); + + // Build schemas (always needed) + final List schemas = new ArrayList<>(columnCount); + for (final int colIdx : matchedColumnIndices) { + schemas.add(new MeasurementSchema(measurements[colIdx], dataTypes[colIdx])); + } + + // Build column arrays and bitmaps using bulk copy + final long[] newTimes = Arrays.copyOf(times, rowCount); + final Object[] newColumns = new Object[columnCount]; + final BitMap[] newBitMaps = new BitMap[columnCount]; + + for (int i = 0; i < columnCount; i++) { + final int originalColIdx = allColumnsMatch ? i : matchedColumnIndices.get(i); + newColumns[i] = copyColumnArray(dataTypes[originalColIdx], columns[originalColIdx], rowCount); + if (bitMaps != null && bitMaps[originalColIdx] != null) { + newBitMaps[i] = new BitMap(rowCount); + BitMap.copyOfRange(bitMaps[originalColIdx], 0, newBitMaps[i], 0, rowCount); + } + } + + final Tablet tablet = + new Tablet(deviceId.toString(), schemas, newTimes, newColumns, newBitMaps, rowCount); + + return Collections.singletonList(tablet); + } + + private List convertInsertRowsNode(final InsertRowsNode node) { + final List tablets = new ArrayList<>(); + for (final InsertRowNode rowNode : node.getInsertRowNodeList()) { + // Handle merge bug: RelationalInsertRowNode.mergeInsertNode() is not overridden, + // so merged relational nodes arrive as InsertRowsNode (tree) with RelationalInsertRowNode + // children. Dispatch correctly by checking the actual child type. + if (rowNode instanceof RelationalInsertRowNode) { + tablets.addAll(convertRelationalInsertRowNode((RelationalInsertRowNode) rowNode)); + } else { + tablets.addAll(convertInsertRowNode(rowNode)); + } + } + return tablets; + } + + private List convertInsertRowsOfOneDeviceNode(final InsertRowsOfOneDeviceNode node) { + final List tablets = new ArrayList<>(); + for (final InsertRowNode rowNode : node.getInsertRowNodeList()) { + tablets.addAll(convertInsertRowNode(rowNode)); + } + return tablets; + } + + private List convertInsertMultiTabletsNode(final InsertMultiTabletsNode node) { + final List tablets = new ArrayList<>(); + for (final InsertTabletNode tabletNode : node.getInsertTabletNodeList()) { + tablets.addAll(convertInsertTabletNode(tabletNode)); + } + return tablets; + } + + // ======================== Table Model Conversion ======================== + + private List convertRelationalInsertRowNode(final RelationalInsertRowNode node) { + final String tableName = node.getTableName(); + + // Table-level pattern filtering + if (tablePattern != null) { + if (databaseName != null && !tablePattern.matchesDatabase(databaseName)) { + return Collections.emptyList(); + } + if (tableName != null && !tablePattern.matchesTable(tableName)) { + return Collections.emptyList(); + } + } + + final long time = node.getTime(); + final String[] measurements = node.getMeasurements(); + final TSDataType[] dataTypes = node.getDataTypes(); + final Object[] values = node.getValues(); + + final int columnCount = measurements.length; + final List schemas = new ArrayList<>(columnCount); + for (int i = 0; i < columnCount; i++) { + schemas.add(new MeasurementSchema(measurements[i], dataTypes[i])); + } + + final Tablet tablet = new Tablet(tableName != null ? tableName : "", schemas, 1); + tablet.addTimestamp(0, time); + + for (int i = 0; i < columnCount; i++) { + final Object value = values[i]; + if (value == null) { + if (tablet.getBitMaps() == null) { + tablet.initBitMaps(); + } + tablet.getBitMaps()[i].mark(0); + } else { + addValueToTablet(tablet, 0, i, dataTypes[i], value); + } + } + tablet.setRowSize(1); + + return Collections.singletonList(tablet); + } + + private List convertRelationalInsertTabletNode(final RelationalInsertTabletNode node) { + final String tableName = node.getTableName(); + + // Table-level pattern filtering + if (tablePattern != null) { + if (databaseName != null && !tablePattern.matchesDatabase(databaseName)) { + return Collections.emptyList(); + } + if (tableName != null && !tablePattern.matchesTable(tableName)) { + return Collections.emptyList(); + } + } + + final String[] measurements = node.getMeasurements(); + final TSDataType[] dataTypes = node.getDataTypes(); + final long[] times = node.getTimes(); + final Object[] columns = node.getColumns(); + final BitMap[] bitMaps = node.getBitMaps(); + final int rowCount = node.getRowCount(); + + final int columnCount = measurements.length; + final List schemas = new ArrayList<>(columnCount); + for (int i = 0; i < columnCount; i++) { + schemas.add(new MeasurementSchema(measurements[i], dataTypes[i])); + } + + // Build column arrays and bitmaps using bulk copy + final long[] newTimes = Arrays.copyOf(times, rowCount); + final Object[] newColumns = new Object[columnCount]; + final BitMap[] newBitMaps = new BitMap[columnCount]; + + for (int colIdx = 0; colIdx < columnCount; colIdx++) { + newColumns[colIdx] = copyColumnArray(dataTypes[colIdx], columns[colIdx], rowCount); + if (bitMaps != null && bitMaps[colIdx] != null) { + newBitMaps[colIdx] = new BitMap(rowCount); + BitMap.copyOfRange(bitMaps[colIdx], 0, newBitMaps[colIdx], 0, rowCount); + } + } + + final Tablet tablet = + new Tablet( + tableName != null ? tableName : "", + schemas, + newTimes, + newColumns, + newBitMaps, + rowCount); + + return Collections.singletonList(tablet); + } + + private List convertRelationalInsertRowsNode(final RelationalInsertRowsNode node) { + final List tablets = new ArrayList<>(); + for (final InsertRowNode rowNode : node.getInsertRowNodeList()) { + tablets.addAll(convertRelationalInsertRowNode((RelationalInsertRowNode) rowNode)); + } + return tablets; + } + + // ======================== Helper Methods ======================== + + /** + * Returns indices of columns that match the tree pattern. If no tree pattern is specified, all + * column indices are returned. + */ + private List getMatchedTreeColumnIndices( + final IDeviceID deviceId, final String[] measurements) { + if (treePattern == null || treePattern.isRoot() || treePattern.coversDevice(deviceId)) { + // All columns match + final List allIndices = new ArrayList<>(measurements.length); + for (int i = 0; i < measurements.length; i++) { + if (measurements[i] != null) { + allIndices.add(i); + } + } + return allIndices; + } + + final List matchedIndices = new ArrayList<>(); + for (int i = 0; i < measurements.length; i++) { + if (measurements[i] != null && treePattern.matchesMeasurement(deviceId, measurements[i])) { + matchedIndices.add(i); + } + } + return matchedIndices; + } + + /** + * Bulk-copies a typed column array using System.arraycopy. Returns a new array of the same type + * containing the first {@code rowCount} elements. + */ + private Object copyColumnArray( + final TSDataType dataType, final Object sourceColumn, final int rowCount) { + switch (dataType) { + case BOOLEAN: + { + final boolean[] src = (boolean[]) sourceColumn; + final boolean[] dst = new boolean[rowCount]; + System.arraycopy(src, 0, dst, 0, rowCount); + return dst; + } + case INT32: + case DATE: + { + final int[] src = (int[]) sourceColumn; + final int[] dst = new int[rowCount]; + System.arraycopy(src, 0, dst, 0, rowCount); + return dst; + } + case INT64: + case TIMESTAMP: + { + final long[] src = (long[]) sourceColumn; + final long[] dst = new long[rowCount]; + System.arraycopy(src, 0, dst, 0, rowCount); + return dst; + } + case FLOAT: + { + final float[] src = (float[]) sourceColumn; + final float[] dst = new float[rowCount]; + System.arraycopy(src, 0, dst, 0, rowCount); + return dst; + } + case DOUBLE: + { + final double[] src = (double[]) sourceColumn; + final double[] dst = new double[rowCount]; + System.arraycopy(src, 0, dst, 0, rowCount); + return dst; + } + case TEXT: + case BLOB: + case STRING: + { + final Binary[] src = (Binary[]) sourceColumn; + final Binary[] dst = new Binary[rowCount]; + System.arraycopy(src, 0, dst, 0, rowCount); + return dst; + } + default: + LOGGER.warn("Unsupported data type for bulk copy: {}", dataType); + return sourceColumn; + } + } + + /** + * Adds a single value to the tablet at the specified position. + * + *

IMPORTANT: In tsfile-2.2.1, Tablet.addTimestamp() calls initBitMapsWithApiUsage() which + * creates bitMaps and marks ALL positions as null via markAll(). Since we write values directly + * to the underlying typed arrays (bypassing the Tablet.addValue() API which would call + * updateBitMap to unmark), we must explicitly unmark the bitmap position to indicate the value is + * NOT null. + */ + private void addValueToTablet( + final Tablet tablet, + final int rowIndex, + final int columnIndex, + final TSDataType dataType, + final Object value) { + switch (dataType) { + case BOOLEAN: + ((boolean[]) tablet.getValues()[columnIndex])[rowIndex] = (boolean) value; + break; + case INT32: + case DATE: + ((int[]) tablet.getValues()[columnIndex])[rowIndex] = (int) value; + break; + case INT64: + case TIMESTAMP: + ((long[]) tablet.getValues()[columnIndex])[rowIndex] = (long) value; + break; + case FLOAT: + ((float[]) tablet.getValues()[columnIndex])[rowIndex] = (float) value; + break; + case DOUBLE: + ((double[]) tablet.getValues()[columnIndex])[rowIndex] = (double) value; + break; + case TEXT: + case BLOB: + case STRING: + ((Binary[]) tablet.getValues()[columnIndex])[rowIndex] = (Binary) value; + break; + default: + LOGGER.warn("Unsupported data type: {}", dataType); + return; + } + // Unmark the bitmap position to indicate this value is NOT null. + // addTimestamp() triggers initBitMapsWithApiUsage() which marks all positions as null. + final BitMap[] bitMaps = tablet.getBitMaps(); + if (bitMaps != null && bitMaps[columnIndex] != null) { + bitMaps[columnIndex].unmark(rowIndex); + } + } + + /** Copies a single column value from the source column array to the tablet. */ + private void copyColumnValue( + final Tablet tablet, + final int targetRowIndex, + final int targetColumnIndex, + final TSDataType dataType, + final Object sourceColumn, + final int sourceRowIndex) { + switch (dataType) { + case BOOLEAN: + ((boolean[]) tablet.getValues()[targetColumnIndex])[targetRowIndex] = + ((boolean[]) sourceColumn)[sourceRowIndex]; + break; + case INT32: + case DATE: + ((int[]) tablet.getValues()[targetColumnIndex])[targetRowIndex] = + ((int[]) sourceColumn)[sourceRowIndex]; + break; + case INT64: + case TIMESTAMP: + ((long[]) tablet.getValues()[targetColumnIndex])[targetRowIndex] = + ((long[]) sourceColumn)[sourceRowIndex]; + break; + case FLOAT: + ((float[]) tablet.getValues()[targetColumnIndex])[targetRowIndex] = + ((float[]) sourceColumn)[sourceRowIndex]; + break; + case DOUBLE: + ((double[]) tablet.getValues()[targetColumnIndex])[targetRowIndex] = + ((double[]) sourceColumn)[sourceRowIndex]; + break; + case TEXT: + case BLOB: + case STRING: + ((Binary[]) tablet.getValues()[targetColumnIndex])[targetRowIndex] = + ((Binary[]) sourceColumn)[sourceRowIndex]; + break; + default: + LOGGER.warn("Unsupported data type for copy: {}", dataType); + return; + } + // Unmark the bitmap position to indicate this value is NOT null. + final BitMap[] bitMaps = tablet.getBitMaps(); + if (bitMaps != null && bitMaps[targetColumnIndex] != null) { + bitMaps[targetColumnIndex].unmark(targetRowIndex); + } + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusPrefetchingQueue.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusPrefetchingQueue.java new file mode 100644 index 0000000000000..83d13d1474bf5 --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusPrefetchingQueue.java @@ -0,0 +1,1420 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.broker.consensus; + +import org.apache.iotdb.commons.subscription.config.SubscriptionConfig; +import org.apache.iotdb.consensus.common.request.IConsensusRequest; +import org.apache.iotdb.consensus.common.request.IndexedConsensusRequest; +import org.apache.iotdb.consensus.common.request.IoTConsensusRequest; +import org.apache.iotdb.consensus.iot.IoTConsensusServerImpl; +import org.apache.iotdb.consensus.iot.log.ConsensusReqReader; +import org.apache.iotdb.db.conf.IoTDBDescriptor; +import org.apache.iotdb.db.pipe.agent.PipeDataNodeAgent; +import org.apache.iotdb.db.pipe.resource.memory.PipeMemoryWeightUtil; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.PlanNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.PlanNodeType; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertMultiTabletsNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertTabletNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.SearchNode; +import org.apache.iotdb.db.storageengine.dataregion.wal.buffer.WALEntry; +import org.apache.iotdb.db.storageengine.dataregion.wal.node.WALNode; +import org.apache.iotdb.db.subscription.event.SubscriptionEvent; +import org.apache.iotdb.rpc.subscription.payload.poll.ErrorPayload; +import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionCommitContext; +import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionPollResponseType; +import org.apache.iotdb.rpc.subscription.payload.poll.TabletsPayload; + +import org.apache.tsfile.utils.Pair; +import org.apache.tsfile.write.record.Tablet; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.NavigableMap; +import java.util.Objects; +import java.util.concurrent.ArrayBlockingQueue; +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentSkipListMap; +import java.util.concurrent.PriorityBlockingQueue; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.locks.ReentrantReadWriteLock; + +import static org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionCommitContext.INVALID_COMMIT_ID; + +/** + * A prefetching queue that reads data from IoTConsensus using a hybrid approach: + * + *

    + *
  1. In-memory pending queue: Registered with {@link IoTConsensusServerImpl}, receives + * {@link IndexedConsensusRequest} in real-time from the write path (same mechanism as + * LogDispatcher). This avoids waiting for WAL flush to disk. + *
  2. WAL fallback: Uses {@link ConsensusReqReader.ReqIterator} to read from WAL files for + * gap-filling (pending queue overflow) or catch-up scenarios. + *
  3. WAL pinning: Supplies the earliest outstanding (uncommitted) search index to {@link + * IoTConsensusServerImpl}, preventing WAL deletion of entries not yet consumed by the + * subscription. + *
+ * + *

A background prefetch thread continuously drains the pending queue, converts InsertNode + * entries to Tablets via {@link ConsensusLogToTabletConverter}, and enqueues {@link + * SubscriptionEvent} objects into the prefetchingQueue for consumer polling. + * + *

This design mirrors LogDispatcher's dual-path (pendingEntries + WAL reader) but targets + * subscription delivery instead of replication. + * + *

Thread safety: Uses a fair {@link ReentrantReadWriteLock} to ensure mutual exclusion between + * cleanup and other operations (poll, ack, nack), consistent with the existing prefetching queue + * design. + */ +public class ConsensusPrefetchingQueue { + + private static final Logger LOGGER = LoggerFactory.getLogger(ConsensusPrefetchingQueue.class); + + private final String brokerId; // consumer group id + private final String topicName; + private final String consensusGroupId; + + private final IoTConsensusServerImpl serverImpl; + + private final ConsensusReqReader consensusReqReader; + + private volatile ConsensusReqReader.ReqIterator reqIterator; + + /** + * In-memory pending queue registered with {@link IoTConsensusServerImpl#write}. Receives + * IndexedConsensusRequest in real-time without waiting for WAL flush. Capacity is bounded to + * apply back-pressure; overflows are filled from WAL. + */ + private final BlockingQueue pendingEntries; + + private static final int PENDING_QUEUE_CAPACITY = 4096; + + private final ConsensusLogToTabletConverter converter; + + private final ConsensusSubscriptionCommitManager commitManager; + + /** Commit ID generator, monotonically increasing within this queue's lifetime. */ + private final AtomicLong commitIdGenerator; + + /** + * Commit IDs less than or equal to this threshold are considered outdated. Updated on creation + * and on seek to invalidate all pre-seek events. + */ + private volatile long outdatedCommitIdThreshold; + + private final AtomicLong nextExpectedSearchIndex; + + private final PriorityBlockingQueue prefetchingQueue; + + /** + * Tracks in-flight events that have been polled but not yet committed. Key: (consumerId, + * commitContext) -> event. + */ + private final Map, SubscriptionEvent> inFlightEvents; + + /** + * Tracks outstanding (uncommitted) events for WAL pinning. Maps commitId to the startSearchIndex + * of that event batch. The earliest entry's value is supplied to IoTConsensusServerImpl to pin + * WAL files from deletion. + */ + private final ConcurrentSkipListMap outstandingCommitIdToStartIndex; + + private static final int MAX_PREFETCHING_QUEUE_SIZE = 256; + + /** Counter of WAL gap entries that could not be filled (data loss). */ + private final AtomicLong walGapSkippedEntries = new AtomicLong(0); + + /** + * Sparse in-memory mapping from data timestamp to searchIndex, used by {@link + * #seekToTimestamp(long)} to approximate a searchIndex for a given timestamp. Sampled every + * {@link #TIMESTAMP_SAMPLE_INTERVAL} entries during prefetch. Cleared on seek. + * + *

TODO: For a more robust long-term solution, consider extending WALMetaData to store per-entry timestamps + * so that timestamp-based seek can use file-level min/max filtering + in-file binary search without + * full InsertNode deserialization. + */ + private final NavigableMap timestampToSearchIndex = new ConcurrentSkipListMap<>(); + + private static final int TIMESTAMP_SAMPLE_INTERVAL = 100; + + private long timestampSampleCounter = 0; + + private final ReentrantReadWriteLock lock = new ReentrantReadWriteLock(true); + + private volatile boolean isClosed = false; + + /** + * Background thread that drains pendingEntries and fills prefetchingQueue. TODO: manage thread + * count + */ + private final Thread prefetchThread; + + public ConsensusPrefetchingQueue( + final String brokerId, + final String topicName, + final String consensusGroupId, + final IoTConsensusServerImpl serverImpl, + final ConsensusLogToTabletConverter converter, + final ConsensusSubscriptionCommitManager commitManager, + final long startSearchIndex, + final AtomicLong sharedCommitIdGenerator) { + this.brokerId = brokerId; + this.topicName = topicName; + this.consensusGroupId = consensusGroupId; + this.serverImpl = serverImpl; + this.consensusReqReader = serverImpl.getConsensusReqReader(); + this.converter = converter; + this.commitManager = commitManager; + + this.commitIdGenerator = sharedCommitIdGenerator; + this.outdatedCommitIdThreshold = commitIdGenerator.get(); + this.nextExpectedSearchIndex = new AtomicLong(startSearchIndex); + this.reqIterator = consensusReqReader.getReqIterator(startSearchIndex); + + this.prefetchingQueue = new PriorityBlockingQueue<>(); + this.inFlightEvents = new ConcurrentHashMap<>(); + this.outstandingCommitIdToStartIndex = new ConcurrentSkipListMap<>(); + + // Create and register the in-memory pending queue with IoTConsensusServerImpl. + this.pendingEntries = new ArrayBlockingQueue<>(PENDING_QUEUE_CAPACITY); + serverImpl.registerSubscriptionQueue(pendingEntries); + + // Start background prefetch thread + this.prefetchThread = + new Thread(this::prefetchLoop, "ConsensusPrefetch-" + brokerId + "-" + topicName); + this.prefetchThread.setDaemon(true); + this.prefetchThread.start(); + + LOGGER.info( + "ConsensusPrefetchingQueue created: brokerId={}, topicName={}, consensusGroupId={}, " + + "startSearchIndex={}", + brokerId, + topicName, + consensusGroupId, + startSearchIndex); + } + + // ======================== Lock Operations ======================== + + private void acquireReadLock() { + lock.readLock().lock(); + } + + private void releaseReadLock() { + lock.readLock().unlock(); + } + + private void acquireWriteLock() { + lock.writeLock().lock(); + } + + private void releaseWriteLock() { + lock.writeLock().unlock(); + } + + // ======================== Poll ======================== + + public SubscriptionEvent poll(final String consumerId) { + acquireReadLock(); + try { + return isClosed ? null : pollInternal(consumerId); + } finally { + releaseReadLock(); + } + } + + private SubscriptionEvent pollInternal(final String consumerId) { + final long size = prefetchingQueue.size(); + if (size == 0) { + LOGGER.debug( + "ConsensusPrefetchingQueue {}: prefetching queue is empty for consumerId={}, " + + "pendingEntriesSize={}, nextExpected={}, isClosed={}, threadAlive={}", + this, + consumerId, + pendingEntries.size(), + nextExpectedSearchIndex.get(), + isClosed, + prefetchThread.isAlive()); + return null; + } + + LOGGER.debug( + "ConsensusPrefetchingQueue {}: polling, queue size={}, consumerId={}", + this, + size, + consumerId); + long count = 0; + + SubscriptionEvent event; + try { + while (count++ < size + && Objects.nonNull( + event = + prefetchingQueue.poll( + SubscriptionConfig.getInstance().getSubscriptionPollMaxBlockingTimeMs(), + TimeUnit.MILLISECONDS))) { + if (event.isCommitted()) { + LOGGER.warn( + "ConsensusPrefetchingQueue {} poll committed event {} (broken invariant), remove it", + this, + event); + continue; + } + + if (!event.pollable()) { + LOGGER.warn( + "ConsensusPrefetchingQueue {} poll non-pollable event {} (broken invariant), nack it", + this, + event); + event.nack(); + continue; + } + + // Mark as polled before updating inFlightEvents + event.recordLastPolledTimestamp(); + inFlightEvents.put(new Pair<>(consumerId, event.getCommitContext()), event); + event.recordLastPolledConsumerId(consumerId); + return event; + } + } catch (final InterruptedException e) { + Thread.currentThread().interrupt(); + LOGGER.warn("ConsensusPrefetchingQueue {} interrupted while polling", this, e); + } + + return null; + } + + public SubscriptionEvent pollTablets( + final String consumerId, final SubscriptionCommitContext commitContext, final int offset) { + acquireReadLock(); + try { + if (isClosed) { + return null; + } + final SubscriptionEvent event = inFlightEvents.get(new Pair<>(consumerId, commitContext)); + if (Objects.isNull(event)) { + if (isCommitContextOutdated(commitContext)) { + return generateOutdatedErrorResponse(); + } + return generateErrorResponse( + String.format( + "ConsensusPrefetchingQueue %s: no in-flight event for consumer %s, commit context %s", + this, consumerId, commitContext)); + } + return event; + } finally { + releaseReadLock(); + } + } + + // ======================== Background Prefetch ======================== + + public boolean executePrefetch() { + acquireReadLock(); + try { + if (isClosed) { + return false; + } + // Recycle pollable events from inFlightEvents back to prefetchingQueue + recycleInFlightEvents(); + return !prefetchingQueue.isEmpty(); + } finally { + releaseReadLock(); + } + } + + private static final long PENDING_DRAIN_TIMEOUT_MS = 10; + + private static final long WAL_WAIT_TIMEOUT_SECONDS = 2; + + /** + * Background prefetch loop. Continuously drains from pendingEntries (in-memory, real-time), + * detects gaps and fills from WAL reader, converts to Tablets, and enqueues SubscriptionEvents. + * + *

Batching strategy (linger): Tablets are accumulated across loop iterations until one of + * three thresholds is met: + * + *

    + *
  • Tablet count exceeds {@code subscriptionConsensusBatchMaxTabletCount} + *
  • Estimated byte size exceeds {@code subscriptionConsensusBatchMaxSizeInBytes} + *
  • Time since first tablet in current batch exceeds {@code + * subscriptionConsensusBatchMaxDelayInMs} + *
+ */ + private void prefetchLoop() { + LOGGER.info("ConsensusPrefetchingQueue {}: prefetch thread started", this); + + final List lingerTablets = new ArrayList<>(); + long lingerEstimatedBytes = 0; + long lingerBatchStartSearchIndex = nextExpectedSearchIndex.get(); + long lingerBatchEndSearchIndex = lingerBatchStartSearchIndex; + long lingerFirstTabletTimeMs = 0; // 0 means no tablets accumulated yet + + try { + while (!isClosed && !Thread.currentThread().isInterrupted()) { + try { + // Back-pressure: wait if prefetchingQueue is full + if (prefetchingQueue.size() >= MAX_PREFETCHING_QUEUE_SIZE) { + Thread.sleep(50); + continue; + } + + final SubscriptionConfig config = SubscriptionConfig.getInstance(); + final int maxWalEntries = config.getSubscriptionConsensusBatchMaxWalEntries(); + final int batchMaxDelayMs = config.getSubscriptionConsensusBatchMaxDelayInMs(); + final int maxTablets = config.getSubscriptionConsensusBatchMaxTabletCount(); + final long maxBatchBytes = config.getSubscriptionConsensusBatchMaxSizeInBytes(); + + // Try to drain from pending entries (in-memory, fast path) + final List batch = new ArrayList<>(); + final IndexedConsensusRequest first = + pendingEntries.poll(PENDING_DRAIN_TIMEOUT_MS, TimeUnit.MILLISECONDS); + if (first != null) { + batch.add(first); + int drained = 0; + IndexedConsensusRequest next; + while (drained < maxWalEntries - 1 && (next = pendingEntries.poll()) != null) { + batch.add(next); + drained++; + } + } + + if (!batch.isEmpty()) { + LOGGER.debug( + "ConsensusPrefetchingQueue {}: drained {} entries from pendingEntries, " + + "first searchIndex={}, last searchIndex={}, nextExpected={}, " + + "prefetchingQueueSize={}", + this, + batch.size(), + batch.get(0).getSearchIndex(), + batch.get(batch.size() - 1).getSearchIndex(), + nextExpectedSearchIndex.get(), + prefetchingQueue.size()); + + // Accumulate tablets from pending entries into linger buffer + final int tabletsBefore = lingerTablets.size(); + lingerBatchEndSearchIndex = + accumulateFromPending(batch, lingerTablets, lingerBatchEndSearchIndex); + + // Update byte estimates for newly added tablets + for (int i = tabletsBefore; i < lingerTablets.size(); i++) { + lingerEstimatedBytes += estimateTabletSize(lingerTablets.get(i)); + } + + // Flush sub-batches that exceeded thresholds during accumulation + while (lingerTablets.size() >= maxTablets || lingerEstimatedBytes >= maxBatchBytes) { + final int flushCount = Math.min(lingerTablets.size(), maxTablets); + final List toFlush = new ArrayList<>(lingerTablets.subList(0, flushCount)); + createAndEnqueueEvent( + toFlush, lingerBatchStartSearchIndex, lingerBatchEndSearchIndex); + lingerTablets.subList(0, flushCount).clear(); + // Recalculate byte estimate for remaining tablets + lingerEstimatedBytes = 0; + for (final Tablet t : lingerTablets) { + lingerEstimatedBytes += estimateTabletSize(t); + } + lingerBatchStartSearchIndex = nextExpectedSearchIndex.get(); + lingerFirstTabletTimeMs = lingerTablets.isEmpty() ? 0 : lingerFirstTabletTimeMs; + } + + // Record first tablet time if we just started accumulating + if (!lingerTablets.isEmpty() && lingerFirstTabletTimeMs == 0) { + lingerFirstTabletTimeMs = System.currentTimeMillis(); + } + } else if (lingerTablets.isEmpty()) { + // Pending queue was empty and no lingering tablets — try catch-up from WAL + tryCatchUpFromWAL(); + } + // If we have lingering tablets but pending was empty, fall through to time check below + + // Time-based flush: if tablets have been lingering longer than batchMaxDelayMs, flush now + if (!lingerTablets.isEmpty() + && lingerFirstTabletTimeMs > 0 + && (System.currentTimeMillis() - lingerFirstTabletTimeMs) >= batchMaxDelayMs) { + LOGGER.debug( + "ConsensusPrefetchingQueue {}: time-based flush, {} tablets lingered for {}ms " + + "(threshold={}ms)", + this, + lingerTablets.size(), + System.currentTimeMillis() - lingerFirstTabletTimeMs, + batchMaxDelayMs); + createAndEnqueueEvent( + new ArrayList<>(lingerTablets), + lingerBatchStartSearchIndex, + lingerBatchEndSearchIndex); + lingerTablets.clear(); + lingerEstimatedBytes = 0; + lingerBatchStartSearchIndex = nextExpectedSearchIndex.get(); + lingerFirstTabletTimeMs = 0; + } + } catch (final InterruptedException e) { + Thread.currentThread().interrupt(); + break; + } catch (final Throwable t) { + LOGGER.error( + "ConsensusPrefetchingQueue {}: CRITICAL error in prefetch loop " + + "(type={}, message={})", + this, + t.getClass().getName(), + t.getMessage(), + t); + if (t instanceof VirtualMachineError) { + LOGGER.error( + "ConsensusPrefetchingQueue {}: caught VirtualMachineError, stopping thread", this); + markClosed(); + break; + } + try { + Thread.sleep(100); + } catch (final InterruptedException ie) { + Thread.currentThread().interrupt(); + break; + } + } + } + + if (!lingerTablets.isEmpty()) { + LOGGER.info( + "ConsensusPrefetchingQueue {}: flushing {} lingering tablets on loop exit", + this, + lingerTablets.size()); + createAndEnqueueEvent( + lingerTablets, lingerBatchStartSearchIndex, lingerBatchEndSearchIndex); + } + } catch (final Throwable fatal) { + LOGGER.error( + "ConsensusPrefetchingQueue {}: FATAL uncaught throwable escaped prefetch loop " + + "(type={}, message={})", + this, + fatal.getClass().getName(), + fatal.getMessage(), + fatal); + } + LOGGER.info("ConsensusPrefetchingQueue {}: prefetch thread stopped", this); + } + + /** + * Accumulates tablets from pending entries into the linger buffer. Handles gap detection and + * filling from WAL. Does NOT flush — the caller is responsible for flush decisions. + * + * @return the updated batchEndSearchIndex + */ + private long accumulateFromPending( + final List batch, + final List lingerTablets, + long batchEndSearchIndex) { + + int processedCount = 0; + int skippedCount = 0; + + for (final IndexedConsensusRequest request : batch) { + final long searchIndex = request.getSearchIndex(); + + // Detect gap: if searchIndex > nextExpected, entries were dropped from pending queue. + final long expected = nextExpectedSearchIndex.get(); + if (searchIndex > expected) { + LOGGER.debug( + "ConsensusPrefetchingQueue {}: gap detected, expected={}, got={}. " + + "Filling {} entries from WAL.", + this, + expected, + searchIndex, + searchIndex - expected); + final long gapMaxIndex = fillGapFromWAL(expected, searchIndex, lingerTablets); + if (gapMaxIndex > batchEndSearchIndex) { + batchEndSearchIndex = gapMaxIndex; + } + } + + if (searchIndex < nextExpectedSearchIndex.get()) { + skippedCount++; + continue; + } + + // Process this entry + final InsertNode insertNode = deserializeToInsertNode(request); + if (insertNode != null) { + recordTimestampSample(insertNode, searchIndex); + final List tablets = converter.convert(insertNode); + if (!tablets.isEmpty()) { + lingerTablets.addAll(tablets); + batchEndSearchIndex = searchIndex; + processedCount++; + } + } + nextExpectedSearchIndex.set(searchIndex + 1); + } + + // Update WAL reader position to stay in sync + syncReqIteratorPosition(); + + LOGGER.debug( + "ConsensusPrefetchingQueue {}: accumulate complete, batchSize={}, processed={}, " + + "skipped={}, lingerTablets={}, nextExpected={}", + this, + batch.size(), + processedCount, + skippedCount, + lingerTablets.size(), + nextExpectedSearchIndex.get()); + + return batchEndSearchIndex; + } + + /** + * Fills a gap in the pending queue by reading entries from WAL. Called when gap is detected + * between nextExpectedSearchIndex and an incoming entry's searchIndex. + * + * @return the maximum searchIndex processed during gap filling, or -1 if no entries processed + */ + private long fillGapFromWAL( + final long fromIndex, final long toIndex, final List batchedTablets) { + // Re-position WAL reader to the gap start + reqIterator = consensusReqReader.getReqIterator(fromIndex); + long maxProcessedIndex = -1; + + while (nextExpectedSearchIndex.get() < toIndex && reqIterator.hasNext()) { + try { + final IndexedConsensusRequest walEntry = reqIterator.next(); + final long walIndex = walEntry.getSearchIndex(); + if (walIndex < nextExpectedSearchIndex.get()) { + continue; // already processed + } + + final InsertNode insertNode = deserializeToInsertNode(walEntry); + if (insertNode != null) { + recordTimestampSample(insertNode, walIndex); + final List tablets = converter.convert(insertNode); + batchedTablets.addAll(tablets); + } + nextExpectedSearchIndex.set(walIndex + 1); + if (walIndex > maxProcessedIndex) { + maxProcessedIndex = walIndex; + } + } catch (final Exception e) { + LOGGER.warn( + "ConsensusPrefetchingQueue {}: error filling gap from WAL at index {}", + this, + nextExpectedSearchIndex.get(), + e); + break; + } + } + + // If WAL doesn't have the gap entries yet (still in memory buffer), wait briefly + if (nextExpectedSearchIndex.get() < toIndex) { + try { + reqIterator.waitForNextReady(WAL_WAIT_TIMEOUT_SECONDS, TimeUnit.SECONDS); + while (nextExpectedSearchIndex.get() < toIndex && reqIterator.hasNext()) { + final IndexedConsensusRequest walEntry = reqIterator.next(); + final long walIndex = walEntry.getSearchIndex(); + if (walIndex < nextExpectedSearchIndex.get()) { + continue; + } + final InsertNode insertNode = deserializeToInsertNode(walEntry); + if (insertNode != null) { + recordTimestampSample(insertNode, walIndex); + final List tablets = converter.convert(insertNode); + batchedTablets.addAll(tablets); + } + nextExpectedSearchIndex.set(walIndex + 1); + if (walIndex > maxProcessedIndex) { + maxProcessedIndex = walIndex; + } + } + } catch (final InterruptedException e) { + Thread.currentThread().interrupt(); + } catch (final TimeoutException e) { + LOGGER.debug( + "ConsensusPrefetchingQueue {}: timeout waiting for WAL gap fill [{}, {})", + this, + nextExpectedSearchIndex.get(), + toIndex); + } + } + + // If entries are in the current-writing WAL file (excluded by PlanNodeIterator for + // concurrency safety), trigger a WAL file roll to make them readable. + if (nextExpectedSearchIndex.get() < toIndex && consensusReqReader instanceof WALNode) { + final long currentWALIndex = consensusReqReader.getCurrentSearchIndex(); + if (nextExpectedSearchIndex.get() <= currentWALIndex) { + LOGGER.debug( + "ConsensusPrefetchingQueue {}: gap fill incomplete (at {} vs WAL {}), " + + "triggering WAL file roll", + this, + nextExpectedSearchIndex.get(), + currentWALIndex); + ((WALNode) consensusReqReader).rollWALFile(); + syncReqIteratorPosition(); + // Retry reading after roll + while (nextExpectedSearchIndex.get() < toIndex && reqIterator.hasNext()) { + try { + final IndexedConsensusRequest walEntry = reqIterator.next(); + final long walIndex = walEntry.getSearchIndex(); + if (walIndex < nextExpectedSearchIndex.get()) { + continue; + } + final InsertNode insertNode = deserializeToInsertNode(walEntry); + if (insertNode != null) { + recordTimestampSample(insertNode, walIndex); + final List tablets = converter.convert(insertNode); + batchedTablets.addAll(tablets); + } + nextExpectedSearchIndex.set(walIndex + 1); + if (walIndex > maxProcessedIndex) { + maxProcessedIndex = walIndex; + } + } catch (final Exception e) { + LOGGER.warn( + "ConsensusPrefetchingQueue {}: error reading WAL after roll at index {}", + this, + nextExpectedSearchIndex.get(), + e); + break; + } + } + } + } + + // If the gap still cannot be filled, WAL is corrupted/truncated + if (nextExpectedSearchIndex.get() < toIndex) { + final long skipped = toIndex - nextExpectedSearchIndex.get(); + walGapSkippedEntries.addAndGet(skipped); + LOGGER.warn( + "ConsensusPrefetchingQueue {}: WAL gap [{}, {}) cannot be filled - {} entries lost. " + + "Total skipped entries so far: {}. " + + "Possible causes: WAL retention policy reclaimed files, or WAL corruption/truncation.", + this, + nextExpectedSearchIndex.get(), + toIndex, + skipped, + walGapSkippedEntries.get()); + nextExpectedSearchIndex.set(toIndex); + } + + return maxProcessedIndex; + } + + /** + * Try catch-up from WAL when the pending queue was empty. This handles cold-start or scenarios + * where the subscription started after data was already written. + */ + private void tryCatchUpFromWAL() { + // Re-position WAL reader + syncReqIteratorPosition(); + + if (!reqIterator.hasNext()) { + // The WAL iterator excludes the current-writing WAL file for concurrency safety. + // If entries exist in WAL but are all in the current file (e.g., after pending queue + // overflow), we need to trigger a WAL file roll to make them readable. + final long currentWALIndex = consensusReqReader.getCurrentSearchIndex(); + if (nextExpectedSearchIndex.get() <= currentWALIndex + && consensusReqReader instanceof WALNode) { + LOGGER.debug( + "ConsensusPrefetchingQueue {}: subscription behind (at {} vs WAL {}), " + + "triggering WAL file roll to make entries readable", + this, + nextExpectedSearchIndex.get(), + currentWALIndex); + ((WALNode) consensusReqReader).rollWALFile(); + syncReqIteratorPosition(); + } + if (!reqIterator.hasNext()) { + // Data loss detection: if we expected earlier entries but WAL has advanced past them, + // the retention policy has reclaimed WAL files before we consumed them. + // Auto-seek to the current WAL position (similar to Kafka's auto.offset.reset=latest). + if (nextExpectedSearchIndex.get() < currentWALIndex) { + final long skipped = currentWALIndex - nextExpectedSearchIndex.get(); + LOGGER.warn( + "ConsensusPrefetchingQueue {}: WAL data loss detected. Expected searchIndex={} " + + "but earliest available is {}. {} entries were reclaimed by WAL retention " + + "policy before consumption. Auto-seeking to current position.", + this, + nextExpectedSearchIndex.get(), + currentWALIndex, + skipped); + walGapSkippedEntries.addAndGet(skipped); + nextExpectedSearchIndex.set(currentWALIndex); + syncReqIteratorPosition(); + } + if (!reqIterator.hasNext()) { + return; + } + } + } + + final SubscriptionConfig config = SubscriptionConfig.getInstance(); + final int maxTablets = config.getSubscriptionConsensusBatchMaxTabletCount(); + final long maxBatchBytes = config.getSubscriptionConsensusBatchMaxSizeInBytes(); + final int maxWalEntries = config.getSubscriptionConsensusBatchMaxWalEntries(); + + final List batchedTablets = new ArrayList<>(); + long batchStartSearchIndex = nextExpectedSearchIndex.get(); + long batchEndSearchIndex = batchStartSearchIndex; + long estimatedBatchBytes = 0; + int entriesRead = 0; + + while (entriesRead < maxWalEntries + && reqIterator.hasNext() + && prefetchingQueue.size() < MAX_PREFETCHING_QUEUE_SIZE) { + try { + final IndexedConsensusRequest walEntry = reqIterator.next(); + final long walIndex = walEntry.getSearchIndex(); + entriesRead++; + + if (walIndex < nextExpectedSearchIndex.get()) { + continue; + } + + final InsertNode insertNode = deserializeToInsertNode(walEntry); + if (insertNode != null) { + recordTimestampSample(insertNode, walIndex); + final List tablets = converter.convert(insertNode); + if (!tablets.isEmpty()) { + batchedTablets.addAll(tablets); + for (final Tablet t : tablets) { + estimatedBatchBytes += estimateTabletSize(t); + } + batchEndSearchIndex = walIndex; + } + } + nextExpectedSearchIndex.set(walIndex + 1); + + if (batchedTablets.size() >= maxTablets || estimatedBatchBytes >= maxBatchBytes) { + createAndEnqueueEvent( + new ArrayList<>(batchedTablets), batchStartSearchIndex, batchEndSearchIndex); + batchedTablets.clear(); + estimatedBatchBytes = 0; + // Reset start index for the next sub-batch + batchStartSearchIndex = nextExpectedSearchIndex.get(); + } + } catch (final Exception e) { + LOGGER.warn("ConsensusPrefetchingQueue {}: error reading WAL for catch-up", this, e); + break; + } + } + + if (!batchedTablets.isEmpty()) { + createAndEnqueueEvent(batchedTablets, batchStartSearchIndex, batchEndSearchIndex); + } + + if (entriesRead > 0) { + LOGGER.debug( + "ConsensusPrefetchingQueue {}: WAL catch-up read {} entries, " + + "nextExpectedSearchIndex={}", + this, + entriesRead, + nextExpectedSearchIndex.get()); + } + } + + /** + * Re-positions the WAL reader to the current nextExpectedSearchIndex. Called before reading from + * WAL to ensure the iterator is in sync with tracking position. + */ + private void syncReqIteratorPosition() { + reqIterator = consensusReqReader.getReqIterator(nextExpectedSearchIndex.get()); + } + + /** + * Deserializes the IConsensusRequest entries within an IndexedConsensusRequest to produce an + * InsertNode. WAL entries are typically stored as IoTConsensusRequest (serialized ByteBuffers), + * and a single logical write may be split across multiple fragments (SearchNode). This method + * handles both cases. + * + *

The deserialization follows the same pattern as {@code + * DataRegionStateMachine.grabPlanNode()}. + */ + private InsertNode deserializeToInsertNode(final IndexedConsensusRequest indexedRequest) { + final List searchNodes = new ArrayList<>(); + PlanNode nonSearchNode = null; + + for (final IConsensusRequest req : indexedRequest.getRequests()) { + PlanNode planNode; + try { + if (req instanceof IoTConsensusRequest) { + // WAL entries read from file are wrapped as IoTConsensusRequest (ByteBuffer) + planNode = WALEntry.deserializeForConsensus(req.serializeToByteBuffer()); + } else if (req instanceof InsertNode) { + // In-memory entries (not yet flushed to WAL file) may already be PlanNode + planNode = (PlanNode) req; + } else { + // ByteBufferConsensusRequest or unknown + planNode = PlanNodeType.deserialize(req.serializeToByteBuffer()); + } + } catch (final Exception e) { + LOGGER.warn( + "ConsensusPrefetchingQueue {}: failed to deserialize IConsensusRequest " + + "(type={}) in searchIndex={}: {}", + this, + req.getClass().getSimpleName(), + indexedRequest.getSearchIndex(), + e.getMessage(), + e); + continue; + } + + if (planNode instanceof SearchNode) { + ((SearchNode) planNode).setSearchIndex(indexedRequest.getSearchIndex()); + searchNodes.add((SearchNode) planNode); + } else { + nonSearchNode = planNode; + } + } + + // Merge split SearchNode fragments (same pattern as DataRegionStateMachine.grabPlanNode) + if (!searchNodes.isEmpty()) { + final PlanNode merged = searchNodes.get(0).merge(searchNodes); + if (merged instanceof InsertNode) { + final InsertNode mergedInsert = (InsertNode) merged; + LOGGER.debug( + "ConsensusPrefetchingQueue {}: deserialized merged InsertNode for searchIndex={}, " + + "type={}, deviceId={}, searchNodeCount={}", + this, + indexedRequest.getSearchIndex(), + mergedInsert.getType(), + ConsensusLogToTabletConverter.safeDeviceIdForLog(mergedInsert), + searchNodes.size()); + + return mergedInsert; + } + } + + if (nonSearchNode != null) { + LOGGER.debug( + "ConsensusPrefetchingQueue {}: searchIndex={} contains non-InsertNode PlanNode: {}", + this, + indexedRequest.getSearchIndex(), + nonSearchNode.getClass().getSimpleName()); + } + + return null; + } + + private static long estimateTabletSize(final Tablet tablet) { + return PipeMemoryWeightUtil.calculateTabletSizeInBytes(tablet); + } + + private void createAndEnqueueEvent( + final List tablets, final long startSearchIndex, final long endSearchIndex) { + if (tablets.isEmpty()) { + return; + } + + final long commitId = commitIdGenerator.getAndIncrement(); + + // Record the mapping from commitId to the end searchIndex + // so that when the client commits, we know which WAL position has been consumed + commitManager.recordCommitMapping( + brokerId, topicName, consensusGroupId, commitId, endSearchIndex); + + // Track outstanding event for WAL pinning + outstandingCommitIdToStartIndex.put(commitId, startSearchIndex); + + final SubscriptionCommitContext commitContext = + new SubscriptionCommitContext( + IoTDBDescriptor.getInstance().getConfig().getDataNodeId(), + PipeDataNodeAgent.runtime().getRebootTimes(), + topicName, + brokerId, + commitId); + + // nextOffset <= 0 means all tablets delivered in single batch + // -tablets.size() indicates total count + // Use Map> constructor with actual database name for table model; + final TabletsPayload payload = + new TabletsPayload( + Collections.singletonMap(converter.getDatabaseName(), tablets), -tablets.size()); + + final SubscriptionEvent event = + new SubscriptionEvent( + SubscriptionPollResponseType.TABLETS.getType(), payload, commitContext); + + prefetchingQueue.add(event); + + LOGGER.debug( + "ConsensusPrefetchingQueue {}: ENQUEUED event with {} tablets, " + + "searchIndex range [{}, {}], commitId={}, prefetchQueueSize={}", + this, + tablets.size(), + startSearchIndex, + endSearchIndex, + commitId, + prefetchingQueue.size()); + } + + // ======================== Commit (Ack/Nack) ======================== + + public boolean ack(final String consumerId, final SubscriptionCommitContext commitContext) { + acquireReadLock(); + try { + return !isClosed && ackInternal(consumerId, commitContext); + } finally { + releaseReadLock(); + } + } + + private boolean ackInternal( + final String consumerId, final SubscriptionCommitContext commitContext) { + final AtomicBoolean acked = new AtomicBoolean(false); + final long commitId = commitContext.getCommitId(); + inFlightEvents.compute( + new Pair<>(consumerId, commitContext), + (key, ev) -> { + if (Objects.isNull(ev)) { + LOGGER.warn( + "ConsensusPrefetchingQueue {}: commit context {} does not exist for ack", + this, + commitContext); + return null; + } + + if (ev.isCommitted()) { + LOGGER.warn( + "ConsensusPrefetchingQueue {}: event {} already committed", this, commitContext); + ev.cleanUp(false); + return null; + } + + ev.ack(); + ev.recordCommittedTimestamp(); + acked.set(true); + + ev.cleanUp(false); + return null; + }); + + if (acked.get()) { + commitManager.commit(brokerId, topicName, consensusGroupId, commitId); + outstandingCommitIdToStartIndex.remove(commitId); + } + + return acked.get(); + } + + public boolean nack(final String consumerId, final SubscriptionCommitContext commitContext) { + acquireReadLock(); + try { + return !isClosed && nackInternal(consumerId, commitContext); + } finally { + releaseReadLock(); + } + } + + /** + * Silent version of ack: returns false without logging if the commit context is not found. Used + * in multi-region iteration where only one queue owns the event. + */ + public boolean ackSilent(final String consumerId, final SubscriptionCommitContext commitContext) { + acquireReadLock(); + try { + if (isClosed) { + return false; + } + final AtomicBoolean acked = new AtomicBoolean(false); + final long commitId = commitContext.getCommitId(); + inFlightEvents.compute( + new Pair<>(consumerId, commitContext), + (key, ev) -> { + if (Objects.isNull(ev)) { + return null; + } + if (ev.isCommitted()) { + ev.cleanUp(false); + return null; + } + ev.ack(); + ev.recordCommittedTimestamp(); + acked.set(true); + ev.cleanUp(false); + return null; + }); + if (acked.get()) { + commitManager.commit(brokerId, topicName, consensusGroupId, commitId); + outstandingCommitIdToStartIndex.remove(commitId); + } + return acked.get(); + } finally { + releaseReadLock(); + } + } + + /** + * Silent version of nack: returns false without logging if the commit context is not found. Used + * in multi-region iteration where only one queue owns the event. + */ + public boolean nackSilent( + final String consumerId, final SubscriptionCommitContext commitContext) { + acquireReadLock(); + try { + if (isClosed) { + return false; + } + final AtomicBoolean nacked = new AtomicBoolean(false); + inFlightEvents.compute( + new Pair<>(consumerId, commitContext), + (key, ev) -> { + if (Objects.isNull(ev)) { + return null; + } + ev.nack(); + nacked.set(true); + prefetchingQueue.add(ev); + return null; + }); + return nacked.get(); + } finally { + releaseReadLock(); + } + } + + private boolean nackInternal( + final String consumerId, final SubscriptionCommitContext commitContext) { + final AtomicBoolean nacked = new AtomicBoolean(false); + inFlightEvents.compute( + new Pair<>(consumerId, commitContext), + (key, ev) -> { + if (Objects.isNull(ev)) { + LOGGER.warn( + "ConsensusPrefetchingQueue {}: commit context {} does not exist for nack", + this, + commitContext); + return null; + } + + ev.nack(); + nacked.set(true); + prefetchingQueue.add(ev); + return null; + }); + + return nacked.get(); + } + + // ======================== Recycle ======================== + + /** Recycles in-flight events that are pollable (timed out) back to the prefetching queue. */ + private void recycleInFlightEvents() { + for (final Pair key : + new ArrayList<>(inFlightEvents.keySet())) { + inFlightEvents.compute( + key, + (k, ev) -> { + if (Objects.isNull(ev)) { + return null; + } + if (ev.isCommitted()) { + ev.cleanUp(false); + return null; + } + if (ev.pollable()) { + ev.nack(); + prefetchingQueue.add(ev); + LOGGER.debug( + "ConsensusPrefetchingQueue {}: recycled timed-out event {} back to prefetching queue", + this, + ev); + return null; + } + return ev; + }); + } + } + + // ======================== Cleanup ======================== + + public void cleanUp() { + acquireWriteLock(); + try { + prefetchingQueue.forEach(event -> event.cleanUp(true)); + prefetchingQueue.clear(); + + inFlightEvents.values().forEach(event -> event.cleanUp(true)); + inFlightEvents.clear(); + + outstandingCommitIdToStartIndex.clear(); + } finally { + releaseWriteLock(); + } + } + + // ======================== Seek ======================== + + /** + * Seeks the subscription to a specific WAL search index. Clears all pending, prefetched, and + * in-flight events, resets the WAL reader, and invalidates all pre-seek commit contexts. + * + *

After seek, the consumer will receive data starting from {@code targetSearchIndex}. If the + * target is beyond available WAL (reclaimed by retention), the consumer will start from the + * earliest available position. + */ + public void seekToSearchIndex(final long targetSearchIndex) { + acquireWriteLock(); + try { + if (isClosed) { + return; + } + + // 1. Invalidate all pre-seek commit contexts + outdatedCommitIdThreshold = commitIdGenerator.get(); + + // 2. Clean up all queued and in-flight events + prefetchingQueue.forEach(event -> event.cleanUp(true)); + prefetchingQueue.clear(); + inFlightEvents.values().forEach(event -> event.cleanUp(true)); + inFlightEvents.clear(); + outstandingCommitIdToStartIndex.clear(); + + // 3. Discard stale pending entries from in-memory queue + pendingEntries.clear(); + + // 4. Reset WAL read position + nextExpectedSearchIndex.set(targetSearchIndex); + reqIterator = consensusReqReader.getReqIterator(targetSearchIndex); + + // 5. Reset commit state in CommitManager + commitManager.resetState(brokerId, topicName, consensusGroupId, targetSearchIndex); + + LOGGER.info( + "ConsensusPrefetchingQueue {}: seek to searchIndex={}, " + + "outdatedCommitIdThreshold={}", + this, + targetSearchIndex, + outdatedCommitIdThreshold); + } finally { + releaseWriteLock(); + } + } + + /** + * Seeks to the earliest available WAL position. The actual position depends on WAL retention — if + * old files have been reclaimed, the earliest available position may be later than 0. + */ + public void seekToBeginning() { + // ConsensusReqReader.DEFAULT_SAFELY_DELETED_SEARCH_INDEX is Long.MIN_VALUE; + // getReqIterator will clamp to the earliest available file. + seekToSearchIndex(0); + } + + /** + * Seeks to the current WAL write position. After this, only newly written data will be consumed. + */ + public void seekToEnd() { + seekToSearchIndex(consensusReqReader.getCurrentSearchIndex()); + } + + /** + * Seeks to the earliest WAL entry whose data timestamp >= targetTimestamp. Uses the in-memory + * sparse mapping ({@link #timestampToSearchIndex}) to approximate the searchIndex, then seeks to + * that position. If no mapping entry exists (targetTimestamp earlier than all samples), falls back + * to seekToBeginning. If targetTimestamp is beyond the latest sample, seeks to the current WAL + * write position (equivalent to seekToEnd). + */ + public void seekToTimestamp(final long targetTimestamp) { + final Map.Entry floor = timestampToSearchIndex.floorEntry(targetTimestamp); + final long approxSearchIndex; + if (floor == null) { + // targetTimestamp is earlier than all known samples — seek to beginning + approxSearchIndex = 0; + } else { + final Map.Entry lastEntry = timestampToSearchIndex.lastEntry(); + if (lastEntry != null && floor.getKey().equals(lastEntry.getKey()) + && targetTimestamp > lastEntry.getKey()) { + // targetTimestamp is beyond the latest known sample — seek to end + approxSearchIndex = consensusReqReader.getCurrentSearchIndex(); + } else { + approxSearchIndex = floor.getValue(); + } + } + LOGGER.info( + "ConsensusPrefetchingQueue {}: seekToTimestamp={}, approxSearchIndex={} (from sparse map, size={})", + this, + targetTimestamp, + approxSearchIndex, + timestampToSearchIndex.size()); + seekToSearchIndex(approxSearchIndex); + } + + /** + * Records a sparse timestamp→searchIndex sample for {@link #seekToTimestamp(long)}. Called during + * prefetch for every successfully deserialized InsertNode. + */ + private void recordTimestampSample(final InsertNode insertNode, final long searchIndex) { + if (timestampSampleCounter++ % TIMESTAMP_SAMPLE_INTERVAL == 0) { + final long minTime = extractMinTime(insertNode); + if (minTime != Long.MAX_VALUE) { + timestampToSearchIndex.put(minTime, searchIndex); + } + } + } + + /** + * Extracts the minimum timestamp from an InsertNode. For InsertMultiTabletsNode (whose + * getMinTime() throws NotImplementedException), iterates over inner InsertTabletNodes. + * + * @return the minimum timestamp, or Long.MAX_VALUE if extraction fails + */ + private long extractMinTime(final InsertNode insertNode) { + try { + return insertNode.getMinTime(); + } catch (final Exception e) { + // InsertMultiTabletsNode.getMinTime() is not implemented + if (insertNode instanceof InsertMultiTabletsNode) { + long min = Long.MAX_VALUE; + for (final InsertTabletNode child : + ((InsertMultiTabletsNode) insertNode).getInsertTabletNodeList()) { + try { + min = Math.min(min, child.getMinTime()); + } catch (final Exception ignored) { + } + } + return min; + } + return Long.MAX_VALUE; + } + } + + public void close() { + markClosed(); + // Stop background prefetch thread + prefetchThread.interrupt(); + try { + prefetchThread.join(5000); + } catch (final InterruptedException e) { + Thread.currentThread().interrupt(); + } + try { + // Unregister from IoTConsensusServerImpl (stop receiving in-memory data). + serverImpl.unregisterSubscriptionQueue(pendingEntries); + } catch (final Exception e) { + LOGGER.warn("ConsensusPrefetchingQueue {}: error during unregister", this, e); + } finally { + try { + cleanUp(); + } finally { + // Persist progress before closing + commitManager.persistAll(); + } + } + } + + private SubscriptionEvent generateErrorResponse(final String errorMessage) { + return new SubscriptionEvent( + SubscriptionPollResponseType.ERROR.getType(), + new ErrorPayload(errorMessage, false), + new SubscriptionCommitContext( + IoTDBDescriptor.getInstance().getConfig().getDataNodeId(), + PipeDataNodeAgent.runtime().getRebootTimes(), + topicName, + brokerId, + INVALID_COMMIT_ID)); + } + + private SubscriptionEvent generateOutdatedErrorResponse() { + return new SubscriptionEvent( + SubscriptionPollResponseType.ERROR.getType(), + ErrorPayload.OUTDATED_ERROR_PAYLOAD, + new SubscriptionCommitContext( + IoTDBDescriptor.getInstance().getConfig().getDataNodeId(), + PipeDataNodeAgent.runtime().getRebootTimes(), + topicName, + brokerId, + INVALID_COMMIT_ID)); + } + + public boolean isCommitContextOutdated(final SubscriptionCommitContext commitContext) { + return PipeDataNodeAgent.runtime().getRebootTimes() > commitContext.getRebootTimes() + || outdatedCommitIdThreshold > commitContext.getCommitId(); + } + + // ======================== Status ======================== + + public boolean isClosed() { + return isClosed; + } + + public void markClosed() { + isClosed = true; + } + + public String getPrefetchingQueueId() { + return brokerId + "_" + topicName; + } + + public long getSubscriptionUncommittedEventCount() { + return inFlightEvents.size(); + } + + public long getCurrentCommitId() { + return commitIdGenerator.get(); + } + + public int getPrefetchedEventCount() { + return prefetchingQueue.size(); + } + + public long getCurrentReadSearchIndex() { + return nextExpectedSearchIndex.get(); + } + + public String getBrokerId() { + return brokerId; + } + + public String getTopicName() { + return topicName; + } + + public String getConsensusGroupId() { + return consensusGroupId; + } + + // ======================== Stringify ======================== + + public Map coreReportMessage() { + final Map result = new HashMap<>(); + result.put("brokerId", brokerId); + result.put("topicName", topicName); + result.put("consensusGroupId", consensusGroupId); + result.put("currentReadSearchIndex", String.valueOf(nextExpectedSearchIndex.get())); + result.put("prefetchingQueueSize", String.valueOf(prefetchingQueue.size())); + result.put("inFlightEventsSize", String.valueOf(inFlightEvents.size())); + result.put("outstandingEventsSize", String.valueOf(outstandingCommitIdToStartIndex.size())); + result.put("pendingEntriesSize", String.valueOf(pendingEntries.size())); + result.put("commitIdGenerator", commitIdGenerator.toString()); + result.put("walGapSkippedEntries", String.valueOf(walGapSkippedEntries.get())); + result.put("isClosed", String.valueOf(isClosed)); + return result; + } + + @Override + public String toString() { + return "ConsensusPrefetchingQueue" + coreReportMessage(); + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionCommitManager.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionCommitManager.java new file mode 100644 index 0000000000000..049e9154a9448 --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionCommitManager.java @@ -0,0 +1,465 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.broker.consensus; + +import org.apache.iotdb.db.conf.IoTDBDescriptor; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.DataOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Iterator; +import java.util.Map; +import java.util.TreeSet; +import java.util.concurrent.ConcurrentHashMap; + +/** + * Manages commit state for consensus-based subscriptions. + * + *

This manager tracks which events have been committed by consumers and maps commit IDs back to + * WAL search indices. It maintains the progress for each (consumerGroup, topic, region) triple and + * supports persistence and recovery. + * + *

Progress is tracked per-region because searchIndex is region-local — each DataRegion + * has its own independent WAL with its own searchIndex namespace. Using a single state per topic + * would cause TreeSet deduplication bugs when different regions emit the same searchIndex value. + * + *

Key responsibilities: + * + *

    + *
  • Track the mapping from commitId to searchIndex + *
  • Handle commit/ack from consumers + *
  • Persist and recover progress state + *
+ */ +public class ConsensusSubscriptionCommitManager { + + private static final Logger LOGGER = + LoggerFactory.getLogger(ConsensusSubscriptionCommitManager.class); + + private static final String PROGRESS_FILE_PREFIX = "consensus_subscription_progress_"; + private static final String PROGRESS_FILE_SUFFIX = ".dat"; + + /** Key: "consumerGroupId_topicName_regionId" -> progress tracking state */ + private final Map commitStates = + new ConcurrentHashMap<>(); + + private final String persistDir; + + private ConsensusSubscriptionCommitManager() { + this.persistDir = + IoTDBDescriptor.getInstance().getConfig().getSystemDir() + + File.separator + + "subscription" + + File.separator + + "consensus_progress"; + final File dir = new File(persistDir); + if (!dir.exists()) { + dir.mkdirs(); + } + } + + /** + * Gets or creates the commit state for a specific (consumerGroup, topic, region) triple. + * + * @param consumerGroupId the consumer group ID + * @param topicName the topic name + * @param regionId the consensus group / data region ID string + * @return the commit state + */ + public ConsensusSubscriptionCommitState getOrCreateState( + final String consumerGroupId, final String topicName, final String regionId) { + final String key = generateKey(consumerGroupId, topicName, regionId); + return commitStates.computeIfAbsent( + key, + k -> { + // Try to recover from persisted state + final ConsensusSubscriptionCommitState recovered = tryRecover(key); + if (recovered != null) { + return recovered; + } + return new ConsensusSubscriptionCommitState(new SubscriptionConsensusProgress(0L, 0L)); + }); + } + + /** + * Records commitId to searchIndex mapping for later commit handling. + * + * @param consumerGroupId the consumer group ID + * @param topicName the topic name + * @param regionId the consensus group / data region ID string + * @param commitId the assigned commit ID + * @param searchIndex the WAL search index corresponding to this event + */ + public void recordCommitMapping( + final String consumerGroupId, + final String topicName, + final String regionId, + final long commitId, + final long searchIndex) { + final ConsensusSubscriptionCommitState state = + getOrCreateState(consumerGroupId, topicName, regionId); + state.recordMapping(commitId, searchIndex); + } + + /** + * Handles commit (ack) for an event. Updates the progress and potentially advances the committed + * search index. + * + * @param consumerGroupId the consumer group ID + * @param topicName the topic name + * @param regionId the consensus group / data region ID string + * @param commitId the committed event's commit ID + * @return true if commit handled successfully + */ + public boolean commit( + final String consumerGroupId, + final String topicName, + final String regionId, + final long commitId) { + final String key = generateKey(consumerGroupId, topicName, regionId); + final ConsensusSubscriptionCommitState state = commitStates.get(key); + if (state == null) { + LOGGER.warn( + "ConsensusSubscriptionCommitManager: Cannot commit for unknown state, " + + "consumerGroupId={}, topicName={}, regionId={}, commitId={}", + consumerGroupId, + topicName, + regionId, + commitId); + return false; + } + final boolean success = state.commit(commitId); + if (success) { + // Periodically persist progress + persistProgressIfNeeded(key, state); + } + return success; + } + + /** + * Gets the current committed search index for a specific region's state. + * + * @param consumerGroupId the consumer group ID + * @param topicName the topic name + * @param regionId the consensus group / data region ID string + * @return the committed search index, or -1 if no state exists + */ + public long getCommittedSearchIndex( + final String consumerGroupId, final String topicName, final String regionId) { + final String key = generateKey(consumerGroupId, topicName, regionId); + final ConsensusSubscriptionCommitState state = commitStates.get(key); + if (state == null) { + return -1; + } + return state.getCommittedSearchIndex(); + } + + /** + * Removes state for a specific (consumerGroup, topic, region) triple. + * + * @param consumerGroupId the consumer group ID + * @param topicName the topic name + * @param regionId the consensus group / data region ID string + */ + public void removeState( + final String consumerGroupId, final String topicName, final String regionId) { + final String key = generateKey(consumerGroupId, topicName, regionId); + commitStates.remove(key); + // Clean up persisted file + final File file = getProgressFile(key); + if (file.exists()) { + file.delete(); + } + } + + /** + * Removes all states for a given (consumerGroup, topic) pair across all regions. Used during + * subscription teardown when the individual regionIds may not be readily available. + * + * @param consumerGroupId the consumer group ID + * @param topicName the topic name + */ + public void removeAllStatesForTopic(final String consumerGroupId, final String topicName) { + final String prefix = consumerGroupId + KEY_SEPARATOR + topicName + KEY_SEPARATOR; + final Iterator> it = + commitStates.entrySet().iterator(); + while (it.hasNext()) { + final Map.Entry entry = it.next(); + if (entry.getKey().startsWith(prefix)) { + it.remove(); + final File file = getProgressFile(entry.getKey()); + if (file.exists()) { + file.delete(); + } + } + } + } + + /** + * Resets the commit state for a specific (consumerGroup, topic, region) triple to a new search + * index. Used by seek operations to discard all outstanding commit tracking and restart from the + * specified position. + */ + public void resetState( + final String consumerGroupId, + final String topicName, + final String regionId, + final long newSearchIndex) { + final String key = generateKey(consumerGroupId, topicName, regionId); + final ConsensusSubscriptionCommitState state = commitStates.get(key); + if (state == null) { + LOGGER.warn( + "ConsensusSubscriptionCommitManager: Cannot reset unknown state, " + + "consumerGroupId={}, topicName={}, regionId={}", + consumerGroupId, + topicName, + regionId); + return; + } + state.resetForSeek(newSearchIndex); + persistProgress(key, state); + } + + /** Persists all states. Should be called during graceful shutdown. */ + public void persistAll() { + for (final Map.Entry entry : + commitStates.entrySet()) { + persistProgress(entry.getKey(), entry.getValue()); + } + } + + // ======================== Helper Methods ======================== + + // Use a separator that cannot appear in consumerGroupId, topicName, or regionId + // to prevent key collisions (e.g., "a_b" + "c" vs "a" + "b_c"). + private static final String KEY_SEPARATOR = "##"; + + private String generateKey( + final String consumerGroupId, final String topicName, final String regionId) { + return consumerGroupId + KEY_SEPARATOR + topicName + KEY_SEPARATOR + regionId; + } + + private File getProgressFile(final String key) { + return new File(persistDir, PROGRESS_FILE_PREFIX + key + PROGRESS_FILE_SUFFIX); + } + + private ConsensusSubscriptionCommitState tryRecover(final String key) { + final File file = getProgressFile(key); + if (!file.exists()) { + return null; + } + try (final FileInputStream fis = new FileInputStream(file)) { + final byte[] bytes = new byte[(int) file.length()]; + fis.read(bytes); + final ByteBuffer buffer = ByteBuffer.wrap(bytes); + return ConsensusSubscriptionCommitState.deserialize(buffer); + } catch (final IOException e) { + LOGGER.warn("Failed to recover consensus subscription progress from {}", file, e); + return null; + } + } + + private void persistProgressIfNeeded( + final String key, final ConsensusSubscriptionCommitState state) { + // Persist every 100 commits to reduce disk IO + if (state.getProgress().getCommitIndex() % 100 == 0) { + persistProgress(key, state); + } + } + + private void persistProgress(final String key, final ConsensusSubscriptionCommitState state) { + final File file = getProgressFile(key); + try (final FileOutputStream fos = new FileOutputStream(file); + final DataOutputStream dos = new DataOutputStream(fos)) { + state.serialize(dos); + dos.flush(); + } catch (final IOException e) { + LOGGER.warn("Failed to persist consensus subscription progress to {}", file, e); + } + } + + // ======================== Inner State Class ======================== + + /** + * Tracks commit state for a single (consumerGroup, topic, region) triple. Maintains the mapping + * from commitId to searchIndex and tracks committed progress within one region's WAL. + */ + public static class ConsensusSubscriptionCommitState { + + private final SubscriptionConsensusProgress progress; + + /** + * Maps commitId -> searchIndex. Records which WAL search index corresponds to each committed + * event. Entries are removed once committed. + */ + private final Map commitIdToSearchIndex = new ConcurrentHashMap<>(); + + /** + * Tracks the safe recovery position: the highest search index where all prior dispatched events + * have been committed. Only advances contiguously — never jumps over uncommitted gaps. + */ + private volatile long committedSearchIndex; + + /** + * Tracks the maximum search index among all committed events (may be ahead of + * committedSearchIndex when out-of-order commits exist). Used to update committedSearchIndex + * once all outstanding events are committed. + */ + private long maxCommittedSearchIndex; + + /** + * Tracks search indices of dispatched but not-yet-committed events. Used to prevent + * committedSearchIndex from jumping over uncommitted gaps. On commit, the frontier advances to + * min(outstanding) - 1 (or maxCommittedSearchIndex if empty). + * + *

Since state is now per-region, searchIndex values within this set are guaranteed unique + * (they come from a single region's monotonically increasing WAL searchIndex). + */ + private final TreeSet outstandingSearchIndices = new TreeSet<>(); + + public ConsensusSubscriptionCommitState(final SubscriptionConsensusProgress progress) { + this.progress = progress; + this.committedSearchIndex = progress.getSearchIndex(); + this.maxCommittedSearchIndex = progress.getSearchIndex(); + } + + public SubscriptionConsensusProgress getProgress() { + return progress; + } + + public long getCommittedSearchIndex() { + return committedSearchIndex; + } + + /** Threshold for warning about outstanding (uncommitted) search indices accumulation. */ + private static final int OUTSTANDING_SIZE_WARN_THRESHOLD = 10000; + + public void recordMapping(final long commitId, final long searchIndex) { + synchronized (this) { + commitIdToSearchIndex.put(commitId, searchIndex); + outstandingSearchIndices.add(searchIndex); + final int size = outstandingSearchIndices.size(); + if (size > OUTSTANDING_SIZE_WARN_THRESHOLD && size % OUTSTANDING_SIZE_WARN_THRESHOLD == 1) { + LOGGER.warn( + "ConsensusSubscriptionCommitState: outstandingSearchIndices size ({}) exceeds " + + "threshold ({}), consumers may not be committing. committedSearchIndex={}, " + + "maxCommittedSearchIndex={}, commitIdToSearchIndex size={}", + size, + OUTSTANDING_SIZE_WARN_THRESHOLD, + committedSearchIndex, + maxCommittedSearchIndex, + commitIdToSearchIndex.size()); + } + } + } + + /** + * Commits the specified event and advances the committed search index contiguously. + * + *

The committed search index only advances to a position where all prior dispatched events + * have been committed. This prevents the recovery position from jumping over uncommitted gaps, + * ensuring at-least-once delivery even after crash recovery. + * + * @param commitId the commit ID to commit + * @return true if successfully committed + */ + public boolean commit(final long commitId) { + progress.incrementCommitIndex(); + + // Advance committed search index contiguously (gap-aware). + // Both remove from commitIdToSearchIndex and outstandingSearchIndices must be + // inside the same synchronized block to prevent a race with recordMapping(): + // recordMapping: put(commitId, si) -> add(si) + // commit: remove(commitId) -> remove(si) + // Without atomicity, commit could remove from map between put and add, + // leaving si permanently in outstandingSearchIndices (WAL leak). + synchronized (this) { + final Long searchIndex = commitIdToSearchIndex.remove(commitId); + if (searchIndex == null) { + LOGGER.warn("ConsensusSubscriptionCommitState: unknown commitId {} for commit", commitId); + return false; + } + outstandingSearchIndices.remove(searchIndex); + if (searchIndex > maxCommittedSearchIndex) { + maxCommittedSearchIndex = searchIndex; + } + + if (outstandingSearchIndices.isEmpty()) { + // All dispatched events have been committed — advance to the max + committedSearchIndex = maxCommittedSearchIndex; + } else { + // Advance to just below the earliest uncommitted event + // (never go backward) + committedSearchIndex = + Math.max(committedSearchIndex, outstandingSearchIndices.first() - 1); + } + progress.setSearchIndex(committedSearchIndex); + } + + return true; + } + + /** + * Resets all commit tracking state for a seek operation. Clears all outstanding mappings and + * resets progress to the new search index position. + */ + public void resetForSeek(final long newSearchIndex) { + synchronized (this) { + commitIdToSearchIndex.clear(); + outstandingSearchIndices.clear(); + final long baseIndex = newSearchIndex - 1; + committedSearchIndex = baseIndex; + maxCommittedSearchIndex = baseIndex; + progress.setSearchIndex(baseIndex); + } + } + + public void serialize(final DataOutputStream stream) throws IOException { + progress.serialize(stream); + stream.writeLong(committedSearchIndex); + } + + public static ConsensusSubscriptionCommitState deserialize(final ByteBuffer buffer) { + final SubscriptionConsensusProgress progress = + SubscriptionConsensusProgress.deserialize(buffer); + final ConsensusSubscriptionCommitState state = new ConsensusSubscriptionCommitState(progress); + state.committedSearchIndex = buffer.getLong(); + state.maxCommittedSearchIndex = state.committedSearchIndex; + return state; + } + } + + // ======================== Singleton ======================== + + private static class Holder { + private static final ConsensusSubscriptionCommitManager INSTANCE = + new ConsensusSubscriptionCommitManager(); + } + + public static ConsensusSubscriptionCommitManager getInstance() { + return Holder.INSTANCE; + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionSetupHandler.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionSetupHandler.java new file mode 100644 index 0000000000000..7a6605dcda2ea --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionSetupHandler.java @@ -0,0 +1,462 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.broker.consensus; + +import org.apache.iotdb.commons.consensus.ConsensusGroupId; +import org.apache.iotdb.commons.consensus.DataRegionId; +import org.apache.iotdb.commons.pipe.datastructure.pattern.IoTDBTreePattern; +import org.apache.iotdb.commons.pipe.datastructure.pattern.PrefixTreePattern; +import org.apache.iotdb.commons.pipe.datastructure.pattern.TablePattern; +import org.apache.iotdb.commons.pipe.datastructure.pattern.TreePattern; +import org.apache.iotdb.consensus.IConsensus; +import org.apache.iotdb.consensus.iot.IoTConsensus; +import org.apache.iotdb.consensus.iot.IoTConsensusServerImpl; +import org.apache.iotdb.db.conf.IoTDBConfig; +import org.apache.iotdb.db.conf.IoTDBDescriptor; +import org.apache.iotdb.db.consensus.DataRegionConsensusImpl; +import org.apache.iotdb.db.storageengine.StorageEngine; +import org.apache.iotdb.db.storageengine.dataregion.DataRegion; +import org.apache.iotdb.db.subscription.agent.SubscriptionAgent; +import org.apache.iotdb.rpc.subscription.config.TopicConfig; +import org.apache.iotdb.rpc.subscription.config.TopicConstant; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.List; +import java.util.Map; +import java.util.Set; + +/** + * Handles the setup and teardown of consensus-based subscription queues on DataNode. When a + * real-time subscription is detected, this handler finds the local IoTConsensus data regions, + * creates the appropriate converter, and binds prefetching queues to the subscription broker. + */ +public class ConsensusSubscriptionSetupHandler { + + private static final Logger LOGGER = + LoggerFactory.getLogger(ConsensusSubscriptionSetupHandler.class); + + private static final IoTDBConfig IOTDB_CONFIG = IoTDBDescriptor.getInstance().getConfig(); + + private ConsensusSubscriptionSetupHandler() { + // utility class + } + + /** + * Ensures that the IoTConsensus new-peer and peer-removed callbacks are set, so that when a new + * DataRegion is created, all active consensus subscriptions are automatically bound to the new + * region, and when a DataRegion is removed, all subscription queues are properly cleaned up. + */ + public static void ensureNewRegionListenerRegistered() { + if (IoTConsensus.onNewPeerCreated == null) { + IoTConsensus.onNewPeerCreated = ConsensusSubscriptionSetupHandler::onNewRegionCreated; + LOGGER.info( + "Set IoTConsensus.onNewPeerCreated callback for consensus subscription auto-binding"); + } + if (IoTConsensus.onPeerRemoved == null) { + IoTConsensus.onPeerRemoved = ConsensusSubscriptionSetupHandler::onRegionRemoved; + LOGGER.info("Set IoTConsensus.onPeerRemoved callback for consensus subscription cleanup"); + } + } + + /** + * Callback invoked when a new DataRegion (IoTConsensusServerImpl) is created locally. Queries + * existing subscription metadata to find all active consensus subscriptions and binds prefetching + * queues to the new region. + */ + private static void onNewRegionCreated( + final ConsensusGroupId groupId, final IoTConsensusServerImpl serverImpl) { + if (!(groupId instanceof DataRegionId)) { + return; + } + + // Query existing metadata keepers for all active subscriptions + final Map> allSubscriptions = + SubscriptionAgent.consumer().getAllSubscriptions(); + if (allSubscriptions.isEmpty()) { + return; + } + + final ConsensusSubscriptionCommitManager commitManager = + ConsensusSubscriptionCommitManager.getInstance(); + + LOGGER.info( + "New DataRegion {} created, checking {} consumer group(s) for auto-binding, " + + "currentSearchIndex={}", + groupId, + allSubscriptions.size(), + serverImpl.getSearchIndex()); + + for (final Map.Entry> groupEntry : allSubscriptions.entrySet()) { + final String consumerGroupId = groupEntry.getKey(); + for (final String topicName : groupEntry.getValue()) { + if (!isConsensusBasedTopic(topicName)) { + continue; + } + try { + final Map topicConfigs = + SubscriptionAgent.topic().getTopicConfigs(java.util.Collections.singleton(topicName)); + final TopicConfig topicConfig = topicConfigs.get(topicName); + if (topicConfig == null) { + continue; + } + + // Resolve the new DataRegion's actual database name + final DataRegion dataRegion = + StorageEngine.getInstance().getDataRegion((DataRegionId) groupId); + if (dataRegion == null) { + continue; + } + final String dbRaw = dataRegion.getDatabaseName(); + final String dbTableModel = dbRaw.startsWith("root.") ? dbRaw.substring(5) : dbRaw; + + // For table topics, skip if this region's database doesn't match the topic filter + if (topicConfig.isTableTopic()) { + final String topicDb = + topicConfig.getStringOrDefault( + TopicConstant.DATABASE_KEY, TopicConstant.DATABASE_DEFAULT_VALUE); + if (topicDb != null + && !topicDb.isEmpty() + && !TopicConstant.DATABASE_DEFAULT_VALUE.equals(topicDb) + && !topicDb.equalsIgnoreCase(dbTableModel)) { + continue; + } + } + + final String actualDbName = topicConfig.isTableTopic() ? dbTableModel : null; + final ConsensusLogToTabletConverter converter = buildConverter(topicConfig, actualDbName); + + // Use persisted committedSearchIndex for restart recovery; fall back to WAL tail + // for brand-new regions that have no prior subscription progress. + final long persistedIndex = + commitManager.getCommittedSearchIndex(consumerGroupId, topicName, groupId.toString()); + final long startSearchIndex = + (persistedIndex > 0) ? persistedIndex + 1 : serverImpl.getSearchIndex() + 1; + + LOGGER.info( + "Auto-binding consensus queue for topic [{}] in group [{}] to new region {} " + + "(database={}, startSearchIndex={}, persistedIndex={})", + topicName, + consumerGroupId, + groupId, + dbTableModel, + startSearchIndex, + persistedIndex); + + SubscriptionAgent.broker() + .bindConsensusPrefetchingQueue( + consumerGroupId, + topicName, + groupId.toString(), + serverImpl, + converter, + commitManager, + startSearchIndex); + } catch (final Exception e) { + LOGGER.error( + "Failed to auto-bind topic [{}] in group [{}] to new region {}", + topicName, + consumerGroupId, + groupId, + e); + } + } + } + } + + /** + * Callback invoked before a DataRegion (IoTConsensusServerImpl) is deleted locally. Unbinds and + * cleans up all subscription prefetching queues associated with the removed region across all + * consumer groups. + */ + private static void onRegionRemoved(final ConsensusGroupId groupId) { + if (!(groupId instanceof DataRegionId)) { + return; + } + final String regionIdStr = groupId.toString(); + LOGGER.info( + "DataRegion {} being removed, unbinding all consensus subscription queues", regionIdStr); + try { + SubscriptionAgent.broker().unbindByRegion(regionIdStr); + } catch (final Exception e) { + LOGGER.error( + "Failed to unbind consensus subscription queues for removed region {}", regionIdStr, e); + } + } + + public static boolean isConsensusBasedTopic(final String topicName) { + try { + final String topicMode = SubscriptionAgent.topic().getTopicMode(topicName); + final String topicFormat = SubscriptionAgent.topic().getTopicFormat(topicName); + final boolean result = + TopicConstant.MODE_LIVE_VALUE.equalsIgnoreCase(topicMode) + && !TopicConstant.FORMAT_TS_FILE_HANDLER_VALUE.equalsIgnoreCase(topicFormat); + LOGGER.debug( + "isConsensusBasedTopic check for topic [{}]: mode={}, format={}, result={}", + topicName, + topicMode, + topicFormat, + result); + return result; + } catch (final Exception e) { + LOGGER.warn( + "Failed to check if topic [{}] is consensus-based, defaulting to false", topicName, e); + return false; + } + } + + public static void setupConsensusSubscriptions( + final String consumerGroupId, final Set topicNames) { + final IConsensus dataRegionConsensus = DataRegionConsensusImpl.getInstance(); + if (!(dataRegionConsensus instanceof IoTConsensus)) { + LOGGER.warn( + "Data region consensus is not IoTConsensus (actual: {}), " + + "cannot set up consensus-based subscription for consumer group [{}]", + dataRegionConsensus.getClass().getSimpleName(), + consumerGroupId); + return; + } + + // Ensure the new-region listener is registered (idempotent) + ensureNewRegionListenerRegistered(); + + final IoTConsensus ioTConsensus = (IoTConsensus) dataRegionConsensus; + final ConsensusSubscriptionCommitManager commitManager = + ConsensusSubscriptionCommitManager.getInstance(); + + LOGGER.info( + "Setting up consensus subscriptions for consumer group [{}], topics={}, " + + "total consensus groups={}", + consumerGroupId, + topicNames, + ioTConsensus.getAllConsensusGroupIds().size()); + + for (final String topicName : topicNames) { + if (!isConsensusBasedTopic(topicName)) { + continue; + } + + try { + setupConsensusQueueForTopic(consumerGroupId, topicName, ioTConsensus, commitManager); + } catch (final Exception e) { + LOGGER.error( + "Failed to set up consensus subscription for topic [{}] in consumer group [{}]", + topicName, + consumerGroupId, + e); + } + } + } + + /** + * Set up consensus queue for a single topic. Discovers all local data region consensus groups and + * binds a ConsensusReqReader-based prefetching queue to every matching region. + * + *

For table-model topics, only regions whose database matches the topic's {@code DATABASE_KEY} + * filter are bound. For tree-model topics, all local data regions are bound. Additionally, the + * {@link #onNewRegionCreated} callback ensures that regions created after this method runs are + * also automatically bound. + */ + private static void setupConsensusQueueForTopic( + final String consumerGroupId, + final String topicName, + final IoTConsensus ioTConsensus, + final ConsensusSubscriptionCommitManager commitManager) { + + // Get topic config for building the converter + final Map topicConfigs = + SubscriptionAgent.topic().getTopicConfigs(java.util.Collections.singleton(topicName)); + final TopicConfig topicConfig = topicConfigs.get(topicName); + if (topicConfig == null) { + LOGGER.warn( + "Topic config not found for topic [{}], cannot set up consensus queue", topicName); + return; + } + + // Build the converter based on topic config (path pattern, time range, tree/table model) + LOGGER.info( + "Setting up consensus queue for topic [{}]: isTableTopic={}, config={}", + topicName, + topicConfig.isTableTopic(), + topicConfig.getAttribute()); + + // For table topics, extract the database filter from topic config + final String topicDatabaseFilter = + topicConfig.isTableTopic() + ? topicConfig.getStringOrDefault( + TopicConstant.DATABASE_KEY, TopicConstant.DATABASE_DEFAULT_VALUE) + : null; + + final List allGroupIds = ioTConsensus.getAllConsensusGroupIds(); + LOGGER.info( + "Discovered {} consensus group(s) for topic [{}] in consumer group [{}]: {}", + allGroupIds.size(), + topicName, + consumerGroupId, + allGroupIds); + boolean bound = false; + + for (final ConsensusGroupId groupId : allGroupIds) { + if (!(groupId instanceof DataRegionId)) { + continue; + } + + final IoTConsensusServerImpl serverImpl = ioTConsensus.getImpl(groupId); + if (serverImpl == null) { + continue; + } + + // Resolve the DataRegion's actual database name + final DataRegion dataRegion = + StorageEngine.getInstance().getDataRegion((DataRegionId) groupId); + if (dataRegion == null) { + continue; + } + final String dbRaw = dataRegion.getDatabaseName(); + final String dbTableModel = dbRaw.startsWith("root.") ? dbRaw.substring(5) : dbRaw; + + if (topicDatabaseFilter != null + && !topicDatabaseFilter.isEmpty() + && !TopicConstant.DATABASE_DEFAULT_VALUE.equals(topicDatabaseFilter) + && !topicDatabaseFilter.equalsIgnoreCase(dbTableModel)) { + LOGGER.info( + "Skipping region {} (database={}) for table topic [{}] (DATABASE_KEY={})", + groupId, + dbTableModel, + topicName, + topicDatabaseFilter); + continue; + } + + final String actualDbName = topicConfig.isTableTopic() ? dbTableModel : null; + final ConsensusLogToTabletConverter converter = buildConverter(topicConfig, actualDbName); + + // Use persisted committedSearchIndex for restart recovery; fall back to WAL tail + // for brand-new regions that have no prior subscription progress. + final long persistedIndex = + commitManager.getCommittedSearchIndex(consumerGroupId, topicName, groupId.toString()); + final long startSearchIndex = + (persistedIndex > 0) ? persistedIndex + 1 : serverImpl.getSearchIndex() + 1; + + LOGGER.info( + "Binding consensus prefetching queue for topic [{}] in consumer group [{}] " + + "to data region consensus group [{}] (database={}, startSearchIndex={}, " + + "persistedIndex={})", + topicName, + consumerGroupId, + groupId, + dbTableModel, + startSearchIndex, + persistedIndex); + + SubscriptionAgent.broker() + .bindConsensusPrefetchingQueue( + consumerGroupId, + topicName, + groupId.toString(), + serverImpl, + converter, + commitManager, + startSearchIndex); + + bound = true; + } + + if (!bound) { + LOGGER.warn( + "No local IoTConsensus data region found for topic [{}] in consumer group [{}]. " + + "Consensus subscription will be set up when a matching data region becomes available.", + topicName, + consumerGroupId); + } + } + + private static ConsensusLogToTabletConverter buildConverter( + final TopicConfig topicConfig, final String actualDatabaseName) { + // Determine tree or table model + final boolean isTableTopic = topicConfig.isTableTopic(); + + TreePattern treePattern = null; + TablePattern tablePattern = null; + + if (isTableTopic) { + // Table model: database + table name pattern + final String database = + topicConfig.getStringOrDefault( + TopicConstant.DATABASE_KEY, TopicConstant.DATABASE_DEFAULT_VALUE); + final String table = + topicConfig.getStringOrDefault( + TopicConstant.TABLE_KEY, TopicConstant.TABLE_DEFAULT_VALUE); + tablePattern = new TablePattern(true, database, table); + } else { + // Tree model: path or pattern + if (topicConfig.getAttribute().containsKey(TopicConstant.PATTERN_KEY)) { + final String pattern = topicConfig.getAttribute().get(TopicConstant.PATTERN_KEY); + treePattern = new PrefixTreePattern(pattern); + } else { + final String path = + topicConfig.getStringOrDefault( + TopicConstant.PATH_KEY, TopicConstant.PATH_DEFAULT_VALUE); + treePattern = new IoTDBTreePattern(path); + } + } + + return new ConsensusLogToTabletConverter(treePattern, tablePattern, actualDatabaseName); + } + + public static void teardownConsensusSubscriptions( + final String consumerGroupId, final Set topicNames) { + for (final String topicName : topicNames) { + try { + SubscriptionAgent.broker().unbindConsensusPrefetchingQueue(consumerGroupId, topicName); + + // Clean up commit state for all regions of this topic + ConsensusSubscriptionCommitManager.getInstance() + .removeAllStatesForTopic(consumerGroupId, topicName); + + LOGGER.info( + "Tore down consensus subscription for topic [{}] in consumer group [{}]", + topicName, + consumerGroupId); + } catch (final Exception e) { + LOGGER.warn( + "Failed to tear down consensus subscription for topic [{}] in consumer group [{}]", + topicName, + consumerGroupId, + e); + } + } + } + + public static void handleNewSubscriptions( + final String consumerGroupId, final Set newTopicNames) { + if (newTopicNames == null || newTopicNames.isEmpty()) { + return; + } + + LOGGER.info( + "Checking new subscriptions in consumer group [{}] for consensus-based topics: {}", + consumerGroupId, + newTopicNames); + + setupConsensusSubscriptions(consumerGroupId, newTopicNames); + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/SubscriptionConsensusProgress.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/SubscriptionConsensusProgress.java new file mode 100644 index 0000000000000..9e45f8a160127 --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/SubscriptionConsensusProgress.java @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.broker.consensus; + +import org.apache.tsfile.utils.ReadWriteIOUtils; + +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Objects; +import java.util.concurrent.atomic.AtomicLong; + +/** + * Tracks consensus subscription consumption progress for a single (consumerGroup, topic, region) + * combination. + * + *

Since searchIndex is region-local (each DataRegion has its own independent WAL and searchIndex + * namespace), progress is tracked per-region: + * + *

    + *
  • searchIndex: The committed WAL search index — the highest position where all prior + * dispatched events have been acknowledged. Used as the recovery start point after crash. + *
  • commitIndex: Monotonically increasing count of committed events. Used for + * persistence throttling and diagnostics. + *
+ */ +public class SubscriptionConsensusProgress { + + private final AtomicLong searchIndex; + + private final AtomicLong commitIndex; + + public SubscriptionConsensusProgress() { + this(0L, 0L); + } + + public SubscriptionConsensusProgress(final long searchIndex, final long commitIndex) { + this.searchIndex = new AtomicLong(searchIndex); + this.commitIndex = new AtomicLong(commitIndex); + } + + public long getSearchIndex() { + return searchIndex.get(); + } + + public void setSearchIndex(final long searchIndex) { + this.searchIndex.set(searchIndex); + } + + public long getCommitIndex() { + return commitIndex.get(); + } + + public void setCommitIndex(final long commitIndex) { + this.commitIndex.set(commitIndex); + } + + public void incrementCommitIndex() { + this.commitIndex.incrementAndGet(); + } + + public void serialize(final DataOutputStream stream) throws IOException { + ReadWriteIOUtils.write(searchIndex.get(), stream); + ReadWriteIOUtils.write(commitIndex.get(), stream); + } + + public static SubscriptionConsensusProgress deserialize(final ByteBuffer buffer) { + final long searchIndex = ReadWriteIOUtils.readLong(buffer); + final long commitIndex = ReadWriteIOUtils.readLong(buffer); + return new SubscriptionConsensusProgress(searchIndex, commitIndex); + } + + @Override + public boolean equals(final Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + final SubscriptionConsensusProgress that = (SubscriptionConsensusProgress) o; + return searchIndex.get() == that.searchIndex.get() + && commitIndex.get() == that.commitIndex.get(); + } + + @Override + public int hashCode() { + return Objects.hash(searchIndex.get(), commitIndex.get()); + } + + @Override + public String toString() { + return "SubscriptionConsensusProgress{" + + "searchIndex=" + + searchIndex.get() + + ", commitIndex=" + + commitIndex.get() + + '}'; + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/event/SubscriptionEvent.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/event/SubscriptionEvent.java index dfadee5908fa5..9ede61fbffe74 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/event/SubscriptionEvent.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/event/SubscriptionEvent.java @@ -248,6 +248,11 @@ public void nack() { } } + /** Returns the current nack count for this event. */ + public long getNackCount() { + return nackCount.get(); + } + public void recordLastPolledConsumerId(final String consumerId) { lastPolledConsumerId = consumerId; } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/receiver/SubscriptionReceiverV1.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/receiver/SubscriptionReceiverV1.java index 203b93ef1e4bd..9605bd4aaea13 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/receiver/SubscriptionReceiverV1.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/receiver/SubscriptionReceiverV1.java @@ -61,6 +61,7 @@ import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribePollReq; import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeRequestType; import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeRequestVersion; +import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeSeekReq; import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeSubscribeReq; import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeUnsubscribeReq; import org.apache.iotdb.rpc.subscription.payload.response.PipeSubscribeCloseResp; @@ -70,6 +71,7 @@ import org.apache.iotdb.rpc.subscription.payload.response.PipeSubscribePollResp; import org.apache.iotdb.rpc.subscription.payload.response.PipeSubscribeResponseType; import org.apache.iotdb.rpc.subscription.payload.response.PipeSubscribeResponseVersion; +import org.apache.iotdb.rpc.subscription.payload.response.PipeSubscribeSeekResp; import org.apache.iotdb.rpc.subscription.payload.response.PipeSubscribeSubscribeResp; import org.apache.iotdb.rpc.subscription.payload.response.PipeSubscribeUnsubscribeResp; import org.apache.iotdb.service.rpc.thrift.TPipeSubscribeReq; @@ -135,6 +137,8 @@ public final TPipeSubscribeResp handle(final TPipeSubscribeReq req) { return handlePipeSubscribeCommit(PipeSubscribeCommitReq.fromTPipeSubscribeReq(req)); case CLOSE: return handlePipeSubscribeClose(PipeSubscribeCloseReq.fromTPipeSubscribeReq(req)); + case SEEK: + return handlePipeSubscribeSeek(PipeSubscribeSeekReq.fromTPipeSubscribeReq(req)); default: break; } @@ -662,6 +666,45 @@ private TPipeSubscribeResp handlePipeSubscribeCloseInternal(final PipeSubscribeC return PipeSubscribeCloseResp.toTPipeSubscribeResp(RpcUtils.SUCCESS_STATUS); } + private TPipeSubscribeResp handlePipeSubscribeSeek(final PipeSubscribeSeekReq req) { + try { + return handlePipeSubscribeSeekInternal(req); + } catch (final Exception e) { + LOGGER.warn("Exception occurred when seeking with request {}", req, e); + final String exceptionMessage = + String.format( + "Subscription: something unexpected happened when seeking with request %s: %s", + req, e); + return PipeSubscribeSeekResp.toTPipeSubscribeResp( + RpcUtils.getStatus(TSStatusCode.SUBSCRIPTION_SEEK_ERROR, exceptionMessage)); + } + } + + private TPipeSubscribeResp handlePipeSubscribeSeekInternal(final PipeSubscribeSeekReq req) { + // check consumer config thread local + final ConsumerConfig consumerConfig = consumerConfigThreadLocal.get(); + if (Objects.isNull(consumerConfig)) { + LOGGER.warn( + "Subscription: missing consumer config when handling PipeSubscribeSeekReq: {}", req); + return SUBSCRIPTION_MISSING_CUSTOMER_RESP; + } + + final String topicName = req.getTopicName(); + final short seekType = req.getSeekType(); + + SubscriptionAgent.broker() + .seek(consumerConfig, topicName, seekType, req.getTimestamp()); + + LOGGER.info( + "Subscription: consumer {} seek topic {} with seekType={}, timestamp={}", + consumerConfig, + topicName, + seekType, + req.getTimestamp()); + + return PipeSubscribeSeekResp.toTPipeSubscribeResp(RpcUtils.SUCCESS_STATUS); + } + private void closeConsumer(final ConsumerConfig consumerConfig) { // unsubscribe all subscribed topics final Set topicNames = diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonConfig.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonConfig.java index cf68da89553c0..cde968ae3c701 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonConfig.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonConfig.java @@ -389,6 +389,13 @@ public class CommonConfig { private long subscriptionMetaSyncerInitialSyncDelayMinutes = 3; private long subscriptionMetaSyncerSyncIntervalMinutes = 3; + private int subscriptionConsensusBatchMaxDelayInMs = 50; + private long subscriptionConsensusBatchMaxSizeInBytes = 8 * MB; + private int subscriptionConsensusBatchMaxTabletCount = 64; + private int subscriptionConsensusBatchMaxWalEntries = 128; + + private long subscriptionConsensusWalRetentionSizeInBytes = 512 * MB; + /** Whether to use persistent schema mode. */ private String schemaEngineMode = "Memory"; @@ -2477,6 +2484,52 @@ public long getSubscriptionMetaSyncerSyncIntervalMinutes() { return subscriptionMetaSyncerSyncIntervalMinutes; } + public int getSubscriptionConsensusBatchMaxDelayInMs() { + return subscriptionConsensusBatchMaxDelayInMs; + } + + public void setSubscriptionConsensusBatchMaxDelayInMs( + final int subscriptionConsensusBatchMaxDelayInMs) { + this.subscriptionConsensusBatchMaxDelayInMs = subscriptionConsensusBatchMaxDelayInMs; + } + + public long getSubscriptionConsensusBatchMaxSizeInBytes() { + return subscriptionConsensusBatchMaxSizeInBytes; + } + + public void setSubscriptionConsensusBatchMaxSizeInBytes( + final long subscriptionConsensusBatchMaxSizeInBytes) { + this.subscriptionConsensusBatchMaxSizeInBytes = subscriptionConsensusBatchMaxSizeInBytes; + } + + public int getSubscriptionConsensusBatchMaxTabletCount() { + return subscriptionConsensusBatchMaxTabletCount; + } + + public void setSubscriptionConsensusBatchMaxTabletCount( + final int subscriptionConsensusBatchMaxTabletCount) { + this.subscriptionConsensusBatchMaxTabletCount = subscriptionConsensusBatchMaxTabletCount; + } + + public int getSubscriptionConsensusBatchMaxWalEntries() { + return subscriptionConsensusBatchMaxWalEntries; + } + + public void setSubscriptionConsensusBatchMaxWalEntries( + final int subscriptionConsensusBatchMaxWalEntries) { + this.subscriptionConsensusBatchMaxWalEntries = subscriptionConsensusBatchMaxWalEntries; + } + + public long getSubscriptionConsensusWalRetentionSizeInBytes() { + return subscriptionConsensusWalRetentionSizeInBytes; + } + + public void setSubscriptionConsensusWalRetentionSizeInBytes( + final long subscriptionConsensusWalRetentionSizeInBytes) { + this.subscriptionConsensusWalRetentionSizeInBytes = + subscriptionConsensusWalRetentionSizeInBytes; + } + public void setSubscriptionMetaSyncerSyncIntervalMinutes( long subscriptionMetaSyncerSyncIntervalMinutes) { this.subscriptionMetaSyncerSyncIntervalMinutes = subscriptionMetaSyncerSyncIntervalMinutes; diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonDescriptor.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonDescriptor.java index 8483d1425cfec..156b054e7e533 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonDescriptor.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonDescriptor.java @@ -420,6 +420,27 @@ private void loadSubscriptionProps(TrimProperties properties) { properties.getProperty( "subscription_meta_syncer_sync_interval_minutes", String.valueOf(config.getSubscriptionMetaSyncerSyncIntervalMinutes())))); + + config.setSubscriptionConsensusBatchMaxDelayInMs( + Integer.parseInt( + properties.getProperty( + "subscription_consensus_batch_max_delay_in_ms", + String.valueOf(config.getSubscriptionConsensusBatchMaxDelayInMs())))); + config.setSubscriptionConsensusBatchMaxSizeInBytes( + Long.parseLong( + properties.getProperty( + "subscription_consensus_batch_max_size_in_bytes", + String.valueOf(config.getSubscriptionConsensusBatchMaxSizeInBytes())))); + config.setSubscriptionConsensusBatchMaxTabletCount( + Integer.parseInt( + properties.getProperty( + "subscription_consensus_batch_max_tablet_count", + String.valueOf(config.getSubscriptionConsensusBatchMaxTabletCount())))); + config.setSubscriptionConsensusBatchMaxWalEntries( + Integer.parseInt( + properties.getProperty( + "subscription_consensus_batch_max_wal_entries", + String.valueOf(config.getSubscriptionConsensusBatchMaxWalEntries())))); } public void loadRetryProperties(TrimProperties properties) throws IOException { diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/config/SubscriptionConfig.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/config/SubscriptionConfig.java index c7e7fea8d12f8..d709457372a82 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/config/SubscriptionConfig.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/config/SubscriptionConfig.java @@ -30,7 +30,7 @@ public class SubscriptionConfig { private static final CommonConfig COMMON_CONFIG = CommonDescriptor.getInstance().getConfig(); public boolean getSubscriptionEnabled() { - return false; + return true; // TODO: make it configurable after subscription is stable } public float getSubscriptionCacheMemoryUsagePercentage() { @@ -137,6 +137,23 @@ public long getSubscriptionMetaSyncerSyncIntervalMinutes() { return COMMON_CONFIG.getSubscriptionMetaSyncerSyncIntervalMinutes(); } + // Consensus subscription batching parameters + public int getSubscriptionConsensusBatchMaxDelayInMs() { + return COMMON_CONFIG.getSubscriptionConsensusBatchMaxDelayInMs(); + } + + public long getSubscriptionConsensusBatchMaxSizeInBytes() { + return COMMON_CONFIG.getSubscriptionConsensusBatchMaxSizeInBytes(); + } + + public int getSubscriptionConsensusBatchMaxTabletCount() { + return COMMON_CONFIG.getSubscriptionConsensusBatchMaxTabletCount(); + } + + public int getSubscriptionConsensusBatchMaxWalEntries() { + return COMMON_CONFIG.getSubscriptionConsensusBatchMaxWalEntries(); + } + /////////////////////////////// Utils /////////////////////////////// private static final Logger LOGGER = LoggerFactory.getLogger(SubscriptionConfig.class); @@ -207,6 +224,18 @@ public void printAllConfigs() { LOGGER.info( "SubscriptionMetaSyncerSyncIntervalMinutes: {}", getSubscriptionMetaSyncerSyncIntervalMinutes()); + + LOGGER.info( + "SubscriptionConsensusBatchMaxDelayInMs: {}", getSubscriptionConsensusBatchMaxDelayInMs()); + LOGGER.info( + "SubscriptionConsensusBatchMaxSizeInBytes: {}", + getSubscriptionConsensusBatchMaxSizeInBytes()); + LOGGER.info( + "SubscriptionConsensusBatchMaxTabletCount: {}", + getSubscriptionConsensusBatchMaxTabletCount()); + LOGGER.info( + "SubscriptionConsensusBatchMaxWalEntries: {}", + getSubscriptionConsensusBatchMaxWalEntries()); } /////////////////////////////// Singleton /////////////////////////////// diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/meta/consumer/ConsumerGroupMeta.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/meta/consumer/ConsumerGroupMeta.java index 4393ef8a6cf61..9f66b48210bc2 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/meta/consumer/ConsumerGroupMeta.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/meta/consumer/ConsumerGroupMeta.java @@ -115,6 +115,26 @@ private boolean shouldRecordSubscriptionCreationTime() { return unsubscribedTopicNames; } + public static Set getTopicsNewlySubByGroup( + final ConsumerGroupMeta currentMeta, final ConsumerGroupMeta updatedMeta) { + if (!Objects.equals(currentMeta.consumerGroupId, updatedMeta.consumerGroupId) + || !Objects.equals(currentMeta.creationTime, updatedMeta.creationTime)) { + return Collections.emptySet(); + } + + final Set newlySubscribedTopicNames = new HashSet<>(); + updatedMeta + .topicNameToSubscribedConsumerIdSet + .keySet() + .forEach( + topicName -> { + if (!currentMeta.topicNameToSubscribedConsumerIdSet.containsKey(topicName)) { + newlySubscribedTopicNames.add(topicName); + } + }); + return newlySubscribedTopicNames; + } + /////////////////////////////// consumer /////////////////////////////// public void checkAuthorityBeforeJoinConsumerGroup(final ConsumerMeta consumerMeta) @@ -171,6 +191,11 @@ public ConsumerMeta getConsumerMeta(final String consumerId) { ////////////////////////// subscription ////////////////////////// + /** Get all topic names subscribed by this consumer group. */ + public Set getSubscribedTopicNames() { + return Collections.unmodifiableSet(topicNameToSubscribedConsumerIdSet.keySet()); + } + /** * Get the consumers subscribing the given topic in this group. *