InfraSail · Sigma711 · Jan 6, 2026 · Jan 29, 2026 · Feb 3, 2026 · Feb 4, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -7,7 +7,7 @@ SET(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 OPTION(TAOTU_ENABLE_CLANG_FORMAT "Enable clang-format checks." OFF)
 OPTION(TAOTU_ENABLE_CLANG_TIDY "Enable clang-tidy checks." OFF)
 
-SET(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG")
+SET(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG -march=native -flto")
 
 SET(CMAKE_CXX_FLAGS_DEBUG "-g -O0 -fsanitize=address -DTAOTU_DEBUG")
 
@@ -49,6 +49,7 @@ IF(TAOTU_ENABLE_CLANG_FORMAT)
        "${CMAKE_SOURCE_DIR}/example/*.c"
        "${CMAKE_SOURCE_DIR}/example/*.cc"
        "${CMAKE_SOURCE_DIR}/example/*.cpp")
+  LIST(FILTER TAOTU_CLANG_FORMAT_FILES EXCLUDE REGEX ".*\\.pb\\.(cc|h)$")
   ADD_CUSTOM_TARGET(
     clang-format
     COMMAND ${CLANG_FORMAT_EXE} --dry-run --Werror -style=file

diff --git a/README.md b/README.md
@@ -23,7 +23,67 @@ cmake --build build_release -j
 Notes:
 - Requires a C++17 compiler, CMake, and liburing.
 - RPC demo uses protobuf.
-- You can tune io_uring entries with `TAOTU_IORING_ENTRIES` if memory is tight.
+
+## Configuration
+
+### CMake options
+
+You can configure build behavior with:
+
+- `-DCMAKE_BUILD_TYPE=Release|Debug` (or other CMake build types)
+- `-DTAOTU_ENABLE_CLANG_TIDY=ON|OFF` (default `OFF`)
+- `-DTAOTU_ENABLE_CLANG_FORMAT=ON|OFF` (default `OFF`)
+
+### Runtime environment variables (core io_uring backend)
+
+These are read at process startup by `Poller`:
+
+- `TAOTU_ENABLE_BUF_RING`
+  - Default: enabled (best effort)
+  - Effect: enable io_uring buffer ring (`buf_ring`) for provided buffers
+    (values non-empty and not `'0'` enable). When enabled and supported by the
+    running kernel/liburing, returning a recv-multishot provided buffer becomes
+    a user-space operation (no extra `IORING_OP_PROVIDE_BUFFERS` SQE).
+- `TAOTU_DISABLE_BUF_RING`
+  - Default: disabled
+  - Effect: force-disable `buf_ring` provided-buffer path and fallback to the
+    legacy `IORING_OP_PROVIDE_BUFFERS` path (`non-empty` and not `'0'` disables),
+    takes precedence over `TAOTU_ENABLE_BUF_RING`.
+- `TAOTU_IORING_ENTRIES`
+  - Default: `32768`
+  - Clamp range: `[1024, 32768]`
+  - Effect: io_uring queue depth (can reduce memory footprint when lowered).
+- `TAOTU_ENABLE_SQPOLL`
+  - Default: disabled
+  - Effect: request io_uring SQPOLL mode (`non-empty` and not `'0'` enables).
+- `TAOTU_DISABLE_SQPOLL`
+  - Default: disabled
+  - Effect: force-disable SQPOLL (`non-empty` and not `'0'` disables), takes precedence over `TAOTU_ENABLE_SQPOLL`.
+- `TAOTU_DISABLE_RECV_MULTISHOT`
+  - Default: disabled
+  - Effect: disable recv-multishot + provided-buffer registration path.
+- `TAOTU_IORING_SUBMIT_BATCH`
+  - Default: `16`
+  - Max clamp: `256`
+  - Effect: submit SQEs when pending count reaches this threshold (or when forced by the loop).
+- `TAOTU_IORING_OP_POOL_LIMIT`
+  - Default: `65536`
+  - Max clamp: `1048576`
+  - Effect: max cached io_uring operation objects per `Poller` for allocation reuse.
+- `TAOTU_IORING_BORROWED_BUFFER_LIMIT`
+  - Default: `kBufCount/2` (currently `128`)
+  - Clamp range: `[0, kBufCount]`
+  - Effect: max number of recv-multishot provided buffers that can be leased
+    (held past the read CQE) for the borrowed-send fast path; `0` disables
+    leasing (borrowed send falls back to copy).
- `TAOTU_IORING_BORROWED_BUFFER_LIMIT`
-  - Default: `kBufCount/2` (currently `128`)
-  - Clamp range: `[0, kBufCount]`
-  - Effect: max number of recv-multishot provided buffers that can be leased
-    (held past the read CQE) for the borrowed-send fast path; `0` disables
-    leasing (borrowed send falls back to copy).
- `TAOTU_IORING_BORROWED_BUFFER_LIMIT`
-  - Default: `kBufCount/2` (currently `128`)
-  - Clamp range: `[0, kBufCount]`
-  - Effect: max number of recv-multishot provided buffers that can be leased
-    (held past the read CQE) for the borrowed-send fast path; `0` disables
-    leasing (borrowed send falls back to copy).
+
+Example:
+
+```bash
+TAOTU_IORING_ENTRIES=16384 \
+TAOTU_IORING_SUBMIT_BATCH=16 \
+./pingpong_server 4567 8
+```
 
 ## Run demos
 
@@ -36,6 +96,27 @@ cd build/output/bin
 ./simple_echo 4567 4
 ```
 
+### Demo command-line options
+
+Server demos:
+
+- `simple_echo [port [io_threads]]`
+- `simple_discard [port [io_threads]]`
+- `simple_time [port [io_threads]]`
+- `http_server [port [io_threads]]`
+- `chat_server [port [io_threads]]`
+- `pingpong_server [port [io_threads]]`
+
+Client demos:
+
+- `chat_client [host_ip] <port>` (if only `host_ip` is provided, port defaults to `4567`)
+- `pingpong_client <host_ip> <port> <threads> <block_size> <sessions> <time_sec>`
+- `time_service_sync_client` (no CLI args)
+
+RPC server demo:
+
+- `time_service_server` (no CLI args)
+
 ## Basic usage
 
 High-level flow:

diff --git a/README_zh-Hans.md b/README_zh-Hans.md
@@ -23,7 +23,65 @@ cmake --build build_release -j
 说明：
 - 需要 C++17 编译器、CMake 和 liburing。
 - RPC 示例需要 protobuf。
-- 如果内存吃紧，可以通过 `TAOTU_IORING_ENTRIES` 调小 io_uring 队列大小。
+
+## 配置项
+
+### CMake 选项
+
+可通过以下参数配置构建行为：
+
+- `-DCMAKE_BUILD_TYPE=Release|Debug`（或其他 CMake 构建类型）
+- `-DTAOTU_ENABLE_CLANG_TIDY=ON|OFF`（默认 `OFF`）
+- `-DTAOTU_ENABLE_CLANG_FORMAT=ON|OFF`（默认 `OFF`）
+
+### 运行时环境变量（核心 io_uring 后端）
+
+以下变量会在进程启动时由 `Poller` 读取：
+
+- `TAOTU_ENABLE_BUF_RING`
+  - 默认：开启（尽力启用）
+  - 作用：启用 io_uring 的 buffer ring（`buf_ring`）作为 provided-buffer 的回收机制
+    （值非空且不为 `'0'` 时启用）。当开启且系统内核/liburing 支持时，recv-multishot
+    的 buffer 归还会变成纯用户态操作（不再需要额外提交 `IORING_OP_PROVIDE_BUFFERS` SQE）。
+- `TAOTU_DISABLE_BUF_RING`
+  - 默认：关闭
+  - 作用：强制关闭 `buf_ring` 的 provided-buffer 路径，回退到旧的
+    `IORING_OP_PROVIDE_BUFFERS` 机制（值非空且不为 `'0'` 时禁用），优先级高于 `TAOTU_ENABLE_BUF_RING`。
+- `TAOTU_IORING_ENTRIES`
+  - 默认值：`32768`
+  - 限制范围：`[1024, 32768]`
+  - 作用：设置 io_uring 队列深度（调小可降低内存占用）。
+- `TAOTU_ENABLE_SQPOLL`
+  - 默认：关闭
+  - 作用：请求启用 io_uring 的 SQPOLL 模式（值非空且不为 `'0'` 时启用）。
+- `TAOTU_DISABLE_SQPOLL`
+  - 默认：关闭
+  - 作用：强制关闭 SQPOLL（值非空且不为 `'0'` 时关闭），优先级高于 `TAOTU_ENABLE_SQPOLL`。
+- `TAOTU_DISABLE_RECV_MULTISHOT`
+  - 默认：关闭
+  - 作用：关闭 recv-multishot + provided-buffer 注册路径。
+- `TAOTU_IORING_SUBMIT_BATCH`
+  - 默认值：`16`
+  - 最大限制：`256`
+  - 作用：当待提交 SQE 数达到该阈值时触发提交（事件循环中也可能被强制提交）。
+- `TAOTU_IORING_OP_POOL_LIMIT`
+  - 默认值：`65536`
+  - 最大限制：`1048576`
+  - 作用：每个 `Poller` 可缓存复用的 io_uring 操作对象上限。
+- `TAOTU_IORING_BORROWED_BUFFER_LIMIT`
+  - 默认值：`kBufCount/2`（当前为 `128`）
+  - 限制范围：`[0, kBufCount]`
+  - 作用：recv-multishot 的 provided-buffer 最多允许被“借用/持有”的数量上限
+    （读取 CQE 回调结束后暂不归还，用于借用直发）；设为 `0` 可禁用借用
+    （借用发送会自动回退为拷贝发送）。
- `TAOTU_IORING_BORROWED_BUFFER_LIMIT`
-  - 默认值：`kBufCount/2`（当前为 `128`）
-  - 限制范围：`[0, kBufCount]`
-  - 作用：recv-multishot 的 provided-buffer 最多允许被“借用/持有”的数量上限
-    （读取 CQE 回调结束后暂不归还，用于借用直发）；设为 `0` 可禁用借用
-    （借用发送会自动回退为拷贝发送）。
- `TAOTU_IORING_BORROWED_BUFFER_LIMIT`
-  - 默认值：`kBufCount/2`（当前为 `128`）
-  - 限制范围：`[0, kBufCount]`
-  - 作用：recv-multishot 的 provided-buffer 最多允许被“借用/持有”的数量上限
-    （读取 CQE 回调结束后暂不归还，用于借用直发）；设为 `0` 可禁用借用
-    （借用发送会自动回退为拷贝发送）。
+
+示例：
+
+```bash
+TAOTU_IORING_ENTRIES=16384 \
+TAOTU_IORING_SUBMIT_BATCH=16 \
+./pingpong_server 4567 8
+```
 
 ## 运行示例
 
@@ -36,6 +94,27 @@ cd build/output/bin
 ./simple_echo 4567 4
 ```
 
+### Demo 命令行参数
+
+服务端示例：
+
+- `simple_echo [port [io_threads]]`
+- `simple_discard [port [io_threads]]`
+- `simple_time [port [io_threads]]`
+- `http_server [port [io_threads]]`
+- `chat_server [port [io_threads]]`
+- `pingpong_server [port [io_threads]]`
+
+客户端示例：
+
+- `chat_client [host_ip] <port>`（仅提供 `host_ip` 时，端口默认为 `4567`）
+- `pingpong_client <host_ip> <port> <threads> <block_size> <sessions> <time_sec>`
+- `time_service_sync_client`（无命令行参数）
+
+RPC 服务端示例：
+
+- `time_service_server`（无命令行参数）
+
 ## 基本使用
 
 一般流程：

diff --git a/example/chat_room/chat_server.cc b/example/chat_room/chat_server.cc
@@ -14,13 +14,19 @@
 
 ChatServer::ChatServer(const taotu::NetAddress& listen_address,
                        bool should_reuse_port, size_t io_thread_amount)
-    : event_managers_(io_thread_amount, new taotu::EventManager),
-      server_(std::make_unique<taotu::Server>(&event_managers_, listen_address,
-                                              should_reuse_port)),
+    : event_managers_(),
+      server_(nullptr),
       codec_([this](taotu::Connecting& connection, const std::string& message,
                     taotu::TimePoint time_point) {
         this->OnCodecMessage(connection, message, time_point);
       }) {
+  size_t thread_count = io_thread_amount > 0 ? io_thread_amount : 1;
+  event_managers_.reserve(thread_count);
+  for (size_t i = 0; i < thread_count; ++i) {
+    event_managers_.push_back(new taotu::EventManager);
+  }
+  server_ = std::make_unique<taotu::Server>(&event_managers_, listen_address,
+                                            should_reuse_port);
   server_->SetConnectionCallback([this](taotu::Connecting& connection) {
     this->OnConnectionCallback(connection);
   });

diff --git a/example/http_server/http_server.cc b/example/http_server/http_server.cc
@@ -17,9 +17,14 @@
 
 HttpServer::HttpServer(const taotu::NetAddress& listen_address,
                        bool should_reuse_port, size_t io_thread_amount)
-    : event_managers_(io_thread_amount, new taotu::EventManager),
-      server_(std::make_unique<taotu::Server>(&event_managers_, listen_address,
-                                              should_reuse_port)) {
+    : event_managers_(), server_(nullptr) {
+  size_t thread_count = io_thread_amount > 0 ? io_thread_amount : 1;
+  event_managers_.reserve(thread_count);
+  for (size_t i = 0; i < thread_count; ++i) {
+    event_managers_.push_back(new taotu::EventManager);
+  }
+  server_ = std::make_unique<taotu::Server>(&event_managers_, listen_address,
+                                            should_reuse_port);
   server_->SetConnectionCallback([this](taotu::Connecting& connection) {
     this->OnConnectionCallback(connection);
   });

diff --git a/example/pingpong/pingpong_client.cc b/example/pingpong/pingpong_client.cc
@@ -95,12 +95,10 @@ void PingpongClient::DoWithTimeout() {
   taotu::LOG_INFO("All stopped!");
   // Print stats now, before attempting to disconnect (which may stall)
   ReportStatsOnce();
-
   // Stop sessions (best effort)
   for (auto& session : sessions_) {
     session->Stop();
   }
-  // Force quit immediately - don't wait for disconnect callbacks
   RequestQuit();
 }
 
@@ -143,9 +141,7 @@ Session::Session(taotu::EventManager* event_manager,
                  const taotu::NetAddress& server_address,
                  const std::shared_ptr<PingpongClient>& master_client)
     : client_(event_manager, server_address, true),
-      master_client_(master_client),
-      bytes_read_(0),
-      messages_read_(0) {
+      master_client_(master_client) {
   client_.SetConnectionCallback([this](taotu::Connecting& connection) {
     this->OnConnectionCallback(connection);
   });
@@ -158,7 +154,7 @@ Session::Session(taotu::EventManager* event_manager,
 
 void Session::Start() { client_.Connect(); }
 
-void Session::Stop() { client_.Stop(); }
+void Session::Stop() { client_.StopWithoutQuit(); }
 
 void Session::OnConnectionCallback(taotu::Connecting& connection) {
   if (connection.IsConnected()) {
@@ -179,7 +175,8 @@ void Session::OnConnectionCallback(taotu::Connecting& connection) {
 
 void Session::OnMessageCallback(taotu::Connecting& connection,
                                 taotu::IoBuffer* io_buffer, taotu::TimePoint) {
-  ++messages_read_;
-  bytes_read_ += static_cast<int64_t>(io_buffer->GetReadableBytes());
+  messages_read_.fetch_add(1, std::memory_order_relaxed);
+  bytes_read_.fetch_add(static_cast<int64_t>(io_buffer->GetReadableBytes()),
+                        std::memory_order_relaxed);
   connection.Send(io_buffer);
 }
diff --git a/example/pingpong/pingpong_client.h b/example/pingpong/pingpong_client.h
@@ -71,8 +71,8 @@ class Session : taotu::NonCopyableMovable {
   // Stop the session
   void Stop();
 
-  int64_t GetBytesRead() const { return bytes_read_; }
-  int64_t GetMessagesRead() const { return messages_read_; }
+  int64_t GetBytesRead() const { return bytes_read_.load(); }
+  int64_t GetMessagesRead() const { return messages_read_.load(); }
 
  private:
   // Called after the connection creating and before the connection destroying
@@ -84,8 +84,8 @@ class Session : taotu::NonCopyableMovable {
 
   taotu::Client client_;
   std::weak_ptr<PingpongClient> master_client_;
-  int64_t bytes_read_;
-  int64_t messages_read_;
+  std::atomic<int64_t> bytes_read_{0};
+  std::atomic<int64_t> messages_read_{0};
 };
 
 #endif  // !TAOTU_EXAMPLE_PINGPONG_PINGPONG_CLIENT_H_
diff --git a/example/rpc_demo/time_service/sync_client_main.cc b/example/rpc_demo/time_service/sync_client_main.cc
@@ -19,14 +19,17 @@
 
 int main() {
   taotu::START_LOG("time_service_sync_client_log.txt");
-  taotu::RpcSyncChannel rpc_sync_channel(taotu::NetAddress{"127.0.0.1", 4567});
-  timeservice::TimeService::Stub stub(&rpc_sync_channel);
-  timeservice::TimeRequest request;
-  request.set_client_id("1234");
-  timeservice::TimeResponse response;
-  stub.GetTime(nullptr, &request, &response, nullptr);
-  ::printf("TimeService RPC Server time: %s\n",
-           response.current_time().c_str());
+  {
+    taotu::RpcSyncChannel rpc_sync_channel(
+        taotu::NetAddress{"127.0.0.1", 4567});
+    timeservice::TimeService::Stub stub(&rpc_sync_channel);
+    timeservice::TimeRequest request;
+    request.set_client_id("1234");
+    timeservice::TimeResponse response;
+    stub.GetTime(nullptr, &request, &response, nullptr);
+    ::printf("TimeService RPC Server time: %s\n",
+             response.current_time().c_str());
+  }
   taotu::END_LOG();
   return 0;
 }
diff --git a/example/simple_discard/discard.cc b/example/simple_discard/discard.cc
@@ -17,9 +17,14 @@
 
 DiscardServer::DiscardServer(const taotu::NetAddress& listen_address,
                              bool should_reuse_port, size_t io_thread_amount)
-    : event_managers_(io_thread_amount, new taotu::EventManager),
-      server_(std::make_unique<taotu::Server>(&event_managers_, listen_address,
-                                              should_reuse_port)) {
+    : event_managers_(), server_(nullptr) {
+  size_t thread_count = io_thread_amount > 0 ? io_thread_amount : 1;
+  event_managers_.reserve(thread_count);
+  for (size_t i = 0; i < thread_count; ++i) {
+    event_managers_.push_back(new taotu::EventManager);
+  }
+  server_ = std::make_unique<taotu::Server>(&event_managers_, listen_address,
+                                            should_reuse_port);
   server_->SetMessageCallback([this](taotu::Connecting& connection,
                                      taotu::IoBuffer* io_buffer,
                                      taotu::TimePoint time_point) {

diff --git a/example/simple_echo/echo.cc b/example/simple_echo/echo.cc
@@ -15,9 +15,14 @@
 
 EchoServer::EchoServer(const taotu::NetAddress& listen_address,
                        bool should_reuse_port, size_t io_thread_amount)
-    : event_managers_(io_thread_amount, new taotu::EventManager),
-      server_(std::make_unique<taotu::Server>(&event_managers_, listen_address,
-                                              should_reuse_port)) {
+    : event_managers_(), server_(nullptr) {
+  size_t thread_count = io_thread_amount > 0 ? io_thread_amount : 1;
+  event_managers_.reserve(thread_count);
+  for (size_t i = 0; i < thread_count; ++i) {
+    event_managers_.push_back(new taotu::EventManager);
+  }
+  server_ = std::make_unique<taotu::Server>(&event_managers_, listen_address,
+                                            should_reuse_port);
   server_->SetMessageCallback([this](taotu::Connecting& connection,
                                      taotu::IoBuffer* io_buffer,
                                      taotu::TimePoint time_point) {

diff --git a/example/simple_time/time.cc b/example/simple_time/time.cc
@@ -16,9 +16,14 @@
 
 TimeServer::TimeServer(const taotu::NetAddress& listen_address,
                        bool should_reuse_port, size_t io_thread_amount)
-    : event_managers_(io_thread_amount, new taotu::EventManager),
-      server_(std::make_unique<taotu::Server>(&event_managers_, listen_address,
-                                              should_reuse_port)) {
+    : event_managers_(), server_(nullptr) {
+  size_t thread_count = io_thread_amount > 0 ? io_thread_amount : 1;
+  event_managers_.reserve(thread_count);
+  for (size_t i = 0; i < thread_count; ++i) {
+    event_managers_.push_back(new taotu::EventManager);
+  }
+  server_ = std::make_unique<taotu::Server>(&event_managers_, listen_address,
+                                            should_reuse_port);
   server_->SetMessageCallback([this](taotu::Connecting& connection,
                                      taotu::IoBuffer* io_buffer,
                                      taotu::TimePoint time_point) {