From fd547d0107117d6ea635a6500fff55e7a7232469 Mon Sep 17 00:00:00 2001 From: zhaochangle Date: Tue, 18 Feb 2025 12:11:49 +0800 Subject: [PATCH 1/5] 1 --- .../current/query-acceleration/dictionary.md | 434 ++++++++++++++++++ sidebars.json | 1 + 2 files changed, 435 insertions(+) create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/current/query-acceleration/dictionary.md diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/query-acceleration/dictionary.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/query-acceleration/dictionary.md new file mode 100644 index 0000000000000..2c741f96a6565 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/query-acceleration/dictionary.md @@ -0,0 +1,434 @@ +--- +{ + "title": "字典表(实验性功能)", + "language": "zh-CN" +} +--- + + + +## 概述 + +字典表(Dictionary)是 Doris 提供的一种用于加速 JOIN 操作的特殊数据结构。它通过将常用的键值对预先加载到内存中,实现快速的查找操作,从而提升查询性能。字典表特别适用于需要频繁进行键值查找的场景。 + +## 使用场景 + +字典表主要适用于以下场景: + +1. 需要频繁进行键值查找的场景 +2. 维度表较小,可以完全加载到内存中 +3. 数据更新频率相对较低的场景 +4. 需要优化 JOIN 操作性能的场景 + +## 字典表定义 + +### 基本语法 + +```sql +CREATE DICTIONARY USING +( + KEY[, + ..., + VALUE] + VALUE[, + ..., + VALUE] +) +LAYOUT() +PROPERTIES( + "" = ""[, + ..., + "" = ""] +); +``` + +其中: + +- ``:字典表的名字 +- ``:源数据表 +- ``:作为键的列在源表中的列名 +- ``:作为值的列在源表中的列名 +- ``:字典表的存储布局类型,详见后文。 +- ``:表的某项属性名 +- ``:表的某项属性取值 + +`` 不必出现在 `` 前。 + +### 布局类型 + +目前支持两种布局类型: + +- `HASH_MAP`:基于哈希表的实现,适用于一般的键值查找场景 +- `IP_TRIE`:基于 Trie 树的实现,专门优化用于 IP 地址类型的查找。Key 列需要为 CIDR 表示法表示的 IP 地址,查询时依 CIDR 表示法匹配。 + +### 属性 + +当前字典仅有一项允许且必须出现的属性: + +|属性名|值类型|含义| +|-|-|-| +|`date_lifetime`|整数,单位为秒|数据有效期。当该字典上次更新距今时间超过该值时,将会自动发起重新导入| + +### 示例 + +```sql +-- 创建源数据表 +CREATE TABLE source_table ( + id INT NOT NULL, + city VARCHAR(32) NOT NULL, + code VARCHAR(32) NOT NULL +) ENGINE=OLAP +DISTRIBUTED BY HASH(id) BUCKETS 1 +PROPERTIES("replication_num" = "1"); + +-- 创建字典表 +CREATE DICTIONARY city_dict USING source_table +( + city KEY, + id VALUE +) +LAYOUT(HASH_MAP) +PROPERTIES('data_lifetime' = '600'); +``` + +基于该表,我们可以使用字典 `city_dict` 通过 `dict_get` 函数,基于 `source_table` 的 `city` 值查询对应的 `id`。 + +### 使用限制 + +1. Key 列 + + - IP_TRIE 类型字典的 Key 列必须为 Varchar 或 String 类型,**Key 列中的值必须为 CIDR 格式**。 + - HASH_MAP 类型字典的 Key 列支持所有简单类型(即排除所有 Map、Array 等嵌套类型)。 + - 作为 Key 列的列,**在源表中不得存在重复值**,否则字典导入数据时将报错。 + +2. Nullable 属性 + + - 所有 Key 列必须为 NOT NULLABLE, Value 列无限制。 + +## 使用与管理 + +### 导入(刷新)数据 + +字典支持自动与手动导入。 + +#### 自动导入 + +自动导入发生在以下时机: + +1. 字典建立以后 +2. 字典数据过期时(见[属性](#属性)) +3. BE 状态显示丢失该字典数据(有新 BE 上线,或旧 BE 重启等均有可能造成) + +#### 手动导入 + +Doris 支持通过以下命令手动刷新字典的数据: + +```sql +REFRESH DICTIONARY ; +``` + +其中 `` 为要导入数据的字典名。 + +#### 导入注意事项 + +1. 只有导入数据后的字典才可以查询。 +2. 如果导入时 Key 列具有重复值,导入事务会失败。 +3. 如果导入的数据版本早于 BE 已有的版本,则事务会失败。 +4. 如果当前已经有导入事务正在进行(字典 Status 为 `LOADING` ),则手动进行的导入会失败。请等待正在进行的导入完成后操作。 + +### 查询字典 + +可以分别使用 `dict_get` 和 `dict_get_many` 函数进行单一 Key、Value 列和多 Key、Value 列的字典表查询。 + +#### 语法 + +```sql +VALUE_TYPE dict_get(".", "", ); +STRUCT dict_get_many(".", ARRAY , STRUCT ); +``` + +其中: + +- `` 为字典所在的 database 名 +- `` 为字典名 +- `` 为要查询的 value 列列名 +- `` 为用来查询的 key 列数据 +- `` 为一个包含要查询的 value 列列名的常量数组 +- `` 为一个包含该字典所有 key 列对应数据的 STRUCT + +`dict_get` 的返回类型为 `` 对应的字典列类型。 +`dict_get_many` 的返回类型为 `` 对应的各个字典列类型所组成的 STRUCT。 + +#### 查询示例 + +该语句查询 `test_db` database 内的字典 `city_dict`,查询 key 列值为 "Beijing" 时的对应 `id` 列值: + +```sql +SELECT dict_get("test_db.city_dict", "id", "Beijing"); +``` + +该语句查询 `test_db` database 内的字典 `single_key_dict`,查询 key 列值为 1 时的对应 `k1` 和 `k3` 列值: + +```sql +SELECT dict_get_many("test_db.single_key_dict", ["k1", "k3"], struct(1)); +``` + +该语句查询 `test_db` database 内的字典 `multi_key_dict`,查询 2 个 key 列值依次为 2 和 'ABC' 时的对应 `k1` 和 `k3` 列值: + +```sql +SELECT dict_get_many("test_db.multi_key_dict", ["k2", "k3"], struct(2, 'ABC')); +``` + +例如建表语句如下: + +```sql +create table if not exists multi_key_table( + k0 int not null, + k1 varchar not null, + k2 float not null, + k3 varchar not null +) +DISTRIBUTED BY HASH(`k0`) BUCKETS auto +properties("replication_num" = "1"); + +create dictionary multi_key_dict using multi_key_table +( + k0 KEY, + k1 KEY, + k2 VALUE, + k3 VALUE +) +LAYOUT(HASH_MAP) +PROPERTIES('data_lifetime' = '600'); +``` + +则上述语句 + +```sql +SELECT dict_get_many("test_db.multi_key_dict", ["k2", "k3"], struct(2, 'ABC')); +``` + +的返回值类型为 `STRUCT`。 + +#### 查询注意事项 + +1. 当查询的 Key 数据不存在于字典表内时,返回 null。 +2. IP_TRIE 类型进行查询时,**`` 类型必须为 `IPV4` 或 `IPV6`**。 +3. 使用 IP_TRIE 类型字典时,key 列 `` 内的数据和查询时使用的 `` 同时支持 `IPV4` 和 `IPV6` 格式数据。 +4. 当查询的值不存在时,返回值为 null。 + +### 字典表管理 + +字典表支持以下管理和查看语句: + +1. 查看当前 database 内所有字典表状态 + + ```sql + SHOW DICTIONARIES [LIKE ]; + ``` + +2. 查看特定字典定义 + + ```sql + DESC DICTIONARY ; + ``` + +3. 删除字典表 + + ```sql + DROP DICTIONARY ; + ``` + +### 状态显示 + +通过 `SHOW DICTIONARIES` 语句,可以查看字典对应的基表,当前数据版本号,以及对应在 FE 和 BE 的状态。 + +```sql +> SHOW DICTIONARIES; ++--------------+----------------+----------------------------------------------+---------+--------+------------------------------------+------------------------------+ +| DictionaryId | DictionaryName | BaseTableName | Version | Status | DataDistribution | LastUpdateResult | ++--------------+----------------+----------------------------------------------+---------+--------+------------------------------------+------------------------------+ +| 51 | precision_dict | internal.test_refresh_dict.precision_test | 2 | NORMAL | {10.16.10.2:9767 ver=2 memory=368} | 2025-02-18 09:58:12: succeed | +| 48 | product_dict | internal.test_refresh_dict.product_info | 2 | NORMAL | {10.16.10.2:9767 ver=2 memory=240} | 2025-02-18 09:58:12: succeed | +| 49 | ip_dict | internal.test_refresh_dict.ip_info | 2 | NORMAL | {10.16.10.2:9767 ver=2 memory=194} | 2025-02-18 09:58:12: succeed | +| 52 | order_dict | internal.test_refresh_dict.column_order_test | 2 | NORMAL | {10.16.10.2:9767 ver=2 memory=432} | 2025-02-18 09:58:12: succeed | +| 50 | user_dict | internal.test_refresh_dict.user_info | 2 | NORMAL | {10.16.10.2:9767 ver=2 memory=240} | 2025-02-18 09:58:12: succeed | ++--------------+----------------+----------------------------------------------+---------+--------+------------------------------------+------------------------------+ +``` + +其中: + +1. `Version` 代表数据版本号,每次数据导入时将会自增 1。 + +2. `Status` 代表字典状态,含义如下: + + |状态名|含义|可以进行的操作| + |-|-|-| + |NORMAL|字典当前正常|查询、导入、删除| + |LOADING|字典当前正在进行导入|查询| + |OUT_OF_DATE|字典当前数据已过期|查询、导入、删除| + +3. `DataDistribution` 表示在各个 BE 的当前状态,包括版本号及内存占用大小(KB)。 + +4. `LastUpdateResult` 表示上一次导入(包括自动及手动)的结果,如果有异常,将会在此处显示详细信息。 + +如需查看字典表的列定义,可以通过 `DESC DICTIONARY` 进行。例如: + +```sql +> DESC DICTIONARY city_code_dict; ++-------------+-------------+------+-------+ +| Field | Type | Null | Key | ++-------------+-------------+------+-------+ +| city_name | varchar(32) | NO | true | +| region_code | varchar(32) | NO | false | ++-------------+-------------+------+-------+ +``` + +#### 状态注意事项 + +1. 每次 `SHOW DICTIONARIES` 都会实时拉取所有 BE 的对应字典状态,如当前 Database 内字典过多,推荐通过 LIKE 子句依 `DictionaryName` 进行过滤。 + +## 注意事项 + +1. 数据一致性 + + - 字典每次刷新都将产生新的版本,查询时如 BE 记录的版本与 FE 版本不一致,查询将会失败。 + - Doris 不会监控字典表与基表的数据一致性。用户需要通过业务逻辑适时更新。 + - 当源表被删除时,对应的字典表也会被自动删除。 + - 删除数据库时,其中的字典表也会被删除。 + +2. 性能考虑 + + - 字典表适用于相对静态的数据,如维表数据等。 + - 字典表为纯内存表,全量数据存储于所有 BE 内存中,占用较大,需要权衡内存使用和查询性能,选择合适的表派生字典。 + +3. 最佳实践 + + 1. 合理选择键值列: + + - 选择基数适中的列作为键 + + 2. 布局选择: + + - 对于一般场景使用 HASH_MAP 布局 + - 对于 IP 地址的范围匹配场景使用 IP_TRIE 布局 + + 3. 状态管理: + + - 定期监控字典表的内存使用情况 + - 适时更新字典表数据以保持数据的最新状态 + +## 完整示例 + +1. HASH_MAP + + ```sql + -- 创建源数据表 + CREATE TABLE cities ( + city_id INT NOT NULL, + city_name VARCHAR(32) NOT NULL, + region_code VARCHAR(32) NOT NULL + ) ENGINE=OLAP + DISTRIBUTED BY HASH(city_id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + + -- 插入数据 + INSERT INTO cities VALUES + (1, 'Beijing', 'BJ'), + (2, 'Shanghai', 'SH'), + (3, 'Guangzhou', 'GZ'); + + -- 创建字典表 + CREATE DICTIONARY city_code_dict USING cities + ( + city_name KEY, + region_code VALUE + ) + LAYOUT(HASH_MAP) + PROPERTIES('data_lifetime' = '600'); + + -- 刷新字典 + REFERSH DICTIONARY city_code_dict; + + -- 使用字典表查询 + SELECT dict_get("test_refresh_dict.city_code_dict", "region_code", "Beijing"); + +------------------------------------------------------------------------+ + | dict_get('test_refresh_dict.city_code_dict', 'region_code', 'Beijing') | + +------------------------------------------------------------------------+ + | BJ | + +------------------------------------------------------------------------+ + ``` + +2. IP_TRIE + + ```sql + -- 创建源数据表 + CREATE TABLE ip_locations ( + ip_range VARCHAR(30) NOT NULL, + country VARCHAR(64) NOT NULL, + region VARCHAR(64) NOT NULL, + city VARCHAR(64) NOT NULL + ) ENGINE=OLAP + DISTRIBUTED BY HASH(ip_range) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + + -- 插入一些示例数据 + INSERT INTO ip_locations VALUES + ('1.0.0.0/24', 'United States', 'California', 'Los Angeles'), + ('1.0.1.0/24', 'China', 'Beijing', 'Beijing'), + ('1.0.4.0/24', 'Japan', 'Tokyo', 'Tokyo'); + + -- 创建 IP 地址字典表 + CREATE DICTIONARY ip_location_dict USING ip_locations + ( + ip_range KEY, + country VALUE, + region VALUE, + city VALUE + ) + LAYOUT(IP_TRIE) + PROPERTIES('data_lifetime' = '600'); + + -- 刷新字典表数据 + REFRESH DICTIONARY ip_location_dict; + + -- 查询 IP 地址对应的位置信息,依 CIDR 匹配。 + SELECT + dict_get("test_refresh_dict.ip_location_dict", "country", cast('1.0.0.1' as ipv4)) AS country, + dict_get("test_refresh_dict.ip_location_dict", "region", cast('1.0.0.2' as ipv4)) AS region, + dict_get("test_refresh_dict.ip_location_dict", "city", cast('1.0.0.3' as ipv4)) AS city; + +---------------+------------+-------------+ + | country | region | city | + +---------------+------------+-------------+ + | United States | California | Los Angeles | + +---------------+------------+-------------+ + ``` + +## 错误排查 + +1. 查询时报错 "can not find dict name" + + 首先通过 `SHOW DICTIONARIES` 确认字典是否存在。如存在,重新刷新对应字典数据。 + +2. 查询报错 "dict_get() only support IP type for IP_TRIE" + + 确认 IP_TRIE 类型字典的 Key 列是否严格满足 CIDR 格式。 + +3. 导入报错 "Version ID is not greater than the existing version ID for the dictionary." + + 通过 `DROP DICTIONARY` 命令删除对应字典后重新建立并导入数据。 diff --git a/sidebars.json b/sidebars.json index f860030eb10a4..f236de75c3917 100644 --- a/sidebars.json +++ b/sidebars.json @@ -298,6 +298,7 @@ }, "query-acceleration/sql-cache-manual", "query-acceleration/high-concurrent-point-query", + "query-acceleration/dictionary", { "type": "category", "label": "Distincting Counts", From 6f5663cf936777b32692e6c92b72607ffe98d7af Mon Sep 17 00:00:00 2001 From: zhaochangle Date: Wed, 19 Feb 2025 18:07:37 +0800 Subject: [PATCH 2/5] skip null key --- .../current/query-acceleration/dictionary.md | 284 ++++++++++++++++-- 1 file changed, 251 insertions(+), 33 deletions(-) diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/query-acceleration/dictionary.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/query-acceleration/dictionary.md index 2c741f96a6565..55fa14f4f0a5b 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/query-acceleration/dictionary.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/query-acceleration/dictionary.md @@ -24,7 +24,9 @@ under the License. ## 概述 -字典表(Dictionary)是 Doris 提供的一种用于加速 JOIN 操作的特殊数据结构。它通过将常用的键值对预先加载到内存中,实现快速的查找操作,从而提升查询性能。字典表特别适用于需要频繁进行键值查找的场景。 +字典表(Dictionary) 是 Doris 提供的一种用于加速 JOIN 操作的特殊数据结构。它在普通表的基础上建立,将原表的对应列视为键值关系,将这些列的全部数据预先加载到内存中,实现快速的查找操作,从而提升查询性能。特别适用于需要频繁进行键值查找的场景。 + +自然地,作为键值查找解决方案,字典表不容许重复 Key 的出现。 ## 使用场景 @@ -33,7 +35,121 @@ under the License. 1. 需要频繁进行键值查找的场景 2. 维度表较小,可以完全加载到内存中 3. 数据更新频率相对较低的场景 -4. 需要优化 JOIN 操作性能的场景 + +原本需要使用 LEFT OUTER JOIN 实现的键值查找,在字典表的帮助下可以完全省去 JOIN 的开销,转变为普通的函数调用。以下是一个完整的场景示例: + +### 场景示例 + +在电商系统中,订单表(`orders`, 事实表)记录了大量交易数据,需要经常关联商品表(`products`, 维度表)来获取商品的详细信息。 + +```sql +-- 商品维度表 +CREATE TABLE products ( + product_id BIGINT NOT NULL COMMENT "商品ID", + product_name VARCHAR(128) NOT NULL COMMENT "商品名称", + brand_name VARCHAR(64) NOT NULL COMMENT "品牌名称", + category_name VARCHAR(64) NOT NULL COMMENT "品类名称", + retail_price DECIMAL(10,2) NOT NULL COMMENT "零售价", + update_time DATETIME NOT NULL COMMENT "更新时间" +) +DISTRIBUTED BY HASH(`product_id`) BUCKETS 10; + +-- 订单事实表 +CREATE TABLE orders ( + order_id BIGINT NOT NULL COMMENT "订单ID", + product_id BIGINT NOT NULL COMMENT "商品ID", + user_id BIGINT NOT NULL COMMENT "用户ID", + quantity INT NOT NULL COMMENT "购买数量", + actual_price DECIMAL(10,2) NOT NULL COMMENT "实际成交价", + order_time DATETIME NOT NULL COMMENT "下单时间" +) +DISTRIBUTED BY HASH(`order_id`) BUCKETS 32; + +-- 插入示例数据 +INSERT INTO products VALUES +(1001, 'iPhone 15 Pro 256G 黑色', 'Apple', '手机数码', 8999.00, '2024-01-01 00:00:00'), +(1002, 'MacBook Pro M3 Max', 'Apple', '电脑办公', 19999.00, '2024-01-01 00:00:00'), +(1003, 'AirPods Pro 2', 'Apple', '手机配件', 1999.00, '2024-01-01 00:00:00'); + +INSERT INTO orders VALUES +(10001, 1001, 88001, 1, 8899.00, '2024-02-22 10:15:00'), +(10002, 1002, 88002, 1, 19599.00, '2024-02-22 11:30:00'), +(10003, 1003, 88001, 2, 1899.00, '2024-02-22 14:20:00'); +``` + +以下是一组典型的查询,为了统计各品类的订单量和销售额,以往我们需要使用 LEFT OUTER JOIN 来完成从商品表中提取商品信息的功能。 + +```sql +-- 统计各品类的订单量和销售额 +SELECT + p.category_name, + p.brand_name, + COUNT(DISTINCT o.order_id) as order_count, + SUM(o.quantity) as total_quantity, + SUM(o.actual_price * o.quantity) as total_amount +FROM orders o +LEFT JOIN products p ON o.product_id = p.product_id +WHERE o.order_time >= '2024-02-22 00:00:00' +GROUP BY p.category_name, p.brand_name +ORDER BY total_amount DESC; +``` + +```text ++---------------+------------+-------------+----------------+--------------+ +| category_name | brand_name | order_count | total_quantity | total_amount | ++---------------+------------+-------------+----------------+--------------+ +| 电脑办公 | Apple | 1 | 1 | 19599.00 | +| 手机数码 | Apple | 1 | 1 | 8899.00 | +| 手机配件 | Apple | 1 | 2 | 3798.00 | ++---------------+------------+-------------+----------------+--------------+ +``` + +在这类查询中,我们需要频繁地通过 `product_id` 查询商品的其他信息,这本质上是一种 KV 查找操作。 + +设定好键值对关系,预先构建对应的字典表,可以完全将之前的 JOIN 操作转换为更轻的键值查找,提升 SQL 执行效率: + +```sql +-- 创建商品信息字典 +CREATE DICTIONARY product_info_dict USING products +( + product_id KEY, + product_name VALUE, + brand_name VALUE, + category_name VALUE, + retail_price VALUE +) +LAYOUT(HASH_MAP) +PROPERTIES( + 'data_lifetime'='300' -- 考虑到商品信息变更频率,设置5分钟更新一次 +); +``` + +原始查询借助字典表将 JOIN 操作转换为了 `dict_get` 函数查找,该函数为较轻的 KV 查找操作: + +```sql +SELECT + dict_get("test.product_info_dict", "category_name", o.product_id) as category_name, + dict_get("test.product_info_dict", "brand_name", o.product_id) as brand_name, + COUNT(DISTINCT o.order_id) as order_count, + SUM(o.quantity) as total_quantity, + SUM(o.actual_price * o.quantity) as total_amount +FROM orders o +WHERE o.order_time >= '2024-02-22 00:00:00' +GROUP BY + dict_get("test.product_info_dict", "category_name", o.product_id), + dict_get("test.product_info_dict", "brand_name", o.product_id) +ORDER BY total_amount DESC; +``` + +```text ++---------------+------------+-------------+----------------+--------------+ +| category_name | brand_name | order_count | total_quantity | total_amount | ++---------------+------------+-------------+----------------+--------------+ +| 电脑办公 | Apple | 1 | 1 | 19599.00 | +| 手机数码 | Apple | 1 | 1 | 8899.00 | +| 手机配件 | Apple | 1 | 2 | 3798.00 | ++---------------+------------+-------------+----------------+--------------+ +``` ## 字典表定义 @@ -67,7 +183,7 @@ PROPERTIES( - ``:表的某项属性名 - ``:表的某项属性取值 -`` 不必出现在 `` 前。 +`` 和 `` 至少各有一个。`` 不必出现在 `` 前。 ### 布局类型 @@ -78,11 +194,10 @@ PROPERTIES( ### 属性 -当前字典仅有一项允许且必须出现的属性: - -|属性名|值类型|含义| -|-|-|-| -|`date_lifetime`|整数,单位为秒|数据有效期。当该字典上次更新距今时间超过该值时,将会自动发起重新导入| +|属性名|值类型|含义|必须项| +|-|-|-|-| +|`date_lifetime`|整数,单位为秒|数据有效期。当该字典上次更新距今时间超过该值时,将会自动发起重新导入,导入逻辑详见[自动导入](#自动导入)|是| +|`skip_null_key`|布尔值|向字典导入时如果 Key 列中出现 null 值,如果该值为 `true`,跳过该行数据,否则报错。缺省值为 `false`|否| ### 示例 @@ -93,8 +208,7 @@ CREATE TABLE source_table ( city VARCHAR(32) NOT NULL, code VARCHAR(32) NOT NULL ) ENGINE=OLAP -DISTRIBUTED BY HASH(id) BUCKETS 1 -PROPERTIES("replication_num" = "1"); +DISTRIBUTED BY HASH(id) BUCKETS 1; -- 创建字典表 CREATE DICTIONARY city_dict USING source_table @@ -113,12 +227,13 @@ PROPERTIES('data_lifetime' = '600'); 1. Key 列 - IP_TRIE 类型字典的 Key 列必须为 Varchar 或 String 类型,**Key 列中的值必须为 CIDR 格式**。 + - IP_TRIE 类型的字典只允许出现一个 Key 列。 - HASH_MAP 类型字典的 Key 列支持所有简单类型(即排除所有 Map、Array 等嵌套类型)。 - 作为 Key 列的列,**在源表中不得存在重复值**,否则字典导入数据时将报错。 -2. Nullable 属性 +2. Null 值处理 - - 所有 Key 列必须为 NOT NULLABLE, Value 列无限制。 + - 字典的所有列都可以是 Nullable 列,但 Key 列不应当实际出现 null 值。如果出现,行为取决于[属性](#属性)当中的 `skip_null_key`。 ## 使用与管理 @@ -148,31 +263,32 @@ REFRESH DICTIONARY ; 1. 只有导入数据后的字典才可以查询。 2. 如果导入时 Key 列具有重复值,导入事务会失败。 -3. 如果导入的数据版本早于 BE 已有的版本,则事务会失败。 -4. 如果当前已经有导入事务正在进行(字典 Status 为 `LOADING` ),则手动进行的导入会失败。请等待正在进行的导入完成后操作。 +3. 如果当前已经有导入事务正在进行(字典 Status 为 `LOADING` ),则手动进行的导入会失败。请等待正在进行的导入完成后操作。 ### 查询字典 可以分别使用 `dict_get` 和 `dict_get_many` 函数进行单一 Key、Value 列和多 Key、Value 列的字典表查询。 +首次查询请待字典导入完成以后进行。 + #### 语法 ```sql -VALUE_TYPE dict_get(".", "", ); -STRUCT dict_get_many(".", ARRAY , STRUCT ); +dict_get(".", "", ); +dict_get_many(".", , ); ``` 其中: - `` 为字典所在的 database 名 - `` 为字典名 -- `` 为要查询的 value 列列名 +- `` 为要查询的 value 列列名,类型为 `VARCHAR`,**必须为常量** +- `` 为要查询的所有 value 列列名,类型为 `ARRAY`,**必须为常量** - `` 为用来查询的 key 列数据 -- `` 为一个包含要查询的 value 列列名的常量数组 -- `` 为一个包含该字典所有 key 列对应数据的 STRUCT +- `` 为一个包含该字典**所有 key 列**的需查询数据的 STRUCT `dict_get` 的返回类型为 `` 对应的字典列类型。 -`dict_get_many` 的返回类型为 `` 对应的各个字典列类型所组成的 STRUCT。 +`dict_get_many` 的返回类型为 `` 对应的各个字典列类型所组成的 [STRUCT](../sql-manual/sql-data-types/semi-structured/STRUCT)。 #### 查询示例 @@ -188,7 +304,7 @@ SELECT dict_get("test_db.city_dict", "id", "Beijing"); SELECT dict_get_many("test_db.single_key_dict", ["k1", "k3"], struct(1)); ``` -该语句查询 `test_db` database 内的字典 `multi_key_dict`,查询 2 个 key 列值依次为 2 和 'ABC' 时的对应 `k1` 和 `k3` 列值: +该语句查询 `test_db` database 内的字典 `multi_key_dict`,查询 2 个 key 列值依次为 2 和 'ABC' 时的对应 `k2` 和 `k3` 列值: ```sql SELECT dict_get_many("test_db.multi_key_dict", ["k2", "k3"], struct(2, 'ABC')); @@ -203,8 +319,7 @@ create table if not exists multi_key_table( k2 float not null, k3 varchar not null ) -DISTRIBUTED BY HASH(`k0`) BUCKETS auto -properties("replication_num" = "1"); +DISTRIBUTED BY HASH(`k0`) BUCKETS auto; create dictionary multi_key_dict using multi_key_table ( @@ -227,10 +342,9 @@ SELECT dict_get_many("test_db.multi_key_dict", ["k2", "k3"], struct(2, 'ABC')); #### 查询注意事项 -1. 当查询的 Key 数据不存在于字典表内时,返回 null。 +1. 当查询的 Key 数据不存在于字典表内,**或 Key 数据为 null 时**,返回 null。 2. IP_TRIE 类型进行查询时,**`` 类型必须为 `IPV4` 或 `IPV6`**。 3. 使用 IP_TRIE 类型字典时,key 列 `` 内的数据和查询时使用的 `` 同时支持 `IPV4` 和 `IPV6` 格式数据。 -4. 当查询的值不存在时,返回值为 null。 ### 字典表管理 @@ -254,6 +368,8 @@ SELECT dict_get_many("test_db.multi_key_dict", ["k2", "k3"], struct(2, 'ABC')); DROP DICTIONARY ; ``` + 删除字典表后,被删除的字典可能不会立即从 BE 中移除。 + ### 状态显示 通过 `SHOW DICTIONARIES` 语句,可以查看字典对应的基表,当前数据版本号,以及对应在 FE 和 BE 的状态。 @@ -299,10 +415,6 @@ SELECT dict_get_many("test_db.multi_key_dict", ["k2", "k3"], struct(2, 'ABC')); +-------------+-------------+------+-------+ ``` -#### 状态注意事项 - -1. 每次 `SHOW DICTIONARIES` 都会实时拉取所有 BE 的对应字典状态,如当前 Database 内字典过多,推荐通过 LIKE 子句依 `DictionaryName` 进行过滤。 - ## 注意事项 1. 数据一致性 @@ -344,8 +456,7 @@ SELECT dict_get_many("test_db.multi_key_dict", ["k2", "k3"], struct(2, 'ABC')); city_name VARCHAR(32) NOT NULL, region_code VARCHAR(32) NOT NULL ) ENGINE=OLAP - DISTRIBUTED BY HASH(city_id) BUCKETS 1 - PROPERTIES("replication_num" = "1"); + DISTRIBUTED BY HASH(city_id) BUCKETS 1; -- 插入数据 INSERT INTO cities VALUES @@ -367,6 +478,9 @@ SELECT dict_get_many("test_db.multi_key_dict", ["k2", "k3"], struct(2, 'ABC')); -- 使用字典表查询 SELECT dict_get("test_refresh_dict.city_code_dict", "region_code", "Beijing"); + ``` + + ```text +------------------------------------------------------------------------+ | dict_get('test_refresh_dict.city_code_dict', 'region_code', 'Beijing') | +------------------------------------------------------------------------+ @@ -384,8 +498,7 @@ SELECT dict_get_many("test_db.multi_key_dict", ["k2", "k3"], struct(2, 'ABC')); region VARCHAR(64) NOT NULL, city VARCHAR(64) NOT NULL ) ENGINE=OLAP - DISTRIBUTED BY HASH(ip_range) BUCKETS 1 - PROPERTIES("replication_num" = "1"); + DISTRIBUTED BY HASH(ip_range) BUCKETS 1; -- 插入一些示例数据 INSERT INTO ip_locations VALUES @@ -412,6 +525,9 @@ SELECT dict_get_many("test_db.multi_key_dict", ["k2", "k3"], struct(2, 'ABC')); dict_get("test_refresh_dict.ip_location_dict", "country", cast('1.0.0.1' as ipv4)) AS country, dict_get("test_refresh_dict.ip_location_dict", "region", cast('1.0.0.2' as ipv4)) AS region, dict_get("test_refresh_dict.ip_location_dict", "city", cast('1.0.0.3' as ipv4)) AS city; + ``` + + ```text +---------------+------------+-------------+ | country | region | city | +---------------+------------+-------------+ @@ -419,6 +535,108 @@ SELECT dict_get_many("test_db.multi_key_dict", ["k2", "k3"], struct(2, 'ABC')); +---------------+------------+-------------+ ``` +3. HASH_MAP 多 Key / 多 Value + + ```sql + -- 商品SKU维度表:包含了商品的基本属性 + CREATE TABLE product_sku_info ( + product_id INT NOT NULL COMMENT "商品ID", + color_code VARCHAR(32) NOT NULL COMMENT "颜色编码", + size_code VARCHAR(32) NOT NULL COMMENT "尺码编码", + product_name VARCHAR(128) NOT NULL COMMENT "商品名称", + color_name VARCHAR(32) NOT NULL COMMENT "颜色名称", + size_name VARCHAR(32) NOT NULL COMMENT "尺码名称", + stock INT NOT NULL COMMENT "库存", + price DECIMAL(10,2) NOT NULL COMMENT "价格", + update_time DATETIME NOT NULL COMMENT "更新时间" + ) + DISTRIBUTED BY HASH(`product_id`) BUCKETS 10; + + -- 订单明细表:记录实际的销售数据 + CREATE TABLE order_details ( + order_id BIGINT NOT NULL COMMENT "订单ID", + product_id INT NOT NULL COMMENT "商品ID", + color_code VARCHAR(32) NOT NULL COMMENT "颜色编码", + size_code VARCHAR(32) NOT NULL COMMENT "尺码编码", + quantity INT NOT NULL COMMENT "购买数量", + order_time DATETIME NOT NULL COMMENT "下单时间" + ) + DISTRIBUTED BY HASH(`order_id`) BUCKETS 10; + + -- 插入商品SKU数据 + INSERT INTO product_sku_info VALUES + (1001, 'BLK', 'M', 'Nike运动T恤', '黑色', 'M码', 100, 199.00, '2024-02-23 10:00:00'), + (1001, 'BLK', 'L', 'Nike运动T恤', '黑色', 'L码', 80, 199.00, '2024-02-23 10:00:00'), + (1001, 'WHT', 'M', 'Nike运动T恤', '白色', 'M码', 90, 199.00, '2024-02-23 10:00:00'), + (1001, 'WHT', 'L', 'Nike运动T恤', '白色', 'L码', 70, 199.00, '2024-02-23 10:00:00'), + (1002, 'RED', 'S', 'Adidas运动裤', '红色', 'S码', 50, 299.00, '2024-02-23 10:00:00'), + (1002, 'RED', 'M', 'Adidas运动裤', '红色', 'M码', 60, 299.00, '2024-02-23 10:00:00'), + (1002, 'BLU', 'S', 'Adidas运动裤', '蓝色', 'S码', 55, 299.00, '2024-02-23 10:00:00'), + (1002, 'BLU', 'M', 'Adidas运动裤', '蓝色', 'M码', 65, 299.00, '2024-02-23 10:00:00'); + + -- 插入订单数据 + INSERT INTO order_details VALUES + (10001, 1001, 'BLK', 'M', 2, '2024-02-23 12:01:00'), + (10002, 1001, 'WHT', 'L', 1, '2024-02-23 12:05:00'), + (10003, 1002, 'RED', 'S', 1, '2024-02-23 12:10:00'), + (10004, 1001, 'BLK', 'L', 3, '2024-02-23 12:15:00'), + (10005, 1002, 'BLU', 'M', 2, '2024-02-23 12:20:00'); + + -- 创建多键多值字典 + CREATE DICTIONARY sku_dict USING product_sku_info + ( + product_id KEY, + color_code KEY, + size_code KEY, + product_name VALUE, + color_name VALUE, + size_name VALUE, + price VALUE, + stock VALUE + ) + LAYOUT(HASH_MAP) + PROPERTIES('data_lifetime'='300'); + + -- 使用dict_get_many的查询示例:获取订单详情及SKU信息 + WITH order_sku_info AS ( + SELECT + o.order_id, + o.quantity, + o.order_time, + dict_get_many("test.sku_dict", + ["product_name", "color_name", "size_name", "price", "stock"], + struct(o.product_id, o.color_code, o.size_code) + ) as sku_info + FROM order_details o + WHERE o.order_time >= '2024-02-23 12:00:00' + AND o.order_time < '2024-02-23 13:00:00' + ) + SELECT + order_id, + order_time, + struct_element(sku_info, 'product_name') as product_name, + struct_element(sku_info, 'color_name') as color_name, + struct_element(sku_info, 'size_name') as size_name, + quantity, + struct_element(sku_info, 'price') as unit_price, + quantity * struct_element(sku_info, 'price') as total_amount, + struct_element(sku_info, 'stock') as current_stock + FROM order_sku_info + ORDER BY order_time; + ``` + + ```text + +----------+---------------------+-----------------+------------+-----------+----------+------------+--------------+---------------+ + | order_id | order_time | product_name | color_name | size_name | quantity | unit_price | total_amount | current_stock | + +----------+---------------------+-----------------+------------+-----------+----------+------------+--------------+---------------+ + | 10001 | 2024-02-23 12:01:00 | Nike运动T恤 | 黑色 | M码 | 2 | 199.00 | 398.00 | 100 | + | 10002 | 2024-02-23 12:05:00 | Nike运动T恤 | 白色 | L码 | 1 | 199.00 | 199.00 | 70 | + | 10003 | 2024-02-23 12:10:00 | Adidas运动裤 | 红色 | S码 | 1 | 299.00 | 299.00 | 50 | + | 10004 | 2024-02-23 12:15:00 | Nike运动T恤 | 黑色 | L码 | 3 | 199.00 | 597.00 | 80 | + | 10005 | 2024-02-23 12:20:00 | Adidas运动裤 | 蓝色 | M码 | 2 | 299.00 | 598.00 | 65 | + +----------+---------------------+-----------------+------------+-----------+----------+------------+--------------+---------------+ + ``` + ## 错误排查 1. 查询时报错 "can not find dict name" From c002120ae57afa81cf3f9a435a8dc351b8bfa33a Mon Sep 17 00:00:00 2001 From: zhaochangle Date: Mon, 24 Feb 2025 00:40:27 +0800 Subject: [PATCH 3/5] english --- docs/query-acceleration/dictionary.md | 648 ++++++++++++++++++ .../current/query-acceleration/dictionary.md | 30 +- 2 files changed, 662 insertions(+), 16 deletions(-) create mode 100644 docs/query-acceleration/dictionary.md diff --git a/docs/query-acceleration/dictionary.md b/docs/query-acceleration/dictionary.md new file mode 100644 index 0000000000000..aa70f383f0a86 --- /dev/null +++ b/docs/query-acceleration/dictionary.md @@ -0,0 +1,648 @@ +--- +{ + "title": "Dictionary Table(Experimental)", + "language": "en" +} +--- + + + +## Overview + +Dictionary is a special data structure provided by Doris to speed up JOIN operations. It is built on the basis of ordinary tables, treating the corresponding columns of the original table as key-value relationships, and pre-loading all the data of these columns into memory to achieve fast lookup operations, thus improving query performance. It is especially suitable for scenarios that require frequent key-value lookups. + +Naturally, as a key-value lookup solution, dictionary tables do not allow duplicate keys. + +## Usage Scenario + +The dictionary table is mainly suitable for the following scenarios: + +1. Scenarios where frequent key-value lookups are required +2. Dimension tables are small and can be fully loaded into memory +3. Scenarios with relatively low frequency of data updates + +The key-value lookup that originally needed to be implemented using LEFT OUTER JOIN can be completely eliminated with the help of the dictionary table, transforming into a normal function call. Here is a complete scenario example: + +### Scenario Example + +In e-commerce systems, the order table (`orders`, fact table) records a large amount of transaction data, and it needs to frequently associate with the product table (`products`, dimension table) to obtain detailed product information. + +```sql +-- Product Dimension Table +CREATE TABLE products ( + product_id BIGINT NOT NULL COMMENT "商品ID", + product_name VARCHAR(128) NOT NULL COMMENT "商品名称", + brand_name VARCHAR(64) NOT NULL COMMENT "品牌名称", + category_name VARCHAR(64) NOT NULL COMMENT "品类名称", + retail_price DECIMAL(10,2) NOT NULL COMMENT "零售价", + update_time DATETIME NOT NULL COMMENT "更新时间" +) +DISTRIBUTED BY HASH(`product_id`) BUCKETS 10; + +-- Order Fact Table +CREATE TABLE orders ( + order_id BIGINT NOT NULL COMMENT "订单ID", + product_id BIGINT NOT NULL COMMENT "商品ID", + user_id BIGINT NOT NULL COMMENT "用户ID", + quantity INT NOT NULL COMMENT "购买数量", + actual_price DECIMAL(10,2) NOT NULL COMMENT "实际成交价", + order_time DATETIME NOT NULL COMMENT "下单时间" +) +DISTRIBUTED BY HASH(`order_id`) BUCKETS 32; + +INSERT INTO products VALUES +(1001, 'iPhone 15 Pro 256G 黑色', 'Apple', '手机数码', 8999.00, '2024-01-01 00:00:00'), +(1002, 'MacBook Pro M3 Max', 'Apple', '电脑办公', 19999.00, '2024-01-01 00:00:00'), +(1003, 'AirPods Pro 2', 'Apple', '手机配件', 1999.00, '2024-01-01 00:00:00'); + +INSERT INTO orders VALUES +(10001, 1001, 88001, 1, 8899.00, '2024-02-22 10:15:00'), +(10002, 1002, 88002, 1, 19599.00, '2024-02-22 11:30:00'), +(10003, 1003, 88001, 2, 1899.00, '2024-02-22 14:20:00'); +``` + +The following is a set of typical queries. In order to count the order volume and sales of each category, in the past, we needed to use LEFT OUTER JOIN to achieve the function of extracting product information from the product table. + +```sql +-- Analyze the order volume and sales revenue of each category +SELECT + p.category_name, + p.brand_name, + COUNT(DISTINCT o.order_id) as order_count, + SUM(o.quantity) as total_quantity, + SUM(o.actual_price * o.quantity) as total_amount +FROM orders o +LEFT JOIN products p ON o.product_id = p.product_id +WHERE o.order_time >= '2024-02-22 00:00:00' +GROUP BY p.category_name, p.brand_name +ORDER BY total_amount DESC; +``` + +```text ++---------------+------------+-------------+----------------+--------------+ +| category_name | brand_name | order_count | total_quantity | total_amount | ++---------------+------------+-------------+----------------+--------------+ +| 电脑办公 | Apple | 1 | 1 | 19599.00 | +| 手机数码 | Apple | 1 | 1 | 8899.00 | +| 手机配件 | Apple | 1 | 2 | 3798.00 | ++---------------+------------+-------------+----------------+--------------+ +``` + +In such queries, we need to frequently retrieve other information about products using the `product_id`, which essentially involves a KV lookup operation. + +By setting up the key-value pair relationships and pre-building the corresponding dictionary tables, we can completely convert previous JOIN operations into lighter key-value lookups, thereby improving SQL execution efficiency: + +```sql +-- Create product information dictionary +CREATE DICTIONARY product_info_dict USING products +( + product_id KEY, + product_name VALUE, + brand_name VALUE, + category_name VALUE, + retail_price VALUE +) +LAYOUT(HASH_MAP) +PROPERTIES( + 'data_lifetime'='300' -- Considering the frequency of changes in product information, set the update interval to 5 minutes. +); +``` + +The original query converts the JOIN operation into a `dict_get` function lookup using a dictionary table, which is a lighter KV lookup operation: + +```sql +SELECT + dict_get("test.product_info_dict", "category_name", o.product_id) as category_name, + dict_get("test.product_info_dict", "brand_name", o.product_id) as brand_name, + COUNT(DISTINCT o.order_id) as order_count, + SUM(o.quantity) as total_quantity, + SUM(o.actual_price * o.quantity) as total_amount +FROM orders o +WHERE o.order_time >= '2024-02-22 00:00:00' +GROUP BY + dict_get("test.product_info_dict", "category_name", o.product_id), + dict_get("test.product_info_dict", "brand_name", o.product_id) +ORDER BY total_amount DESC; +``` + +```text ++---------------+------------+-------------+----------------+--------------+ +| category_name | brand_name | order_count | total_quantity | total_amount | ++---------------+------------+-------------+----------------+--------------+ +| 电脑办公 | Apple | 1 | 1 | 19599.00 | +| 手机数码 | Apple | 1 | 1 | 8899.00 | +| 手机配件 | Apple | 1 | 2 | 3798.00 | ++---------------+------------+-------------+----------------+--------------+ +``` + +## Dictionary Table Definition + +### Basic Grammar + +```sql +CREATE DICTIONARY USING +( + KEY[, + ..., + VALUE] + VALUE[, + ..., + VALUE] +) +LAYOUT() +PROPERTIES( + "" = ""[, + ..., + "" = ""] +); +``` + +Among: + +- ``: The name of the dictionary table +- ``: Source data table +- ``: The column name in the source table that serves as a key +- ``: The column name in the source table that serves as a value +- ``: The storage layout type of the dictionary table, see later for details. +- ``: The name of a certain property of a table +- ``: The value of a certain property of a table + +`` and `` each must have at least one. `` does not have to appear before ``. + +### Layout Type + +Currently, two layout types are supported: + +- `HASH_MAP`: An implementation based on a hash table, suitable for general key-value lookup scenarios. + +- `IP_TRIE`: An implementation based on a Trie tree, specifically optimized for IP address type lookups. The Key column needs to be represented in CIDR notation for IP addresses, and queries are matched according to CIDR notation. + +### Property + +|Property Name|Value Type|Meaning|Required| +|-|-|-|-| +|`date_lifetime`|Integer, unit in seconds|Data validity period. When the time since the last update of this dictionary exceeds this value, it will automatically initiate a import. The import logic is detailed in [Automatic Import](#automatic-import)|Yes| +|`skip_null_key`|Boolean|If the Key column contains null values when load to a dictionary, skip the row if the value is `true`, otherwise raise an error. The default value is `false`|No| + +### Example + +```sql +-- Create source data table +CREATE TABLE source_table ( + id INT NOT NULL, + city VARCHAR(32) NOT NULL, + code VARCHAR(32) NOT NULL +) ENGINE=OLAP +DISTRIBUTED BY HASH(id) BUCKETS 1; + +-- Create dictionary table +CREATE DICTIONARY city_dict USING source_table +( + city KEY, + id VALUE +) +LAYOUT(HASH_MAP) +PROPERTIES('data_lifetime' = '600'); +``` + +Based on the table, we can use the dictionary `city_dict` through the `dict_get` function to query the corresponding `id` based on the `city` value in `source_table`. + +### Usage Restrictions + +1. Key Columns + + - The Key column of the IP_TRIE type dictionary must be of Varchar or String type, **the values in the Key column must be in CIDR format**. + - The dictionary of the IP_TRIE type allows only one Key column. + - The Key column of the HASH_MAP type dictionary supports all simple types (i.e., excluding all nested types such as Map, Array, etc.). + - As a Key column, **there must not be duplicate values in the source table**, otherwise an error will be reported when importing dictionary data. + +2. Null Value Handling + + - All columns in the dictionary can be nullable columns, but the Key column should not actually appear with null values. If it does, the behavior depends on the `skip_null_key` in the [Property](#property). + +## Use and Management + +### Import (Refresh) Data + +The dictionary supports automatic and manual import. "import" is also called "refresh" here. + +#### Automatic Import + +Automatic import occurs at the following times: + +1. After the dictionary is established +2. When the dictionary data expires (see [Property](#property)) +3. When the BE state shows the loss of the dictionary data (new BE going online, or old BE restarting, etc.) + +#### Manual Import + +Doris supports manually refreshing dictionary data through the following commands: + +```sql +REFRESH DICTIONARY ; +``` + +Among them, `` is the name of the dictionary to be imported. + +#### Attention Points of Import + +1. Only dictionaries that have imported data can be queried. +2. If the Key column has duplicate values during import, the import transaction will fail. +3. If there is already an ongoing import transaction at the moment (dictionary Status is `LOADING`), the manual import will fail. Please wait until the ongoing import is completed before proceeding. + +### Query Dictionary + +You can use the `dict_get` and `dict_get_many` functions for dictionary table queries of single Key, Value list and multi Key, Value list respectively. + +Please wait until the dictionary is imported before performing the first query to a dictionary. + +#### Grammar + +```sql +dict_get(".", "", ); +dict_get_many(".", , ); +``` + +Among: + +- `` is the name of the database where the dictionary is located. +- `` is the name of the dictionary +- `` is the column name for the value column to be queried, with a type of `VARCHAR`, **must be a constant** +- `` are the column names for all value columns to be queried, with a type of `ARRAY`, **must be constants**. +- `` is data for key columns used in queries +- `` is a STRUCT that contains all Key columns of the data to be queried in a dictionary. + +The return type of `dict_get` is the dictionary column type corresponding to ``. +The return type of `dict_get_many` is a [STRUCT](../sql-manual/sql-data-types/semi-structured/STRUCT) corresponding to the types of various dictionary columns in ``。 + +#### Query Example + +The statement queries the dictionary `city_dict` within the `test_db` database, for the corresponding `id` value when the `key` column value is "Beijing": + +```sql +SELECT dict_get("test_db.city_dict", "id", "Beijing"); +``` + +The statement queries the dictionary `single_key_dict` within the `test_db` database, for the corresponding values of `k1` and `k3` when the value of the `key` column is 1: + +```sql +SELECT dict_get_many("test_db.single_key_dict", ["k1", "k3"], struct(1)); +``` + +The statement queries the dictionary `multi_key_dict` within the `test_db` database, for the corresponding `k2` and `k3` column values when the 2 key column values are 2 and 'ABC' in sequence: + +```sql +SELECT dict_get_many("test_db.multi_key_dict", ["k2", "k3"], struct(2, 'ABC')); +``` + +For example, the table creation statement is as follows: + +```sql +create table if not exists multi_key_table( + k0 int not null, + k1 varchar not null, + k2 float not null, + k3 varchar not null +) +DISTRIBUTED BY HASH(`k0`) BUCKETS auto; + +create dictionary multi_key_dict using multi_key_table +( + k0 KEY, + k1 KEY, + k2 VALUE, + k3 VALUE +) +LAYOUT(HASH_MAP) +PROPERTIES('data_lifetime' = '600'); +``` + +Then the above statement + +```sql +SELECT dict_get_many("test_db.multi_key_dict", ["k2", "k3"], struct(2, 'ABC')); +``` + +returns type of `STRUCT`。 + +#### Attention Points of Query + +1. When the query Key data does not exist in the dictionary table, **or the Key data is null**, return null. +2. For IP_TRIE type queries, **`` type must be `IPV4` or `IPV6`**. +3. When using an IP_TRIE type dictionary, the data in the Key column `` and the `` used for querying both support `IPV4` and `IPV6` format data. + +### Dictionary Management + +The dictionary table supports the following management and viewing statements: + +1. Check the status of all dictionary tables in the current database. + + ```sql + SHOW DICTIONARIES [LIKE ]; + ``` + +2. Check the definition of a specific dictionary + + ```sql + DESC DICTIONARY ; + ``` + +3. Delete dictionary table + + ```sql + DROP DICTIONARY ; + ``` + + After deleting the dictionary table, the deleted dictionary may not be removed from BE immediately. + +### Status Display + +By using the `SHOW DICTIONARIES` statement, you can view the base table corresponding to the dictionary, the current data version number, and the corresponding status in FE and BE: + +```sql +> SHOW DICTIONARIES; +``` + +```text ++--------------+----------------+----------------------------------------------+---------+--------+------------------------------------+------------------------------+ +| DictionaryId | DictionaryName | BaseTableName | Version | Status | DataDistribution | LastUpdateResult | ++--------------+----------------+----------------------------------------------+---------+--------+------------------------------------+------------------------------+ +| 51 | precision_dict | internal.test_refresh_dict.precision_test | 2 | NORMAL | {10.16.10.2:9767 ver=2 memory=368} | 2025-02-18 09:58:12: succeed | +| 48 | product_dict | internal.test_refresh_dict.product_info | 2 | NORMAL | {10.16.10.2:9767 ver=2 memory=240} | 2025-02-18 09:58:12: succeed | +| 49 | ip_dict | internal.test_refresh_dict.ip_info | 2 | NORMAL | {10.16.10.2:9767 ver=2 memory=194} | 2025-02-18 09:58:12: succeed | +| 52 | order_dict | internal.test_refresh_dict.column_order_test | 2 | NORMAL | {10.16.10.2:9767 ver=2 memory=432} | 2025-02-18 09:58:12: succeed | +| 50 | user_dict | internal.test_refresh_dict.user_info | 2 | NORMAL | {10.16.10.2:9767 ver=2 memory=240} | 2025-02-18 09:58:12: succeed | ++--------------+----------------+----------------------------------------------+---------+--------+------------------------------------+------------------------------+ +``` + +Among: + +1. `Version` represents the data version number, which will increment by 1 each time data is imported. + +2. `Status` represents the dictionary status, meaning as follows: + + |Status Name|Meaning| + |-|-| + |NORMAL|The dictionary is currently normal| + |LOADING|The dictionary is currently importing| + |OUT_OF_DATE|The current dictionary data has expired| + + The dictionary cannot be imported again while it is being imported. + +3. `DataDistribution` represents the current status of each BE, including the version number and memory usage size (KB). + +4. `LastUpdateResult` indicates the result of the last import (including automatic and manual), and detailed error message will be displayed here if there are any exceptions. + +To view the column definitions of the dictionary table, you can use `DESC DICTIONARY`. For example: + +```sql +> DESC DICTIONARY city_code_dict; ++-------------+-------------+------+-------+ +| Field | Type | Null | Key | ++-------------+-------------+------+-------+ +| city_name | varchar(32) | NO | true | +| region_code | varchar(32) | NO | false | ++-------------+-------------+------+-------+ +``` + +## Cautionary Notes + +1. Data consistency + + - Each refresh of the dictionary will generate a new version. If the version of the BE record doesn't match the FE version during the query, the query will fail. + - Doris does not maintain strong data consistency between dictionary tables and base tables. Users need to properly set the `data_lifetime` of the dictionary to achieve automatic updates, and manually update when necessary based on business logic. + - When the source table is deleted by any way, the corresponding dictionary table will also be automatically deleted. + +2. Performance Considerations + + - Dictionary tables are suitable for relatively static data, such as dimension table data. + + - Dictionary tables are pure in-memory tables, with full data stored in the memory of all BEs, may occupying a large amount, and it is necessary to weigh memory usage and query performance to choose an appropriate table to derive dictionary. + +3. Best Practices + + 1. Reasonable Selection of Key Value Columns: + + - Choose columns with a moderate cardinality as keys + + 2. Layout Selection: + + - Use HASH_MAP layout for general scenarios + - Use IP_TRIE layout for IP address range matching scenarios + + 3. State Management: + + - Regularly monitor the memory usage of dictionary tables + - Select an appropriate data update interval and manually refresh the dictionary when data expires on the business side + +## Complete Example + +1. HASH_MAP + + ```sql + -- Create source data table + CREATE TABLE cities ( + city_id INT NOT NULL, + city_name VARCHAR(32) NOT NULL, + region_code VARCHAR(32) NOT NULL + ) ENGINE=OLAP + DISTRIBUTED BY HASH(city_id) BUCKETS 1; + + INSERT INTO cities VALUES + (1, 'Beijing', 'BJ'), + (2, 'Shanghai', 'SH'), + (3, 'Guangzhou', 'GZ'); + + -- Create dictionary table + CREATE DICTIONARY city_code_dict USING cities + ( + city_name KEY, + region_code VALUE + ) + LAYOUT(HASH_MAP) + PROPERTIES('data_lifetime' = '600'); + + -- Query using a dictionary table + SELECT dict_get("test_refresh_dict.city_code_dict", "region_code", "Beijing"); + ``` + + ```text + +------------------------------------------------------------------------+ + | dict_get('test_refresh_dict.city_code_dict', 'region_code', 'Beijing') | + +------------------------------------------------------------------------+ + | BJ | + +------------------------------------------------------------------------+ + ``` + +2. IP_TRIE + + ```sql + CREATE TABLE ip_locations ( + ip_range VARCHAR(30) NOT NULL, + country VARCHAR(64) NOT NULL, + region VARCHAR(64) NOT NULL, + city VARCHAR(64) NOT NULL + ) ENGINE=OLAP + DISTRIBUTED BY HASH(ip_range) BUCKETS 1; + + INSERT INTO ip_locations VALUES + ('1.0.0.0/24', 'United States', 'California', 'Los Angeles'), + ('1.0.1.0/24', 'China', 'Beijing', 'Beijing'), + ('1.0.4.0/24', 'Japan', 'Tokyo', 'Tokyo'); + + -- Create an IP address dictionary table + CREATE DICTIONARY ip_location_dict USING ip_locations + ( + ip_range KEY, + country VALUE, + region VALUE, + city VALUE + ) + LAYOUT(IP_TRIE) + PROPERTIES('data_lifetime' = '600'); + + -- Query the location information corresponding to the IP address, based on CIDR matching. + SELECT + dict_get("test_refresh_dict.ip_location_dict", "country", cast('1.0.0.1' as ipv4)) AS country, + dict_get("test_refresh_dict.ip_location_dict", "region", cast('1.0.0.2' as ipv4)) AS region, + dict_get("test_refresh_dict.ip_location_dict", "city", cast('1.0.0.3' as ipv4)) AS city; + ``` + + ```text + +---------------+------------+-------------+ + | country | region | city | + +---------------+------------+-------------+ + | United States | California | Los Angeles | + +---------------+------------+-------------+ + ``` + +3. HASH_MAP with multi-key / multi-value + + ```sql + -- Product SKU Dimension Table: Includes basic product attributes + CREATE TABLE product_sku_info ( + product_id INT NOT NULL COMMENT "商品ID", + color_code VARCHAR(32) NOT NULL COMMENT "颜色编码", + size_code VARCHAR(32) NOT NULL COMMENT "尺码编码", + product_name VARCHAR(128) NOT NULL COMMENT "商品名称", + color_name VARCHAR(32) NOT NULL COMMENT "颜色名称", + size_name VARCHAR(32) NOT NULL COMMENT "尺码名称", + stock INT NOT NULL COMMENT "库存", + price DECIMAL(10,2) NOT NULL COMMENT "价格", + update_time DATETIME NOT NULL COMMENT "更新时间" + ) + DISTRIBUTED BY HASH(`product_id`) BUCKETS 10; + + -- Order Details Table: Records actual sales data + CREATE TABLE order_details ( + order_id BIGINT NOT NULL COMMENT "订单ID", + product_id INT NOT NULL COMMENT "商品ID", + color_code VARCHAR(32) NOT NULL COMMENT "颜色编码", + size_code VARCHAR(32) NOT NULL COMMENT "尺码编码", + quantity INT NOT NULL COMMENT "购买数量", + order_time DATETIME NOT NULL COMMENT "下单时间" + ) + DISTRIBUTED BY HASH(`order_id`) BUCKETS 10; + + -- Insert product SKU data + INSERT INTO product_sku_info VALUES + (1001, 'BLK', 'M', 'Nike运动T恤', '黑色', 'M码', 100, 199.00, '2024-02-23 10:00:00'), + (1001, 'BLK', 'L', 'Nike运动T恤', '黑色', 'L码', 80, 199.00, '2024-02-23 10:00:00'), + (1001, 'WHT', 'M', 'Nike运动T恤', '白色', 'M码', 90, 199.00, '2024-02-23 10:00:00'), + (1001, 'WHT', 'L', 'Nike运动T恤', '白色', 'L码', 70, 199.00, '2024-02-23 10:00:00'), + (1002, 'RED', 'S', 'Adidas运动裤', '红色', 'S码', 50, 299.00, '2024-02-23 10:00:00'), + (1002, 'RED', 'M', 'Adidas运动裤', '红色', 'M码', 60, 299.00, '2024-02-23 10:00:00'), + (1002, 'BLU', 'S', 'Adidas运动裤', '蓝色', 'S码', 55, 299.00, '2024-02-23 10:00:00'), + (1002, 'BLU', 'M', 'Adidas运动裤', '蓝色', 'M码', 65, 299.00, '2024-02-23 10:00:00'); + + -- Insert order data + INSERT INTO order_details VALUES + (10001, 1001, 'BLK', 'M', 2, '2024-02-23 12:01:00'), + (10002, 1001, 'WHT', 'L', 1, '2024-02-23 12:05:00'), + (10003, 1002, 'RED', 'S', 1, '2024-02-23 12:10:00'), + (10004, 1001, 'BLK', 'L', 3, '2024-02-23 12:15:00'), + (10005, 1002, 'BLU', 'M', 2, '2024-02-23 12:20:00'); + + -- Create a multi-key multi-value dictionary + CREATE DICTIONARY sku_dict USING product_sku_info + ( + product_id KEY, + color_code KEY, + size_code KEY, + product_name VALUE, + color_name VALUE, + size_name VALUE, + price VALUE, + stock VALUE + ) + LAYOUT(HASH_MAP) + PROPERTIES('data_lifetime'='300'); + + -- Query example using dict_get_many: Retrieve order details and SKU information + WITH order_sku_info AS ( + SELECT + o.order_id, + o.quantity, + o.order_time, + dict_get_many("test.sku_dict", + ["product_name", "color_name", "size_name", "price", "stock"], + struct(o.product_id, o.color_code, o.size_code) + ) as sku_info + FROM order_details o + WHERE o.order_time >= '2024-02-23 12:00:00' + AND o.order_time < '2024-02-23 13:00:00' + ) + SELECT + order_id, + order_time, + struct_element(sku_info, 'product_name') as product_name, + struct_element(sku_info, 'color_name') as color_name, + struct_element(sku_info, 'size_name') as size_name, + quantity, + struct_element(sku_info, 'price') as unit_price, + quantity * struct_element(sku_info, 'price') as total_amount, + struct_element(sku_info, 'stock') as current_stock + FROM order_sku_info + ORDER BY order_time; + ``` + + ```text + +----------+---------------------+-----------------+------------+-----------+----------+------------+--------------+---------------+ + | order_id | order_time | product_name | color_name | size_name | quantity | unit_price | total_amount | current_stock | + +----------+---------------------+-----------------+------------+-----------+----------+------------+--------------+---------------+ + | 10001 | 2024-02-23 12:01:00 | Nike运动T恤 | 黑色 | M码 | 2 | 199.00 | 398.00 | 100 | + | 10002 | 2024-02-23 12:05:00 | Nike运动T恤 | 白色 | L码 | 1 | 199.00 | 199.00 | 70 | + | 10003 | 2024-02-23 12:10:00 | Adidas运动裤 | 红色 | S码 | 1 | 299.00 | 299.00 | 50 | + | 10004 | 2024-02-23 12:15:00 | Nike运动T恤 | 黑色 | L码 | 3 | 199.00 | 597.00 | 80 | + | 10005 | 2024-02-23 12:20:00 | Adidas运动裤 | 蓝色 | M码 | 2 | 299.00 | 598.00 | 65 | + +----------+---------------------+-----------------+------------+-----------+----------+------------+--------------+---------------+ + ``` + +## Troubleshooting + +1. The query reports an error of "can not find dict name" + + Firstly, confirm the existence of the dictionary by using `SHOW DICTIONARIES`. If it exists, refresh the corresponding dictionary data. + +2. The query reports an error of "dict_get() only support IP type for IP_TRIE" + + Confirm whether the Key column of the IP_TRIE type dictionary strictly meets to the CIDR format. + +3. The importing reports an error of "Version ID is not greater than the existing version ID for the dictionary." + + Delete the corresponding dictionary using the `DROP DICTIONARY` command, recreate it, and then import the data. diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/query-acceleration/dictionary.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/query-acceleration/dictionary.md index 55fa14f4f0a5b..6ee70f1dba38e 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/query-acceleration/dictionary.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/query-acceleration/dictionary.md @@ -239,7 +239,7 @@ PROPERTIES('data_lifetime' = '600'); ### 导入(刷新)数据 -字典支持自动与手动导入。 +字典支持自动与手动导入。字典的导入也称为”刷新“操作。 #### 自动导入 @@ -376,6 +376,9 @@ SELECT dict_get_many("test_db.multi_key_dict", ["k2", "k3"], struct(2, 'ABC')); ```sql > SHOW DICTIONARIES; +``` + +```text +--------------+----------------+----------------------------------------------+---------+--------+------------------------------------+------------------------------+ | DictionaryId | DictionaryName | BaseTableName | Version | Status | DataDistribution | LastUpdateResult | +--------------+----------------+----------------------------------------------+---------+--------+------------------------------------+------------------------------+ @@ -393,11 +396,13 @@ SELECT dict_get_many("test_db.multi_key_dict", ["k2", "k3"], struct(2, 'ABC')); 2. `Status` 代表字典状态,含义如下: - |状态名|含义|可以进行的操作| - |-|-|-| - |NORMAL|字典当前正常|查询、导入、删除| - |LOADING|字典当前正在进行导入|查询| - |OUT_OF_DATE|字典当前数据已过期|查询、导入、删除| + |状态名|含义| + |-|-| + |NORMAL|字典当前正常| + |LOADING|字典当前正在进行导入| + |OUT_OF_DATE|字典当前数据已过期| + + 字典正在导入时,不能再次对其进行导入。 3. `DataDistribution` 表示在各个 BE 的当前状态,包括版本号及内存占用大小(KB)。 @@ -420,9 +425,8 @@ SELECT dict_get_many("test_db.multi_key_dict", ["k2", "k3"], struct(2, 'ABC')); 1. 数据一致性 - 字典每次刷新都将产生新的版本,查询时如 BE 记录的版本与 FE 版本不一致,查询将会失败。 - - Doris 不会监控字典表与基表的数据一致性。用户需要通过业务逻辑适时更新。 - - 当源表被删除时,对应的字典表也会被自动删除。 - - 删除数据库时,其中的字典表也会被删除。 + - Doris 不会保持字典表与基表的数据强一致性。用户需要妥善设置字典的 `data_lifetime` 以期自动更新,并在必要时根据业务逻辑辅以手动更新。 + - 当源表以任何方式被删除时,对应的字典表也会被自动删除。 2. 性能考虑 @@ -443,7 +447,7 @@ SELECT dict_get_many("test_db.multi_key_dict", ["k2", "k3"], struct(2, 'ABC')); 3. 状态管理: - 定期监控字典表的内存使用情况 - - 适时更新字典表数据以保持数据的最新状态 + - 选取合适的数据更新间隔,并在业务侧明确数据过期时手动刷新字典。 ## 完整示例 @@ -473,9 +477,6 @@ SELECT dict_get_many("test_db.multi_key_dict", ["k2", "k3"], struct(2, 'ABC')); LAYOUT(HASH_MAP) PROPERTIES('data_lifetime' = '600'); - -- 刷新字典 - REFERSH DICTIONARY city_code_dict; - -- 使用字典表查询 SELECT dict_get("test_refresh_dict.city_code_dict", "region_code", "Beijing"); ``` @@ -517,9 +518,6 @@ SELECT dict_get_many("test_db.multi_key_dict", ["k2", "k3"], struct(2, 'ABC')); LAYOUT(IP_TRIE) PROPERTIES('data_lifetime' = '600'); - -- 刷新字典表数据 - REFRESH DICTIONARY ip_location_dict; - -- 查询 IP 地址对应的位置信息,依 CIDR 匹配。 SELECT dict_get("test_refresh_dict.ip_location_dict", "country", cast('1.0.0.1' as ipv4)) AS country, From 20d64a0eb888722e8dcf51dc0631cea920d8ddb9 Mon Sep 17 00:00:00 2001 From: zhaochangle Date: Fri, 28 Feb 2025 20:31:34 +0800 Subject: [PATCH 4/5] fix --- docs/query-acceleration/dictionary.md | 15 +++++++++++++++ .../current/query-acceleration/dictionary.md | 15 +++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/docs/query-acceleration/dictionary.md b/docs/query-acceleration/dictionary.md index aa70f383f0a86..3248ee4ffe81e 100644 --- a/docs/query-acceleration/dictionary.md +++ b/docs/query-acceleration/dictionary.md @@ -198,6 +198,7 @@ Currently, two layout types are supported: |-|-|-|-| |`date_lifetime`|Integer, unit in seconds|Data validity period. When the time since the last update of this dictionary exceeds this value, it will automatically initiate a import. The import logic is detailed in [Automatic Import](#automatic-import)|Yes| |`skip_null_key`|Boolean|If the Key column contains null values when load to a dictionary, skip the row if the value is `true`, otherwise raise an error. The default value is `false`|No| +|`memory_limit`|Integer, unit in bytes|The upper limit of memory occupied by this dictionary on a single BE. The deafult value is `2147483648`, which equals to 2GB.|No| ### Example @@ -264,6 +265,7 @@ Among them, `` is the name of the dictionary to be imported. 1. Only dictionaries that have imported data can be queried. 2. If the Key column has duplicate values during import, the import transaction will fail. 3. If there is already an ongoing import transaction at the moment (dictionary Status is `LOADING`), the manual import will fail. Please wait until the ongoing import is completed before proceeding. +4. If the size of the imported dictionary exceeds the set `memory_limit`, the import transaction will fail. ### Query Dictionary @@ -449,6 +451,7 @@ To view the column definitions of the dictionary table, you can use `DESC DICTIO - Regularly monitor the memory usage of dictionary tables - Select an appropriate data update interval and manually refresh the dictionary when data expires on the business side + - When using dictionary tables, pay attention to the BE memory monitoring to prevent the dictionary tables from being too numerous or too large, occupying excessive memory and causing abnormal BE status. ## Complete Example @@ -646,3 +649,15 @@ To view the column definitions of the dictionary table, you can use `DESC DICTIO 3. The importing reports an error of "Version ID is not greater than the existing version ID for the dictionary." Delete the corresponding dictionary using the `DROP DICTIONARY` command, recreate it, and then import the data. + +4. `SHOW DICTIONARIES` result shows that the dictionary is in a BE version greater than the FE version. + + Delete the corresponding dictionary using the `DROP DICTIONARY` command, recreate it, and then import the data. + +5. The importing reports an error of "Dictionary `X` commit version `Y` failed" + + Re-refresh the dictionary. + +6. Contingency Strategy + + For the vast majority of error messages, if normal operation fails, rebuilding the dictionary after `DROP` can resolve the issue. diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/query-acceleration/dictionary.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/query-acceleration/dictionary.md index 6ee70f1dba38e..40f027d37d034 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/query-acceleration/dictionary.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/query-acceleration/dictionary.md @@ -198,6 +198,7 @@ PROPERTIES( |-|-|-|-| |`date_lifetime`|整数,单位为秒|数据有效期。当该字典上次更新距今时间超过该值时,将会自动发起重新导入,导入逻辑详见[自动导入](#自动导入)|是| |`skip_null_key`|布尔值|向字典导入时如果 Key 列中出现 null 值,如果该值为 `true`,跳过该行数据,否则报错。缺省值为 `false`|否| +|`memory_limit`|整数,单位为 byte|该字典在单一 BE 上所占内存的上限,缺省值为 `2147483648` 即 2GB|否| ### 示例 @@ -264,6 +265,7 @@ REFRESH DICTIONARY ; 1. 只有导入数据后的字典才可以查询。 2. 如果导入时 Key 列具有重复值,导入事务会失败。 3. 如果当前已经有导入事务正在进行(字典 Status 为 `LOADING` ),则手动进行的导入会失败。请等待正在进行的导入完成后操作。 +4. 如果导入的字典大小超过设定的 `memory_limit`,导入事务会失败。 ### 查询字典 @@ -448,6 +450,7 @@ SELECT dict_get_many("test_db.multi_key_dict", ["k2", "k3"], struct(2, 'ABC')); - 定期监控字典表的内存使用情况 - 选取合适的数据更新间隔,并在业务侧明确数据过期时手动刷新字典。 + - 使用字典表时应关注 BE 内存监控,防止字典表过多、过大占据过多内存,导致 BE 状态异常。 ## 完整示例 @@ -648,3 +651,15 @@ SELECT dict_get_many("test_db.multi_key_dict", ["k2", "k3"], struct(2, 'ABC')); 3. 导入报错 "Version ID is not greater than the existing version ID for the dictionary." 通过 `DROP DICTIONARY` 命令删除对应字典后重新建立并导入数据。 + +4. `SHOW DICTIONARIES` 发现字典在某个 BE 的 Version 大于 FE Version + + 通过 `DROP DICTIONARY` 命令删除对应字典后重新建立并导入数据。 + +5. 导入报错 "Dictionary `X` commit version `Y` failed" + + 重新对该字典进行导入。 + +6. 兜底策略 + + 对于绝大多数报错,如果正常操作失败, `DROP` 之后重建字典可以解决。 From db158278f1b5e8ff89761625b43228cb842cc81d Mon Sep 17 00:00:00 2001 From: zhaochangle Date: Mon, 3 Mar 2025 16:32:29 +0800 Subject: [PATCH 5/5] config items --- docs/query-acceleration/dictionary.md | 9 +++++++++ .../current/query-acceleration/dictionary.md | 9 +++++++++ 2 files changed, 18 insertions(+) diff --git a/docs/query-acceleration/dictionary.md b/docs/query-acceleration/dictionary.md index 3248ee4ffe81e..cd79b71625e47 100644 --- a/docs/query-acceleration/dictionary.md +++ b/docs/query-acceleration/dictionary.md @@ -372,6 +372,15 @@ The dictionary table supports the following management and viewing statements: After deleting the dictionary table, the deleted dictionary may not be removed from BE immediately. +#### Config Item + +The dictionary table supports the following configuration items, all of which are FE CONFIG: + +1. `dictionary_task_queue_size` —— The queue length of the thread pool for all tasks in the dictionary is not dynamically adjustable. The default value is 1024, and it is generally not necessary to adjust it. +2. `job_dictionary_task_consumer_thread_num` —— The number of threads in the thread pool for all tasks in the dictionary is not dynamically adjustable. Default value is 3. +3. `dictionary_rpc_timeout_ms` —— The timeout duration for all related RPCs in the dictionary can be dynamically adjusted. The default is 5000 (i.e., 5 seconds), and it generally does not need to be adjusted. +4. `dictionary_auto_refresh_interval_seconds` —— The interval for automatically checking if all dictionary data is up to date is default 60 (seconds), and it can be dynamically adjusted. + ### Status Display By using the `SHOW DICTIONARIES` statement, you can view the base table corresponding to the dictionary, the current data version number, and the corresponding status in FE and BE: diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/query-acceleration/dictionary.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/query-acceleration/dictionary.md index 40f027d37d034..9f2a91704b5bd 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/query-acceleration/dictionary.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/query-acceleration/dictionary.md @@ -372,6 +372,15 @@ SELECT dict_get_many("test_db.multi_key_dict", ["k2", "k3"], struct(2, 'ABC')); 删除字典表后,被删除的字典可能不会立即从 BE 中移除。 +#### 配置项 + +字典表支持以下配置项,均为 FE CONFIG: + +1. `dictionary_task_queue_size` —— 字典所有任务的线程池的队列长度,不可动态调整。默认值 1024,一般不需要调整。 +2. `job_dictionary_task_consumer_thread_num` —— 字典所有任务的线程池的线程数量,不可动态调整。默认值 3。 +3. `dictionary_rpc_timeout_ms` —— 字典所有相关 rpc 的超时时间,可以动态调整。默认 5000(即 5s),一般不需要调整。 +4. `dictionary_auto_refresh_interval_seconds` —— 自动检查所有字典数据是否过期的间隔,默认 60(秒),可以动态调整。 + ### 状态显示 通过 `SHOW DICTIONARIES` 语句,可以查看字典对应的基表,当前数据版本号,以及对应在 FE 和 BE 的状态。