HUSTAI
diff --git a/‎README.md
+161-63 b/‎README.md
+161-63
diff --git a/‎convert.py
+51-1 b/‎convert.py
+51-1
diff --git a/‎doccano.md
+92-7 b/‎doccano.md
+92-7
diff --git a/‎doccano.py
+54-19 b/‎doccano.py
+54-19
@@ -33,10 +33,11 @@
 from utils import logger
 
 MODEL_MAP = {
+    # vocab.txt/special_tokens_map.json/tokenizer_config.json are common to the default model.
     "uie-base": {
         "resource_file_urls": {
             "model_state.pdparams":
-            "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base_v0.1/model_state.pdparams",
+            "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base_v1.0/model_state.pdparams",
             "model_config.json":
             "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/model_config.json",
             "vocab_file":
@@ -117,6 +118,55 @@
             "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/tokenizer_config.json",
         }
     },
+    "uie-base-en": {
+        "resource_file_urls": {
+            "model_state.pdparams":
+            "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base_en_v1.1/model_state.pdparams",
+            "model_config.json":
+            "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base_en/model_config.json",
+            "vocab_file":
+            "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base_en/vocab.txt",
+            "special_tokens_map":
+            "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base_en/special_tokens_map.json",
+            "tokenizer_config":
+            "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base_en/tokenizer_config.json",
+        }
+    },
+    # uie-m模型需要Ernie-M模型
+    # "uie-m-base": {
+    #     "resource_file_urls": {
+    #         "model_state.pdparams":
+    #         "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_m_base_v1.0/model_state.pdparams",
+    #         "model_config.json":
+    #         "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_m_base/model_config.json",
+    #         "vocab_file":
+    #         "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_m_base/vocab.txt",
+    #         "special_tokens_map":
+    #         "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_m_base/special_tokens_map.json",
+    #         "tokenizer_config":
+    #         "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_m_base/tokenizer_config.json",
+    #         "sentencepiece_model_file":
+    #         "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_m_base/sentencepiece.bpe.model"
+
+    #     }
+    # },
+    # "uie-m-large": {
+    #     "resource_file_urls": {
+    #         "model_state.pdparams":
+    #         "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_m_large_v1.0/model_state.pdparams",
+    #         "model_config.json":
+    #         "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_m_large/model_config.json",
+    #         "vocab_file":
+    #         "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_m_large/vocab.txt",
+    #         "special_tokens_map":
+    #         "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_m_large/special_tokens_map.json",
+    #         "tokenizer_config":
+    #         "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_m_large/tokenizer_config.json",
+    #         "sentencepiece_model_file":
+    #         "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_m_base/sentencepiece.bpe.model"
+    #     }
+    # },
+    # Rename to `uie-medium` and the name of `uie-tiny` will be deprecated in future.
     "uie-tiny": {
         "resource_file_urls": {
             "model_state.pdparams":
 
@@ -105,6 +105,15 @@ Relation类型标签构建示例：
 
 示例中定义了`时间`、`选手`、`赛事名称`和`得分`四种Span类型标签。
 
+```text
+schema = [
+    '时间',
+    '选手',
+    '赛事名称',
+    '得分'
+]
+```
+
 #### 5.2 关系抽取
 
 关系抽取（Relation Extraction，简称RE），是指从文本中识别实体并抽取实体之间的语义关系，即抽取三元组（实体一，关系类型，实体二）。
@@ -117,6 +126,18 @@ Relation类型标签构建示例：
 
 示例中定义了`作品名`、`人物名`和`时间`三种Span类型标签，以及`歌手`、`发行时间`和`所属专辑`三种Relation标签。Relation标签**由Subject对应实体指向Object对应实体**。
 
+该标注示例对应的schema为：
+
+```text
+schema = {
+    '作品名': [
+        '歌手',
+        '发行时间',
+        '所属专辑'
+    ]
+}
+```
+
 #### 5.3 事件抽取
 
 事件抽取 (Event Extraction, 简称EE)，是指从自然语言文本中抽取事件并识别事件类型和事件论元的技术。UIE所包含的事件抽取任务，是指根据已知事件类型，抽取该事件所包含的事件论元。
@@ -129,6 +150,17 @@ Relation类型标签构建示例：
 
 示例中定义了`地震触发词`（触发词）、`等级`（事件论元）和`时间`（事件论元）三种Span标签，以及`时间`和`震级`两种Relation标签。触发词标签**统一格式为`XX触发词`**，`XX`表示具体事件类型，上例中的事件类型是`地震`，则对应触发词为`地震触发词`。Relation标签**由触发词指向对应的事件论元**。
 
+该标注示例对应的schema为：
+
+```text
+schema = {
+    '地震触发词': [
+        '时间',
+        '震级'
+    ]
+}
+```
+
 #### 5.4 评价观点抽取
 
 评论观点抽取，是指抽取文本中包含的评价维度、观点词。
@@ -141,7 +173,15 @@ Relation类型标签构建示例：
 
 示例中定义了`评价维度`和`观点词`两种Span标签，以及`观点词`一种Relation标签。Relation标签**由评价维度指向观点词**。
 
-#### 5.5 分类任务
+该标注示例对应的schema为：
+
+```text
+schema = {
+    '评价维度': '观点词'
+}
+```
+
+#### 5.5 句子级分类任务
 
 标注示例：
 
@@ -151,11 +191,38 @@ Relation类型标签构建示例：
 
 示例中定义了`正向`和`负向`两种类别标签对文本的情感倾向进行分类。
 
+该标注示例对应的schema为：
+
+```text
+schema = '情感倾向[正向，负向]'
+```
+
+#### 5.6 实体/评价维度级分类任务
+
+<div align="center">
+    <img src=https://user-images.githubusercontent.com/40840292/172628328-878923d7-8c5d-4667-a0e2-b92bce89b47c.png height=200 hspace='20'/>
+</div>
+
+标注示例：
+
+示例中定义了`评价维度##正向`，`评价维度##负向`和`观点词`三种Span标签以及`观点词`一种Relation标签。其中，`##`是实体类别/评价维度与分类标签的分隔符（可通过doccano.py中的separator参数自定义）。
+
+该标注示例对应的schema为：
+
+```text
+schema = {
+    '评价维度': [
+        '观点词',
+        '情感倾向[正向，负向]'
+    ]
+}
+```
+
 <a name="数据导出"></a>
 
 ## 6. 数据导出
 
-#### 6.1 导出抽取式任务数据
+#### 6.1 导出抽取式和实体/评价维度级分类任务数据
 
 选择导出的文件类型为``JSONL(relation)``，导出数据示例：
 
@@ -226,7 +293,7 @@ Relation类型标签构建示例：
     - ``to_id``: Span2对应的标识ID。
     - ``type``: Relation类型。
 
-#### 6.2 导出分类式任务数据
+#### 6.2 导出句子级分类任务数据
 
 选择导出的文件类型为``JSONL``，导出数据示例：
 
@@ -264,10 +331,10 @@ python doccano.py \
     --negative_ratio 5
 ```
 
-#### 7.2 分类式任务数据转换
+#### 7.2 句子级分类任务数据转换
 
 - 当标注完成后，在 doccano 平台上导出 `JSON` 形式的文件，并将其重命名为 `doccano_cls.json` 后，放入 `./data` 目录下。
-- 在数据转换阶段，我们会自动构造用于模型训练需要的prompt信息。例如句子级情感分类中，prompt为``情感倾向[正向,负向]``，可以通过`prompt_prefix`和`options`参数进行声明。
+- 在数据转换阶段，我们会自动构造用于模型训练的prompt信息。例如句子级情感分类中，prompt为``情感倾向[正向,负向]``，可以通过`prompt_prefix`和`options`参数进行声明。
 - 通过 [doccano.py](./doccano.py) 脚本进行数据形式转换，然后便可以开始进行相应模型训练。
 
 ```shell
@@ -280,17 +347,35 @@ python doccano.py \
     --options "正向" "负向"
 ```
 
+#### 7.3 实体/评价维度级分类任务数据转换
+
+- 当标注完成后，在 doccano 平台上导出 `JSONL(relation)` 形式的文件，并将其重命名为 `doccano_ext.json` 后，放入 `./data` 目录下。
+- 在数据转换阶段，我们会自动构造用于模型训练的prompt信息。例如评价维度级情感分类中，prompt为``XXX的情感倾向[正向,负向]``，可以通过`prompt_prefix`和`options`参数进行声明。
+- 通过 [doccano.py](./doccano.py) 脚本进行数据形式转换，然后便可以开始进行相应模型训练。
+
+```shell
+python doccano.py \
+    --doccano_file ./data/doccano_ext.json \
+    --task_type "ext" \
+    --save_dir ./data \
+    --splits 0.8 0.1 0.1 \
+    --prompt_prefix "情感倾向" \
+    --options "正向" "负向" \
+    --separator "##"
+```
+
 可配置参数说明：
 
 - ``doccano_file``: 从doccano导出的数据标注文件。
 - ``save_dir``: 训练数据的保存目录，默认存储在``data``目录下。
 - ``negative_ratio``: 最大负例比例，该参数只对抽取类型任务有效，适当构造负例可提升模型效果。负例数量和实际的标签数量有关，最大负例数量 = negative_ratio * 正例数量。该参数只对训练集有效，默认为5。为了保证评估指标的准确性，验证集和测试集默认构造全负例。
 - ``splits``: 划分数据集时训练集、验证集所占的比例。默认为[0.8, 0.1, 0.1]表示按照``8:1:1``的比例将数据划分为训练集、验证集和测试集。
 - ``task_type``: 选择任务类型，可选有抽取和分类两种类型的任务。
-- ``options``: 指定分类任务的类别标签，该参数只对分类类型任务有效。
-- ``prompt_prefix``: 声明分类任务的prompt前缀信息，该参数只对分类类型任务有效。
+- ``options``: 指定分类任务的类别标签，该参数只对分类类型任务有效。默认为["正向", "负向"]。
+- ``prompt_prefix``: 声明分类任务的prompt前缀信息，该参数只对分类类型任务有效。默认为"情感倾向"。
 - ``is_shuffle``: 是否对数据集进行随机打散，默认为True。
 - ``seed``: 随机种子，默认为1000.
+- ``separator``: 实体类别/评价维度与分类标签的分隔符，该参数只对实体/评价维度级分类任务有效。默认为"##"。
 
 备注：
 - 默认情况下 [doccano.py](./doccano.py) 脚本会按照比例将数据划分为 train/dev/test 数据集
 
@@ -1,4 +1,5 @@
-# Copyright (c) 2022 Heiheiyoyo. All Rights Reserved.
+# coding=utf-8
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -48,12 +49,16 @@ def _check_sum(splits):
         raw_examples = f.readlines()
 
     def _create_ext_examples(examples,
-                             negative_ratio=0,
+                             negative_ratio,
+                             prompt_prefix="情感倾向",
+                             options=["正向", "负向"],
+                             separator="##",
                              shuffle=False,
                              is_train=True):
-        entities, relations = convert_ext_examples(
-            examples, negative_ratio, is_train=is_train)
-        examples = entities + relations
+        entities, relations, aspects = convert_ext_examples(
+            examples, negative_ratio, prompt_prefix, options, separator,
+            is_train)
+        examples = entities + relations + aspects
         if shuffle:
             indexes = np.random.permutation(len(examples))
             examples = [examples[i] for i in indexes]
@@ -81,34 +86,62 @@ def _save_examples(save_dir, file_name, examples):
     if len(args.splits) == 0:
         if args.task_type == "ext":
             examples = _create_ext_examples(raw_examples, args.negative_ratio,
-                                            args.is_shuffle)
+                                            args.prompt_prefix, args.options,
+                                            args.separator, args.is_shuffle)
         else:
             examples = _create_cls_examples(raw_examples, args.prompt_prefix,
                                             args.options, args.is_shuffle)
         _save_examples(args.save_dir, "train.txt", examples)
     else:
         if args.is_shuffle:
             indexes = np.random.permutation(len(raw_examples))
+            index_list = indexes.tolist()
             raw_examples = [raw_examples[i] for i in indexes]
 
         i1, i2, _ = args.splits
         p1 = int(len(raw_examples) * i1)
         p2 = int(len(raw_examples) * (i1 + i2))
 
+        train_ids = index_list[:p1]
+        dev_ids = index_list[p1:p2]
+        test_ids = index_list[p2:]
+
+        with open(os.path.join(args.save_dir, "sample_index.json"), "w") as fp:
+            maps = {
+                "train_ids": train_ids,
+                "dev_ids": dev_ids,
+                "test_ids": test_ids
+            }
+            fp.write(json.dumps(maps))
+
         if args.task_type == "ext":
-            train_examples = _create_ext_examples(
-                raw_examples[:p1], args.negative_ratio, args.is_shuffle)
-            dev_examples = _create_ext_examples(
-                raw_examples[p1:p2], -1, is_train=False)
-            test_examples = _create_ext_examples(
-                raw_examples[p2:], -1, is_train=False)
+            train_examples = _create_ext_examples(raw_examples[:p1],
+                                                  args.negative_ratio,
+                                                  args.prompt_prefix,
+                                                  args.options, args.separator,
+                                                  args.is_shuffle)
+            dev_examples = _create_ext_examples(raw_examples[p1:p2],
+                                                -1,
+                                                args.prompt_prefix,
+                                                args.options,
+                                                args.separator,
+                                                is_train=False)
+            test_examples = _create_ext_examples(raw_examples[p2:],
+                                                 -1,
+                                                 args.prompt_prefix,
+                                                 args.options,
+                                                 args.separator,
+                                                 is_train=False)
         else:
-            train_examples = _create_cls_examples(
-                raw_examples[:p1], args.prompt_prefix, args.options)
-            dev_examples = _create_cls_examples(
-                raw_examples[p1:p2], args.prompt_prefix, args.options)
-            test_examples = _create_cls_examples(
-                raw_examples[p2:], args.prompt_prefix, args.options)
+            train_examples = _create_cls_examples(raw_examples[:p1],
+                                                  args.prompt_prefix,
+                                                  args.options)
+            dev_examples = _create_cls_examples(raw_examples[p1:p2],
+                                                args.prompt_prefix,
+                                                args.options)
+            test_examples = _create_cls_examples(raw_examples[p2:],
+                                                 args.prompt_prefix,
+                                                 args.options)
 
         _save_examples(args.save_dir, "train.txt", train_examples)
         _save_examples(args.save_dir, "dev.txt", dev_examples)
@@ -138,7 +171,9 @@ def _save_examples(save_dir, file_name, examples):
     parser.add_argument("--is_shuffle", default=True, type=bool,
                         help="Whether to shuffle the labeled dataset, defaults to True.")
     parser.add_argument("--seed", type=int, default=1000,
-                        help="random seed for initialization")
+                        help="Random seed for initialization")
+    parser.add_argument("--separator", type=str, default='##',
+                        help="Used only for entity/aspect-level classification task, separator for entity label and classification label")
 
     args = parser.parse_args()
     # yapf: enable