Release 1.6.0

IgnatovFedor · web-flow · commit 6e1036dbfcde · 2024-03-13T12:36:57.000+03:00
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -19,7 +19,7 @@ node('cuda-module') {
                     docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG ps | grep Exit | grep -v 'Exit 0' && exit 1
                     docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG up py38 py39
                     docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG ps | grep Exit | grep -v 'Exit 0' && exit 1
-                    docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG up py310
+                    docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG up py310 py311
                     docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG ps | grep Exit | grep -v 'Exit 0' && exit 1 || exit 0
                 """
                 currentBuild.result = 'SUCCESS'
diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
 [![License Apache 2.0](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](LICENSE)
-![Python 3.6, 3.7, 3.8, 3.9, 3.10](https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8%20%7C%203.9%20%7C%203.10-green.svg)
+![Python 3.6, 3.7, 3.8, 3.9, 3.10, 3.11](https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8%20%7C%203.9%20%7C%203.10%20%7C%203.11-green.svg)
 [![Downloads](https://pepy.tech/badge/deeppavlov)](https://pepy.tech/project/deeppavlov)
 <img align="right" height="27%" width="27%" src="docs/_static/deeppavlov_logo.png"/>
 
diff --git a/deeppavlov/_meta.py b/deeppavlov/_meta.py
@@ -1,4 +1,4 @@
-__version__ = '1.5.0'
+__version__ = '1.6.0'
 __author__ = 'Neural Networks and Deep Learning lab, MIPT'
 __description__ = 'An open source library for building end-to-end dialog systems and training chatbots.'
 __keywords__ = ['NLP', 'NER', 'SQUAD', 'Intents', 'Chatbot']
diff --git a/deeppavlov/configs/ner/ner_bert_base.json b/deeppavlov/configs/ner/ner_bert_base.json
@@ -0,0 +1,55 @@
+{
+  "chainer": {
+    "in": ["x"],
+    "in_y": ["y"],
+    "pipe": [
+      {
+        "class_name": "torch_transformers_ner_preprocessor",
+        "vocab_file": "{BASE_MODEL}",
+        "in": ["x"],
+        "out": ["x_tokens", "x_subword_tokens", "x_subword_tok_ids", "startofword_markers", "attention_mask", "tokens_offsets"]
+      },
+      {
+        "id": "tag_vocab",
+        "class_name": "simple_vocab",
+        "unk_token": ["O"],
+        "save_path": "{MODEL_PATH}/tag.dict",
+        "load_path": "{MODEL_PATH}/tag.dict",
+        "fit_on": ["y"],
+        "in": ["y"],
+        "out": ["y_ind"]
+      },
+      {
+        "class_name": "torch_transformers_sequence_tagger",
+        "n_tags": "#tag_vocab.len",
+        "pretrained_bert": "{BASE_MODEL}",
+        "save_path": "{MODEL_PATH}/model",
+        "load_path": "{MODEL_PATH}/model",
+        "in": ["x_subword_tok_ids", "attention_mask", "startofword_markers"],
+        "in_y": ["y_ind"],
+        "out": ["y_pred_ind", "probas"]
+      },
+      {
+        "ref": "tag_vocab",
+        "in": ["y_pred_ind"],
+        "out": ["y_pred"]
+      }
+    ],
+    "out": ["x_tokens", "y_pred"]
+  },
+  "metadata": {
+    "variables": {
+      "BASE_MODEL": "bert-base-multilingual-cased",
+      "ROOT_PATH": "~/.deeppavlov",
+      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
+      "MODELS_PATH": "{ROOT_PATH}/models",
+      "MODEL_PATH": "{MODELS_PATH}/ner/{BASE_MODEL}"
+    },
+    "download": [
+      {
+        "url": "http://files.deeppavlov.ai/v1/ner/ner_bert_base.tar.gz",
+        "subdir": "{MODEL_PATH}"
+      }
+    ]
+  }
+}
diff --git a/deeppavlov/core/data/utils.py b/deeppavlov/core/data/utils.py
@@ -78,7 +78,7 @@ def s3_download(url: str, destination: str) -> None:
         file_object.download_file(destination, Callback=pbar.update)
 
 
-def simple_download(url: str, destination: Union[Path, str], headers: Optional[dict] = None) -> None:
+def simple_download(url: str, destination: Union[Path, str], headers: Optional[dict] = None, n_tries: int = 3) -> None:
     """Download a file from URL to target location.
 
     Displays a progress bar to the terminal during the download process.
@@ -87,58 +87,66 @@ def simple_download(url: str, destination: Union[Path, str], headers: Optional[d
         url: The source URL.
         destination: Path to the file destination (including file name).
         headers: Headers for file server.
+        n_tries: Number of retries if download fails.
 
     """
-    destination = Path(destination)
-    destination.parent.mkdir(parents=True, exist_ok=True)
-
-    log.info('Downloading from {} to {}'.format(url, destination))
-
-    if url.startswith('s3://'):
-        return s3_download(url, str(destination))
-
-    chunk_size = 32 * 1024
-    temporary = destination.with_suffix(destination.suffix + '.part')
-
-    r = requests.get(url, stream=True, headers=headers)
-    if r.status_code != 200:
-        raise RuntimeError(f'Got status code {r.status_code} when trying to download {url}')
-    total_length = int(r.headers.get('content-length', 0))
-
-    if temporary.exists() and temporary.stat().st_size > total_length:
-        temporary.write_bytes(b'')  # clearing temporary file when total_length is inconsistent
-
-    with temporary.open('ab') as f:
-        downloaded = f.tell()
-        if downloaded != 0:
-            log.warning(f'Found a partial download {temporary}')
-        with tqdm(initial=downloaded, total=total_length, unit='B', unit_scale=True) as pbar:
-            while True:
-                if downloaded != 0:
-                    log.warning(f'Download stopped abruptly, trying to resume from {downloaded} '
-                                f'to reach {total_length}')
-                    headers['Range'] = f'bytes={downloaded}-'
-                    r = requests.get(url, headers=headers, stream=True)
-                    if 'content-length' not in r.headers or \
-                            total_length - downloaded != int(r.headers['content-length']):
-                        raise RuntimeError('It looks like the server does not support resuming downloads.')
-
-                try:
-                    for chunk in r.iter_content(chunk_size=chunk_size):
-                        if chunk:  # filter out keep-alive new chunks
-                            downloaded += len(chunk)
-                            pbar.update(len(chunk))
-                            f.write(chunk)
-                except requests.exceptions.ChunkedEncodingError:
-                    if downloaded == 0:
-                        r = requests.get(url, stream=True, headers=headers)
-
-                if downloaded >= total_length:
-                    # Note that total_length is 0 if the server didn't return the content length,
-                    # in this case we perform just one iteration and assume that we are done.
-                    break
-
-    temporary.rename(destination)
+    try:
+        destination = Path(destination)
+        destination.parent.mkdir(parents=True, exist_ok=True)
+
+        log.info('Downloading from {} to {}'.format(url, destination))
+
+        if url.startswith('s3://'):
+            return s3_download(url, str(destination))
+
+        chunk_size = 32 * 1024
+        temporary = destination.with_suffix(destination.suffix + '.part')
+
+        r = requests.get(url, stream=True, headers=headers)
+        if r.status_code != 200:
+            raise RuntimeError(f'Got status code {r.status_code} when trying to download {url}')
+        total_length = int(r.headers.get('content-length', 0))
+
+        if temporary.exists() and temporary.stat().st_size > total_length:
+            temporary.write_bytes(b'')  # clearing temporary file when total_length is inconsistent
+
+        with temporary.open('ab') as f:
+            downloaded = f.tell()
+            if downloaded != 0:
+                log.warning(f'Found a partial download {temporary}')
+            with tqdm(initial=downloaded, total=total_length, unit='B', unit_scale=True) as pbar:
+                while True:
+                    if downloaded != 0:
+                        log.warning(f'Download stopped abruptly, trying to resume from {downloaded} '
+                                    f'to reach {total_length}')
+                        headers['Range'] = f'bytes={downloaded}-'
+                        r = requests.get(url, headers=headers, stream=True)
+                        if 'content-length' not in r.headers or \
+                                total_length - downloaded != int(r.headers['content-length']):
+                            raise RuntimeError('It looks like the server does not support resuming downloads.')
+
+                    try:
+                        for chunk in r.iter_content(chunk_size=chunk_size):
+                            if chunk:  # filter out keep-alive new chunks
+                                downloaded += len(chunk)
+                                pbar.update(len(chunk))
+                                f.write(chunk)
+                    except requests.exceptions.ChunkedEncodingError:
+                        if downloaded == 0:
+                            r = requests.get(url, stream=True, headers=headers)
+
+                    if downloaded >= total_length:
+                        # Note that total_length is 0 if the server didn't return the content length,
+                        # in this case we perform just one iteration and assume that we are done.
+                        break
+
+        temporary.rename(destination)
+    except Exception as e:
+        if n_tries > 0:
+            log.warning(f'Download failed: {e}, retrying')
+            simple_download(url, destination, headers, n_tries - 1)
+        else:
+            raise e
 
 
 def download(dest_file_path: [List[Union[str, Path]]], source_url: str, force_download: bool = True,
diff --git a/deeppavlov/requirements/datasets.txt b/deeppavlov/requirements/datasets.txt
@@ -1 +1,2 @@
-datasets>=1.16.0,<2.5.0
+datasets>=1.16.0,<2.5.0;python_version<="3.10"
+datasets==2.2.*;python_version=="3.11.*"
diff --git a/deeppavlov/requirements/faiss.txt b/deeppavlov/requirements/faiss.txt
@@ -1 +1,2 @@
-faiss-cpu==1.7.2
+faiss-cpu==1.7.2;python_version<="3.10"
+faiss-cpu==1.7.4;python_version=="3.11.*"
diff --git a/deeppavlov/requirements/kenlm.txt b/deeppavlov/requirements/kenlm.txt
@@ -1 +1,2 @@
-pypi-kenlm==0.1.20220713
+pypi-kenlm==0.1.20220713;python_version<="3.10"
+kenlm==0.2.*;python_version=="3.11.*"
diff --git a/docs/features/models/KBQA.ipynb b/docs/features/models/KBQA.ipynb
@@ -22,13 +22,13 @@
     "    \n",
     "    4.2. [Predict using CLI](#4.2-Predict-using-CLI)\n",
     "\n",
-    "    4.3. [Using entity linking and Wiki parser as standalone services for KBQA](#4.3-Using-entity-linking-and-Wiki-parser-as-standalone-services-for-KBQA)\n",
+    "    4.3. [Using entity linking and Wiki parser as standalone services for KBQA](#4.3-Using-entity-linking-and-Wiki-parser-as-standalone-tools-for-KBQA)\n",
     "     \n",
     "5. [Customize the model](#5.-Customize-the-model)\n",
     "    \n",
-    "    5.1. [Train your model from Python](#5.1-Train-your-model-from-Python)\n",
+    "    5.1. [Description of config parameters](#5.1-Description-of-config-parameters)\n",
     "    \n",
-    "    5.2. [Train your model from CLI](#5.2-Train-your-model-from-CLI)\n",
+    "    5.2. [Train KBQA components](#5.2-Train-KBQA-components)\n",
     "\n",
     "# 1. Introduction to the task\n",
     "\n",
diff --git a/docs/features/models/NER.ipynb b/docs/features/models/NER.ipynb
@@ -22,7 +22,7 @@
     "    \n",
     "    4.2. [Predict using CLI](#4.2-Predict-using-CLI)\n",
     "    \n",
-    "5. [Evaluate](#6.-Evaluate)\n",
+    "5. [Evaluate](#5.-Evaluate)\n",
     "    \n",
     "    5.1. [Evaluate from Python](#5.1-Evaluate-from-Python)\n",
     "    \n",
diff --git a/docs/features/models/SQuAD.ipynb b/docs/features/models/SQuAD.ipynb
@@ -105,7 +105,7 @@
     "`squad_bert` is the name of the model's *config_file*. [What is a Config File?](http://docs.deeppavlov.ai/en/master/intro/configuration.html) \n",
     "\n",
     "Configuration file defines the model and describes its hyperparameters. To use another model, change the name of the *config_file* here and further.\n",
-    "The full list of the models with their config names can be found in the [table](#6.-Models-list).\n",
+    "The full list of the models with their config names can be found in the [table](#3.-Models-list).\n",
     "\n",
     "# 3. Models list\n",
     "\n",
diff --git a/docs/features/models/classification.ipynb b/docs/features/models/classification.ipynb
@@ -162,7 +162,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 3.2 Predict using CLI\n",
+    "## 4.2 Predict using CLI\n",
     "\n",
     "You can also get predictions in an interactive mode through CLI (Command Line Interface)."
    ]
@@ -198,9 +198,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# 4. Evaluation\n",
+    "# 5. Evaluation\n",
     "\n",
-    "## 4.1 Evaluate from Python"
+    "## 5.1 Evaluate from Python"
    ]
   },
   {
@@ -218,7 +218,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 4.2 Evaluate from CLI"
+    "## 5.2 Evaluate from CLI"
    ]
   },
   {
@@ -234,9 +234,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# 5. Customize the model\n",
+    "# 6. Train the model on your data\n",
     "\n",
-    "## 5.1 Train your model from Python\n",
+    "## 6.1 Train your model from Python\n",
     "\n",
     "### Provide your data path\n",
     "\n",
@@ -346,7 +346,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 5.2 Train your model from CLI\n",
+    "## 6.2 Train your model from CLI\n",
     "\n",
     "To train the model on your data, create a copy of a config file and change the *data_path* variable in it. After that, train the model using your new *config_file*. You can also change any of the hyperparameters of the model."
    ]
diff --git a/docs/features/models/few_shot_classification.ipynb b/docs/features/models/few_shot_classification.ipynb
@@ -119,7 +119,7 @@
     "\n",
     "## 4.2 Predict using Python\n",
     "\n",
-    "After [installing](#4.-Get-started-with-the-model) the model, build it from the config and predict."
+    "After [installing](#2.-Get-started-with-the-model) the model, build it from the config and predict."
    ]
   },
   {
@@ -192,7 +192,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 4.2 Predict using CLI\n",
+    "## 4.3 Predict using CLI\n",
     "\n",
     "You can also get predictions in an interactive mode through CLI (Сommand Line Interface)."
    ]
diff --git a/docs/features/models/morpho_tagger.ipynb b/docs/features/models/morpho_tagger.ipynb
@@ -22,7 +22,7 @@
     "\n",
     "    4.2. [Predict using CLI](#4.2-Predict-using-CLI)\n",
     "\n",
-    "5. [Customize the model](#4.-Customize-the-model)\n",
+    "5. [Customize the model](#5.-Customize-the-model)\n",
     "\n",
     "# 1. Introduction to the task\n",
     "\n",
diff --git a/docs/features/models/relation_extraction.ipynb b/docs/features/models/relation_extraction.ipynb
@@ -198,7 +198,7 @@
     "|NUM    | Percents, money, quantities                                                                    |\n",
     "|MISC   | Products, including vehicles, weapons, etc. <br> Events, including elections, battles, sporting MISC events, etc. Laws, cases, languages, etc.   |\n",
     "\n",
-    "**Model Output**: one or several of the [97 relations](#5.1-Relations-used-in-English-model) found between the given entities; relation id in [Wikidata](https://www.wikidata.org/wiki/Wikidata:Main_Page) (e.g. 'P26') and relation name ('spouse').\n",
+    "**Model Output**: one or several of the [97 relations](#6.1-Relations-used-in-English-model) found between the given entities; relation id in [Wikidata](https://www.wikidata.org/wiki/Wikidata:Main_Page) (e.g. 'P26') and relation name ('spouse').\n",
     "\n",
     "### Russian"
    ]
@@ -244,8 +244,34 @@
     "- list of entities positions (i.e. all start and end positions of both entities' mentions)\n",
     "- list of NER tags of both entities.\n",
     "\n",
-    "**Model Output**: one or several of the [30 relations](#5.2-Relations-used-in-Russian-model) found between the given entities; a Russian relation name (e.g. \"участник\") or an English one, if Russian one is unavailable, and, if applicable, its id in [Wikidata](https://www.wikidata.org/wiki/Wikidata:Main_Page) (e.g. 'P710').\n",
+    "**Model Output**: one or several of the [30 relations](#6.2-Relations-used-in-Russian-model) found between the given entities; a Russian relation name (e.g. \"участник\") or an English one, if Russian one is unavailable, and, if applicable, its id in [Wikidata](https://www.wikidata.org/wiki/Wikidata:Main_Page) (e.g. 'P710').\n",
     "\n",
+    "## 4.2 Predict using CLI\n",
+    "\n",
+    "You can also get predictions in an interactive mode through CLI."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! python -m deeppavlov interact re_docred [-d]\n",
+    "! python -m deeppavlov interact re_rured [-d]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "`-d` is an optional download key (alternative to `download=True` in Python code). It is used to download the pre-trained model along with embeddings and all other files needed to run the model."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
     "# 5. Customize the model\n",
     "\n",
     "## 5.1 Description of config parameters\n",
diff --git a/docs/features/models/spelling_correction.ipynb b/docs/features/models/spelling_correction.ipynb
@@ -22,7 +22,7 @@
     "\n",
     "    4.2. [Predict using CLI](#4.2-Predict-using-CLI)\n",
     "\n",
-    "5. [Customize the model](#4.-Customize-the-model)\n",
+    "5. [Customize the model](#5.-Customize-the-model)\n",
     "\n",
     "    5.1. [Training configuration](#5.1-Training-configuration)\n",
     "\n",
diff --git a/docs/features/models/syntax_parser.ipynb b/docs/features/models/syntax_parser.ipynb
@@ -22,7 +22,7 @@
     "\n",
     "    4.2. [Predict using CLI](#4.2-Predict-using-CLI)\n",
     "\n",
-    "5. [Customize the model](#4.-Customize-the-model)\n",
+    "5. [Customize the model](#5.-Customize-the-model)\n",
     "\n",
     "# 1. Introduction to the task\n",
     "\n",
diff --git a/docs/intro/installation.rst b/docs/intro/installation.rst
diff --git a/docs/intro/quick_start.rst b/docs/intro/quick_start.rst
diff --git a/requirements.txt b/requirements.txt
diff --git a/setup.py b/setup.py
diff --git a/tests/test_quick_start.py b/tests/test_quick_start.py
diff --git a/utils/Docker/docker-compose.yml b/utils/Docker/docker-compose.yml

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-__version__ = '1.5.0'`
	`1`	`+__version__ = '1.6.0'`
`2`	`2`	`__author__ = 'Neural Networks and Deep Learning lab, MIPT'`
`3`	`3`	`__description__ = 'An open source library for building end-to-end dialog systems and training chatbots.'`
`4`	`4`	`__keywords__ = ['NLP', 'NER', 'SQUAD', 'Intents', 'Chatbot']`
Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`		`-datasets>=1.16.0,<2.5.0`
	`1`	`+datasets>=1.16.0,<2.5.0;python_version<="3.10"`
	`2`	`+datasets==2.2.;python_version=="3.11."`
Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`		`-faiss-cpu==1.7.2`
	`1`	`+faiss-cpu==1.7.2;python_version<="3.10"`
	`2`	`+faiss-cpu==1.7.4;python_version=="3.11.*"`
Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`		`-pypi-kenlm==0.1.20220713`
	`1`	`+pypi-kenlm==0.1.20220713;python_version<="3.10"`
	`2`	`+kenlm==0.2.;python_version=="3.11."`
Original file line number	Diff line number	Diff line change
`@@ -119,7 +119,7 @@`
`119`	`119`	`"\n",`
`120`	`120`	`"## 4.2 Predict using Python\n",`
`121`	`121`	`"\n",`
`122`		`- "After [installing](#4.-Get-started-with-the-model) the model, build it from the config and predict."`
	`122`	`+ "After [installing](#2.-Get-started-with-the-model) the model, build it from the config and predict."`
`123`	`123`	`]`
`124`	`124`	`},`
`125`	`125`	`{`
`@@ -192,7 +192,7 @@`
`192`	`192`	`"cell_type": "markdown",`
`193`	`193`	`"metadata": {},`
`194`	`194`	`"source": [`
`195`		`- "## 4.2 Predict using CLI\n",`
	`195`	`+ "## 4.3 Predict using CLI\n",`
`196`	`196`	`"\n",`
`197`	`197`	`"You can also get predictions in an interactive mode through CLI (Сommand Line Interface)."`
`198`	`198`	`]`