diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
deleted file mode 100644
index 25f5076860..0000000000
--- a/.pre-commit-config.yaml
+++ /dev/null
@@ -1,18 +0,0 @@
-
-repos:
-- repo: https://github.com/pre-commit/pre-commit-hooks
- rev: v1.2.3
- hooks:
- - id: trailing-whitespace
- exclude: "Megatron-LM/"
- - id: check-yaml
- exclude: "Megatron-LM/"
- - id: end-of-file-fixer
- exclude: "Megatron-LM/"
-
-
-- repo: https://github.com/pre-commit/mirrors-yapf
- rev: v0.29.0
- hooks:
- - id: yapf
- exclude: "Megatron-LM/"
diff --git a/README.md b/README.md
index 4802433bdc..135a54c3a2 100644
--- a/README.md
+++ b/README.md
@@ -1,15 +1,17 @@
-
# DeepSpeed Examples
-This repository contains various example models that use [DeepSpeed](https://github.com/microsoft/DeepSpeed) for training and inference.
+This repository contains various examples including training, inference, compression, benchmarks, and applications that use [DeepSpeed](https://github.com/microsoft/DeepSpeed).
+
+## 1. Training
+There are several training and finetuning examples so please see the individual folders for specific instructions.
-# Inference Examples
+## 2. Inference
The DeepSpeed Huggingface inference [README](./inference/huggingface/README.md) explains how to get started with running DeepSpeed Huggingface inference examples.
-# Training Examples
-There are several trianing examples in this repository. Please see the individual folders.
+## 3. Compression
+Model compression examples.
-## Note on Megatron examples
-Please use the latest [Megatron-DeepSpeed fork](https://github.com/microsoft/Megatron-DeepSpeed) instead of the deprecated/old megatron forks in the megatron folder.
+## 4. Benchmarks
+TODO: Move the DeepSpeed benchmarks folder here.
# Contributing
@@ -23,4 +25,4 @@ provided by the bot. You will only need to do this once across all repos using o
This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
-contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
+contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
\ No newline at end of file
diff --git a/benchmarks/README.md b/benchmarks/README.md
new file mode 100644
index 0000000000..6c42dcc636
--- /dev/null
+++ b/benchmarks/README.md
@@ -0,0 +1 @@
+The new home for DeepSpeed benchmarks. TODO: Move DS benchmarks to this repo.
diff --git a/model_compression/bert/README.md b/compression/bert/README.md
similarity index 100%
rename from model_compression/bert/README.md
rename to compression/bert/README.md
diff --git a/model_compression/bert/bash_script/XTC/layer_reduction.sh b/compression/bert/bash_script/XTC/layer_reduction.sh
similarity index 100%
rename from model_compression/bert/bash_script/XTC/layer_reduction.sh
rename to compression/bert/bash_script/XTC/layer_reduction.sh
diff --git a/model_compression/bert/bash_script/XTC/layer_reduction_1bit.sh b/compression/bert/bash_script/XTC/layer_reduction_1bit.sh
similarity index 100%
rename from model_compression/bert/bash_script/XTC/layer_reduction_1bit.sh
rename to compression/bert/bash_script/XTC/layer_reduction_1bit.sh
diff --git a/model_compression/bert/bash_script/XTC/quant_1bit.sh b/compression/bert/bash_script/XTC/quant_1bit.sh
similarity index 100%
rename from model_compression/bert/bash_script/XTC/quant_1bit.sh
rename to compression/bert/bash_script/XTC/quant_1bit.sh
diff --git a/model_compression/bert/bash_script/ZeroQuant/zero_quant.sh b/compression/bert/bash_script/ZeroQuant/zero_quant.sh
similarity index 100%
rename from model_compression/bert/bash_script/ZeroQuant/zero_quant.sh
rename to compression/bert/bash_script/ZeroQuant/zero_quant.sh
diff --git a/model_compression/bert/bash_script/ZeroQuant/zero_quant_lkd.sh b/compression/bert/bash_script/ZeroQuant/zero_quant_lkd.sh
similarity index 100%
rename from model_compression/bert/bash_script/ZeroQuant/zero_quant_lkd.sh
rename to compression/bert/bash_script/ZeroQuant/zero_quant_lkd.sh
diff --git a/model_compression/bert/bash_script/layer_reduction.sh b/compression/bert/bash_script/layer_reduction.sh
similarity index 100%
rename from model_compression/bert/bash_script/layer_reduction.sh
rename to compression/bert/bash_script/layer_reduction.sh
diff --git a/model_compression/bert/bash_script/pruning_head.sh b/compression/bert/bash_script/pruning_head.sh
similarity index 100%
rename from model_compression/bert/bash_script/pruning_head.sh
rename to compression/bert/bash_script/pruning_head.sh
diff --git a/model_compression/bert/bash_script/pruning_row.sh b/compression/bert/bash_script/pruning_row.sh
similarity index 100%
rename from model_compression/bert/bash_script/pruning_row.sh
rename to compression/bert/bash_script/pruning_row.sh
diff --git a/model_compression/bert/bash_script/pruning_sparse.sh b/compression/bert/bash_script/pruning_sparse.sh
similarity index 100%
rename from model_compression/bert/bash_script/pruning_sparse.sh
rename to compression/bert/bash_script/pruning_sparse.sh
diff --git a/model_compression/bert/bash_script/quant_activation.sh b/compression/bert/bash_script/quant_activation.sh
similarity index 100%
rename from model_compression/bert/bash_script/quant_activation.sh
rename to compression/bert/bash_script/quant_activation.sh
diff --git a/model_compression/bert/bash_script/quant_weight.sh b/compression/bert/bash_script/quant_weight.sh
similarity index 100%
rename from model_compression/bert/bash_script/quant_weight.sh
rename to compression/bert/bash_script/quant_weight.sh
diff --git a/model_compression/bert/config/XTC/ds_config_W1A8_Qgroup1_fp32.json b/compression/bert/config/XTC/ds_config_W1A8_Qgroup1_fp32.json
similarity index 100%
rename from model_compression/bert/config/XTC/ds_config_W1A8_Qgroup1_fp32.json
rename to compression/bert/config/XTC/ds_config_W1A8_Qgroup1_fp32.json
diff --git a/model_compression/bert/config/XTC/ds_config_layer_reduction_W1Q8_fp32.json b/compression/bert/config/XTC/ds_config_layer_reduction_W1Q8_fp32.json
similarity index 100%
rename from model_compression/bert/config/XTC/ds_config_layer_reduction_W1Q8_fp32.json
rename to compression/bert/config/XTC/ds_config_layer_reduction_W1Q8_fp32.json
diff --git a/model_compression/bert/config/XTC/ds_config_layer_reduction_fp16.json b/compression/bert/config/XTC/ds_config_layer_reduction_fp16.json
similarity index 100%
rename from model_compression/bert/config/XTC/ds_config_layer_reduction_fp16.json
rename to compression/bert/config/XTC/ds_config_layer_reduction_fp16.json
diff --git a/model_compression/bert/config/ZeroQuant/ds_config_W48A8_Qgroup48_lkd_fp32.json b/compression/bert/config/ZeroQuant/ds_config_W48A8_Qgroup48_lkd_fp32.json
similarity index 100%
rename from model_compression/bert/config/ZeroQuant/ds_config_W48A8_Qgroup48_lkd_fp32.json
rename to compression/bert/config/ZeroQuant/ds_config_W48A8_Qgroup48_lkd_fp32.json
diff --git a/model_compression/bert/config/ZeroQuant/ds_config_W8A8_Qgroup48_fp32.json b/compression/bert/config/ZeroQuant/ds_config_W8A8_Qgroup48_fp32.json
similarity index 100%
rename from model_compression/bert/config/ZeroQuant/ds_config_W8A8_Qgroup48_fp32.json
rename to compression/bert/config/ZeroQuant/ds_config_W8A8_Qgroup48_fp32.json
diff --git a/model_compression/bert/config/ds_config.json b/compression/bert/config/ds_config.json
similarity index 100%
rename from model_compression/bert/config/ds_config.json
rename to compression/bert/config/ds_config.json
diff --git a/model_compression/bert/config/ds_config_TEMPLATE.json b/compression/bert/config/ds_config_TEMPLATE.json
similarity index 100%
rename from model_compression/bert/config/ds_config_TEMPLATE.json
rename to compression/bert/config/ds_config_TEMPLATE.json
diff --git a/model_compression/bert/config/ds_config_W1A8_Qgroup64_fp16.json b/compression/bert/config/ds_config_W1A8_Qgroup64_fp16.json
similarity index 100%
rename from model_compression/bert/config/ds_config_W1A8_Qgroup64_fp16.json
rename to compression/bert/config/ds_config_W1A8_Qgroup64_fp16.json
diff --git a/model_compression/bert/config/ds_config_W1A8_Qgroup64_fp32.json b/compression/bert/config/ds_config_W1A8_Qgroup64_fp32.json
similarity index 100%
rename from model_compression/bert/config/ds_config_W1A8_Qgroup64_fp32.json
rename to compression/bert/config/ds_config_W1A8_Qgroup64_fp32.json
diff --git a/model_compression/bert/config/ds_config_W1or2A8_Qgroup64_fp16.json b/compression/bert/config/ds_config_W1or2A8_Qgroup64_fp16.json
similarity index 100%
rename from model_compression/bert/config/ds_config_W1or2A8_Qgroup64_fp16.json
rename to compression/bert/config/ds_config_W1or2A8_Qgroup64_fp16.json
diff --git a/model_compression/bert/huggingface_transformer/modeling_bert.py b/compression/bert/huggingface_transformer/modeling_bert.py
similarity index 100%
rename from model_compression/bert/huggingface_transformer/modeling_bert.py
rename to compression/bert/huggingface_transformer/modeling_bert.py
diff --git a/model_compression/bert/requirements.txt b/compression/bert/requirements.txt
similarity index 100%
rename from model_compression/bert/requirements.txt
rename to compression/bert/requirements.txt
diff --git a/model_compression/bert/run_glue_lkd.py b/compression/bert/run_glue_lkd.py
similarity index 100%
rename from model_compression/bert/run_glue_lkd.py
rename to compression/bert/run_glue_lkd.py
diff --git a/model_compression/bert/run_glue_no_trainer.py b/compression/bert/run_glue_no_trainer.py
similarity index 100%
rename from model_compression/bert/run_glue_no_trainer.py
rename to compression/bert/run_glue_no_trainer.py
diff --git a/model_compression/bert/util.py b/compression/bert/util.py
similarity index 100%
rename from model_compression/bert/util.py
rename to compression/bert/util.py
diff --git a/model_compression/cifar/README.md b/compression/cifar/README.md
similarity index 100%
rename from model_compression/cifar/README.md
rename to compression/cifar/README.md
diff --git a/model_compression/cifar/config/ds_config.json b/compression/cifar/config/ds_config.json
similarity index 100%
rename from model_compression/cifar/config/ds_config.json
rename to compression/cifar/config/ds_config.json
diff --git a/model_compression/cifar/config/ds_config_channel_prune.json b/compression/cifar/config/ds_config_channel_prune.json
similarity index 100%
rename from model_compression/cifar/config/ds_config_channel_prune.json
rename to compression/cifar/config/ds_config_channel_prune.json
diff --git a/model_compression/cifar/resnet.py b/compression/cifar/resnet.py
similarity index 100%
rename from model_compression/cifar/resnet.py
rename to compression/cifar/resnet.py
diff --git a/model_compression/cifar/run_compress.sh b/compression/cifar/run_compress.sh
similarity index 100%
rename from model_compression/cifar/run_compress.sh
rename to compression/cifar/run_compress.sh
diff --git a/model_compression/cifar/train.py b/compression/cifar/train.py
similarity index 100%
rename from model_compression/cifar/train.py
rename to compression/cifar/train.py
diff --git a/model_compression/cifar/utils.py b/compression/cifar/utils.py
similarity index 100%
rename from model_compression/cifar/utils.py
rename to compression/cifar/utils.py
diff --git a/model_compression/gpt2/README.md b/compression/gpt2/README.md
similarity index 100%
rename from model_compression/gpt2/README.md
rename to compression/gpt2/README.md
diff --git a/model_compression/gpt2/bash_script/run_zero_quant.sh b/compression/gpt2/bash_script/run_zero_quant.sh
similarity index 100%
rename from model_compression/gpt2/bash_script/run_zero_quant.sh
rename to compression/gpt2/bash_script/run_zero_quant.sh
diff --git a/model_compression/gpt2/config/ds_config.json b/compression/gpt2/config/ds_config.json
similarity index 100%
rename from model_compression/gpt2/config/ds_config.json
rename to compression/gpt2/config/ds_config.json
diff --git a/model_compression/gpt2/config/ds_config_W4or8A8_Qgroup64_fp16.json b/compression/gpt2/config/ds_config_W4or8A8_Qgroup64_fp16.json
similarity index 100%
rename from model_compression/gpt2/config/ds_config_W4or8A8_Qgroup64_fp16.json
rename to compression/gpt2/config/ds_config_W4or8A8_Qgroup64_fp16.json
diff --git a/model_compression/gpt2/config/ds_config_W4or8A8_Qgroup64_fp32.json b/compression/gpt2/config/ds_config_W4or8A8_Qgroup64_fp32.json
similarity index 100%
rename from model_compression/gpt2/config/ds_config_W4or8A8_Qgroup64_fp32.json
rename to compression/gpt2/config/ds_config_W4or8A8_Qgroup64_fp32.json
diff --git a/model_compression/gpt2/config/ds_config_W8A8_Qgroup64_fp16.json b/compression/gpt2/config/ds_config_W8A8_Qgroup64_fp16.json
similarity index 100%
rename from model_compression/gpt2/config/ds_config_W8A8_Qgroup64_fp16.json
rename to compression/gpt2/config/ds_config_W8A8_Qgroup64_fp16.json
diff --git a/model_compression/gpt2/config/ds_config_W8A8_Qgroup64_fp32.json b/compression/gpt2/config/ds_config_W8A8_Qgroup64_fp32.json
similarity index 100%
rename from model_compression/gpt2/config/ds_config_W8A8_Qgroup64_fp32.json
rename to compression/gpt2/config/ds_config_W8A8_Qgroup64_fp32.json
diff --git a/model_compression/gpt2/requirements.txt b/compression/gpt2/requirements.txt
similarity index 100%
rename from model_compression/gpt2/requirements.txt
rename to compression/gpt2/requirements.txt
diff --git a/model_compression/gpt2/run_clm_no_trainer.py b/compression/gpt2/run_clm_no_trainer.py
similarity index 100%
rename from model_compression/gpt2/run_clm_no_trainer.py
rename to compression/gpt2/run_clm_no_trainer.py
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/LICENSE b/megatron/Megatron-LM-v1.1.5-3D_parallelism/LICENSE
deleted file mode 100644
index c4bad15939..0000000000
--- a/megatron/Megatron-LM-v1.1.5-3D_parallelism/LICENSE
+++ /dev/null
@@ -1,264 +0,0 @@
-The following applies to all files unless otherwise noted:
-
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-# * Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution.
-# * Neither the name of NVIDIA CORPORATION nor the names of its
-# contributors may be used to endorse or promote products derived
-# from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
---
-
-This repository also contains code from Hugging Face Inc., Google Research,
-and Facebook (from their Fairseq project). Files from these
-organizations have notices at the top of each file. Below are licenses
-used in those files, as indicated.
-
-
-------------- LICENSE FOR huggingface and Google Research code --------------
-
-
- Apache License
- Version 2.0, January 2004
- http://www.apache.org/licenses/
-
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
- 1. Definitions.
-
- "License" shall mean the terms and conditions for use, reproduction,
- and distribution as defined by Sections 1 through 9 of this document.
-
- "Licensor" shall mean the copyright owner or entity authorized by
- the copyright owner that is granting the License.
-
- "Legal Entity" shall mean the union of the acting entity and all
- other entities that control, are controlled by, or are under common
- control with that entity. For the purposes of this definition,
- "control" means (i) the power, direct or indirect, to cause the
- direction or management of such entity, whether by contract or
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
- outstanding shares, or (iii) beneficial ownership of such entity.
-
- "You" (or "Your") shall mean an individual or Legal Entity
- exercising permissions granted by this License.
-
- "Source" form shall mean the preferred form for making modifications,
- including but not limited to software source code, documentation
- source, and configuration files.
-
- "Object" form shall mean any form resulting from mechanical
- transformation or translation of a Source form, including but
- not limited to compiled object code, generated documentation,
- and conversions to other media types.
-
- "Work" shall mean the work of authorship, whether in Source or
- Object form, made available under the License, as indicated by a
- copyright notice that is included in or attached to the work
- (an example is provided in the Appendix below).
-
- "Derivative Works" shall mean any work, whether in Source or Object
- form, that is based on (or derived from) the Work and for which the
- editorial revisions, annotations, elaborations, or other modifications
- represent, as a whole, an original work of authorship. For the purposes
- of this License, Derivative Works shall not include works that remain
- separable from, or merely link (or bind by name) to the interfaces of,
- the Work and Derivative Works thereof.
-
- "Contribution" shall mean any work of authorship, including
- the original version of the Work and any modifications or additions
- to that Work or Derivative Works thereof, that is intentionally
- submitted to Licensor for inclusion in the Work by the copyright owner
- or by an individual or Legal Entity authorized to submit on behalf of
- the copyright owner. For the purposes of this definition, "submitted"
- means any form of electronic, verbal, or written communication sent
- to the Licensor or its representatives, including but not limited to
- communication on electronic mailing lists, source code control systems,
- and issue tracking systems that are managed by, or on behalf of, the
- Licensor for the purpose of discussing and improving the Work, but
- excluding communication that is conspicuously marked or otherwise
- designated in writing by the copyright owner as "Not a Contribution."
-
- "Contributor" shall mean Licensor and any individual or Legal Entity
- on behalf of whom a Contribution has been received by Licensor and
- subsequently incorporated within the Work.
-
- 2. Grant of Copyright License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- copyright license to reproduce, prepare Derivative Works of,
- publicly display, publicly perform, sublicense, and distribute the
- Work and such Derivative Works in Source or Object form.
-
- 3. Grant of Patent License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- (except as stated in this section) patent license to make, have made,
- use, offer to sell, sell, import, and otherwise transfer the Work,
- where such license applies only to those patent claims licensable
- by such Contributor that are necessarily infringed by their
- Contribution(s) alone or by combination of their Contribution(s)
- with the Work to which such Contribution(s) was submitted. If You
- institute patent litigation against any entity (including a
- cross-claim or counterclaim in a lawsuit) alleging that the Work
- or a Contribution incorporated within the Work constitutes direct
- or contributory patent infringement, then any patent licenses
- granted to You under this License for that Work shall terminate
- as of the date such litigation is filed.
-
- 4. Redistribution. You may reproduce and distribute copies of the
- Work or Derivative Works thereof in any medium, with or without
- modifications, and in Source or Object form, provided that You
- meet the following conditions:
-
- (a) You must give any other recipients of the Work or
- Derivative Works a copy of this License; and
-
- (b) You must cause any modified files to carry prominent notices
- stating that You changed the files; and
-
- (c) You must retain, in the Source form of any Derivative Works
- that You distribute, all copyright, patent, trademark, and
- attribution notices from the Source form of the Work,
- excluding those notices that do not pertain to any part of
- the Derivative Works; and
-
- (d) If the Work includes a "NOTICE" text file as part of its
- distribution, then any Derivative Works that You distribute must
- include a readable copy of the attribution notices contained
- within such NOTICE file, excluding those notices that do not
- pertain to any part of the Derivative Works, in at least one
- of the following places: within a NOTICE text file distributed
- as part of the Derivative Works; within the Source form or
- documentation, if provided along with the Derivative Works; or,
- within a display generated by the Derivative Works, if and
- wherever such third-party notices normally appear. The contents
- of the NOTICE file are for informational purposes only and
- do not modify the License. You may add Your own attribution
- notices within Derivative Works that You distribute, alongside
- or as an addendum to the NOTICE text from the Work, provided
- that such additional attribution notices cannot be construed
- as modifying the License.
-
- You may add Your own copyright statement to Your modifications and
- may provide additional or different license terms and conditions
- for use, reproduction, or distribution of Your modifications, or
- for any such Derivative Works as a whole, provided Your use,
- reproduction, and distribution of the Work otherwise complies with
- the conditions stated in this License.
-
- 5. Submission of Contributions. Unless You explicitly state otherwise,
- any Contribution intentionally submitted for inclusion in the Work
- by You to the Licensor shall be under the terms and conditions of
- this License, without any additional terms or conditions.
- Notwithstanding the above, nothing herein shall supersede or modify
- the terms of any separate license agreement you may have executed
- with Licensor regarding such Contributions.
-
- 6. Trademarks. This License does not grant permission to use the trade
- names, trademarks, service marks, or product names of the Licensor,
- except as required for reasonable and customary use in describing the
- origin of the Work and reproducing the content of the NOTICE file.
-
- 7. Disclaimer of Warranty. Unless required by applicable law or
- agreed to in writing, Licensor provides the Work (and each
- Contributor provides its Contributions) on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
- implied, including, without limitation, any warranties or conditions
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
- PARTICULAR PURPOSE. You are solely responsible for determining the
- appropriateness of using or redistributing the Work and assume any
- risks associated with Your exercise of permissions under this License.
-
- 8. Limitation of Liability. In no event and under no legal theory,
- whether in tort (including negligence), contract, or otherwise,
- unless required by applicable law (such as deliberate and grossly
- negligent acts) or agreed to in writing, shall any Contributor be
- liable to You for damages, including any direct, indirect, special,
- incidental, or consequential damages of any character arising as a
- result of this License or out of the use or inability to use the
- Work (including but not limited to damages for loss of goodwill,
- work stoppage, computer failure or malfunction, or any and all
- other commercial damages or losses), even if such Contributor
- has been advised of the possibility of such damages.
-
- 9. Accepting Warranty or Additional Liability. While redistributing
- the Work or Derivative Works thereof, You may choose to offer,
- and charge a fee for, acceptance of support, warranty, indemnity,
- or other liability obligations and/or rights consistent with this
- License. However, in accepting such obligations, You may act only
- on Your own behalf and on Your sole responsibility, not on behalf
- of any other Contributor, and only if You agree to indemnify,
- defend, and hold each Contributor harmless for any liability
- incurred by, or claims asserted against, such Contributor by reason
- of your accepting any such warranty or additional liability.
-
- END OF TERMS AND CONDITIONS
-
- APPENDIX: How to apply the Apache License to your work.
-
- To apply the Apache License to your work, attach the following
- boilerplate notice, with the fields enclosed by brackets "[]"
- replaced with your own identifying information. (Don't include
- the brackets!) The text should be enclosed in the appropriate
- comment syntax for the file format. We also recommend that a
- file or class name and description of purpose be included on the
- same "printed page" as the copyright notice for easier
- identification within third-party archives.
-
- Copyright [yyyy] [name of copyright owner]
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-
-------------- LICENSE FOR Facebook Fairseq code --------------
-
-MIT License
-
-Copyright (c) Facebook, Inc. and its affiliates.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/MANIFEST.in b/megatron/Megatron-LM-v1.1.5-3D_parallelism/MANIFEST.in
deleted file mode 100644
index f44791183e..0000000000
--- a/megatron/Megatron-LM-v1.1.5-3D_parallelism/MANIFEST.in
+++ /dev/null
@@ -1,2 +0,0 @@
-include megatron/data/Makefile
-include megatron/data/helpers.cpp
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/README.md b/megatron/Megatron-LM-v1.1.5-3D_parallelism/README.md
deleted file mode 100644
index df2f9d2f09..0000000000
--- a/megatron/Megatron-LM-v1.1.5-3D_parallelism/README.md
+++ /dev/null
@@ -1,576 +0,0 @@
-This is a snapshot of Megatron v1.1.5 integrated with DeepSpeed's pipeline- and data-parallel training. This 3D parallelism integration
-can train a model with [one trillion parameters](https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/) using as few as 800 NVIDIA V100 GPUs.
-
-See `examples/ds_pretrain_gpt2_pipe.sh` for an entry point to training with 3D parallelism.
-
-See our [pull request](https://github.com/jeffra/DSE/pull/10) for a more detailed view of the integration.
-
-
-[Megatron](https://arxiv.org/pdf/1909.08053.pdf) is a large, powerful transformer developed by the Applied Deep Learning Research team at NVIDIA. This repository is for ongoing research on training large transformer language models at scale. We developed efficient, model-parallel, and multinode training of [GPT-2](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) and [BERT](https://arxiv.org/pdf/1810.04805.pdf) using mixed precision.
-
-Using our GPT-2 model we achieve a perplexity of 10.8 on the WikiText-103 dataset (improving SOTA from 15.8) and an accuracy of 66.5% on the LAMBADA datasets. For BERT training, we swapped the position of the layer normalization and the residual connection in the model architecture (similar to GPT-2 architucture), which allowed the models to continue to improve as they were scaled up. Our BERT models with 3.9 billion parameters reaches a loss of 1.16, SQuAD 2.0 F1-score of 91.7, and RACE accuracy of 90.9%.
-
-Our codebase is capable of efficiently training very large (several billion parameter) language models with both model and data parallelism. To demonstrate how the code scales with multiple GPUs we consider the following GPT-2 model sizes. All models use a vocabulary size of 51,200 and a sequence length of 1024.
-
-![Cases](images/cases.png)
-
-The table below details the weak scaling from 1 to 8 GPUs of our model parallelism code in both a DGX-2 and a DGX-A100. Notice that we double the batch size on the DGX-A100 but the iteration time decreases compared to the DGX-2 resulting in a **2.1x** speedup for the end-to-end application.
-
-![Model Parallel Scaling](images/scaling-mp.png)
-
-The following table details how Megatron scales using data parallelism in conjuction with model parallelism in a cluster of DGX-A100s. All of these cases use 128-way data parallelism and the scaling numbers are relative to a single A100 (Case 1B with a 1076ms iteration time).
-
-![Data Parallel Scaling](images/scaling-dp.png)
-
-
-# Contents
-
-
-- [Setup](#setup)
- - [Downloading Checkpoints](#downloading-checkpoints)
-- [Usage](#usage)
-- [Training](#training)
- - [Data Preprocessing](#data-preprocessing)
- - [BERT Pretraining](#bert-pretraining)
- - [GPT-2 Pretraining](#gpt-2-pretraining)
- - [Distributed BERT or GPT-2 Pretraining](#distributed-bert-or-gpt-2-pretraining)
-- [REALM Pipeline](#realm)
-- [Evaluation and Tasks](#evaluation-and-tasks)
- - [GPT-2 Text Generation](#gpt-2-text-generation)
- - [GPT-2 Evaluation](#gpt-2-evaluation)
- - [WikiText Perplexity Evaluation](#wikitext-perplexity-evaluation)
- - [LAMBADA Cloze Accuracy](#lambada-cloze-accuracy)
- - [BERT Task Evaluation](#bert-task-evaluation)
- - [RACE Evaluation](#race-evaluation)
- - [MNLI Evaluation](#mnli-evaluation)
-- [Datasets](#datasets)
- - [Collecting Wikipedia Training Data](#collecting-wikipedia-training-data)
- - [Collecting GPT-2 Webtext Data](#collecting-gpt-2-webtext-data)
-
-
-
-
-# Setup
-We officially support only python 3.6, pytorch 1.5, cuda 10, and nccl 2.6 versions and above.
-
-To use this repo please install the latest supported versions of PyTorch with GPU support and NVIDIA [APEX](https://github.com/NVIDIA/apex#quick-start). We strongly recommend using one of [NGC's recent PyTorch containers](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) (the latest compatible version at time of publication can be pulled with `docker pull nvcr.io/nvidia/pytorch:20.03-py3`). Data preprocessing requires [NLTK](https://www.nltk.org/install.html), though this is not required for training, evaluation or downstream tasks.
-
-To use megatron you can either clone the repo or install it via pip (make sure python3-dev is installed):
-
-pip install megatron-lm
-
-
-
-## Downloading Checkpoints
-We've provided two pretrained checkpoints for use to evaluate or finetuning downstream tasks. To access these checkpoints, first please [sign up](https://ngc.nvidia.com/signup) for and [setup](https://ngc.nvidia.com/setup/installers/cli) the NVIDIA GPU Cloud (NGC) Registry CLI.
-
-The checkpoints can be downloaded with:
-
-ngc registry model download-version --dest <output_base_directory> nvidia/<model_name>:<version>
-
-
-The available models along with `:` are below:
-* [BERT-345M](https://ngc.nvidia.com/catalog/models/nvidia:megatron_bert_345m): megatron\_bert\_345m:v0.0
-* [GPT-2-345M](https://ngc.nvidia.com/catalog/models/nvidia:megatron_lm_345m): megatron\_lm\_345m:v0.0
-
-The models require vocabulary files to run. The BERT uncased WordPiece vocab file can be extracted from Google's [pretrained BERT models](https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt). The GPT-2 [vocab file](https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json) and [merge table](https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt) can be downloaded directly.
-
-Further documentation for downloading models can be found in the [NGC documentation](https://docs.nvidia.com/dgx/ngc-registry-cli-user-guide/index.html#topic_6_4_1)
-
-
-# Usage
-
-After installation, there are several possible workflows. The most comprehensive is:
-1. Data preprocessing
-2. Pretraining
-3. Finetuning (Optional for zero-shot tasks)
-4. Downstream task evaluation or text generation
-
-However, steps 1 and 2 can be replaced by using one of the pretrained models mentioned above.
-
-We've provided several scripts for pretraining both BERT and GPT-2 in [`examples`](./examples) directory, as well as scripts for both zero-shot and fine-tuned downstream tasks including MNLI, RACE, WikiText103, and LAMBADA evaluation. There is also a script for GPT-2 interactive text generation.
-
-
-# Training
-
-## Data Preprocessing
-We support three file formats for training, but all require preprocessing. First, place your training data in a loose json format, with one json containing a text sample per line. For example:
-
-{"src": "www.nvidia.com", "text": "The quick brown fox", "type": "Eng", "id": "0", "title": "First Part"}
-{"src": "The Internet", "text": "jumps over the lazy dog", "type": "Eng", "id": "42", "title": "Second Part"}
-
-
-The name of the `text` field of the json can be changed by using the `--json-key` flag in [`preprocess_data.py`](./tools/preprocess_data.py) The other metadata are optional and are not used in training.
-
-The loose json is then processed into a binary format for training. To convert the json into mmap, cached index file, or the lazy loader format use `preprocess_data.py`. Set the `--dataset-impl` flag to `mmap`, `cached`, or `lazy`, respectively (default is `mmap`). An example script to prepare data for BERT training is:
-
-
-The output will be two files named, in this case, `my-bert_text_sentence.bin` and `my-bert_text_sentence.idx`. The `--data-path` specified in later BERT training is the full path and new filename, but without the file extension.
-
-Some minor modifications are required for GPT-2 data preprocessing, namely, the addition of a merge table, an end-of-document token, removal of sentence splitting, and a change to the tokenizer type:
-
-
-Here the output files are named `my-gpt2_text_document.bin` and `my-gpt2_text_document.idx`. As before, in GPT-2 training, use the longer name without the extension as `--data-path`.
-
-Further command line arguments are described in the source file [`preprocess_data.py`](./tools/preprocess_data.py).
-
-
-## BERT Pretraining
-`bash examples/pretrain_bert.sh`
-
-This script runs single GPU 345M parameter BERT pretraining. Debugging is the primary use for single GPU training, as the code base and command line arguments are optimized for highly distributed training. Most of the arguments are fairly self-explanatory. By default, the learning rate decays linearly over the training iterations starting at `--lr` to a minimum set by `--min-lr` over `--lr-decay-iters` iterations. The fraction of training iterations used for warmup is set by `--warmup`. While this is single GPU training, the batch size specified by `--batch-size` is per GPU used for data parallelism. The data is partitioned into a 949:50:1 ratio for training/validation/test sets (default is 969:30:1). This partitioning happens on the fly, but is consistent across runs with the same random seed (1234 by default, or specified manually with `--seed`).
-
-The logging, checkpoint-saving, and evaluation intervals are specified. Checkpointing the activations facilitates the training of larger models and/or batches. Note that the `--data-path` now includes the additional `_text_sentence` suffix added in preprocessing, but does not include the file extensions.
-
-
-
-Further command line arguments are described in the source file [`arguments.py`](./megatron/arguments.py).
-
-
-## GPT-2 Pretraining
-`bash examples/pretrain_gpt2.sh`
-
-This script runs single GPU 345M parameter GPT-2 pretraining. As mentioned above, single GPU training is primarily intended for debugging purposes, as the code is optimized for distributed training.
-
-It follows largely the same format as the previous BERT script with a few notable differences: the tokenization scheme used is BPE (which requires a merge table and a `json` vocabulary file) instead of WordPiece, the model architecture allows for longer sequences (note that the max position embedding must be greater than or equal to the maximum sequence length), and the `--lr-decay-style` has been set to cosine decay. Note that the `--data-path` now includes the additional `_text_document` suffix added in preprocessing, but does not include the file extensions.
-
-
-
-Further command line arguments are described in the source file [`arguments.py`](./megatron/arguments.py).
-
-
-## Distributed BERT or GPT-2 Pretraining
-`bash examples/pretrain_bert_distributed.sh`
-
-`bash examples/pretrain_gpt2_distributed.sh`
-
-These scripts use the PyTorch distributed launcher for distributed training. As such, multinode training can be achieved by properly setting environment variables and using `init_method='env://'` in the launcher. See the official PyTorch [documentation](https://pytorch.org/docs/stable/distributed.html#launch-utility) for further description of these [environment variables](https://pytorch.org/docs/stable/distributed.html#environment-variable-initialization). By default, multinode training uses the [nccl](https://developer.nvidia.com/nccl) distributed backend. A simple set of additional arguments and the use of the PyTorch distributed module with the Python flag `-m torch.distributed.launch`, detailed below, are the only additional requirements to adopt distributed training.
-
-The two tiers of parallelism are data and model parallelism. First, we facilitate two distributed data parallel implementations: a simple one of our own that performs gradient all-reduce at the end of back propagation step, and Torch's distributed data parallel wrapper that overlaps gradient reduction with back propagation computation. To switch between these two options use `--DDP-impl local` or `--DDP-impl torch`, respectively. As expected, Torch distributed data parallelism is more efficient at larger model parallel sizes. For example, for the 8.3 billion parameters model running on 512 GPUs, the scaling increases from 60% to 76% when Torch's distributed data parallel is used. However, the overlapping method requires more memory and for some configurations (e.g., 2.5 billion parameters using 2-way model parallel and 1.2 billion parameters with no model parallel) can make the overall training slower as a result. We empirically found that using a smaller model in those cases improves the training time.
-
-Second, we developed a simple and efficient intra-layer model parallel approach. To use model parallelism, add the `--model-parallel-size` flag to specify the number of GPUs among which to split the model, along with the arguments passed to the distributed launcher as mentioned above. With `WORLD_SIZE` GPUs and `MP_SIZE` model parallel size, `WORLD_SIZE`/`MP_SIZE` GPUs will be used for data parallelism. The default value for `--model-parallel-size` is 1, which will not implement model parallelism.
-
-Other than these minor changes, the distributed training is identical to the training on a single GPU.
-
-Distributed BERT training:
-
-WORLD_SIZE=8
-MP_SIZE=2
-
-DISTRIBUTED_ARGS=<same as those directly above>
-
-CHECKPOINT_PATH=checkpoints/gpt2_345m
-VOCAB_FILE=gpt2-vocab.json
-MERGE_FILE=gpt2-merges.txt
-DATA_PATH=my-gpt2_text_document
-GPT2_ARGS=<same as those in GPT-2 pretraining above>
-OUTPUT_ARGS=<same as those in BERT pretraining above>
-
-python -m torch.distributed.launch $DISTRIBUTED_ARGS ./pretrain_gpt2.py \
- $GPT2_ARGS \
- $OUTPUT_ARGS \
- --save $CHECKPOINT_PATH \
- --load $CHECKPOINT_PATH \
- --data-path $DATA_PATH \
- --model-parallel-size $MP_SIZE \
- --DDP-impl torch
-
-
-
-
-## REALM Pipeline
-We are working on implementing the [REALM](https://arxiv.org/pdf/2002.08909.pdf) system. The following sections (will) reflect the three stages of training it. For now it's just the ICT code.
-Loosely, they are pretraining the retriever modules, then jointly training the language model and the retriever, and then finetuning a question answering head on the language model with fixed retriever.
-
-### Inverse Cloze Task (ICT) Pretraining
-1. Have a corpus in loose JSON format with the intention of creating a collection of fixed-size blocks of text as the fundamental units of data. For a corpus like Wikipedia, this will mean multiple sentences per block but also multiple blocks per document.
-Run `tools/preprocess_data.py` to construct one or more indexed datasets with the `--split-sentences` argument to make sentences the basic unit. For the original REALM system, we construct two datasets, one with the title of every document, and another with the body.
-Refer to the following script
-
-python preprocess_data.py \
- --input /path/to/corpus.json \
- --json-keys text title \
- --split-sentences \
- --tokenizer-type BertWordPieceLowerCase \
- --vocab-file /path/to/vocab.txt \
- --output-prefix corpus_indexed \
- --workers 5 # works well for 10 CPU cores. Scale up accordingly.
-
-
-2. Use a custom samples mapping function in place of `megatron/data/realm_dataset_utils.get_block_samples_mapping` if required. To do this, you will need to implement a new function in C++ inside of `megatron/data/helpers.cpp`. The samples mapping data structure is used to select the data that will constitute every training sample in advance of the training loop.
- The samples mapping is responsible for holding all of the required metadata needed to construct the sample from one or more indexed datasets. In REALM, the samples mapping contains the start and end sentence indices, as well as the document index (to find the correct title for a body) and a unique ID for every block.
-3. Pretrain a BERT language model using `pretrain_bert.py`, with the sequence length equal to the block size in token ids. This model should be trained on the same indexed dataset that is used to supply the blocks for the information retrieval task.
-In REALM, this is an uncased bert base model trained with the standard hyperparameters.
-4. Use `pretrain_ict.py` to train an `ICTBertModel` which uses two BERT-based encoders to encode queries and blocks to perform retrieval with.
-The script below trains the ICT model from REALM. It refrences a pretrained BERT model (step 3) in the `--bert-load` argument. The batch size used in the paper is 4096, so this would need to be run with data parallel world size 32.
-
-
-### Building an Index of Block Embeddings
-After having trained an ICT model, you can now embed an entire dataset of blocks by creating a `BlockData` structure. After that has been saved, you can load it
-and wrap it with a `FaissMIPSIndex` to do fast similarity search which is key in the learned information retrieval pipeline. The initial index can be built with the following script, meant to be run in an interactive session. It can leverage multiple GPUs on multiple nodes to index large datasets much more quickly.
-
-
-
-
-# Evaluation and Tasks
-
-We provide several command line arguments, detailed in the scripts listed below, to handle various zero-shot and fine-tuned downstream tasks. However, you can also finetune your model from a pretrained checkpoint on other corpora as desired. To do so, simply add the `--finetune` flag and adjust the input files and training parameters within the original training script. The iteration count will be reset to zero, and the optimizer and internal state will be reinitialized. If the fine-tuning is interrupted for any reason, be sure to remove the `--finetune` flag before continuing, otherwise the training will start again from the beginning.
-
-Because evaluation requires substantially less memory than training, it may be advantageous to merge a model trained in parallel for use on a single GPU in downstream tasks. The following script accomplishes this.
-
-
-
-Several downstream tasks are described for both GPT-2 and BERT models below. They can be run in distributed and model parallel modes with the same changes used in the training scripts.
-
-
-## GPT-2 Text Generation
-`bash examples/generate_text.sh`
-
-We generate text samples using largely the GPT-2 pretraining script. Few changes need to make, such as we need to provide the path to the pretrained checkpoint, the length of the output samples, whether to generate texts unconditionally (`--num-samples` to denote how many samples to generate) or conditional (need to pass `--sample-input-file ` where each line of the file will be used as the conditional texts). There are few optional parameters to play, e.g. `top-k`, `top-p`, or `greedy` (set top-k and top-p to 0) sampling..
-
-
-
-
-## GPT-2 Evaluation
-We include example scripts for GPT-2 evaluation on WikiText perplexity evaluation and LAMBADA Cloze accuracy.
-
-
-### WikiText Perplexity Evaluation
-For even comparison with prior works, we evaluate perplexity on the word-level [WikiText-103 test dataset](https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip), and appropriately compute perplexity given the change in tokens when using our subword tokenizer.
-
-We use the following command to run WikiText-103 evaluation on a 345M parameter model.
-
-
-
-
-### LAMBADA Cloze Accuracy
-To compute LAMBADA cloze accuracy (the accuracy of predicting the last token given the preceeding tokens) we utilize a detokenized, processed version of the [LAMBADA dataset](https://github.com/cybertronai/bflm/blob/master/lambada_test.jsonl).
-
-We use the following command to run LAMBADA evaluation on a 345M parameter model. Note that the `--strict-lambada` flag should be used to require whole word matching. Make that `lambada` is part of the file path.
-
-
-
-Further command line arguments are described in the source file [`main.py`](./tasks/main.py)
-
-
-## BERT Task Evaluation
-
-### RACE Evaluation
-The following script finetunes the BERT model for evaluation on the [RACE dataset](http://www.cs.cmu.edu/~glai1/data/race/). The `TRAIN_DATA` and `VALID_DATA` directory contain the RACE dataset as separate `.txt` files.
-
-
-
-
-### MNLI Evaluation
-The following script finetunes the BERT model for evaluation with the [MultiNLI sentence pair corpus](https://www.nyu.edu/projects/bowman/multinli/). Because the matching tasks are quite similar, the script can be quickly tweaked to work with the [Quora Question Pairs](https://www.kaggle.com/quora/question-pairs-dataset) (QQP) dataset as well.
-
-
-
-TRAIN_DATA="data/glue_data/MNLI/train.tsv"
-VALID_DATA="data/glue_data/MNLI/dev_matched.tsv \
- data/glue_data/MNLI/dev_mismatched.tsv"
-PRETRAINED_CHECKPOINT=checkpoints/bert_345m
-VOCAB_FILE=bert-vocab.txt
-CHECKPOINT_PATH=checkpoints/bert_345m_mnli
-COMMON_TASK_ARGS=<same as those in RACE Evaluation above>
-COMMON_TASK_ARGS_EXT=<same as those in RACE Evaluation above>
-
-python tasks/main.py \
- --task MNLI \
- $COMMON_TASK_ARGS \
- $COMMON_TASK_ARGS_EXT \
- --tokenizer-type BertWordPieceLowerCase \
- --epochs 5 \
- --batch-size 8 \
- --lr 5.0e-5 \
- --warmup 0.065
-
-
-
-# Datasets
-We do not host any datasets for GPT-2 or BERT training, however, we detail their collection so that our results may be reproduced.
-
-
-## Collecting Wikipedia Training Data
-We recommend following the Wikipedia data extraction process specified by Google research: "the recommended pre-processing is to download [the latest dump](https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2), extract the text with [WikiExtractor.py](https://github.com/attardi/wikiextractor), and then apply any necessary cleanup to convert it into plain text."
-
-We recommend using the `--json` argument when using WikiExtractor, which will dump the Wikipedia data into loose json format (one json per line), making it more manageable on the file system and also readily consumable by our codebase. We recommend further preprocessing this json dataset by nltk punctuation standardization. For BERT training, add newlines between sentences during data preprocessing. This is done with the `--split-sentences` flag in `preprocess_data.py` as described [above](#data-preprocessing). (Note that if you'd like to use Wikipedia data for GPT-2 training you should still clean it with nltk/spacy/ftfy, but do not split it into newline separated sentences.)
-
-
-## Collecting GPT-2 Webtext Data
-We utilize the publicly available [OpenWebText](https://github.com/eukaryote31/openwebtext) library from [jcpeterson](https://github.com/jcpeterson/openwebtext) and [eukaryote31's](https://github.com/eukaryote31/openwebtext) work to download urls. We then filtered, cleaned, and deduplicated all downloaded content according to the procedure described in our [openwebtext](./tools/openwebtext) directory. For reddit URLs corresponding to content up to October 2018 we arrived at approximately 37GB of content.
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/changes.md b/megatron/Megatron-LM-v1.1.5-3D_parallelism/changes.md
deleted file mode 100644
index 5ffcb873e9..0000000000
--- a/megatron/Megatron-LM-v1.1.5-3D_parallelism/changes.md
+++ /dev/null
@@ -1 +0,0 @@
-PRETEND THESE ARE CODE CHANGES
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/examples/ds_config.json b/megatron/Megatron-LM-v1.1.5-3D_parallelism/examples/ds_config.json
deleted file mode 100644
index ea62c602d8..0000000000
--- a/megatron/Megatron-LM-v1.1.5-3D_parallelism/examples/ds_config.json
+++ /dev/null
@@ -1,16 +0,0 @@
-{
- "train_batch_size": 256,
- "train_micro_batch_size_per_gpu": 4,
- "steps_per_print": 10,
- "gradient_clipping": 1.0,
- "fp16": {
- "enabled": true,
-
- "loss_scale": 0,
- "loss_scale_window": 1000,
- "hysteresis": 2,
- "min_loss_scale": 1
- },
- "wall_clock_breakdown": true,
- "zero_allow_untested_optimizer": false
-}
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/examples/ds_pretrain_gpt2.sh b/megatron/Megatron-LM-v1.1.5-3D_parallelism/examples/ds_pretrain_gpt2.sh
deleted file mode 100644
index 6087c7250d..0000000000
--- a/megatron/Megatron-LM-v1.1.5-3D_parallelism/examples/ds_pretrain_gpt2.sh
+++ /dev/null
@@ -1,133 +0,0 @@
-#! /bin/bash
-
-GPUS_PER_NODE=8
-# Change for multinode config
-MASTER_ADDR=localhost
-MASTER_PORT=6000
-NNODES=1
-NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
-
-export DLWS_NUM_WORKER=${NNODES}
-export DLWS_NUM_GPU_PER_WORKER=${GPUS_PER_NODE}
-
-DATA_PATH=data/webtext/webtext_text_document
-VOCAB_PATH=data/gpt2-vocab.json
-MERGE_PATH=data/gpt2-merges.txt
-CHECKPOINT_PATH=checkpoints/gpt2_345m_ds
-
-script_path=$(realpath $0)
-script_dir=$(dirname $script_path)
-config_json="$script_dir/ds_zero_stage_2_config.json"
-
-# Megatron Model Parallelism
-mp_size=4
-
-NLAYERS=24
-NHIDDEN=1024
-BATCHSIZE=9
-LOGDIR="tensorboard_data/${NLAYERS}l_${NHIDDEN}h_${NNODES}n_${GPUS_PER_NODE}g_${mp_size}mp_${BATCHSIZE}b_ds4"
-
-#ZeRO Configs
-stage=0
-reduce_scatter=true
-contigious_gradients=true
-rbs=50000000
-agbs=5000000000
-
-#Actication Checkpointing and Contigious Memory
-chkp_layers=1
-PA=true
-PA_CPU=false
-CC=true
-SYNCHRONIZE=true
-PROFILE=false
-
-
-gpt_options=" \
- --model-parallel-size ${mp_size} \
- --num-layers $NLAYERS \
- --hidden-size $NHIDDEN \
- --num-attention-heads 16 \
- --seq-length 1024 \
- --max-position-embeddings 1024 \
- --batch-size $BATCHSIZE \
- --train-iters 320000 \
- --lr-decay-iters 320000 \
- --save $CHECKPOINT_PATH \
- --load $CHECKPOINT_PATH \
- --data-path $DATA_PATH \
- --vocab-file $VOCAB_PATH \
- --merge-file $MERGE_PATH \
- --data-impl mmap \
- --split 949,50,1 \
- --distributed-backend nccl \
- --lr 1.5e-4 \
- --lr-decay-style cosine \
- --min-lr 1.0e-5 \
- --weight-decay 1e-2 \
- --clip-grad 1.0 \
- --warmup 0.01 \
- --checkpoint-activations \
- --log-interval 100 \
- --save-interval 10000 \
- --eval-interval 1000 \
- --eval-iters 10 \
- --fp16 \
- --tensorboard-dir ${LOGDIR}
-"
-
- deepspeed_options=" \
- --deepspeed \
- --deepspeed_config ${config_json} \
- --zero-stage ${stage} \
- --zero-reduce-bucket-size ${rbs} \
- --zero-allgather-bucket-size ${agbs}
- "
-
-if [ "${contigious_gradients}" = "true" ]; then
-deepspeed_options="${deepspeed_options} \
- --zero-contigious-gradients"
-fi
-
-if [ "${reduce_scatter}" = "true" ]; then
-deepspeed_options="${deepspeed_options} \
- --zero-reduce-scatter"
-fi
-
-chkp_opt=" \
---checkpoint-activations \
---checkpoint-num-layers ${chkp_layers}"
-
-if [ "${PA}" = "true" ]; then
-chkp_opt="${chkp_opt} \
- --partition-activations"
-fi
-
-if [ "${PA_CPU}" = "true" ]; then
-chkp_opt="${chkp_opt} \
- --checkpoint-in-cpu"
-fi
-
-if [ "${SYNCHRONIZE}" = "true" ]; then
-chkp_opt="${chkp_opt} \
- --synchronize-each-layer"
-fi
-
-if [ "${CC}" = "true" ]; then
-chkp_opt="${chkp_opt} \
- --contigious-checkpointing"
-fi
-
-if [ "${PROFILE}" = "true" ]; then
-chkp_opt="${chkp_opt} \
- --profile-backward"
-fi
-
-full_options="${gpt_options} ${deepspeed_options} ${chkp_opt}"
-
-run_cmd="deepspeed --num_nodes ${DLWS_NUM_WORKER} --num_gpus ${DLWS_NUM_GPU_PER_WORKER} pretrain_gpt2.py $@ ${full_options}"
-echo ${run_cmd}
-eval ${run_cmd}
-
-set +x
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/examples/ds_pretrain_gpt2_pipe.sh b/megatron/Megatron-LM-v1.1.5-3D_parallelism/examples/ds_pretrain_gpt2_pipe.sh
deleted file mode 100644
index ed60d0181c..0000000000
--- a/megatron/Megatron-LM-v1.1.5-3D_parallelism/examples/ds_pretrain_gpt2_pipe.sh
+++ /dev/null
@@ -1,140 +0,0 @@
-#! /bin/bash
-
-GPUS_PER_NODE=16
-# Change for multinode config
-MASTER_ADDR=localhost
-MASTER_PORT=6000
-NNODES=1
-NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
-
-export DLWS_NUM_WORKER=${NNODES}
-export DLWS_NUM_GPU_PER_WORKER=${GPUS_PER_NODE}
-
-DATA_PATH=data/webtext/webtext_text_document
-VOCAB_PATH=data/gpt2-vocab.json
-MERGE_PATH=data/gpt2-merges.txt
-CHECKPOINT_PATH=checkpoints/gpt2_345m_ds
-
-script_path=$(realpath $0)
-script_dir=$(dirname $script_path)
-#config_json="$script_dir/ds_zero_stage_2_config.json"
-config_json="$script_dir/ds_config.json"
-
-# Megatron Model Parallelism
-mp_size=2
-# DeepSpeed Pipeline parallelism
-pp_size=2
-
-NLAYERS=24
-NHIDDEN=1024
-BATCHSIZE=4
-LOGDIR="tensorboard_data/${NLAYERS}l_${NHIDDEN}h_${NNODES}n_${GPUS_PER_NODE}g_${pp_size}pp_${mp_size}mp_${BATCHSIZE}b_ds4"
-
-GAS=16
-
-#ZeRO Configs
-stage=0
-reduce_scatter=true
-contigious_gradients=true
-rbs=50000000
-agbs=5000000000
-
-#Actication Checkpointing and Contigious Memory
-chkp_layers=1
-PA=true
-PA_CPU=false
-CC=true
-SYNCHRONIZE=true
-PROFILE=false
-
-
-gpt_options=" \
- --model-parallel-size ${mp_size} \
- --pipe-parallel-size ${pp_size} \
- --num-layers $NLAYERS \
- --hidden-size $NHIDDEN \
- --num-attention-heads 16 \
- --seq-length 1024 \
- --max-position-embeddings 1024 \
- --batch-size $BATCHSIZE \
- --gas $GAS \
- --train-iters 320000 \
- --lr-decay-iters 320000 \
- --save $CHECKPOINT_PATH \
- --load $CHECKPOINT_PATH \
- --data-path $DATA_PATH \
- --vocab-file $VOCAB_PATH \
- --merge-file $MERGE_PATH \
- --data-impl mmap \
- --split 949,50,1 \
- --distributed-backend nccl \
- --lr 1.5e-4 \
- --lr-decay-style cosine \
- --min-lr 1.0e-5 \
- --weight-decay 1e-2 \
- --clip-grad 1.0 \
- --warmup 0.01 \
- --checkpoint-activations \
- --log-interval 1 \
- --save-interval 500 \
- --eval-interval 100 \
- --eval-iters 10 \
- --fp16 \
- --tensorboard-dir ${LOGDIR}
-"
-
- deepspeed_options=" \
- --deepspeed \
- --deepspeed_config ${config_json} \
- --zero-stage ${stage} \
- --zero-reduce-bucket-size ${rbs} \
- --zero-allgather-bucket-size ${agbs}
- "
-
-if [ "${contigious_gradients}" = "true" ]; then
-deepspeed_options="${deepspeed_options} \
- --zero-contigious-gradients"
-fi
-
-if [ "${reduce_scatter}" = "true" ]; then
-deepspeed_options="${deepspeed_options} \
- --zero-reduce-scatter"
-fi
-
-chkp_opt=" \
---checkpoint-activations \
---checkpoint-num-layers ${chkp_layers}"
-
-if [ "${PA}" = "true" ]; then
-chkp_opt="${chkp_opt} \
- --partition-activations"
-fi
-
-if [ "${PA_CPU}" = "true" ]; then
-chkp_opt="${chkp_opt} \
- --checkpoint-in-cpu"
-fi
-
-if [ "${SYNCHRONIZE}" = "true" ]; then
-chkp_opt="${chkp_opt} \
- --synchronize-each-layer"
-fi
-
-if [ "${CC}" = "true" ]; then
-chkp_opt="${chkp_opt} \
- --contigious-checkpointing"
-fi
-
-if [ "${PROFILE}" = "true" ]; then
-chkp_opt="${chkp_opt} \
- --profile-backward"
-fi
-
-full_options="${gpt_options} ${deepspeed_options} ${chkp_opt}"
-
-run_cmd="deepspeed --num_nodes ${DLWS_NUM_WORKER} --num_gpus ${DLWS_NUM_GPU_PER_WORKER} pretrain_gpt2.py $@ ${full_options}"
-echo ${run_cmd}
-eval ${run_cmd}
-
-set +x
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/examples/ds_zero_stage_2_config.json b/megatron/Megatron-LM-v1.1.5-3D_parallelism/examples/ds_zero_stage_2_config.json
deleted file mode 100644
index 2ab86c2431..0000000000
--- a/megatron/Megatron-LM-v1.1.5-3D_parallelism/examples/ds_zero_stage_2_config.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
- "train_batch_size": 2048,
- "gradient_accumulation_steps": 1,
- "steps_per_print": 1,
- "zero_optimization": {
- "stage": 2,
- "allgather_partitions": true,
- "reduce_scatter": true,
- "allgather_bucket_size": 50000000,
- "reduce_bucket_size": 50000000,
- "overlap_comm": true
- },
- "optimizer": {
- "type": "Adam",
- "params": {
- "lr": 0.00015,
- "max_grad_norm": 1.0,
- "betas": [0.9, 0.95]
- }
- },
- "gradient_clipping": 1.0,
- "fp16": {
- "enabled": true,
-
- "loss_scale": 0,
- "loss_scale_window": 1000,
- "hysteresis": 2,
- "min_loss_scale": 1
- },
- "wall_clock_breakdown": true,
- "zero_allow_untested_optimizer": false
-}
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/examples/evaluate_zeroshot_gpt2.sh b/megatron/Megatron-LM-v1.1.5-3D_parallelism/examples/evaluate_zeroshot_gpt2.sh
deleted file mode 100644
index f4f9f22ff8..0000000000
--- a/megatron/Megatron-LM-v1.1.5-3D_parallelism/examples/evaluate_zeroshot_gpt2.sh
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/bin/bash
-
-WORLD_SIZE=8
-
-DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
- --nnodes 1 \
- --node_rank 0 \
- --master_addr localhost \
- --master_port 6000"
-
-TASK="LAMBADA"
-
-VALID_DATA=
-VOCAB_FILE=gpt2-vocab.json
-MERGE_FILE=gpt2-merges.txt
-CHECKPOINT=checkpoints/gpt2_345m
-
-
-python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
- --task $TASK \
- --valid-data $VALID_DATA \
- --tokenizer-type GPT2BPETokenizer \
- --strict-lambada \
- --vocab-file $VOCAB_FILE \
- --merge-file $MERGE_FILE \
- --load $CHECKPOINT \
- --model-parallel-size 1 \
- --num-layers 24 \
- --hidden-size 1024 \
- --num-attention-heads 16 \
- --batch-size 8 \
- --checkpoint-activations \
- --seq-length 1024 \
- --max-position-embeddings 1024 \
- --log-interval 10 \
- --fp16 \
- --no-load-optim \
- --no-load-rng
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/examples/finetune_mnli_distributed.sh b/megatron/Megatron-LM-v1.1.5-3D_parallelism/examples/finetune_mnli_distributed.sh
deleted file mode 100644
index 65f3a9f375..0000000000
--- a/megatron/Megatron-LM-v1.1.5-3D_parallelism/examples/finetune_mnli_distributed.sh
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/bin/bash
-
-WORLD_SIZE=8
-
-DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
- --nnodes 1 \
- --node_rank 0 \
- --master_addr localhost \
- --master_port 6000"
-
-TRAIN_DATA="data/glue_data/MNLI/train.tsv"
-VALID_DATA="data/glue_data/MNLI/dev_matched.tsv \
- data/glue_data/MNLI/dev_mismatched.tsv"
-PRETRAINED_CHECKPOINT=checkpoints/bert_345m
-VOCAB_FILE=bert-vocab.txt
-CHECKPOINT_PATH=checkpoints/bert_345m_mnli
-
-python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
- --task MNLI \
- --seed 1234 \
- --train-data $TRAIN_DATA \
- --valid-data $VALID_DATA \
- --tokenizer-type BertWordPieceLowerCase \
- --vocab-file $VOCAB_FILE \
- --epochs 5 \
- --pretrained-checkpoint $PRETRAINED_CHECKPOINT \
- --model-parallel-size 1 \
- --num-layers 24 \
- --hidden-size 1024 \
- --num-attention-heads 16 \
- --batch-size 8 \
- --checkpoint-activations \
- --lr 5.0e-5 \
- --lr-decay-style linear \
- --warmup 0.065 \
- --seq-length 512 \
- --max-position-embeddings 512 \
- --save-interval 500000 \
- --save $CHECKPOINT_PATH \
- --log-interval 10 \
- --eval-interval 100 \
- --eval-iters 50 \
- --weight-decay 1.0e-1 \
- --fp16
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/examples/finetune_race_distributed.sh b/megatron/Megatron-LM-v1.1.5-3D_parallelism/examples/finetune_race_distributed.sh
deleted file mode 100644
index 0212ecbace..0000000000
--- a/megatron/Megatron-LM-v1.1.5-3D_parallelism/examples/finetune_race_distributed.sh
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/bin/bash
-
-WORLD_SIZE=8
-
-DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
- --nnodes 1 \
- --node_rank 0 \
- --master_addr localhost \
- --master_port 6000"
-
-TRAIN_DATA="data/RACE/train/middle"
-VALID_DATA="data/RACE/dev/middle \
- data/RACE/dev/high"
-VOCAB_FILE=bert-vocab.txt
-PRETRAINED_CHECKPOINT=checkpoints/bert_345m
-CHECKPOINT_PATH=checkpoints/bert_345m_race
-
-python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
- --task RACE \
- --seed 1234 \
- --train-data $TRAIN_DATA \
- --valid-data $VALID_DATA \
- --tokenizer-type BertWordPieceLowerCase \
- --vocab-file $VOCAB_FILE \
- --epochs 3 \
- --pretrained-checkpoint $PRETRAINED_CHECKPOINT \
- --model-parallel-size 1 \
- --num-layers 24 \
- --hidden-size 1024 \
- --num-attention-heads 16 \
- --batch-size 4 \
- --checkpoint-activations \
- --lr 1.0e-5 \
- --lr-decay-style linear \
- --warmup 0.06 \
- --seq-length 512 \
- --max-position-embeddings 512 \
- --save-interval 100000 \
- --save $CHECKPOINT_PATH \
- --log-interval 10 \
- --eval-interval 100 \
- --eval-iters 50 \
- --weight-decay 1.0e-1 \
- --clip-grad 1.0 \
- --hidden-dropout 0.1 \
- --attention-dropout 0.1 \
- --fp16
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/examples/generate_text.sh b/megatron/Megatron-LM-v1.1.5-3D_parallelism/examples/generate_text.sh
deleted file mode 100644
index 6a04c4927a..0000000000
--- a/megatron/Megatron-LM-v1.1.5-3D_parallelism/examples/generate_text.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-
-CHECKPOINT_PATH=checkpoints/gpt2_345m
-VOCAB_FILE=gpt2-vocab.json
-MERGE_FILE=gpt2-merges.txt
-
-python tools/generate_samples_gpt2.py \
- --model-parallel-size 1 \
- --num-layers 24 \
- --hidden-size 1024 \
- --load $CHECKPOINT_PATH \
- --num-attention-heads 16 \
- --max-position-embeddings 1024 \
- --tokenizer-type GPT2BPETokenizer \
- --fp16 \
- --batch-size 2 \
- --seq-length 1024 \
- --out-seq-length 1024 \
- --temperature 1.0 \
- --vocab-file $VOCAB_FILE \
- --merge-file $MERGE_FILE \
- --genfile unconditional_samples.json \
- --num-samples 2 \
- --top_p 0.9 \
- --recompute
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/examples/merge_mp_bert.sh b/megatron/Megatron-LM-v1.1.5-3D_parallelism/examples/merge_mp_bert.sh
deleted file mode 100644
index 01e08b128b..0000000000
--- a/megatron/Megatron-LM-v1.1.5-3D_parallelism/examples/merge_mp_bert.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/bash
-
-MODEL_PARALLEL_SIZE=2
-
-VOCAB_FILE=bert-vocab.txt
-CHECKPOINT_PATH=checkpoints/bert_345m
-
-WORLD_SIZE=$MODEL_PARALLEL_SIZE python tools/merge_mp_partitions.py \
- --model-type BERT \
- --model-parallel-size $MODEL_PARALLEL_SIZE \
- --tokenizer-type BertWordPieceLowerCase \
- --vocab-file $VOCAB_FILE \
- --num-layers 24 \
- --hidden-size 1024 \
- --num-attention-heads 16 \
- --seq-length 512 \
- --max-position-embeddings 512 \
- --load $CHECKPOINT_PATH
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/examples/pretrain_bert.sh b/megatron/Megatron-LM-v1.1.5-3D_parallelism/examples/pretrain_bert.sh
deleted file mode 100644
index ecf59477da..0000000000
--- a/megatron/Megatron-LM-v1.1.5-3D_parallelism/examples/pretrain_bert.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/bash
-
-RANK=0
-WORLD_SIZE=1
-DATA_PATH=_text_sentence
-CHECKPOINT_PATH=
-
-python pretrain_bert.py \
- --num-layers 24 \
- --hidden-size 1024 \
- --num-attention-heads 16 \
- --batch-size 4 \
- --seq-length 512 \
- --max-position-embeddings 512 \
- --train-iters 2000000 \
- --save $CHECKPOINT_PATH \
- --load $CHECKPOINT_PATH \
- --data-path $DATA_PATH \
- --vocab-file bert-vocab.txt \
- --data-impl mmap \
- --split 949,50,1 \
- --distributed-backend nccl \
- --lr 0.0001 \
- --min-lr 0.00001 \
- --lr-decay-style linear \
- --lr-decay-iters 990000 \
- --weight-decay 1e-2 \
- --clip-grad 1.0 \
- --warmup .01 \
- --log-interval 100 \
- --save-interval 10000 \
- --eval-interval 1000 \
- --eval-iters 10 \
- --fp16
-
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/examples/pretrain_bert_distributed.sh b/megatron/Megatron-LM-v1.1.5-3D_parallelism/examples/pretrain_bert_distributed.sh
deleted file mode 100644
index 17ebae1fa0..0000000000
--- a/megatron/Megatron-LM-v1.1.5-3D_parallelism/examples/pretrain_bert_distributed.sh
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/bin/bash
-
-GPUS_PER_NODE=8
-# Change for multinode config
-MASTER_ADDR=localhost
-MASTER_PORT=6000
-NNODES=1
-NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
-
-DATA_PATH=_text_sentence
-CHECKPOINT_PATH=
-
-DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
-
-python -m torch.distributed.launch $DISTRIBUTED_ARGS \
- pretrain_bert.py \
- --model-parallel-size 1 \
- --num-layers 24 \
- --hidden-size 1024 \
- --num-attention-heads 16 \
- --batch-size 4 \
- --seq-length 512 \
- --max-position-embeddings 512 \
- --train-iters 1000000 \
- --save $CHECKPOINT_PATH \
- --load $CHECKPOINT_PATH \
- --data-path $DATA_PATH \
- --vocab-file bert-vocab.txt \
- --data-impl mmap \
- --split 949,50,1 \
- --distributed-backend nccl \
- --lr 0.0001 \
- --lr-decay-style linear \
- --min-lr 1.0e-5 \
- --lr-decay-iters 990000 \
- --weight-decay 1e-2 \
- --clip-grad 1.0 \
- --warmup .01 \
- --log-interval 100 \
- --save-interval 10000 \
- --eval-interval 1000 \
- --eval-iters 10 \
- --fp16
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/examples/pretrain_gpt2.sh b/megatron/Megatron-LM-v1.1.5-3D_parallelism/examples/pretrain_gpt2.sh
deleted file mode 100644
index 66232bf5ca..0000000000
--- a/megatron/Megatron-LM-v1.1.5-3D_parallelism/examples/pretrain_gpt2.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#! /bin/bash
-
-# Runs the "345M" parameter model
-
-RANK=0
-WORLD_SIZE=1
-
-DATA_PATH=_text_document
-CHECKPOINT_PATH=
-
-
-python pretrain_gpt2.py \
- --num-layers 24 \
- --hidden-size 1024 \
- --num-attention-heads 16 \
- --batch-size 8 \
- --seq-length 1024 \
- --max-position-embeddings 1024 \
- --train-iters 500000 \
- --lr-decay-iters 320000 \
- --save $CHECKPOINT_PATH \
- --load $CHECKPOINT_PATH \
- --data-path $DATA_PATH \
- --vocab-file gpt2-vocab.json \
- --merge-file gpt2-merges.txt \
- --data-impl mmap \
- --split 949,50,1 \
- --distributed-backend nccl \
- --lr 0.00015 \
- --min-lr 1.0e-5 \
- --lr-decay-style cosine \
- --weight-decay 1e-2 \
- --clip-grad 1.0 \
- --warmup .01 \
- --checkpoint-activations \
- --log-interval 100 \
- --save-interval 10000 \
- --eval-interval 1000 \
- --eval-iters 10 \
- --fp16
-
-
-set +x
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/examples/pretrain_gpt2_distributed.sh b/megatron/Megatron-LM-v1.1.5-3D_parallelism/examples/pretrain_gpt2_distributed.sh
deleted file mode 100644
index 1d7462504c..0000000000
--- a/megatron/Megatron-LM-v1.1.5-3D_parallelism/examples/pretrain_gpt2_distributed.sh
+++ /dev/null
@@ -1,52 +0,0 @@
-#! /bin/bash
-
-# Runs the "345M" parameter model
-
-GPUS_PER_NODE=8
-# Change for multinode config
-MASTER_ADDR=localhost
-MASTER_PORT=6000
-NNODES=1
-NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
-
-DATA_PATH=_text_document
-CHECKPOINT_PATH=
-
-DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
-
-python -m torch.distributed.launch $DISTRIBUTED_ARGS \
- pretrain_gpt2.py \
- --model-parallel-size 1 \
- --num-layers 24 \
- --hidden-size 1024 \
- --num-attention-heads 16 \
- --batch-size 8 \
- --seq-length 1024 \
- --max-position-embeddings 1024 \
- --train-iters 500000 \
- --lr-decay-iters 320000 \
- --save $CHECKPOINT_PATH \
- --load $CHECKPOINT_PATH \
- --data-path $DATA_PATH \
- --vocab-file gpt2-vocab.json \
- --merge-file gpt2-merges.txt \
- --data-impl mmap \
- --split 949,50,1 \
- --distributed-backend nccl \
- --lr 0.00015 \
- --lr-decay-style cosine \
- --min-lr 1.0e-5 \
- --weight-decay 1e-2 \
- --clip-grad 1.0 \
- --warmup .01 \
- --checkpoint-activations \
- --log-interval 100 \
- --save-interval 10000 \
- --eval-interval 1000 \
- --eval-iters 10 \
- --fp16
-
-
-
-set +x
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/images/Makefile b/megatron/Megatron-LM-v1.1.5-3D_parallelism/images/Makefile
deleted file mode 100644
index 5efde02983..0000000000
--- a/megatron/Megatron-LM-v1.1.5-3D_parallelism/images/Makefile
+++ /dev/null
@@ -1,12 +0,0 @@
-default: cases.png scaling-mp.png scaling-dp.png
-
-# for some reason the size option to convert in scaling.tex doesn't work, manually do it after
-cases.png scaling-mp.png scaling-dp.png: tables.tex
- latex --shell-escape $<
- convert tables-1.png -resize 650 cases.png
- convert tables-2.png -resize 600 scaling-mp.png
- convert tables-3.png -resize 350 scaling-dp.png
-
-clean:
- rm -rf *.aux *.log *.dvi *.ps
- rm -rf tables-*.png
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/images/cases.png b/megatron/Megatron-LM-v1.1.5-3D_parallelism/images/cases.png
deleted file mode 100644
index 8f52c38f54..0000000000
Binary files a/megatron/Megatron-LM-v1.1.5-3D_parallelism/images/cases.png and /dev/null differ
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/images/scaling-dp.png b/megatron/Megatron-LM-v1.1.5-3D_parallelism/images/scaling-dp.png
deleted file mode 100644
index ce3ae95806..0000000000
Binary files a/megatron/Megatron-LM-v1.1.5-3D_parallelism/images/scaling-dp.png and /dev/null differ
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/images/scaling-mp.png b/megatron/Megatron-LM-v1.1.5-3D_parallelism/images/scaling-mp.png
deleted file mode 100644
index 75f34985cc..0000000000
Binary files a/megatron/Megatron-LM-v1.1.5-3D_parallelism/images/scaling-mp.png and /dev/null differ
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/images/tables.tex b/megatron/Megatron-LM-v1.1.5-3D_parallelism/images/tables.tex
deleted file mode 100644
index 86d744ef01..0000000000
--- a/megatron/Megatron-LM-v1.1.5-3D_parallelism/images/tables.tex
+++ /dev/null
@@ -1,40 +0,0 @@
-\documentclass[multi,convert]{standalone}
-\usepackage{multirow}
-\standaloneenv{tabular}
-
-\begin{document}
-
-\begin{tabular}{cccccc}
- Case & Hidden Size & Attention Heads & Layers & Parameters (billions) & Model Parallel Partitions \\
- \hline
- 1B & 1920 & 15 & 24 & 1.16 & 1 \\
- 2B & 2304 & 18 & 30 & 2.03 & 2 \\
- 4B & 3072 & 24 & 36 & 4.24 & 4 \\
- 8B & 4096 & 32 & 42 & 8.67 & 8 \\
-\end{tabular}
-
-\begin{tabular}{cc|ccc|ccc}
- & & \multicolumn{3}{c|}{\textbf{DGX-2 (V100) batch size 8}} & \multicolumn{3}{c}{\textbf{DGX-A100 batch size 16}} \\
- \hline
- \multirow{2}{*}{Case} & Number of & Iteration & \multirow{2}{*}{Scaling} & TeraFLOPs & Iteration & \multirow{2}{*}{Scaling} & TeraFLOPs \\
- & GPUs & Time (ms) & & per GPU & Time (ms) & & per GPU \\
- \hline
- 1B & 1 & 1121 & 100.0\% & 71.9 & 1076 & 100\% & 149.8 \\
- 2B & 2 & 1093 & 89.6\% & 64.2 & 1026 & 91.7\% & 136.8 \\
- 4B & 4 & 1238 & 82.5\% & 58.5 & 1162 & 84.5\% & 124.7 \\
- 8B & 8 & 1407 & 74.3\% & 52.2 & 1343 & 74.7\% & 109.3 \\
-\end{tabular}
-
-\begin{tabular}{cc|ccc}
- & & \multicolumn{3}{c}{\textbf{DGX-A100 batch size 2048}} \\
- \hline
- \multirow{2}{*}{Case} & Number of & Iteration & \multirow{2}{*}{Scaling} & TeraFLOPs \\
- & GPUs & Time (ms) & & per GPU \\
- \hline
- 1B & 128 & 1153 & 93.3\% & 139.8 \\
- 2B & 256 & 1101 & 85.5\% & 127.5 \\
- 4B & 512 & 1242 & 79.0\% & 116.7 \\
- 8B & 1024 & 1380 & 72.7\% & 106.5 \\
-\end{tabular}
-
-\end{document}
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/__init__.py b/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/__init__.py
deleted file mode 100644
index 3f4fbdabc7..0000000000
--- a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/__init__.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-
-from .package_info import (
- __description__,
- __contact_names__,
- __url__,
- __download_url__,
- __keywords__,
- __license__,
- __package_name__,
- __version__,
-)
-
-from .global_vars import get_args
-from .global_vars import get_tokenizer
-from .global_vars import get_tensorboard_writer
-from .global_vars import get_adlr_autoresume
-from .global_vars import get_timers
-from .initialize import initialize_megatron
-
-def print_rank_0(message):
- """If distributed is initialized print only on rank 0."""
- if torch.distributed.is_initialized():
- if torch.distributed.get_rank() == 0:
- print(message, flush=True)
- else:
- print(message, flush=True)
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/arguments.py b/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/arguments.py
deleted file mode 100644
index 83a78256a6..0000000000
--- a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/arguments.py
+++ /dev/null
@@ -1,538 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Megatron arguments."""
-
-import argparse
-import os
-
-import torch
-from megatron import fused_kernels
-
-import deepspeed
-
-def parse_args(extra_args_provider=None, defaults={},
- ignore_unknown_args=False):
- """Parse all arguments."""
- parser = argparse.ArgumentParser(description='Megatron-LM Arguments',
- allow_abbrev=False)
-
- # Standard arguments.
- parser = _add_network_size_args(parser)
- parser = _add_regularization_args(parser)
- parser = _add_training_args(parser)
- parser = _add_initialization_args(parser)
- parser = _add_learning_rate_args(parser)
- parser = _add_checkpointing_args(parser)
- parser = _add_mixed_precision_args(parser)
- parser = _add_distributed_args(parser)
- parser = _add_validation_args(parser)
- parser = _add_data_args(parser)
- parser = _add_autoresume_args(parser)
- parser = _add_realm_args(parser)
- parser = _add_zero_args(parser)
- parser = _add_activation_checkpoint_args(parser)
-
- # Custom arguments.
- if extra_args_provider is not None:
- parser = extra_args_provider(parser)
-
- # Include DeepSpeed configuration arguments
- parser = deepspeed.add_config_arguments(parser)
-
- # Parse.
- if ignore_unknown_args:
- args, _ = parser.parse_known_args()
- else:
- args = parser.parse_args()
-
- # Distributed args.
- args.rank = int(os.getenv('RANK', '0'))
- args.world_size = int(os.getenv("WORLD_SIZE", '1'))
- args.model_parallel_size = min(args.model_parallel_size, args.world_size)
- if args.rank == 0:
- print('using world size: {} and model-parallel size: {} '.format(
- args.world_size, args.model_parallel_size))
-
- # Fp16 loss scaling.
- args.dynamic_loss_scale = False
- if args.loss_scale is None:
- args.dynamic_loss_scale = True
-
- # Parameters dtype.
- args.params_dtype = torch.float
- if args.fp16:
- args.params_dtype = torch.half
- if args.rank == 0:
- print('using {} for parameters ...'.format(args.params_dtype),
- flush=True)
-
-
- # Set input defaults.
- for key in defaults:
- # For default to be valid, it should not be provided in the
- # arguments that are passed to the program. We check this by
- # ensuring the arg is set to None.
- if getattr(args, key) is not None:
- if args.rank == 0:
- print('WARNING: overriding default arguments for {key}:{v} \
- with {key}:{v2}'.format(key=key, v=defaults[key],
- v2=getattr(args, key)),
- flush=True)
- else:
- setattr(args, key, defaults[key])
-
- # Check required arguments.
- required_args = ['num_layers', 'hidden_size', 'num_attention_heads',
- 'max_position_embeddings']
- for req_arg in required_args:
- _check_arg_is_not_none(args, req_arg)
-
- # Checks.
- assert args.hidden_size % args.num_attention_heads == 0
- if args.seq_length is not None:
- assert args.max_position_embeddings >= args.seq_length
- if args.lr is not None:
- assert args.min_lr <= args.lr
- if args.save is not None:
- assert args.save_interval is not None
- # Parameters sharing does not work with torch DDP.
- if (args.num_unique_layers is not None) and (args.num_layers is not None):
- assert args.num_unique_layers <= args.num_layers
- assert args.num_layers % args.num_unique_layers == 0, \
- 'num-layers should be divisible by num-unique-layers.'
- if args.num_unique_layers < args.num_layers:
- assert args.DDP_impl == 'local', \
- 'torch-DDP does not work with parameters sharing.'
- # Mixed precision checks.
- if args.fp16_lm_cross_entropy:
- assert args.fp16, 'lm cross entropy in fp16 only support in fp16 mode.'
- # Activation checkpointing.
- if args.distribute_checkpointed_activations:
- assert args.checkpoint_activations, \
- 'for distribute-checkpointed-activations to work you '\
- 'need to enable checkpoint-activations'
-
- # load scaled_upper_triang_masked_softmax_fusion kernel
- if args.scaled_upper_triang_masked_softmax_fusion:
- fused_kernels.load_scaled_upper_triang_masked_softmax_fusion_kernel()
-
- # load scaled_masked_softmax_fusion kernel
- if args.scaled_masked_softmax_fusion:
- fused_kernels.load_scaled_masked_softmax_fusion_kernel()
-
- _print_args(args)
- return args
-
-
-def _print_args(args):
- """Print arguments."""
- if args.rank == 0:
- print('-------------------- arguments --------------------', flush=True)
- str_list = []
- for arg in vars(args):
- dots = '.' * (32 - len(arg))
- str_list.append(' {} {} {}'.format(arg, dots, getattr(args, arg)))
- for arg in sorted(str_list, key=lambda x: x.lower()):
- print(arg, flush=True)
- print('---------------- end of arguments ----------------', flush=True)
-
-
-def _check_arg_is_not_none(args, arg):
- assert getattr(args, arg) is not None, '{} argument is None'.format(arg)
-
-
-def _add_network_size_args(parser):
- group = parser.add_argument_group(title='network size')
-
- group.add_argument('--num-layers', type=int, default=None,
- help='Number of transformer layers.')
- group.add_argument('--num-unique-layers', type=int, default=None,
- help='Number of unique transformer layers. '
- '`num-layers` should be divisible by this value.')
- group.add_argument('--param-sharing-style', default='grouped',
- choices=['grouped', 'spaced'],
- help='Ordering of the shared parameters. For example, '
- 'for a `num-layers`=4 and `--num-unique-layers`=2, '
- 'we will have the following ordering for two unique '
- 'layers 1 and 2: '
- ' grouped: [1, 2, 1, 2] and spaced: [1, 1, 2, 2].')
- group.add_argument('--hidden-size', type=int, default=None,
- help='Tansformer hidden size.')
- group.add_argument('--num-attention-heads', type=int, default=None,
- help='Number of transformer attention heads.')
- group.add_argument('--max-position-embeddings', type=int, default=None,
- help='Maximum number of position embeddings to use. '
- 'This is the size of position embedding.')
- group.add_argument('--make-vocab-size-divisible-by', type=int, default=128,
- help='Pad the vocab size to be divisible by this value.'
- 'This is added for computational efficieny reasons.')
- group.add_argument('--layernorm-epsilon', type=float, default=1e-5,
- help='Layer norm epsilon.')
- group.add_argument('--apply-residual-connection-post-layernorm',
- action='store_true',
- help='If set, use original BERT residula connection '
- 'ordering.')
- group.add_argument('--openai-gelu', action='store_true',
- help='Use OpenAIs GeLU implementation. This option'
- 'should not be used unless for backward compatibility'
- 'reasons.')
- group.add_argument('--onnx-safe', type=bool, required=False,
- help='Use workarounds for known problems with Torch ONNX exporter')
-
- return parser
-
-
-def _add_regularization_args(parser):
- group = parser.add_argument_group(title='regularization')
-
- group.add_argument('--attention-dropout', type=float, default=0.1,
- help='Post attention dropout ptobability.')
- group.add_argument('--hidden-dropout', type=float, default=0.1,
- help='Dropout probability for hidden state transformer.')
- group.add_argument('--weight-decay', type=float, default=0.01,
- help='Weight decay coefficient for L2 regularization.')
- group.add_argument('--clip-grad', type=float, default=1.0,
- help='Gradient clipping based on global L2 norm.')
- group.add_argument('--adam-beta1', type=float, default=0.9,
- help='First coefficient for computing running averages of'
- 'gradient and its square')
- group.add_argument('--adam-beta2', type=float, default=0.999,
- help='Second coefficient for computing running averages of'
- 'gradient and its square')
- group.add_argument('--adam-eps', type=float, default=1e-08,
- help='Term added to the denominator to improve'
- 'numerical stability')
-
- return parser
-
-
-def _add_training_args(parser):
- group = parser.add_argument_group(title='training')
-
- group.add_argument('--batch-size', type=int, default=None,
- help='Batch size per model instance (local batch size). '
- 'Global batch size is local batch size times data '
- 'parallel size.')
- group.add_argument('--gas', type=int, default=1,
- help='Gradient accumulation steps (pipeline parallelism only). '
- 'Global batch size is local batch size times data '
- 'parallel size times gas.')
- group.add_argument('--checkpoint-activations', action='store_true',
- help='Checkpoint activation to allow for training '
- 'with larger models, sequences, and batch sizes.')
- group.add_argument('--distribute-checkpointed-activations',
- action='store_true',
- help='If set, distribute checkpointed activations '
- 'across model parallel group.')
- group.add_argument('--checkpoint-num-layers', type=int, default=1,
- help='chunk size (number of layers) for checkpointing.')
- group.add_argument('--train-iters', type=int, default=None,
- help='Total number of iterations to train over all '
- 'training runs.')
- group.add_argument('--log-interval', type=int, default=100,
- help='Report loss and timing interval.')
- group.add_argument('--exit-interval', type=int, default=None,
- help='Exit the program after the iteration is divisible '
- 'by this value.')
- group.add_argument('--tensorboard-dir', type=str, default=None,
- help='Write TensorBoard logs to this directory.')
- group.add_argument('--scaled-upper-triang-masked-softmax-fusion',
- action='store_true',
- help='Enable fusion of query_key_value_scaling '
- 'time (upper diagonal) masking and softmax.')
- group.add_argument('--scaled-masked-softmax-fusion',
- action='store_true',
- help='Enable fusion of query_key_value_scaling '
- 'general masking and softmax.')
- group.add_argument('--bias-gelu-fusion', action='store_true',
- help='Enable bias and gelu fusion.')
- group.add_argument('--bias-dropout-fusion', action='store_true',
- help='Enable bias and dropout fusion.')
-
- group.add_argument('--cpu-optimizer', action='store_true',
- help='Run optimizer on CPU')
- group.add_argument('--cpu_torch_adam', action='store_true',
- help='Use Torch Adam as optimizer on CPU.')
- return parser
-
-
-def _add_initialization_args(parser):
- group = parser.add_argument_group(title='initialization')
-
- group.add_argument('--seed', type=int, default=1234,
- help='Random seed used for python, numpy, '
- 'pytorch, and cuda.')
- group.add_argument('--init-method-std', type=float, default=0.02,
- help='Standard deviation of the zero mean normal '
- 'distribution used for weight initialization.')
-
- return parser
-
-
-def _add_learning_rate_args(parser):
- group = parser.add_argument_group(title='learning rate')
-
- group.add_argument('--lr', type=float, default=None,
- help='Initial learning rate. Depending on decay style '
- 'and initial warmup, the learing rate at each '
- 'iteration would be different.')
- group.add_argument('--lr-decay-style', type=str, default='linear',
- choices=['constant', 'linear', 'cosine', 'exponential'],
- help='Learning rate decay function.')
- group.add_argument('--lr-decay-iters', type=int, default=None,
- help='number of iterations to decay learning rate over,'
- ' If None defaults to `--train-iters`')
- group.add_argument('--min-lr', type=float, default=0.0,
- help='Minumum value for learning rate. The scheduler'
- 'clip values below this threshold.')
- group.add_argument('--warmup', type=float, default=0.01,
- help='Percentage of total iterations to warmup on '
- '(.01 = 1 percent of all training iters).')
- group.add_argument('--override-lr-scheduler', action='store_true',
- help='Reset the values of the scheduler (learning rate,'
- 'warmup iterations, minimum learning rate, maximum '
- 'number of iterations, and decay style from input '
- 'arguments and ignore values from checkpoints. Note'
- 'that all the above values will be reset.')
- group.add_argument('--use-checkpoint-lr-scheduler', action='store_true',
- help='Use checkpoint to set the values of the scheduler '
- '(learning rate, warmup iterations, minimum learning '
- 'rate, maximum number of iterations, and decay style '
- 'from checkpoint and ignore input arguments.')
-
- return parser
-
-
-def _add_checkpointing_args(parser):
- group = parser.add_argument_group(title='checkpointing')
-
- group.add_argument('--save', type=str, default=None,
- help='Output directory to save checkpoints to.')
- group.add_argument('--save-interval', type=int, default=None,
- help='Number of iterations between checkpoint saves.')
- group.add_argument('--no-save-optim', action='store_true',
- help='Do not save current optimizer.')
- group.add_argument('--no-save-rng', action='store_true',
- help='Do not save current rng state.')
- group.add_argument('--load', type=str, default=None,
- help='Directory containing a model checkpoint.')
- group.add_argument('--no-load-optim', action='store_true',
- help='Do not load optimizer when loading checkpoint.')
- group.add_argument('--no-load-rng', action='store_true',
- help='Do not load rng state when loading checkpoint.')
- group.add_argument('--finetune', action='store_true',
- help='Load model for finetuning. Do not load optimizer '
- 'or rng state from checkpoint and set iteration to 0. '
- 'Assumed when loading a release checkpoint.')
-
- return parser
-
-
-def _add_mixed_precision_args(parser):
- group = parser.add_argument_group(title='mixed precision')
-
- group.add_argument('--fp16', action='store_true',
- help='Run model in fp16 mode.')
- group.add_argument('--apply-query-key-layer-scaling', action='store_true',
- help='Scale Q * K^T by 1 / layer-number. If this flag '
- 'is set, then it will automatically set '
- 'attention-softmax-in-fp32 to true')
- group.add_argument('--attention-softmax-in-fp32', action='store_true',
- help='Run attention masking and softmax in fp32.')
- group.add_argument('--fp32-allreduce', action='store_true',
- help='All-reduce in fp32')
- group.add_argument('--hysteresis', type=int, default=2,
- help='hysteresis for dynamic loss scaling')
- group.add_argument('--loss-scale', type=float, default=None,
- help='Static loss scaling, positive power of 2 '
- 'values can improve fp16 convergence. If None, dynamic'
- 'loss scaling is used.')
- group.add_argument('--loss-scale-window', type=float, default=1000,
- help='Window over which to raise/lower dynamic scale.')
- group.add_argument('--min-scale', type=float, default=1,
- help='Minimum loss scale for dynamic loss scale.')
- group.add_argument('--fp16-lm-cross-entropy', action='store_true',
- help='Move the cross entropy unreduced loss calculation'
- 'for lm head to fp16.')
-
-
- return parser
-
-
-def _add_distributed_args(parser):
- group = parser.add_argument_group(title='mixed precision')
-
- group.add_argument('--model-parallel-size', type=int, default=1,
- help='Size of the model parallel.')
- group.add_argument('--pipe-parallel-size', type=int, default=0,
- help='Size of the pipeline parallel. Disable with 0.')
- group.add_argument('--distributed-backend', default='nccl',
- choices=['nccl', 'gloo'],
- help='Which backend to use for distributed training.')
- group.add_argument('--DDP-impl', default='local',
- choices=['local', 'torch'],
- help='which DistributedDataParallel implementation '
- 'to use.')
- group.add_argument('--local_rank', type=int, default=None,
- help='local rank passed from distributed launcher.')
- group.add_argument('--lazy-mpu-init', type=bool, required=False,
- help='If set to True, initialize_megatron() skips DDP initialization'
- ' and returns function to complete it instead.'
- 'Also turns on --use-cpu-initialization flag.'
- 'This is for external DDP manager.' )
- group.add_argument('--use-cpu-initialization', action='store_true',
- help='If set, affine parallel weights initialization uses CPU' )
- return parser
-
-
-def _add_validation_args(parser):
- group = parser.add_argument_group(title='validation')
-
- group.add_argument('--eval-iters', type=int, default=100,
- help='Number of iterations to run for evaluation'
- 'validation/test for.')
- group.add_argument('--eval-interval', type=int, default=1000,
- help='Interval between running evaluation on '
- 'validation set.')
-
- return parser
-
-
-def _add_data_args(parser):
- group = parser.add_argument_group(title='data and dataloader')
-
- group.add_argument('--data-path', type=str, default=None,
- help='Path to combined dataset to split.')
- group.add_argument('--split', type=str, default='969, 30, 1',
- help='Comma-separated list of proportions for training,'
- ' validation, and test split. For example the split '
- '`90,5,5` will use 90% of data for training, 5% for '
- 'validation and 5% for test.')
- group.add_argument('--vocab-file', type=str, default=None,
- help='Path to the vocab file.')
- group.add_argument('--merge-file', type=str, default=None,
- help='Path to the BPE merge file.')
- group.add_argument('--seq-length', type=int, default=None,
- help="Maximum sequence length to process.")
- group.add_argument('--mask-prob', type=float, default=0.15,
- help='Probability of replacing a token with mask.')
- group.add_argument('--short-seq-prob', type=float, default=0.1,
- help='Probability of producing a short sequence.')
- group.add_argument('--mmap-warmup', action='store_true',
- help='Warm up mmap files.')
- group.add_argument('--num-workers', type=int, default=2,
- help="Dataloader number of workers.")
- group.add_argument('--tokenizer-type', type=str,
- default=None,
- choices=['BertWordPieceLowerCase',
- 'BertWordPieceCase',
- 'GPT2BPETokenizer'],
- help='What type of tokenizer to use.')
- group.add_argument('--data-impl', type=str, default='infer',
- choices=['lazy', 'cached', 'mmap', 'infer'],
- help='Implementation of indexed datasets.')
- group.add_argument('--reset-position-ids', action='store_true',
- help='Reset posistion ids after end-of-document token.')
- group.add_argument('--reset-attention-mask', action='store_true',
- help='Reset self attention maske after '
- 'end-of-document token.')
- group.add_argument('--eod-mask-loss', action='store_true',
- help='Mask loss for the end of document tokens.')
-
- return parser
-
-
-def _add_autoresume_args(parser):
- group = parser.add_argument_group(title='autoresume')
-
- group.add_argument('--adlr-autoresume', action='store_true',
- help='Enable autoresume on adlr cluster.')
- group.add_argument('--adlr-autoresume-interval', type=int, default=1000,
- help='Intervals over which check for autoresume'
- 'termination signal')
-
- return parser
-
-
-def _add_realm_args(parser):
- group = parser.add_argument_group(title='realm')
-
- # network size
- group.add_argument('--ict-head-size', type=int, default=None,
- help='Size of block embeddings to be used in ICT and REALM (paper default: 128)')
-
- # checkpointing
- group.add_argument('--ict-load', type=str, default=None,
- help='Directory containing an ICTBertModel checkpoint')
- group.add_argument('--bert-load', type=str, default=None,
- help='Directory containing an BertModel checkpoint (needed to start ICT and REALM)')
-
- # data
- group.add_argument('--titles-data-path', type=str, default=None,
- help='Path to titles dataset used for ICT')
- group.add_argument('--query-in-block-prob', type=float, default=0.1,
- help='Probability of keeping query in block for ICT dataset')
- group.add_argument('--use-one-sent-docs', action='store_true',
- help='Whether to use one sentence documents in ICT')
-
- # training
- group.add_argument('--report-topk-accuracies', nargs='+', default=[],
- help="Which top-k accuracies to report (e.g. '1 5 20')")
-
- # faiss index
- group.add_argument('--faiss-use-gpu', action='store_true',
- help='Whether create the FaissMIPSIndex on GPU')
- group.add_argument('--block-data-path', type=str, default=None,
- help='Where to save/load BlockData to/from')
-
- # indexer
- group.add_argument('--indexer-batch-size', type=int, default=128,
- help='How large of batches to use when doing indexing jobs')
- group.add_argument('--indexer-log-interval', type=int, default=1000,
- help='After how many batches should the indexer report progress')
- return parser
-
-
-def _add_zero_args(parser):
- """Text generate arguments."""
-
- group = parser.add_argument_group('Text generation', 'configurations')
- group.add_argument("--zero-stage", type=int, default=1.0)
- group.add_argument('--zero-reduce-scatter', action='store_true',
- help='Use reduce scatter if specified')
- group.add_argument('--zero-contigious-gradients', action='store_true',
- help='Use contigious memory optimizaiton if specified')
- group.add_argument("--zero-reduce-bucket-size", type=int, default=0.0)
- group.add_argument("--zero-allgather-bucket-size", type=int, default=0.0)
- return parser
-
-
-def _add_activation_checkpoint_args(parser):
- group = parser.add_argument_group('Activation Checkpointing',
- 'Checkpointing Configurations')
- group.add_argument('--deepspeed-activation-checkpointing', action='store_true',
- help='uses activation checkpointing from deepspeed')
- group.add_argument('--partition-activations', action='store_true',
- help='partition Activations across GPUs before checkpointing.')
- group.add_argument('--contigious-checkpointing', action='store_true',
- help='Contigious memory checkpointing for activatoins.')
- group.add_argument('--checkpoint-in-cpu', action='store_true',
- help='Move the activation checkpoints to CPU.')
- group.add_argument('--synchronize-each-layer', action='store_true',
- help='does a synchronize at the beginning and end of each checkpointed layer.')
- group.add_argument('--profile-backward', action='store_true',
- help='Enables backward pass profiling for checkpointed layers.')
- return parser
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/checkpointing.py b/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/checkpointing.py
deleted file mode 100644
index 80ee265cc3..0000000000
--- a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/checkpointing.py
+++ /dev/null
@@ -1,340 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Input/output checkpointing."""
-
-import os
-import random
-import sys
-import numpy as np
-
-import torch
-from torch.nn.parallel import DistributedDataParallel as torchDDP
-
-from megatron import mpu, get_args
-from megatron import get_args
-from megatron import print_rank_0
-
-_CHECKPOINT_VERSION = None
-
-def set_checkpoint_version(value):
- global _CHECKPOINT_VERSION
- assert _CHECKPOINT_VERSION is None, \
- "checkpoint version already set"
- _CHECKPOINT_VERSION = value
-
-def get_checkpoint_version():
- global _CHECKPOINT_VERSION
- return _CHECKPOINT_VERSION
-
-def check_checkpoint_args(checkpoint_args):
- """Ensure fixed arguments for a model are the same for the input
- arguments and the one retreived frm checkpoint."""
- args = get_args()
-
- def _compare(arg_name):
- checkpoint_value = getattr(checkpoint_args, arg_name)
- args_value = getattr(args, arg_name)
- error_message = '{} value from checkpoint ({}) is not equal to the ' \
- 'input argument value ({}).'.format(
- arg_name, checkpoint_value, args_value)
- assert checkpoint_value == args_value, error_message
-
- _compare('num_layers')
- _compare('hidden_size')
- _compare('num_attention_heads')
- _compare('max_position_embeddings')
- _compare('make_vocab_size_divisible_by')
- _compare('padded_vocab_size')
- _compare('tokenizer_type')
- _compare('model_parallel_size')
-
-
-def ensure_directory_exists(filename):
- """Build filename's path if it does not already exists."""
- dirname = os.path.dirname(filename)
- if not os.path.exists(dirname):
- os.makedirs(dirname)
-
-
-def get_checkpoint_name(checkpoints_path, iteration,
- release=False, mp_rank=None):
- """A unified checkpoint name."""
- if release:
- directory = 'release'
- else:
- directory = 'iter_{:07d}'.format(iteration)
- return os.path.join(checkpoints_path, directory,
- 'mp_rank_{:02d}'.format(
- mpu.get_model_parallel_rank() if mp_rank is None
- else mp_rank),
- 'model_optim_rng.pt')
-
-
-def get_checkpoint_tracker_filename(checkpoints_path):
- """Tracker file rescords the latest chckpoint during
- training to restart from."""
- return os.path.join(checkpoints_path, 'latest_checkpointed_iteration.txt')
-
-
-def save_ds_checkpoint(iteration, model, args):
- """Save a model checkpoint."""
-
- sd = {}
- sd['iteration'] = iteration
- # rng states.
- if not args.no_save_rng:
- sd['random_rng_state'] = random.getstate()
- sd['np_rng_state'] = np.random.get_state()
- sd['torch_rng_state'] = torch.get_rng_state()
- sd['cuda_rng_state'] = torch.cuda.get_rng_state()
- sd['rng_tracker_states'] = mpu.get_cuda_rng_tracker().get_states()
-
- if args.pipe_parallel_size == 0:
- #megatron model uses state_dict_for_save_checkpointing instead of the standard state_dict
- #state_dict is used by deepspeed for module saving so it needs to point to the right function
- model.module.state_dict = model.module.state_dict_for_save_checkpoint
- else:
- # Pipeline parallelism manages its own state_dict.
- pass
-
- model.save_checkpoint(args.save, client_state=sd)
-
-
-def save_checkpoint(iteration, model, optimizer, lr_scheduler):
- """Save a model checkpoint."""
- args = get_args()
-
- if args.deepspeed:
- save_ds_checkpoint(iteration, model, args)
- else:
- # Only rank zero of the data parallel writes to the disk.
- if isinstance(model, torchDDP):
- model = model.module
- if mpu.get_data_parallel_rank() == 0:
-
- # Arguments, iteration, and model.
- state_dict = {}
- state_dict['args'] = args
- state_dict['checkpoint_version'] = 2.0
- state_dict['iteration'] = iteration
- state_dict['model'] = model.state_dict_for_save_checkpoint()
-
- # Optimizer stuff.
- if not args.no_save_optim:
- if optimizer is not None:
- state_dict['optimizer'] = optimizer.state_dict()
- if lr_scheduler is not None:
- state_dict['lr_scheduler'] = lr_scheduler.state_dict()
-
- # RNG states.
- if not args.no_save_rng:
- state_dict['random_rng_state'] = random.getstate()
- state_dict['np_rng_state'] = np.random.get_state()
- state_dict['torch_rng_state'] = torch.get_rng_state()
- state_dict['cuda_rng_state'] = torch.cuda.get_rng_state()
- state_dict['rng_tracker_states'] \
- = mpu.get_cuda_rng_tracker().get_states()
-
- # Save.
- checkpoint_name = get_checkpoint_name(args.save, iteration)
- print('global rank {} is saving checkpoint at iteration {:7d} to {}'.
- format(torch.distributed.get_rank(), iteration,
- checkpoint_name))
- ensure_directory_exists(checkpoint_name)
- torch.save(state_dict, checkpoint_name)
- print(' successfully saved {}'.format(checkpoint_name))
-
- # Wait so everyone is done (necessary)
- torch.distributed.barrier()
- # And update the latest iteration
- if torch.distributed.get_rank() == 0:
- tracker_filename = get_checkpoint_tracker_filename(args.save)
- with open(tracker_filename, 'w') as f:
- f.write(str(iteration))
- # Wait so everyone is done (not necessary)
- torch.distributed.barrier()
-
-
-def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load'):
- """Load a model checkpoint and return the iteration."""
- args = get_args()
- load_dir = getattr(args, load_arg)
-
- if isinstance(model, torchDDP):
- model = model.module
- # Read the tracker file and set the iteration.
- tracker_filename = get_checkpoint_tracker_filename(load_dir)
-
- # If no tracker file, return iretation zero.
- if not os.path.isfile(tracker_filename):
- print_rank_0('WARNING: could not find the metadata file {} '.format(
- tracker_filename))
- print_rank_0(' will not load any checkpoints and will start from '
- 'random')
- return 0
-
- # Otherwise, read the tracker file and either set the iteration or
- # mark it as a release checkpoint.
- iteration = 0
- release = False
- with open(tracker_filename, 'r') as f:
- metastring = f.read().strip()
- try:
- iteration = int(metastring)
- except ValueError:
- release = metastring == 'release'
- if not release:
- print_rank_0('ERROR: Invalid metadata file {}. Exiting'.format(
- tracker_filename))
- sys.exit()
-
- assert iteration > 0 or release, 'error parsing metadata file {}'.format(
- tracker_filename)
-
- if args.deepspeed:
- checkpoint_name, state_dict = model.load_checkpoint(load_dir)
-
- if checkpoint_name is None:
- if mpu.get_data_parallel_rank() == 0:
- print("Unable to load checkpoint.")
- return iteration
-
- else:
- # Checkpoint.
- checkpoint_name = get_checkpoint_name(load_dir, iteration, release)
- if mpu.get_data_parallel_rank() == 0:
- print('global rank {} is loading checkpoint {}'.format(
- torch.distributed.get_rank(), checkpoint_name))
-
- # Load the checkpoint.
- try:
- state_dict = torch.load(checkpoint_name, map_location='cpu')
- except ModuleNotFoundError:
- # For backward compatibility.
- print_rank_0(' > deserializing using the old code structure ...')
- sys.modules['fp16.loss_scaler'] = sys.modules[
- 'megatron.fp16.loss_scaler']
- state_dict = torch.load(checkpoint_name, map_location='cpu')
- sys.modules.pop('fp16.loss_scaler', None)
- except BaseException:
- print_rank_0('could not load the checkpoint')
- sys.exit()
- # Model.
-
- model.load_state_dict(state_dict['model'])
-
- # Optimizer.
- if not release and not args.finetune and not args.no_load_optim:
- try:
- if optimizer is not None:
- optimizer.load_state_dict(state_dict['optimizer'])
- if lr_scheduler is not None:
- lr_scheduler.load_state_dict(state_dict['lr_scheduler'])
- except KeyError:
- print_rank_0(
- 'Unable to load optimizer from checkpoint {}. '
- 'Specify --no-load-optim or --finetune to prevent '
- 'attempting to load the optimizer state, '
- 'exiting ...'.format(checkpoint_name))
- sys.exit()
-
- # set checkpoint version
- set_checkpoint_version(state_dict.get('checkpoint_version', 0))
-
- # Set iteration.
- if args.finetune or release:
- iteration = 0
- else:
- try:
- iteration = state_dict['iteration']
- except KeyError:
- try: # Backward compatible with older checkpoints
- iteration = state_dict['total_iters']
- except KeyError:
- print_rank_0('A metadata file exists but unable to load '
- 'iteration from checkpoint {}, exiting'.format(
- checkpoint_name))
- sys.exit()
-
-
- # Check arguments.
- if 'args' in state_dict:
- checkpoint_args = state_dict['args']
- check_checkpoint_args(checkpoint_args)
- else:
- print_rank_0('could not find arguments in the checkpoint ...')
-
- # rng states.
- if not release and not args.finetune and not args.no_load_rng:
- try:
- random.setstate(state_dict['random_rng_state'])
- np.random.set_state(state_dict['np_rng_state'])
- torch.set_rng_state(state_dict['torch_rng_state'])
- torch.cuda.set_rng_state(state_dict['cuda_rng_state'])
- mpu.get_cuda_rng_tracker().set_states(
- state_dict['rng_tracker_states'])
- except KeyError:
- print_rank_0('Unable to load optimizer from checkpoint {}. '
- 'Specify --no-load-rng or --finetune to prevent '
- 'attempting to load the optimizer state, '
- 'exiting ...'.format(checkpoint_name))
- sys.exit()
-
- torch.distributed.barrier()
- if mpu.get_data_parallel_rank() == 0:
- print(' successfully loaded {}'.format(checkpoint_name))
-
- return iteration
-
-
-def load_ict_checkpoint(model, only_query_model=False, only_block_model=False, from_realm_chkpt=False):
- """selectively load ICT models for indexing/retrieving from ICT or REALM checkpoints"""
-
- args = get_args()
-
- if isinstance(model, torchDDP):
- model = model.module
-
- load_path = args.load if from_realm_chkpt else args.ict_load
-
- tracker_filename = get_checkpoint_tracker_filename(load_path)
- with open(tracker_filename, 'r') as f:
- iteration = int(f.read().strip())
-
- # assert iteration > 0
- checkpoint_name = get_checkpoint_name(load_path, iteration, False)
- if mpu.get_data_parallel_rank() == 0:
- print('global rank {} is loading checkpoint {}'.format(
- torch.distributed.get_rank(), checkpoint_name))
-
- state_dict = torch.load(checkpoint_name, map_location='cpu')
- ict_state_dict = state_dict['model']
- if from_realm_chkpt and mpu.get_data_parallel_rank() == 0:
- print(" loading ICT state dict from REALM", flush=True)
- ict_state_dict = ict_state_dict['retriever']['ict_model']
-
- if only_query_model:
- ict_state_dict.pop('context_model')
- if only_block_model:
- ict_state_dict.pop('question_model')
-
- model.load_state_dict(ict_state_dict)
- torch.distributed.barrier()
-
- if mpu.get_data_parallel_rank() == 0:
- print(' successfully loaded {}'.format(checkpoint_name))
-
- return model
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/data/Makefile b/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/data/Makefile
deleted file mode 100644
index 8f9db76866..0000000000
--- a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/data/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
-CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
-CPPFLAGS += $(shell python3 -m pybind11 --includes)
-LIBNAME = helpers
-LIBEXT = $(shell python3-config --extension-suffix)
-
-default: $(LIBNAME)$(LIBEXT)
-
-%$(LIBEXT): %.cpp
- $(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/data/__init__.py b/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/data/__init__.py
deleted file mode 100644
index cd5f898c6b..0000000000
--- a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/data/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from . import indexed_dataset
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/data/bert_dataset.py b/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/data/bert_dataset.py
deleted file mode 100644
index 5203666f35..0000000000
--- a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/data/bert_dataset.py
+++ /dev/null
@@ -1,232 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""BERT Style dataset."""
-
-import os
-import time
-
-import numpy as np
-import torch
-from torch.utils.data import Dataset
-
-from megatron import get_tokenizer, get_args
-from megatron import print_rank_0
-from megatron import mpu
-from megatron.data.dataset_utils import get_a_and_b_segments
-from megatron.data.dataset_utils import truncate_segments
-from megatron.data.dataset_utils import create_tokens_and_tokentypes
-from megatron.data.dataset_utils import pad_and_convert_to_numpy
-from megatron.data.dataset_utils import create_masked_lm_predictions
-
-
-class BertDataset(Dataset):
-
- def __init__(self, name, indexed_dataset, data_prefix,
- num_epochs, max_num_samples, masked_lm_prob,
- max_seq_length, short_seq_prob, seed):
-
- # Params to store.
- self.name = name
- self.seed = seed
- self.masked_lm_prob = masked_lm_prob
- self.max_seq_length = max_seq_length
-
- # Dataset.
- self.indexed_dataset = indexed_dataset
-
- # Build the samples mapping.
- self.samples_mapping = get_samples_mapping_(self.indexed_dataset,
- data_prefix,
- num_epochs,
- max_num_samples,
- self.max_seq_length,
- short_seq_prob,
- self.seed,
- self.name)
-
- # Vocab stuff.
- tokenizer = get_tokenizer()
- self.vocab_id_list = list(tokenizer.inv_vocab.keys())
- self.vocab_id_to_token_dict = tokenizer.inv_vocab
- self.cls_id = tokenizer.cls
- self.sep_id = tokenizer.sep
- self.mask_id = tokenizer.mask
- self.pad_id = tokenizer.pad
-
- def __len__(self):
- return self.samples_mapping.shape[0]
-
- def __getitem__(self, idx):
- start_idx, end_idx, seq_length = self.samples_mapping[idx]
- sample = [self.indexed_dataset[i] for i in range(start_idx, end_idx)]
- # Note that this rng state should be numpy and not python since
- # python randint is inclusive whereas the numpy one is exclusive.
- np_rng = np.random.RandomState(seed=(self.seed + idx))
- return build_training_sample(sample, seq_length,
- self.max_seq_length, # needed for padding
- self.vocab_id_list,
- self.vocab_id_to_token_dict,
- self.cls_id, self.sep_id,
- self.mask_id, self.pad_id,
- self.masked_lm_prob, np_rng)
-
-
-def get_samples_mapping_(indexed_dataset,
- data_prefix,
- num_epochs,
- max_num_samples,
- max_seq_length,
- short_seq_prob,
- seed,
- name):
- if not num_epochs:
- if not max_num_samples:
- raise ValueError("Need to specify either max_num_samples "
- "or num_epochs")
- num_epochs = np.iinfo(np.int32).max - 1
- if not max_num_samples:
- max_num_samples = np.iinfo(np.int64).max - 1
-
- # Filename of the index mapping
- indexmap_filename = data_prefix
- indexmap_filename += '_{}_indexmap'.format(name)
- if num_epochs != (np.iinfo(np.int32).max - 1):
- indexmap_filename += '_{}ep'.format(num_epochs)
- if max_num_samples != (np.iinfo(np.int64).max - 1):
- indexmap_filename += '_{}mns'.format(max_num_samples)
- indexmap_filename += '_{}msl'.format(max_seq_length)
- indexmap_filename += '_{:0.2f}ssp'.format(short_seq_prob)
- indexmap_filename += '_{}s'.format(seed)
- indexmap_filename += '.npy'
-
- # Build the indexed mapping if not exist.
- if torch.distributed.get_rank() == 0 and \
- not os.path.isfile(indexmap_filename):
- print(' > WARNING: could not find index map file {}, building '
- 'the indices on rank 0 ...'.format(indexmap_filename))
-
- # Make sure the types match the helpers input types.
- assert indexed_dataset.doc_idx.dtype == np.int64
- assert indexed_dataset.sizes.dtype == np.int32
-
- # Build samples mapping
- verbose = torch.distributed.get_rank() == 0
- start_time = time.time()
- print_rank_0(' > building sapmles index mapping for {} ...'.format(
- name))
- # First compile and then import.
- from megatron.data.dataset_utils import compile_helper
- compile_helper()
- from megatron.data import helpers
- samples_mapping = helpers.build_mapping(
- indexed_dataset.doc_idx,
- indexed_dataset.sizes,
- num_epochs,
- max_num_samples,
- max_seq_length - 3, # account for added tokens
- short_seq_prob,
- seed,
- verbose)
- print_rank_0(' > done building sapmles index maping')
- np.save(indexmap_filename, samples_mapping, allow_pickle=True)
- print_rank_0(' > saved the index mapping in {}'.format(
- indexmap_filename))
- # Make sure all the ranks have built the mapping
- print_rank_0(' > elasped time to build and save samples mapping '
- '(seconds): {:4f}'.format(
- time.time() - start_time))
- # This should be a barrier but nccl barrier assumes
- # device_index=rank which is not the case for model
- # parallel case
- counts = torch.cuda.LongTensor([1])
- torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group())
- assert counts[0].item() == torch.distributed.get_world_size(
- group=mpu.get_data_parallel_group())
-
- # Load indexed dataset.
- print_rank_0(' > loading indexed mapping from {}'.format(
- indexmap_filename))
- start_time = time.time()
- samples_mapping = np.load(indexmap_filename, allow_pickle=True, mmap_mode='r')
- print_rank_0(' loaded indexed file in {:3.3f} seconds'.format(
- time.time() - start_time))
- print_rank_0(' total number of samples: {}'.format(
- samples_mapping.shape[0]))
-
- return samples_mapping
-
-
-def build_training_sample(sample,
- target_seq_length, max_seq_length,
- vocab_id_list, vocab_id_to_token_dict,
- cls_id, sep_id, mask_id, pad_id,
- masked_lm_prob, np_rng):
- """Biuld training sample.
-
- Arguments:
- sample: A list of sentences in which each sentence is a list token ids.
- target_seq_length: Desired sequence length.
- max_seq_length: Maximum length of the sequence. All values are padded to
- this length.
- vocab_id_list: List of vocabulary ids. Used to pick a random id.
- vocab_id_to_token_dict: A dictionary from vocab ids to text tokens.
- cls_id: Start of example id.
- sep_id: Separator id.
- mask_id: Mask token id.
- pad_id: Padding token id.
- masked_lm_prob: Probability to mask tokens.
- np_rng: Random number genenrator. Note that this rng state should be
- numpy and not python since python randint is inclusive for
- the opper bound whereas the numpy one is exclusive.
- """
-
- # We assume that we have at least two sentences in the sample
- assert len(sample) > 1
- assert target_seq_length <= max_seq_length
-
- # Divide sample into two segments (A and B).
- tokens_a, tokens_b, is_next_random = get_a_and_b_segments(sample, np_rng)
-
- # Truncate to `target_sequence_length`.
- max_num_tokens = target_seq_length
- truncated = truncate_segments(tokens_a, tokens_b, len(tokens_a),
- len(tokens_b), max_num_tokens, np_rng)
-
- # Build tokens and toketypes.
- tokens, tokentypes = create_tokens_and_tokentypes(tokens_a, tokens_b,
- cls_id, sep_id)
-
- # Masking.
- max_predictions_per_seq = masked_lm_prob * max_num_tokens
- (tokens, masked_positions, masked_labels, _) = create_masked_lm_predictions(
- tokens, vocab_id_list, vocab_id_to_token_dict, masked_lm_prob,
- cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng)
-
- # Padding.
- tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np \
- = pad_and_convert_to_numpy(tokens, tokentypes, masked_positions,
- masked_labels, pad_id, max_seq_length)
-
- train_sample = {
- 'text': tokens_np,
- 'types': tokentypes_np,
- 'labels': labels_np,
- 'is_random': int(is_next_random),
- 'loss_mask': loss_mask_np,
- 'padding_mask': padding_mask_np,
- 'truncated': int(truncated)}
- return train_sample
-
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/data/dataset_utils.py b/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/data/dataset_utils.py
deleted file mode 100644
index d51b1ceb56..0000000000
--- a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/data/dataset_utils.py
+++ /dev/null
@@ -1,503 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors, and NVIDIA.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-# Most of the code here has been copied from:
-# https://github.com/google-research/albert/blob/master/create_pretraining_data.py
-# with some modifications.
-
-import time
-import collections
-
-import numpy as np
-from megatron import get_args, print_rank_0
-from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset
-
-DSET_TYPE_STD = 'standard_bert'
-DSET_TYPE_ICT = 'ict'
-
-DSET_TYPES = [DSET_TYPE_ICT, DSET_TYPE_STD]
-
-
-def compile_helper():
- """Compile helper function ar runtime. Make sure this
- is invoked on a single process."""
- import os
- import subprocess
- path = os.path.abspath(os.path.dirname(__file__))
- ret = subprocess.run(['make', '-C', path])
- if ret.returncode != 0:
- print("Making C++ dataset helpers module failed, exiting.")
- import sys
- sys.exit(1)
-
-
-def get_a_and_b_segments(sample, np_rng):
- """Divide sample into a and b segments."""
-
- # Number of sentences in the sample.
- n_sentences = len(sample)
- # Make sure we always have two sentences.
- assert n_sentences > 1, 'make sure each sample has at least two sentences.'
-
- # First part:
- # `a_end` is how many sentences go into the `A`.
- a_end = 1
- if n_sentences >= 3:
- # Note that randin in numpy is exclusive.
- a_end = np_rng.randint(1, n_sentences)
- tokens_a = []
- for j in range(a_end):
- tokens_a.extend(sample[j])
-
- # Second part:
- tokens_b = []
- for j in range(a_end, n_sentences):
- tokens_b.extend(sample[j])
-
- # Random next:
- is_next_random = False
- if np_rng.random() < 0.5:
- is_next_random = True
- tokens_a, tokens_b = tokens_b, tokens_a
-
- return tokens_a, tokens_b, is_next_random
-
-
-def truncate_segments(tokens_a, tokens_b, len_a, len_b, max_num_tokens, np_rng):
- """Truncates a pair of sequences to a maximum sequence length."""
- #print(len_a, len_b, max_num_tokens)
- assert len_a > 0
- assert len_b > 0
- if len_a + len_b <= max_num_tokens:
- return False
- while len_a + len_b > max_num_tokens:
- if len_a > len_b:
- len_a -= 1
- tokens = tokens_a
- else:
- len_b -= 1
- tokens = tokens_b
- if np_rng.random() < 0.5:
- del tokens[0]
- else:
- tokens.pop()
- return True
-
-
-def create_tokens_and_tokentypes(tokens_a, tokens_b, cls_id, sep_id):
- """Merge segments A and B, add [CLS] and [SEP] and build tokentypes."""
-
- tokens = []
- tokentypes = []
- # [CLS].
- tokens.append(cls_id)
- tokentypes.append(0)
- # Segment A.
- for token in tokens_a:
- tokens.append(token)
- tokentypes.append(0)
- # [SEP].
- tokens.append(sep_id)
- tokentypes.append(0)
- # Segment B.
- for token in tokens_b:
- tokens.append(token)
- tokentypes.append(1)
- # [SEP].
- tokens.append(sep_id)
- tokentypes.append(1)
-
- return tokens, tokentypes
-
-
-MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
- ["index", "label"])
-
-
-def is_start_piece(piece):
- """Check if the current word piece is the starting piece (BERT)."""
- # When a word has been split into
- # WordPieces, the first token does not have any marker and any subsequence
- # tokens are prefixed with ##. So whenever we see the ## token, we
- # append it to the previous set of word indexes.
- return not piece.startswith("##")
-
-
-def create_masked_lm_predictions(tokens,
- vocab_id_list, vocab_id_to_token_dict,
- masked_lm_prob,
- cls_id, sep_id, mask_id,
- max_predictions_per_seq,
- np_rng,
- max_ngrams=3,
- do_whole_word_mask=True,
- favor_longer_ngram=False,
- do_permutation=False):
- """Creates the predictions for the masked LM objective.
- Note: Tokens here are vocab ids and not text tokens."""
-
- cand_indexes = []
- # Note(mingdachen): We create a list for recording if the piece is
- # the starting piece of current token, where 1 means true, so that
- # on-the-fly whole word masking is possible.
- token_boundary = [0] * len(tokens)
-
- for (i, token) in enumerate(tokens):
- if token == cls_id or token == sep_id:
- token_boundary[i] = 1
- continue
- # Whole Word Masking means that if we mask all of the wordpieces
- # corresponding to an original word.
- #
- # Note that Whole Word Masking does *not* change the training code
- # at all -- we still predict each WordPiece independently, softmaxed
- # over the entire vocabulary.
- if (do_whole_word_mask and len(cand_indexes) >= 1 and
- not is_start_piece(vocab_id_to_token_dict[token])):
- cand_indexes[-1].append(i)
- else:
- cand_indexes.append([i])
- if is_start_piece(vocab_id_to_token_dict[token]):
- token_boundary[i] = 1
-
- output_tokens = list(tokens)
-
- masked_lm_positions = []
- masked_lm_labels = []
-
- if masked_lm_prob == 0:
- return (output_tokens, masked_lm_positions,
- masked_lm_labels, token_boundary)
-
- num_to_predict = min(max_predictions_per_seq,
- max(1, int(round(len(tokens) * masked_lm_prob))))
-
- # Note(mingdachen):
- # By default, we set the probilities to favor shorter ngram sequences.
- ngrams = np.arange(1, max_ngrams + 1, dtype=np.int64)
- pvals = 1. / np.arange(1, max_ngrams + 1)
- pvals /= pvals.sum(keepdims=True)
-
- if favor_longer_ngram:
- pvals = pvals[::-1]
-
- ngram_indexes = []
- for idx in range(len(cand_indexes)):
- ngram_index = []
- for n in ngrams:
- ngram_index.append(cand_indexes[idx:idx + n])
- ngram_indexes.append(ngram_index)
-
- np_rng.shuffle(ngram_indexes)
-
- masked_lms = []
- covered_indexes = set()
- for cand_index_set in ngram_indexes:
- if len(masked_lms) >= num_to_predict:
- break
- if not cand_index_set:
- continue
- # Note(mingdachen):
- # Skip current piece if they are covered in lm masking or previous ngrams.
- for index_set in cand_index_set[0]:
- for index in index_set:
- if index in covered_indexes:
- continue
-
- n = np_rng.choice(ngrams[:len(cand_index_set)],
- p=pvals[:len(cand_index_set)] /
- pvals[:len(cand_index_set)].sum(keepdims=True))
- index_set = sum(cand_index_set[n - 1], [])
- n -= 1
- # Note(mingdachen):
- # Repeatedly looking for a candidate that does not exceed the
- # maximum number of predictions by trying shorter ngrams.
- while len(masked_lms) + len(index_set) > num_to_predict:
- if n == 0:
- break
- index_set = sum(cand_index_set[n - 1], [])
- n -= 1
- # If adding a whole-word mask would exceed the maximum number of
- # predictions, then just skip this candidate.
- if len(masked_lms) + len(index_set) > num_to_predict:
- continue
- is_any_index_covered = False
- for index in index_set:
- if index in covered_indexes:
- is_any_index_covered = True
- break
- if is_any_index_covered:
- continue
- for index in index_set:
- covered_indexes.add(index)
-
- masked_token = None
- # 80% of the time, replace with [MASK]
- if np_rng.random() < 0.8:
- masked_token = mask_id
- else:
- # 10% of the time, keep original
- if np_rng.random() < 0.5:
- masked_token = tokens[index]
- # 10% of the time, replace with random word
- else:
- masked_token = vocab_id_list[np_rng.randint(0, len(vocab_id_list))]
-
- output_tokens[index] = masked_token
-
- masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
- assert len(masked_lms) <= num_to_predict
-
- np_rng.shuffle(ngram_indexes)
-
- select_indexes = set()
- if do_permutation:
- for cand_index_set in ngram_indexes:
- if len(select_indexes) >= num_to_predict:
- break
- if not cand_index_set:
- continue
- # Note(mingdachen):
- # Skip current piece if they are covered in lm masking or previous ngrams.
- for index_set in cand_index_set[0]:
- for index in index_set:
- if index in covered_indexes or index in select_indexes:
- continue
-
- n = np.random.choice(ngrams[:len(cand_index_set)],
- p=pvals[:len(cand_index_set)] /
- pvals[:len(cand_index_set)].sum(keepdims=True))
- index_set = sum(cand_index_set[n - 1], [])
- n -= 1
-
- while len(select_indexes) + len(index_set) > num_to_predict:
- if n == 0:
- break
- index_set = sum(cand_index_set[n - 1], [])
- n -= 1
- # If adding a whole-word mask would exceed the maximum number of
- # predictions, then just skip this candidate.
- if len(select_indexes) + len(index_set) > num_to_predict:
- continue
- is_any_index_covered = False
- for index in index_set:
- if index in covered_indexes or index in select_indexes:
- is_any_index_covered = True
- break
- if is_any_index_covered:
- continue
- for index in index_set:
- select_indexes.add(index)
- assert len(select_indexes) <= num_to_predict
-
- select_indexes = sorted(select_indexes)
- permute_indexes = list(select_indexes)
- np_rng.shuffle(permute_indexes)
- orig_token = list(output_tokens)
-
- for src_i, tgt_i in zip(select_indexes, permute_indexes):
- output_tokens[src_i] = orig_token[tgt_i]
- masked_lms.append(MaskedLmInstance(index=src_i, label=orig_token[src_i]))
-
- masked_lms = sorted(masked_lms, key=lambda x: x.index)
-
- for p in masked_lms:
- masked_lm_positions.append(p.index)
- masked_lm_labels.append(p.label)
-
- return (output_tokens, masked_lm_positions, masked_lm_labels, token_boundary)
-
-
-def pad_and_convert_to_numpy(tokens, tokentypes, masked_positions,
- masked_labels, pad_id, max_seq_length):
- """Pad sequences and convert them to numpy."""
-
- # Some checks.
- num_tokens = len(tokens)
- padding_length = max_seq_length - num_tokens
- assert padding_length >= 0
- assert len(tokentypes) == num_tokens
- assert len(masked_positions) == len(masked_labels)
-
- # Tokens and token types.
- filler = [pad_id] * padding_length
- tokens_np = np.array(tokens + filler, dtype=np.int64)
- tokentypes_np = np.array(tokentypes + filler, dtype=np.int64)
-
- # Padding mask.
- padding_mask_np = np.array([1] * num_tokens + [0] * padding_length,
- dtype=np.int64)
-
- # Lables and loss mask.
- labels = [-1] * max_seq_length
- loss_mask = [0] * max_seq_length
- for i in range(len(masked_positions)):
- assert masked_positions[i] < num_tokens
- labels[masked_positions[i]] = masked_labels[i]
- loss_mask[masked_positions[i]] = 1
- labels_np = np.array(labels, dtype=np.int64)
- loss_mask_np = np.array(loss_mask, dtype=np.int64)
-
- return tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np
-
-
-def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
- train_valid_test_num_samples,
- max_seq_length, masked_lm_prob,
- short_seq_prob, seed, skip_warmup,
- dataset_type='standard_bert'):
-
- if dataset_type not in DSET_TYPES:
- raise ValueError("Invalid dataset_type: ", dataset_type)
-
- # Indexed dataset.
- indexed_dataset = get_indexed_dataset_(data_prefix,
- data_impl,
- skip_warmup)
-
- if dataset_type == DSET_TYPE_ICT:
- args = get_args()
- title_dataset = get_indexed_dataset_(args.titles_data_path,
- data_impl,
- skip_warmup)
-
- # Get start and end indices of train/valid/train into doc-idx
- # Note that doc-idx is desinged to be num-docs + 1 so we can
- # easily iterate over it.
- total_num_of_documents = indexed_dataset.doc_idx.shape[0] - 1
- splits = get_train_valid_test_split_(splits_string, total_num_of_documents)
-
- # Print stats about the splits.
- print_rank_0(' > dataset split:')
-
- def print_split_stats(name, index):
- print_rank_0(' {}:'.format(name))
- print_rank_0(' document indices in [{}, {}) total of {} '
- 'documents'.format(splits[index], splits[index + 1],
- splits[index + 1] - splits[index]))
- start_index = indexed_dataset.doc_idx[splits[index]]
- end_index = indexed_dataset.doc_idx[splits[index + 1]]
- print_rank_0(' sentence indices in [{}, {}) total of {} '
- 'sentences'.format(start_index, end_index,
- end_index - start_index))
- print_split_stats('train', 0)
- print_split_stats('validation', 1)
- print_split_stats('test', 2)
-
- def build_dataset(index, name):
- from megatron.data.bert_dataset import BertDataset
- from megatron.data.ict_dataset import ICTDataset
- dataset = None
- if splits[index + 1] > splits[index]:
- # Get the pointer to the original doc-idx so we can set it later.
- doc_idx_ptr = indexed_dataset.get_doc_idx()
- # Slice the doc-idx
- start_index = splits[index]
- # Add +1 so we can index into the dataset to get the upper bound.
- end_index = splits[index + 1] + 1
- # New doc_idx view.
- indexed_dataset.set_doc_idx(doc_idx_ptr[start_index:end_index])
- # Build the dataset accordingly.
- kwargs = dict(
- name=name,
- data_prefix=data_prefix,
- num_epochs=None,
- max_num_samples=train_valid_test_num_samples[index],
- max_seq_length=max_seq_length,
- seed=seed
- )
-
- if dataset_type == DSET_TYPE_ICT:
- args = get_args()
- dataset = ICTDataset(
- block_dataset=indexed_dataset,
- title_dataset=title_dataset,
- query_in_block_prob=args.query_in_block_prob,
- use_one_sent_docs=args.use_one_sent_docs,
- **kwargs
- )
- else:
- dataset = BertDataset(
- indexed_dataset=indexed_dataset,
- masked_lm_prob=masked_lm_prob,
- short_seq_prob=short_seq_prob,
- **kwargs
- )
-
- # Set the original pointer so dataset remains the main dataset.
- indexed_dataset.set_doc_idx(doc_idx_ptr)
- # Checks.
- assert indexed_dataset.doc_idx[0] == 0
- assert indexed_dataset.doc_idx.shape[0] == \
- (total_num_of_documents + 1)
- return dataset
-
- train_dataset = build_dataset(0, 'train')
- valid_dataset = build_dataset(1, 'valid')
- test_dataset = build_dataset(2, 'test')
-
- return (train_dataset, valid_dataset, test_dataset)
-
-
-def get_indexed_dataset_(data_prefix, data_impl, skip_warmup):
-
- print_rank_0(' > building dataset index ...')
-
- start_time = time.time()
- indexed_dataset = make_indexed_dataset(data_prefix,
- data_impl,
- skip_warmup)
- assert indexed_dataset.sizes.shape[0] == indexed_dataset.doc_idx[-1]
- print_rank_0(' > finished creating indexed dataset in {:4f} '
- 'seconds'.format(time.time() - start_time))
-
- print_rank_0(' > indexed dataset stats:')
- print_rank_0(' number of documents: {}'.format(
- indexed_dataset.doc_idx.shape[0] - 1))
- print_rank_0(' number of sentences: {}'.format(
- indexed_dataset.sizes.shape[0]))
-
- return indexed_dataset
-
-
-def get_train_valid_test_split_(splits_string, size):
- """ Get dataset splits from comma or '/' separated string list."""
-
- splits = []
- if splits_string.find(',') != -1:
- splits = [float(s) for s in splits_string.split(',')]
- elif splits_string.find('/') != -1:
- splits = [float(s) for s in splits_string.split('/')]
- else:
- splits = [float(splits_string)]
- while len(splits) < 3:
- splits.append(0.)
- splits = splits[:3]
- splits_sum = sum(splits)
- assert splits_sum > 0.0
- splits = [split / splits_sum for split in splits]
- splits_index = [0]
- for index, split in enumerate(splits):
- splits_index.append(splits_index[index] +
- int(round(split * float(size))))
- diff = splits_index[-1] - size
- for index in range(1, len(splits_index)):
- splits_index[index] -= diff
- assert len(splits_index) == 4
- assert splits_index[-1] == size
- return splits_index
-
-
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/data/gpt2_dataset.py b/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/data/gpt2_dataset.py
deleted file mode 100644
index 3aa7b705a6..0000000000
--- a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/data/gpt2_dataset.py
+++ /dev/null
@@ -1,317 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""GPT2 style dataset."""
-
-import os
-import time
-
-import numpy as np
-import torch
-
-from megatron import mpu, print_rank_0
-from megatron.data.dataset_utils import get_train_valid_test_split_
-from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset
-
-
-def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
- train_valid_test_num_samples,
- seq_length, seed, skip_warmup):
- """Build train, valid, and test datasets."""
-
- # Indexed dataset.
- indexed_dataset = get_indexed_dataset_(data_prefix,
- data_impl,
- skip_warmup)
-
- total_num_of_documents = indexed_dataset.sizes.shape[0]
- splits = get_train_valid_test_split_(splits_string, total_num_of_documents)
-
- # Print stats about the splits.
- print_rank_0(' > dataset split:')
-
- def print_split_stats(name, index):
- print_rank_0(' {}:'.format(name))
- print_rank_0(' document indices in [{}, {}) total of {} '
- 'documents'.format(splits[index], splits[index + 1],
- splits[index + 1] - splits[index]))
- print_split_stats('train', 0)
- print_split_stats('validation', 1)
- print_split_stats('test', 2)
-
- def build_dataset(index, name):
- dataset = None
- if splits[index + 1] > splits[index]:
- documents = np.arange(start=splits[index], stop=splits[index + 1],
- step=1, dtype=np.int32)
- dataset = GPT2Dataset(name, data_prefix,
- documents, indexed_dataset,
- train_valid_test_num_samples[index],
- seq_length, seed)
- return dataset
-
- train_dataset = build_dataset(0, 'train')
- valid_dataset = build_dataset(1, 'valid')
- test_dataset = build_dataset(2, 'test')
-
- return (train_dataset, valid_dataset, test_dataset)
-
-
-def get_indexed_dataset_(data_prefix, data_impl, skip_warmup):
- """Build indexed dataset."""
- print_rank_0(' > building dataset index ...')
-
- start_time = time.time()
- indexed_dataset = make_indexed_dataset(data_prefix,
- data_impl,
- skip_warmup)
- print_rank_0(' > finished creating indexed dataset in {:4f} '
- 'seconds'.format(time.time() - start_time))
- print_rank_0(' number of documents: {}'.format(
- indexed_dataset.sizes.shape[0]))
-
- return indexed_dataset
-
-
-class GPT2Dataset(torch.utils.data.Dataset):
-
- def __init__(self, name, data_prefix, documents, indexed_dataset,
- num_samples, seq_length, seed):
-
- self.name = name
- self.indexed_dataset = indexed_dataset
-
- # Checks
- assert np.min(documents) >= 0
- assert np.max(documents) < indexed_dataset.sizes.shape[0]
-
- # Build index mappings.
- self.doc_idx, self.sample_idx, self.shuffle_idx = _build_index_mappings(
- self.name, data_prefix, documents, self.indexed_dataset.sizes,
- num_samples, seq_length, seed)
-
- def __len__(self):
- # -1 is due to data structure used to retieve the index:
- # sample i --> [sample_idx[i], sample_idx[i+1])
- return self.sample_idx.shape[0] - 1
-
- def __getitem__(self, idx):
- # Get the shuffled index.
- idx = self.shuffle_idx[idx]
- # Start and end documents and offsets.
- doc_index_f = self.sample_idx[idx][0]
- doc_index_l = self.sample_idx[idx + 1][0]
- offset_f = self.sample_idx[idx][1]
- offset_l = self.sample_idx[idx + 1][1]
- # If we are within the same document, just extract the chunk.
- if doc_index_f == doc_index_l:
- sample = self.indexed_dataset.get(self.doc_idx[doc_index_f],
- offset=offset_f,
- length=offset_l - offset_f + 1)
- else:
- # Otherwise, get the rest of the initial document.
- sample_list = [self.indexed_dataset.get(self.doc_idx[doc_index_f],
- offset=offset_f)]
- # Loop over all in between documents and add the entire document.
- for i in range(doc_index_f + 1, doc_index_l):
- sample_list.append(self.indexed_dataset.get(self.doc_idx[i]))
- # And finally add the relevant portion of last document.
- sample_list.append(self.indexed_dataset.get(
- self.doc_idx[doc_index_l],
- length=offset_l + 1))
- sample = np.concatenate(sample_list)
-
- return {'text': np.array(sample, dtype=np.int64)}
-
-
-def _build_index_mappings(name, data_prefix, documents, sizes,
- num_samples, seq_length, seed):
- """Build doc-idx, sample-idx, and shuffle-idx.
- doc-idx: is an array (ordered) of documents to be used in training.
- sample-idx: is the start document index and document offset for each
- training sample.
- shuffle-idx: maps the sample index into a random index into sample-idx.
- """
- # Number of tokens in each epoch and number of required epochs.
- tokens_per_epoch = _num_tokens(documents, sizes)
- num_epochs = _num_epochs(tokens_per_epoch, seq_length, num_samples)
- # rng state
- np_rng = np.random.RandomState(seed=seed)
-
- # Filename of the index mappings.
- _filename = data_prefix
- _filename += '_{}_indexmap'.format(name)
- _filename += '_{}ns'.format(num_samples)
- _filename += '_{}sl'.format(seq_length)
- _filename += '_{}s'.format(seed)
- doc_idx_filename = _filename + '_doc_idx.npy'
- sample_idx_filename = _filename + '_sample_idx.npy'
- shuffle_idx_filename = _filename + '_shuffle_idx.npy'
-
- # Build the indexed mapping if not exist.
- if torch.distributed.get_rank() == 0:
- if (not os.path.isfile(doc_idx_filename)) or \
- (not os.path.isfile(sample_idx_filename)) or \
- (not os.path.isfile(shuffle_idx_filename)):
-
- print_rank_0(' > WARNING: could not find index map files, building '
- 'the indices on rank 0 ...')
- # doc-idx.
- start_time = time.time()
- doc_idx = _build_doc_idx(documents, num_epochs, np_rng)
- np.save(doc_idx_filename, doc_idx, allow_pickle=True)
- print_rank_0(' > elasped time to build and save doc-idx mapping '
- '(seconds): {:4f}'.format(time.time() - start_time))
- # sample-idx.
- start_time = time.time()
- # Use C++ implementation for speed.
- # First compile and then import.
- from megatron.data.dataset_utils import compile_helper
- compile_helper()
- from megatron.data import helpers
- assert doc_idx.dtype == np.int32
- assert sizes.dtype == np.int32
- sample_idx = helpers.build_sample_idx(sizes, doc_idx, seq_length,
- num_epochs, tokens_per_epoch)
- # sample_idx = _build_sample_idx(sizes, doc_idx, seq_length,
- # num_epochs, tokens_per_epoch)
- np.save(sample_idx_filename, sample_idx, allow_pickle=True)
- print_rank_0(' > elasped time to build and save sample-idx mapping '
- '(seconds): {:4f}'.format(time.time() - start_time))
- # shuffle-idx.
- start_time = time.time()
- # -1 is due to data structure used to retieve the index:
- # sample i --> [sample_idx[i], sample_idx[i+1])
- shuffle_idx = _build_shuffle_idx(sample_idx.shape[0] - 1, np_rng)
- np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True)
- print_rank_0(' > elasped time to build and save shuffle-idx mapping'
- ' (seconds): {:4f}'.format(time.time() - start_time))
-
- # This should be a barrier but nccl barrier assumes
- # device_index=rank which is not the case for model
- # parallel case
- counts = torch.cuda.LongTensor([1])
- torch.distributed.all_reduce(counts, group=mpu.get_io_parallel_group())
- assert counts[0].item() == torch.distributed.get_world_size(
- group=mpu.get_io_parallel_group())
-
- # Load mappings.
- start_time = time.time()
- print_rank_0(' > loading doc-idx mapping from {}'.format(
- doc_idx_filename))
- doc_idx = np.load(doc_idx_filename, allow_pickle=True, mmap_mode='r')
- print_rank_0(' > loading sample-idx mapping from {}'.format(
- sample_idx_filename))
- sample_idx = np.load(sample_idx_filename, allow_pickle=True, mmap_mode='r')
- print_rank_0(' > loading shuffle-idx mapping from {}'.format(
- shuffle_idx_filename))
- shuffle_idx = np.load(shuffle_idx_filename, allow_pickle=True, mmap_mode='r')
- print_rank_0(' loaded indexed file in {:3.3f} seconds'.format(
- time.time() - start_time))
- print_rank_0(' total number of samples: {}'.format(
- sample_idx.shape[0]))
- print_rank_0(' total number of epochs: {}'.format(num_epochs))
-
- return doc_idx, sample_idx, shuffle_idx
-
-
-def _num_tokens(documents, sizes):
- """Total number of tokens in the dataset."""
- return np.sum(sizes[documents])
-
-
-def _num_epochs(tokens_per_epoch, seq_length, num_samples):
- """Based on number of samples and sequence lenght, calculate how many
- epochs will be needed."""
- num_epochs = 0
- total_tokens = 0
- while True:
- num_epochs += 1
- total_tokens += tokens_per_epoch
- # -1 is because we need to retrieve seq_length + 1 token each time
- # but the last token will overlap with the first token of the next
- # sample except for the last sample.
- if ((total_tokens - 1) // seq_length) >= num_samples:
- return num_epochs
-
-
-def _build_doc_idx(documents, num_epochs, np_rng):
- """Build an array with length = number-of-epochs * number-of-dcuments.
- Each index is mapped to a corresponding document."""
- doc_idx = np.mgrid[0:num_epochs, 0:len(documents)][1]
- doc_idx[:] = documents
- doc_idx = doc_idx.reshape(-1)
- doc_idx = doc_idx.astype(np.int32)
- np_rng.shuffle(doc_idx)
- return doc_idx
-
-
-def _build_sample_idx(sizes, doc_idx, seq_length,
- num_epochs, tokens_per_epoch):
- """Sample index mapping is a 2D array with sizes
- [number-of-samples + 1, 2] where [..., 0] contains
- the index into `doc_idx` and [..., 1] is the
- starting offset in that document."""
-
- # Total number of samples. For -1 see comments in `_num_epochs`.
- num_samples = (num_epochs * tokens_per_epoch - 1) // seq_length
- sample_idx = np.zeros([num_samples + 1, 2], dtype=np.int32)
-
- # Index into sample_idx.
- sample_index = 0
- # Index into doc_idx.
- doc_idx_index = 0
- # Begining offset for each document.
- doc_offset = 0
- # Start with first document and no offset.
- sample_idx[sample_index][0] = doc_idx_index
- sample_idx[sample_index][1] = doc_offset
- sample_index += 1
- while sample_index <= num_samples:
- # Start with a fresh sequence.
- remaining_seq_length = seq_length + 1
- while remaining_seq_length != 0:
- # Get the document length.
- doc_id = doc_idx[doc_idx_index]
- doc_length = sizes[doc_id] - doc_offset
- # And add it to the current sequence.
- remaining_seq_length -= doc_length
- # If we have more than a full sequence, adjust offset and set
- # remaining length to zero so we return from the while loop.
- # Note that -1 here is for the same reason we have -1 in
- # `_num_epochs` calculations.
- if remaining_seq_length <= 0:
- doc_offset += (remaining_seq_length + doc_length - 1)
- remaining_seq_length = 0
- else:
- # Otherwise, start from the begining of the next document.
- doc_idx_index += 1
- doc_offset = 0
- # Record the sequence.
- sample_idx[sample_index][0] = doc_idx_index
- sample_idx[sample_index][1] = doc_offset
- sample_index += 1
-
- return sample_idx
-
-
-def _build_shuffle_idx(size, np_rng):
- """Build the range [0, size) and shuffle."""
- dtype_ = np.uint32
- if size >= (np.iinfo(np.uint32).max - 1):
- dtype_ = np.int64
- shuffle_idx = np.arange(start=0, stop=size, step=1, dtype=dtype_)
- np_rng.shuffle(shuffle_idx)
- return shuffle_idx
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/data/helpers.cpp b/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/data/helpers.cpp
deleted file mode 100644
index ca90329686..0000000000
--- a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/data/helpers.cpp
+++ /dev/null
@@ -1,643 +0,0 @@
-/*
- coding=utf-8
- Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-
-/* Helper methods for fast index mapping builds */
-
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-
-namespace py = pybind11;
-using namespace std;
-
-const int32_t LONG_SENTENCE_LEN = 512;
-
-
-py::array build_sample_idx(const py::array_t& sizes_,
- const py::array_t& doc_idx_,
- const int32_t seq_length,
- const int32_t num_epochs,
- const int64_t tokens_per_epoch) {
- /* Sample index (sample_idx) is used for gpt2 like dataset for which
- the documents are flattened and the samples are built based on this
- 1-D flatten array. It is a 2D array with sizes [number-of-samples + 1, 2]
- where [..., 0] contains the index into `doc_idx` and [..., 1] is the
- starting offset in that document.*/
-
- // Consistency checks.
- assert(seq_length > 1);
- assert(num_epochs > 0);
- assert(tokens_per_epoch > 1);
-
- // Remove bound checks.
- auto sizes = sizes_.unchecked<1>();
- auto doc_idx = doc_idx_.unchecked<1>();
-
- // Mapping and it's length (1D).
- int64_t num_samples = (num_epochs * tokens_per_epoch - 1) / seq_length;
- int32_t* sample_idx = new int32_t[2*(num_samples+1)];
-
- cout << " using:" << endl << std::flush;
- cout << " number of documents: " <<
- doc_idx_.shape(0) / num_epochs << endl << std::flush;
- cout << " number of epochs: " << num_epochs <<
- endl << std::flush;
- cout << " sequence length: " << seq_length <<
- endl << std::flush;
- cout << " total number of samples: " << num_samples <<
- endl << std::flush;
-
- // Index into sample_idx.
- int64_t sample_index = 0;
- // Index into doc_idx.
- int64_t doc_idx_index = 0;
- // Begining offset for each document.
- int32_t doc_offset = 0;
- // Start with first document and no offset.
- sample_idx[2 * sample_index] = doc_idx_index;
- sample_idx[2 * sample_index + 1] = doc_offset;
- ++sample_index;
-
- while (sample_index <= num_samples) {
- // Start with a fresh sequence.
- int32_t remaining_seq_length = seq_length + 1;
- while (remaining_seq_length != 0) {
- // Get the document length.
- auto doc_id = doc_idx[doc_idx_index];
- auto doc_length = sizes[doc_id] - doc_offset;
- // And add it to the current sequence.
- remaining_seq_length -= doc_length;
- // If we have more than a full sequence, adjust offset and set
- // remaining length to zero so we return from the while loop.
- // Note that -1 here is for the same reason we have -1 in
- // `_num_epochs` calculations.
- if (remaining_seq_length <= 0) {
- doc_offset += (remaining_seq_length + doc_length - 1);
- remaining_seq_length = 0;
- } else {
- // Otherwise, start from the begining of the next document.
- ++doc_idx_index;
- doc_offset = 0;
- }
- }
- // Record the sequence.
- sample_idx[2 * sample_index] = doc_idx_index;
- sample_idx[2 * sample_index + 1] = doc_offset;
- ++sample_index;
- }
-
- // Method to deallocate memory.
- py::capsule free_when_done(sample_idx, [](void *mem_) {
- int32_t *mem = reinterpret_cast(mem_);
- delete[] mem;
- });
-
- // Return the numpy array.
- const auto byte_size = sizeof(int32_t);
- return py::array(std::vector{num_samples+1, 2}, // shape
- {2*byte_size, byte_size}, // C-style contiguous strides
- sample_idx, // the data pointer
- free_when_done); // numpy array references
-
-}
-
-
-inline int32_t get_target_sample_len(const int32_t short_seq_ratio,
- const int32_t max_length,
- std::mt19937& rand32_gen) {
- /* Training sample length. */
- const auto random_number = rand32_gen();
- if ((random_number % short_seq_ratio) == 0) {
- return 2 + random_number % (max_length - 1);
- }
- return max_length;
-}
-
-
-template
-py::array build_mapping_impl(const py::array_t& docs_,
- const py::array_t& sizes_,
- const int32_t num_epochs,
- const uint64_t max_num_samples,
- const int32_t max_seq_length,
- const double short_seq_prob,
- const int32_t seed,
- const bool verbose) {
- /* Build a mapping of (start-index, end-index, sequence-length) where
- start and end index are the indices of the sentences in the sample
- and sequence-length is the target sequence length.
- */
-
- // Consistency checks.
- assert(num_epochs > 0);
- assert(max_seq_length > 1);
- assert(short_seq_prob > 0.0);
- assert(short_seq_prob <= 1.0);
- assert(seed > 0);
-
- // Remove bound checks.
- auto docs = docs_.unchecked<1>();
- auto sizes = sizes_.unchecked<1>();
-
- // For efficiency, convert probability to ratio. Note: rand() generates int.
- const auto short_seq_ratio = static_cast(round(1.0 / short_seq_prob));
-
- if (verbose) {
- const auto sent_start_index = docs[0];
- const auto sent_end_index = docs[docs_.shape(0) - 1];
- const auto num_sentences = sent_end_index - sent_start_index;
- cout << " using:" << endl << std::flush;
- cout << " number of documents: " << docs_.shape(0) - 1 <<
- endl << std::flush;
- cout << " sentences range: [" << sent_start_index <<
- ", " << sent_end_index << ")" << endl << std::flush;
- cout << " total number of sentences: " << num_sentences <<
- endl << std::flush;
- cout << " number of epochs: " << num_epochs <<
- endl << std::flush;
- cout << " maximum number of samples: " << max_num_samples <<
- endl << std::flush;
- cout << " maximum sequence length: " << max_seq_length <<
- endl << std::flush;
- cout << " short sequence probability: " << short_seq_prob <<
- endl << std::flush;
- cout << " short sequence ration (1/prob): " << short_seq_ratio <<
- endl << std::flush;
- cout << " seed: " << seed << endl <<
- std::flush;
- }
-
- // Mapping and it's length (1D).
- int64_t num_samples = -1;
- DocIdx* maps = NULL;
-
- // Perform two iterations, in the first iteration get the size
- // and allocate memory and in the second iteration populate the map.
- bool second = false;
- for (int32_t iteration=0; iteration<2; ++iteration) {
-
- // Set the seed so both iterations produce the same results.
- std::mt19937 rand32_gen(seed);
-
- // Set the flag on second iteration.
- second = (iteration == 1);
-
- // Counters:
- uint64_t empty_docs = 0;
- uint64_t one_sent_docs = 0;
- uint64_t long_sent_docs = 0;
-
- // Current map index.
- uint64_t map_index = 0;
-
- // For each epoch:
- for (int32_t epoch=0; epoch= max_num_samples) {
- if (verbose && (!second)) {
- cout << " reached " << max_num_samples << " samples after "
- << epoch << " epochs ..." << endl << std::flush;
- }
- break;
- }
- // For each document:
- for (int32_t doc=0; doc<(docs.shape(0) - 1); ++doc) {
-
- // Document sentences are in [sent_index_first, sent_index_last)
- const auto sent_index_first = docs[doc];
- const auto sent_index_last = docs[doc + 1];
-
- // At the begining of the document previous index is the
- // start index.
- auto prev_start_index = sent_index_first;
-
- // Remaining documents.
- auto num_remain_sent = sent_index_last - sent_index_first;
-
- // Some bookkeeping
- if ((epoch == 0) && (!second)) {
- if (num_remain_sent == 0) {
- ++empty_docs;
- }
- if (num_remain_sent == 1) {
- ++one_sent_docs;
- }
- }
-
- // Detect documents with long sentences.
- bool contains_long_sentence = false;
- if (num_remain_sent > 1) {
- for (auto sent_index=sent_index_first;
- sent_index < sent_index_last; ++sent_index) {
- if (sizes[sent_index] > LONG_SENTENCE_LEN){
- if ((epoch == 0) && (!second)) {
- ++long_sent_docs;
- }
- contains_long_sentence = true;
- break;
- }
- }
- }
-
- // If we have more than two sentences.
- if ((num_remain_sent > 1) && (!contains_long_sentence)) {
-
- // Set values.
- auto seq_len = int32_t{0};
- auto num_sent = int32_t{0};
- auto target_seq_len = get_target_sample_len(short_seq_ratio,
- max_seq_length,
- rand32_gen);
-
- // Loop through sentences.
- for (auto sent_index=sent_index_first;
- sent_index < sent_index_last; ++sent_index) {
-
- // Add the size and number of sentences.
- seq_len += sizes[sent_index];
- ++num_sent;
- --num_remain_sent;
-
- // If we have reached the target length.
- // and if not only one sentence is left in the document.
- // and if we have at least two sentneces.
- // and if we have reached end of the document.
- if (((seq_len >= target_seq_len) &&
- (num_remain_sent > 1) &&
- (num_sent > 1) ) || (num_remain_sent == 0)) {
-
- // Check for overflow.
- if ((3 * map_index + 2) >
- std::numeric_limits::max()) {
- cout << "number of samples exceeded maximum "
- << "allowed by type int64: "
- << std::numeric_limits::max()
- << endl;
- throw std::overflow_error("Number of samples");
- }
-
- // Populate the map.
- if (second) {
- const auto map_index_0 = 3 * map_index;
- maps[map_index_0] = static_cast(prev_start_index);
- maps[map_index_0 + 1] = static_cast(sent_index + 1);
- maps[map_index_0 + 2] = static_cast(target_seq_len);
- }
-
- // Update indices / counters.
- ++map_index;
- prev_start_index = sent_index + 1;
- target_seq_len = get_target_sample_len(short_seq_ratio,
- max_seq_length,
- rand32_gen);
- seq_len = 0;
- num_sent = 0;
- }
-
- } // for (auto sent_index=sent_index_first; ...
- } // if (num_remain_sent > 1) {
- } // for (int doc=0; doc < num_docs; ++doc) {
- } // for (int epoch=0; epoch < num_epochs; ++epoch) {
-
- if (!second) {
- if (verbose) {
- cout << " number of empty documents: " << empty_docs <<
- endl << std::flush;
- cout << " number of documents with one sentence: " <<
- one_sent_docs << endl << std::flush;
- cout << " number of documents with long sentences: " <<
- long_sent_docs << endl << std::flush;
- cout << " will create mapping for " << map_index <<
- " samples" << endl << std::flush;
- }
- assert(maps == NULL);
- assert(num_samples < 0);
- maps = new DocIdx[3*map_index];
- num_samples = static_cast(map_index);
- }
-
- } // for (int iteration=0; iteration < 2; ++iteration) {
-
- // Shuffle.
- // We need a 64 bit random number generator as we might have more
- // than 2 billion samples.
- std::mt19937_64 rand64_gen(seed + 1);
- for (auto i=(num_samples - 1); i > 0; --i) {
- const auto j = static_cast(rand64_gen() % (i + 1));
- const auto i0 = 3 * i;
- const auto j0 = 3 * j;
- // Swap values.
- swap(maps[i0], maps[j0]);
- swap(maps[i0 + 1], maps[j0 + 1]);
- swap(maps[i0 + 2], maps[j0 + 2]);
- }
-
- // Method to deallocate memory.
- py::capsule free_when_done(maps, [](void *mem_) {
- DocIdx *mem = reinterpret_cast(mem_);
- delete[] mem;
- });
-
- // Return the numpy array.
- const auto byte_size = sizeof(DocIdx);
- return py::array(std::vector{num_samples, 3}, // shape
- {3*byte_size, byte_size}, // C-style contiguous strides
- maps, // the data pointer
- free_when_done); // numpy array references
-
-}
-
-
-py::array build_mapping(const py::array_t& docs_,
- const py::array_t& sizes_,
- const int num_epochs,
- const uint64_t max_num_samples,
- const int max_seq_length,
- const double short_seq_prob,
- const int seed,
- const bool verbose) {
-
- if (sizes_.size() > std::numeric_limits::max()) {
- if (verbose) {
- cout << " using uint64 for data mapping..." << endl << std::flush;
- }
- return build_mapping_impl(docs_, sizes_, num_epochs,
- max_num_samples, max_seq_length,
- short_seq_prob, seed, verbose);
- } else {
- if (verbose) {
- cout << " using uint32 for data mapping..." << endl << std::flush;
- }
- return build_mapping_impl(docs_, sizes_, num_epochs,
- max_num_samples, max_seq_length,
- short_seq_prob, seed, verbose);
- }
-}
-
-template
-py::array build_blocks_mapping_impl(const py::array_t& docs_,
- const py::array_t& sizes_,
- const py::array_t& titles_sizes_,
- const int32_t num_epochs,
- const uint64_t max_num_samples,
- const int32_t max_seq_length,
- const int32_t seed,
- const bool verbose,
- const bool use_one_sent_blocks) {
- /* Build a mapping of (start-index, end-index, sequence-length) where
- start and end index are the indices of the sentences in the sample
- and sequence-length is the target sequence length.
- */
-
- // Consistency checks.
- assert(num_epochs > 0);
- assert(max_seq_length > 1);
- assert(seed > 0);
-
- // Remove bound checks.
- auto docs = docs_.unchecked<1>();
- auto sizes = sizes_.unchecked<1>();
- auto titles_sizes = titles_sizes_.unchecked<1>();
-
- if (verbose) {
- const auto sent_start_index = docs[0];
- const auto sent_end_index = docs[docs_.shape(0) - 1];
- const auto num_sentences = sent_end_index - sent_start_index;
- cout << " using:" << endl << std::flush;
- cout << " number of documents: " << docs_.shape(0) - 1 <<
- endl << std::flush;
- cout << " sentences range: [" << sent_start_index <<
- ", " << sent_end_index << ")" << endl << std::flush;
- cout << " total number of sentences: " << num_sentences <<
- endl << std::flush;
- cout << " number of epochs: " << num_epochs <<
- endl << std::flush;
- cout << " maximum number of samples: " << max_num_samples <<
- endl << std::flush;
- cout << " maximum sequence length: " << max_seq_length <<
- endl << std::flush;
- cout << " seed: " << seed << endl <<
- std::flush;
- }
-
- // Mapping and its length (1D).
- int64_t num_samples = -1;
- DocIdx* maps = NULL;
-
- // Acceptable number of sentences per block.
- int min_num_sent = 2;
- if (use_one_sent_blocks) {
- min_num_sent = 1;
- }
-
- // Perform two iterations, in the first iteration get the size
- // and allocate memory and in the second iteration populate the map.
- bool second = false;
- for (int32_t iteration=0; iteration<2; ++iteration) {
-
- // Set the flag on second iteration.
- second = (iteration == 1);
-
- // Current map index.
- uint64_t map_index = 0;
-
- uint64_t empty_docs = 0;
- uint64_t one_sent_docs = 0;
- uint64_t long_sent_docs = 0;
- // For each epoch:
- for (int32_t epoch=0; epoch= max_num_samples) {
- if (verbose && (!second)) {
- cout << " reached " << max_num_samples << " samples after "
- << epoch << " epochs ..." << endl << std::flush;
- }
- break;
- }
- // For each document:
- for (int32_t doc=0; doc<(docs.shape(0) - 1); ++doc) {
-
- // Document sentences are in [sent_index_first, sent_index_last)
- const auto sent_index_first = docs[doc];
- const auto sent_index_last = docs[doc + 1];
- const auto target_seq_len = max_seq_length - titles_sizes[doc];
-
- // At the begining of the document previous index is the
- // start index.
- auto prev_start_index = sent_index_first;
-
- // Remaining documents.
- auto num_remain_sent = sent_index_last - sent_index_first;
-
- // Some bookkeeping
- if ((epoch == 0) && (!second)) {
- if (num_remain_sent == 0) {
- ++empty_docs;
- }
- if (num_remain_sent == 1) {
- ++one_sent_docs;
- }
- }
- // Detect documents with long sentences.
- bool contains_long_sentence = false;
- if (num_remain_sent >= min_num_sent) {
- for (auto sent_index=sent_index_first;
- sent_index < sent_index_last; ++sent_index) {
- if (sizes[sent_index] > LONG_SENTENCE_LEN){
- if ((epoch == 0) && (!second)) {
- ++long_sent_docs;
- }
- contains_long_sentence = true;
- break;
- }
- }
- }
- // If we have enough sentences and no long sentences.
- if ((num_remain_sent >= min_num_sent) && (!contains_long_sentence)) {
-
- // Set values.
- auto seq_len = int32_t{0};
- auto num_sent = int32_t{0};
-
- // Loop through sentences.
- for (auto sent_index=sent_index_first;
- sent_index < sent_index_last; ++sent_index) {
-
- // Add the size and number of sentences.
- seq_len += sizes[sent_index];
- ++num_sent;
- --num_remain_sent;
-
- // If we have reached the target length.
- // and there are an acceptable number of sentences left
- // and if we have at least the minimum number of sentences.
- // or if we have reached end of the document.
- if (((seq_len >= target_seq_len) &&
- (num_remain_sent >= min_num_sent) &&
- (num_sent >= min_num_sent) ) || (num_remain_sent == 0)) {
-
- // Populate the map.
- if (second) {
- const auto map_index_0 = 4 * map_index;
- // Each sample has 4 items: the starting sentence index, ending sentence index,
- // the index of the document from which the block comes (used for fetching titles)
- // and the unique id of the block (used for creating block indexes)
-
- maps[map_index_0] = static_cast(prev_start_index);
- maps[map_index_0 + 1] = static_cast(sent_index + 1);
- maps[map_index_0 + 2] = static_cast(doc);
- maps[map_index_0 + 3] = static_cast(block_id);
- }
-
- // Update indices / counters.
- ++map_index;
- ++block_id;
- prev_start_index = sent_index + 1;
- seq_len = 0;
- num_sent = 0;
- }
- } // for (auto sent_index=sent_index_first; ...
- } // if (num_remain_sent > 1) {
- } // for (int doc=0; doc < num_docs; ++doc) {
- } // for (int epoch=0; epoch < num_epochs; ++epoch) {
-
- if (!second) {
- if (verbose) {
- cout << " number of empty documents: " << empty_docs <<
- endl << std::flush;
- cout << " number of documents with one sentence: " <<
- one_sent_docs << endl << std::flush;
- cout << " number of documents with long sentences: " <<
- long_sent_docs << endl << std::flush;
- cout << " will create mapping for " << map_index <<
- " samples" << endl << std::flush;
- }
- assert(maps == NULL);
- assert(num_samples < 0);
- maps = new DocIdx[4*map_index];
- num_samples = static_cast(map_index);
- }
-
- } // for (int iteration=0; iteration < 2; ++iteration) {
-
- // Shuffle.
- // We need a 64 bit random number generator as we might have more
- // than 2 billion samples.
- std::mt19937_64 rand64_gen(seed + 1);
- for (auto i=(num_samples - 1); i > 0; --i) {
- const auto j = static_cast(rand64_gen() % (i + 1));
- const auto i0 = 4 * i;
- const auto j0 = 4 * j;
- // Swap values.
- swap(maps[i0], maps[j0]);
- swap(maps[i0 + 1], maps[j0 + 1]);
- swap(maps[i0 + 2], maps[j0 + 2]);
- swap(maps[i0 + 3], maps[j0 + 3]);
- }
-
- // Method to deallocate memory.
- py::capsule free_when_done(maps, [](void *mem_) {
- DocIdx *mem = reinterpret_cast(mem_);
- delete[] mem;
- });
-
- // Return the numpy array.
- const auto byte_size = sizeof(DocIdx);
- return py::array(std::vector{num_samples, 4}, // shape
- {4*byte_size, byte_size}, // C-style contiguous strides
- maps, // the data pointer
- free_when_done); // numpy array references
-
-}
-
-py::array build_blocks_mapping(const py::array_t& docs_,
- const py::array_t& sizes_,
- const py::array_t& titles_sizes_,
- const int num_epochs,
- const uint64_t max_num_samples,
- const int max_seq_length,
- const int seed,
- const bool verbose,
- const bool use_one_sent_blocks) {
-
- if (sizes_.size() > std::numeric_limits::max()) {
- if (verbose) {
- cout << " using uint64 for data mapping..." << endl << std::flush;
- }
- return build_blocks_mapping_impl(docs_, sizes_, titles_sizes_,
- num_epochs, max_num_samples, max_seq_length, seed, verbose, use_one_sent_blocks);
- } else {
- if (verbose) {
- cout << " using uint32 for data mapping..." << endl << std::flush;
- }
- return build_blocks_mapping_impl(docs_, sizes_, titles_sizes_,
- num_epochs, max_num_samples, max_seq_length, seed, verbose, use_one_sent_blocks);
- }
-}
-
-PYBIND11_MODULE(helpers, m) {
- m.def("build_mapping", &build_mapping);
- m.def("build_blocks_mapping", &build_blocks_mapping);
- m.def("build_sample_idx", &build_sample_idx);
-}
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/data/ict_dataset.py b/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/data/ict_dataset.py
deleted file mode 100644
index 71916d642e..0000000000
--- a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/data/ict_dataset.py
+++ /dev/null
@@ -1,140 +0,0 @@
-import itertools
-import random
-
-import numpy as np
-from torch.utils.data import Dataset
-
-from megatron import get_tokenizer
-from megatron import get_args
-from megatron.data.dataset_utils import get_indexed_dataset_
-from megatron.data.realm_dataset_utils import get_block_samples_mapping
-
-
-def get_ict_dataset(use_titles=True, query_in_block_prob=1):
- """Get a dataset which uses block samples mappings to get ICT/block indexing data (via get_block())
- rather than for training, since it is only built with a single epoch sample mapping.
- """
- args = get_args()
- block_dataset = get_indexed_dataset_(args.data_path, 'mmap', True)
- titles_dataset = get_indexed_dataset_(args.titles_data_path, 'mmap', True)
-
- kwargs = dict(
- name='full',
- block_dataset=block_dataset,
- title_dataset=titles_dataset,
- data_prefix=args.data_path,
- num_epochs=1,
- max_num_samples=None,
- max_seq_length=args.seq_length,
- seed=1,
- query_in_block_prob=query_in_block_prob,
- use_titles=use_titles,
- use_one_sent_docs=args.use_one_sent_docs
- )
- dataset = ICTDataset(**kwargs)
- return dataset
-
-
-class ICTDataset(Dataset):
- """Dataset containing sentences and their blocks for an inverse cloze task."""
- def __init__(self, name, block_dataset, title_dataset, data_prefix,
- num_epochs, max_num_samples, max_seq_length, query_in_block_prob,
- seed, use_titles=True, use_one_sent_docs=False):
- self.name = name
- self.seed = seed
- self.max_seq_length = max_seq_length
- self.query_in_block_prob = query_in_block_prob
- self.block_dataset = block_dataset
- self.title_dataset = title_dataset
- self.rng = random.Random(self.seed)
- self.use_titles = use_titles
- self.use_one_sent_docs = use_one_sent_docs
-
- self.samples_mapping = get_block_samples_mapping(
- block_dataset, title_dataset, data_prefix, num_epochs,
- max_num_samples, max_seq_length, seed, name, use_one_sent_docs)
- self.tokenizer = get_tokenizer()
- self.vocab_id_list = list(self.tokenizer.inv_vocab.keys())
- self.vocab_id_to_token_list = self.tokenizer.inv_vocab
- self.cls_id = self.tokenizer.cls
- self.sep_id = self.tokenizer.sep
- self.mask_id = self.tokenizer.mask
- self.pad_id = self.tokenizer.pad
-
- def __len__(self):
- return len(self.samples_mapping)
-
- def __getitem__(self, idx):
- """Get an ICT example of a pseudo-query and the block of text from which it was extracted"""
- sample_data = self.samples_mapping[idx]
- start_idx, end_idx, doc_idx, block_idx = sample_data.as_tuple()
-
- if self.use_titles:
- title = self.title_dataset[int(doc_idx)]
- title_pad_offset = 3 + len(title)
- else:
- title = None
- title_pad_offset = 2
- block = [self.block_dataset[i] for i in range(start_idx, end_idx)]
- assert len(block) > 1 or self.use_one_sent_docs or self.query_in_block_prob == 1
-
- # randint() is inclusive for Python rng
- rand_sent_idx = self.rng.randint(0, len(block) - 1)
-
- # keep the query in the context query_in_block_prob fraction of the time.
- if self.rng.random() < self.query_in_block_prob:
- query = block[rand_sent_idx].copy()
- else:
- query = block.pop(rand_sent_idx)
-
- # still need to truncate because blocks are concluded when
- # the sentence lengths have exceeded max_seq_length.
- query = query[:self.max_seq_length - 2]
- block = list(itertools.chain(*block))[:self.max_seq_length - title_pad_offset]
-
- query_tokens, query_pad_mask = self.concat_and_pad_tokens(query)
- block_tokens, block_pad_mask = self.concat_and_pad_tokens(block, title)
- block_data = sample_data.as_array()
-
- sample = {
- 'query_tokens': query_tokens,
- 'query_pad_mask': query_pad_mask,
- 'block_tokens': block_tokens,
- 'block_pad_mask': block_pad_mask,
- 'block_data': block_data,
- }
-
- return sample
-
- def get_block(self, start_idx, end_idx, doc_idx):
- """Get the IDs for an evidence block plus the title of the corresponding document"""
- block = [self.block_dataset[i] for i in range(start_idx, end_idx)]
- title = self.title_dataset[int(doc_idx)]
-
- block = list(itertools.chain(*block))[:self.max_seq_length - (3 + len(title))]
- block_tokens, block_pad_mask = self.concat_and_pad_tokens(block, title)
-
- return block_tokens, block_pad_mask
-
- def get_null_block(self):
- """Get empty block and title - used in REALM pretraining"""
- block, title = [], []
- block_tokens, block_pad_mask = self.concat_and_pad_tokens(block, title)
-
- return block_tokens, block_pad_mask
-
- def concat_and_pad_tokens(self, tokens, title=None):
- """Concat with special tokens and pad sequence to self.max_seq_length"""
- tokens = list(tokens)
- if title is None:
- tokens = [self.cls_id] + tokens + [self.sep_id]
- else:
- title = list(title)
- tokens = [self.cls_id] + title + [self.sep_id] + tokens + [self.sep_id]
- assert len(tokens) <= self.max_seq_length
-
- num_pad = self.max_seq_length - len(tokens)
- pad_mask = [1] * len(tokens) + [0] * num_pad
- tokens += [self.pad_id] * num_pad
-
- return np.array(tokens), np.array(pad_mask)
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/data/indexed_dataset.py b/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/data/indexed_dataset.py
deleted file mode 100644
index 1251066232..0000000000
--- a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/data/indexed_dataset.py
+++ /dev/null
@@ -1,570 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-
-# copied from fairseq/fairseq/data/indexed_dataset.py
-# Removed IndexedRawTextDataset since it relied on Fairseq dictionary
-# other slight modifications to remove fairseq dependencies
-# Added document index to index file and made it accessible.
-# An empty sentence no longer separates documents.
-
-from functools import lru_cache
-import os
-import shutil
-import struct
-from itertools import accumulate
-
-import numpy as np
-import torch
-from megatron import print_rank_0
-
-
-def __best_fitting_dtype(vocab_size=None):
- if vocab_size is not None and vocab_size < 65500:
- return np.uint16
- else:
- return np.int32
-
-
-def get_available_dataset_impl():
- return ['lazy', 'cached', 'mmap']
-
-
-def infer_dataset_impl(path):
- if IndexedDataset.exists(path):
- with open(index_file_path(path), 'rb') as f:
- magic = f.read(8)
- if magic == IndexedDataset._HDR_MAGIC:
- return 'cached'
- elif magic == MMapIndexedDataset.Index._HDR_MAGIC[:8]:
- return 'mmap'
- else:
- return None
- else:
- print(f"Dataset does not exist: {path}")
- print("Path should be a basename that both .idx and .bin can be appended to get full filenames.")
- return None
-
-
-def make_builder(out_file, impl, vocab_size=None):
- if impl == 'mmap':
- return MMapIndexedDatasetBuilder(out_file, dtype=__best_fitting_dtype(vocab_size))
- else:
- return IndexedDatasetBuilder(out_file)
-
-
-def make_dataset(path, impl, skip_warmup=False):
- if not IndexedDataset.exists(path):
- print(f"Dataset does not exist: {path}")
- print("Path should be a basename that both .idx and .bin can be appended to get full filenames.")
- return None
- if impl == 'infer':
- impl = infer_dataset_impl(path)
- if impl == 'lazy' and IndexedDataset.exists(path):
- return IndexedDataset(path)
- elif impl == 'cached' and IndexedDataset.exists(path):
- return IndexedCachedDataset(path)
- elif impl == 'mmap' and MMapIndexedDataset.exists(path):
- return MMapIndexedDataset(path, skip_warmup)
- print(f"Unknown dataset implementation: {impl}")
- return None
-
-
-def dataset_exists(path, impl):
- if impl == 'mmap':
- return MMapIndexedDataset.exists(path)
- else:
- return IndexedDataset.exists(path)
-
-
-def read_longs(f, n):
- a = np.empty(n, dtype=np.int64)
- f.readinto(a)
- return a
-
-
-def write_longs(f, a):
- f.write(np.array(a, dtype=np.int64))
-
-
-dtypes = {
- 1: np.uint8,
- 2: np.int8,
- 3: np.int16,
- 4: np.int32,
- 5: np.int64,
- 6: np.float,
- 7: np.double,
- 8: np.uint16
-}
-
-
-def code(dtype):
- for k in dtypes.keys():
- if dtypes[k] == dtype:
- return k
- raise ValueError(dtype)
-
-
-def index_file_path(prefix_path):
- return prefix_path + '.idx'
-
-
-def data_file_path(prefix_path):
- return prefix_path + '.bin'
-
-
-def create_doc_idx(sizes):
- doc_idx = [0]
- for i, s in enumerate(sizes):
- if s == 0:
- doc_idx.append(i + 1)
- return doc_idx
-
-
-class IndexedDataset(torch.utils.data.Dataset):
- """Loader for IndexedDataset"""
- _HDR_MAGIC = b'TNTIDX\x00\x00'
-
- def __init__(self, path):
- super().__init__()
- self.path = path
- self.data_file = None
- self.read_index(path)
-
- def read_index(self, path):
- with open(index_file_path(path), 'rb') as f:
- magic = f.read(8)
- assert magic == self._HDR_MAGIC, (
- 'Index file doesn\'t match expected format. '
- 'Make sure that --dataset-impl is configured properly.'
- )
- version = f.read(8)
- assert struct.unpack('= self._len:
- raise IndexError('index out of range')
-
- def __del__(self):
- if self.data_file:
- self.data_file.close()
-
- # @lru_cache(maxsize=8)
- def __getitem__(self, idx):
- if not self.data_file:
- self.read_data(self.path)
- if isinstance(idx, int):
- i = idx
- self.check_index(i)
- tensor_size = self.sizes[self.dim_offsets[i]:self.dim_offsets[i + 1]]
- a = np.empty(tensor_size, dtype=self.dtype)
- self.data_file.seek(self.data_offsets[i] * self.element_size)
- self.data_file.readinto(a)
- return a
- elif isinstance(idx, slice):
- start, stop, step = idx.indices(len(self))
- if step != 1:
- raise ValueError("Slices into indexed_dataset must be contiguous")
- sizes = self.sizes[self.dim_offsets[start]:self.dim_offsets[stop]]
- size = sum(sizes)
- a = np.empty(size, dtype=self.dtype)
- self.data_file.seek(self.data_offsets[start] * self.element_size)
- self.data_file.readinto(a)
- offsets = list(accumulate(sizes))
- sents = np.split(a, offsets[:-1])
- return sents
-
- def __len__(self):
- return self._len
-
- def num_tokens(self, index):
- return self.sizes[index]
-
- def size(self, index):
- return self.sizes[index]
-
- @staticmethod
- def exists(path):
- return (
- os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path))
- )
-
- @property
- def supports_prefetch(self):
- return False # avoid prefetching to save memory
-
-
-class IndexedCachedDataset(IndexedDataset):
-
- def __init__(self, path):
- super().__init__(path)
- self.cache = None
- self.cache_index = {}
-
- @property
- def supports_prefetch(self):
- return True
-
- def prefetch(self, indices):
- if all(i in self.cache_index for i in indices):
- return
- if not self.data_file:
- self.read_data(self.path)
- indices = sorted(set(indices))
- total_size = 0
- for i in indices:
- total_size += self.data_offsets[i + 1] - self.data_offsets[i]
- self.cache = np.empty(total_size, dtype=self.dtype)
- ptx = 0
- self.cache_index.clear()
- for i in indices:
- self.cache_index[i] = ptx
- size = self.data_offsets[i + 1] - self.data_offsets[i]
- a = self.cache[ptx: ptx + size]
- self.data_file.seek(self.data_offsets[i] * self.element_size)
- self.data_file.readinto(a)
- ptx += size
- if self.data_file:
- # close and delete data file after prefetch so we can pickle
- self.data_file.close()
- self.data_file = None
-
- # @lru_cache(maxsize=8)
- def __getitem__(self, idx):
- if isinstance(idx, int):
- i = idx
- self.check_index(i)
- tensor_size = self.sizes[self.dim_offsets[i]:self.dim_offsets[i + 1]]
- a = np.empty(tensor_size, dtype=self.dtype)
- ptx = self.cache_index[i]
- np.copyto(a, self.cache[ptx: ptx + a.size])
- return a
- elif isinstance(idx, slice):
- # Hack just to make this work, can optimizer later if necessary
- sents = []
- for i in range(*idx.indices(len(self))):
- sents.append(self[i])
- return sents
-
-
-class IndexedDatasetBuilder(object):
- element_sizes = {
- np.uint8: 1,
- np.int8: 1,
- np.int16: 2,
- np.int32: 4,
- np.int64: 8,
- np.float: 4,
- np.double: 8
- }
-
- def __init__(self, out_file, dtype=np.int32):
- self.out_file = open(out_file, 'wb')
- self.dtype = dtype
- self.data_offsets = [0]
- self.dim_offsets = [0]
- self.sizes = []
- self.element_size = self.element_sizes[self.dtype]
- self.doc_idx = [0]
-
- def add_item(self, tensor):
- bytes = self.out_file.write(np.array(tensor.numpy(), dtype=self.dtype))
- self.data_offsets.append(self.data_offsets[-1] + bytes / self.element_size)
- for s in tensor.size():
- self.sizes.append(s)
- self.dim_offsets.append(self.dim_offsets[-1] + len(tensor.size()))
-
- def end_document(self):
- self.doc_idx.append(len(self.sizes))
-
- def merge_file_(self, another_file):
- index = IndexedDataset(another_file)
- assert index.dtype == self.dtype
-
- begin = self.data_offsets[-1]
- for offset in index.data_offsets[1:]:
- self.data_offsets.append(begin + offset)
- self.sizes.extend(index.sizes)
- begin = self.dim_offsets[-1]
- for dim_offset in index.dim_offsets[1:]:
- self.dim_offsets.append(begin + dim_offset)
-
- with open(data_file_path(another_file), 'rb') as f:
- while True:
- data = f.read(1024)
- if data:
- self.out_file.write(data)
- else:
- break
-
- def finalize(self, index_file):
- self.out_file.close()
- index = open(index_file, 'wb')
- index.write(b'TNTIDX\x00\x00')
- index.write(struct.pack(' WARNING: could not find index map file {}, building '
- 'the indices on rank 0 ...'.format(indexmap_filename))
-
- # Make sure the types match the helpers input types.
- assert block_dataset.doc_idx.dtype == np.int64
- assert block_dataset.sizes.dtype == np.int32
-
- # Build samples mapping
- verbose = torch.distributed.get_rank() == 0
- start_time = time.time()
- print_rank_0(' > building samples index mapping for {} ...'.format(
- name))
-
- # compile/bind the C++ helper code
- from megatron.data.dataset_utils import compile_helper
- compile_helper()
-
- from megatron.data import helpers
- mapping_array = helpers.build_blocks_mapping(
- block_dataset.doc_idx,
- block_dataset.sizes,
- title_dataset.sizes,
- num_epochs,
- max_num_samples,
- max_seq_length - 3, # account for added tokens
- seed,
- verbose,
- use_one_sent_docs)
-
-
- print_rank_0(' > done building samples index mapping')
- np.save(indexmap_filename, mapping_array, allow_pickle=True)
- print_rank_0(' > saved the index mapping in {}'.format(
- indexmap_filename))
- # Make sure all the ranks have built the mapping
- print_rank_0(' > elapsed time to build and save samples mapping '
- '(seconds): {:4f}'.format(
- time.time() - start_time))
-
- # This should be a barrier but nccl barrier assumes
- # device_index=rank which is not the case for model
- # parallel case
- counts = torch.cuda.LongTensor([1])
- torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group())
- assert counts[0].item() == torch.distributed.get_world_size(
- group=mpu.get_data_parallel_group())
-
- # Load indexed dataset.
- print_rank_0(' > loading indexed mapping from {}'.format(
- indexmap_filename))
- start_time = time.time()
-
- mapping_array = np.load(indexmap_filename, allow_pickle=True, mmap_mode='r')
- samples_mapping = BlockSamplesMapping(mapping_array)
-
- print_rank_0(' loaded indexed file in {:3.3f} seconds'.format(
- time.time() - start_time))
- print_rank_0(' total number of samples: {}'.format(
- mapping_array.shape[0]))
-
- return samples_mapping
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/data/realm_index.py b/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/data/realm_index.py
deleted file mode 100644
index 54344e0c0f..0000000000
--- a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/data/realm_index.py
+++ /dev/null
@@ -1,216 +0,0 @@
-import itertools
-import os
-import pickle
-import shutil
-
-import numpy as np
-import torch
-
-from megatron import get_args
-from megatron import mpu
-
-
-def detach(tensor):
- return tensor.detach().cpu().numpy()
-
-
-class BlockData(object):
- """Serializable data structure for holding data for blocks -- embeddings and necessary metadata for REALM"""
- def __init__(self, block_data_path=None, load_from_path=True, rank=None):
- self.embed_data = dict()
- self.meta_data = dict()
- if block_data_path is None:
- args = get_args()
- block_data_path = args.block_data_path
- rank = args.rank
- self.block_data_path = block_data_path
- self.rank = rank
-
- if load_from_path:
- self.load_from_file()
-
- block_data_name = os.path.splitext(self.block_data_path)[0]
- self.temp_dir_name = block_data_name + '_tmp'
-
- def state(self):
- return {
- 'embed_data': self.embed_data,
- 'meta_data': self.meta_data,
- }
-
- def clear(self):
- """Clear the embedding data structures to save memory.
- The metadata ends up getting used, and is also much smaller in dimensionality
- so it isn't really worth clearing.
- """
- self.embed_data = dict()
-
- def load_from_file(self):
- """Populate members from instance saved to file"""
-
- if mpu.is_unitialized() or mpu.get_data_parallel_rank() == 0:
- print("\n> Unpickling BlockData", flush=True)
- state_dict = pickle.load(open(self.block_data_path, 'rb'))
- if mpu.is_unitialized() or mpu.get_data_parallel_rank() == 0:
- print(">> Finished unpickling BlockData\n", flush=True)
-
- self.embed_data = state_dict['embed_data']
- self.meta_data = state_dict['meta_data']
-
- def add_block_data(self, block_indices, block_embeds, block_metas, allow_overwrite=False):
- """Add data for set of blocks
- :param block_indices: 1D array of unique int ids for the blocks
- :param block_embeds: 2D array of embeddings of the blocks
- :param block_metas: 2D array of metadata for the blocks.
- In the case of REALM this will be [start_idx, end_idx, doc_idx]
- """
- for idx, embed, meta in zip(block_indices, block_embeds, block_metas):
- if not allow_overwrite and idx in self.embed_data:
- raise ValueError("Unexpectedly tried to overwrite block data")
-
- self.embed_data[idx] = np.float16(embed)
- self.meta_data[idx] = meta
-
- def save_shard(self):
- """Save the block data that was created this in this process"""
- if not os.path.isdir(self.temp_dir_name):
- os.makedirs(self.temp_dir_name, exist_ok=True)
-
- # save the data for each shard
- with open('{}/{}.pkl'.format(self.temp_dir_name, self.rank), 'wb') as data_file:
- pickle.dump(self.state(), data_file)
-
- def merge_shards_and_save(self):
- """Combine all the shards made using self.save_shard()"""
- shard_names = os.listdir(self.temp_dir_name)
- seen_own_shard = False
-
- for fname in os.listdir(self.temp_dir_name):
- shard_rank = int(os.path.splitext(fname)[0])
- if shard_rank == self.rank:
- seen_own_shard = True
- continue
-
- with open('{}/{}'.format(self.temp_dir_name, fname), 'rb') as f:
- data = pickle.load(f)
- old_size = len(self.embed_data)
- shard_size = len(data['embed_data'])
-
- # add the shard's data and check to make sure there is no overlap
- self.embed_data.update(data['embed_data'])
- self.meta_data.update(data['meta_data'])
- assert len(self.embed_data) == old_size + shard_size
-
- assert seen_own_shard
-
- # save the consolidated shards and remove temporary directory
- with open(self.block_data_path, 'wb') as final_file:
- pickle.dump(self.state(), final_file)
- shutil.rmtree(self.temp_dir_name, ignore_errors=True)
-
- print("Finished merging {} shards for a total of {} embeds".format(
- len(shard_names), len(self.embed_data)), flush=True)
-
-
-class FaissMIPSIndex(object):
- """Wrapper object for a BlockData which similarity search via FAISS under the hood"""
- def __init__(self, embed_size, block_data=None, use_gpu=False):
- self.embed_size = embed_size
- self.block_data = block_data
- self.use_gpu = use_gpu
- self.id_map = dict()
-
- self.block_mips_index = None
- self._set_block_index()
-
- def _set_block_index(self):
- """Create a Faiss Flat index with inner product as the metric to search against"""
- try:
- import faiss
- except ImportError:
- raise Exception("Error: Please install faiss to use FaissMIPSIndex")
-
- if mpu.is_unitialized() or mpu.get_data_parallel_rank() == 0:
- print("\n> Building index", flush=True)
- self.block_mips_index = faiss.index_factory(self.embed_size, 'Flat', faiss.METRIC_INNER_PRODUCT)
-
- if self.use_gpu:
- # create resources and config for GpuIndex
- res = faiss.StandardGpuResources()
- config = faiss.GpuIndexFlatConfig()
- config.device = torch.cuda.current_device()
- config.useFloat16 = True
-
- self.block_mips_index = faiss.GpuIndexFlat(res, self.block_mips_index, config)
- if mpu.is_unitialized() or mpu.get_data_parallel_rank() == 0:
- print(">> Initialized index on GPU {}".format(self.block_mips_index.getDevice()), flush=True)
- else:
- # CPU index supports IDs so wrap with IDMap
- self.block_mips_index = faiss.IndexIDMap(self.block_mips_index)
- if mpu.is_unitialized() or mpu.get_data_parallel_rank() == 0:
- print(">> Initialized index on CPU", flush=True)
-
- # if we were constructed with a BlockData, then automatically load it when the FAISS structure is built
- if self.block_data is not None:
- self.add_block_embed_data(self.block_data)
-
- def reset_index(self):
- """Delete existing index and create anew"""
- del self.block_mips_index
-
- # reset the block data so that _set_block_index will reload it as well
- if self.block_data is not None:
- block_data_path = self.block_data.block_data_path
- del self.block_data
- self.block_data = BlockData(block_data_path)
-
- self._set_block_index()
-
- def add_block_embed_data(self, all_block_data):
- """Add the embedding of each block to the underlying FAISS index"""
-
- # this assumes the embed_data is a dict : {int: np.array}
- block_indices, block_embeds = zip(*all_block_data.embed_data.items())
-
- # the embeddings have to be entered in as float32 even though the math internally is done with float16.
- block_embeds_arr = np.float32(np.array(block_embeds))
- block_indices_arr = np.array(block_indices)
-
- # faiss GpuIndex doesn't work with IDMap wrapper so store ids to map back with
- if self.use_gpu:
- for i, idx in enumerate(block_indices):
- self.id_map[i] = idx
-
- # we no longer need the embedding data since it's in the index now
- all_block_data.clear()
-
- if self.use_gpu:
- self.block_mips_index.add(block_embeds_arr)
- else:
- self.block_mips_index.add_with_ids(block_embeds_arr, block_indices_arr)
-
- if mpu.is_unitialized() or mpu.get_data_parallel_rank() == 0:
- print(">>> Finished adding block data to index", flush=True)
-
- def search_mips_index(self, query_embeds, top_k, reconstruct=True):
- """Get the top-k blocks by the index distance metric.
-
- :param reconstruct: if True: return a [num_queries x k x embed_dim] array of blocks
- if False: return [num_queries x k] array of distances, and another for indices
- """
- query_embeds = np.float32(detach(query_embeds))
-
- if reconstruct:
- # get the vectors themselves
- top_k_block_embeds = self.block_mips_index.search_and_reconstruct(query_embeds, top_k)
- return top_k_block_embeds
-
- else:
- # get distances and indices of closest vectors
- distances, block_indices = self.block_mips_index.search(query_embeds, top_k)
- if self.use_gpu:
- fresh_indices = np.zeros(block_indices.shape)
- for i, j in itertools.product(block_indices.shape):
- fresh_indices[i, j] = self.id_map[block_indices[i, j]]
- block_indices = fresh_indices
- return distances, block_indices
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/data/samplers.py b/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/data/samplers.py
deleted file mode 100644
index 2fbd070184..0000000000
--- a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/data/samplers.py
+++ /dev/null
@@ -1,148 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Batch samplers that work with either random or sequential data samplers."""
-
-import torch
-from torch.utils import data
-
-
-class RandomSampler(data.sampler.Sampler):
- """Based off of pytorch RandomSampler and DistributedSampler. Essentially
- a RandomSampler, but this class lets the user set an epoch like
- DistributedSampler Samples elements randomly. If without replacement, then
- sample from a shuffled dataset. If with replacement, then user can
- specify ``num_samples`` to draw.
- Arguments:
- data_source (Dataset): dataset to sample from
- num_samples (int): number of samples to draw, default=len(dataset)
- replacement (bool): samples are drawn with replacement if ``True``,
- default=False
- """
-
- def __init__(self, data_source, replacement=False, num_samples=None):
- self.data_source = data_source
- self.replacement = replacement
- self._num_samples = num_samples
- self.epoch = -1
-
- if self._num_samples is not None and replacement is False:
- raise ValueError("With replacement=False, num_samples should not "
- "be specified, since a random permute will be "
- "performed.")
-
- if not isinstance(self.num_samples, int) or self.num_samples <= 0:
- raise ValueError("num_samples should be a positive integer "
- "value, but got num_samples={}".format(
- self.num_samples))
- if not isinstance(self.replacement, bool):
- raise ValueError("replacement should be a boolean value, but got "
- "replacement={}".format(self.replacement))
-
- @property
- def num_samples(self):
- # dataset size might change at runtime
- if self._num_samples is None:
- return len(self.data_source)
- return self._num_samples
-
- def __iter__(self):
- n = len(self.data_source)
- g = torch.Generator()
- if self.epoch >= 0:
- g.manual_seed(self.epoch)
- if self.replacement:
- return iter(torch.randint(high=n, size=(self.num_samples,),
- dtype=torch.int64, generator=g).tolist())
- return iter(torch.randperm(n, generator=g).tolist())
-
- def __len__(self):
- return self.num_samples
-
- def set_epoch(self, epoch):
- self.epoch = epoch
-
-
-class DistributedBatchSampler(data.sampler.BatchSampler):
- """Similar to normal implementation of distributed sampler, except
- implementation is at the batch sampler level, instead of just the
- sampler level. This allows wrapping of arbitrary data samplers
- (sequential, random, WeightedRandomSampler, etc.) with this batch
- sampler.
-
- The `interleave` argument specifies how to distribute a batch. A value
- of True combined with the above random sampler is equivalent to pytorch's
- torch.utils.data.distributed.DistributedSampler.
-
- For the following batch [0,1,2,3,4,5,6,7] and data parallelism of 2
- specifying True will result in the following samples for each gpu:
- GPU0: [0,2,4,6] GPU1: [1,3,5,7]
- specifying False will result in the following samples:
- GPU0: [0,1,2,3] GPU1: [4,5,6,7]"""
-
- def __init__(self, sampler, batch_size, drop_last, rank=-1,
- world_size=2, wrap_last=False, interleave=False):
- super(DistributedBatchSampler, self).__init__(sampler, batch_size,
- drop_last)
- if rank == -1:
- assert False, 'should not be here'
- rank = torch.distributed.get_rank()
- self.rank = rank
- self.world_size = world_size
- self.sampler.wrap_around = 0
- self.wrap_around = 0
- self.wrap_last = wrap_last
- self.start_iter = 0
- self.interleave = interleave
-
- def __iter__(self):
- batch = []
- i = 0
- for idx in self.data_iterator(self.sampler, wrap_around=False):
- batch.append(idx)
- if len(batch) == self.batch_size:
- tbatch = self._batch(batch)
- if i >= self.start_iter:
- yield tbatch
- self.start_iter = 0
- i += 1
- batch = []
- batch_len = len(batch)
- if batch_len > 0 and not self.drop_last:
- if self.wrap_last:
- self.sampler.wrap_around -= (self.batch_size)
- self.wrap_around += (len(batch))
- self.wrap_around %= self.batch_size
- yield self._batch(batch)
- if self.wrap_last:
- self.sampler.wrap_around += self.batch_size
-
- def data_iterator(self, _iter, wrap_around=False):
- """iterates through data and handles wrap around"""
- for i, idx in enumerate(_iter):
- if i < self.wrap_around % self.batch_size:
- continue
- if wrap_around:
- self.wrap_around += 1
- self.wrap_around %= self.batch_size
- yield idx
-
- def _batch(self, batch):
- """extracts samples only pertaining to this worker's batch"""
- if self.interleave:
- return batch[self.rank:self.batch_size:self.world_size]
- start = self.rank * self.batch_size // self.world_size
- end = (self.rank + 1) * self.batch_size // self.world_size
- return batch[start:end]
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/data/test/test_indexed_dataset.py b/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/data/test/test_indexed_dataset.py
deleted file mode 100644
index 9103c6d9ee..0000000000
--- a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/data/test/test_indexed_dataset.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# This file isn't really a formal automated test, it's just a place to
-# put some code used during development and manual testing of
-# indexed_dataset.
-
-from megatron.data import indexed_dataset
-from megatron.tokenizer import build_tokenizer
-import argparse
-import os
-import sys
-
-import torch
-
-script_dir = os.path.dirname(os.path.realpath(__file__))
-sys.path.append(os.path.join(script_dir, "../../../"))
-
-
-def test_indexed_dataset(args):
- ds = indexed_dataset.make_dataset(args.data, args.dataset_impl)
- tokenizer = build_tokenizer(args)
- print(len(ds.doc_idx))
- print(len(ds))
- print(ds.doc_idx[-1])
- if ds.supports_prefetch:
- # just prefetch the whole thing in test (so assume it is small)
- ds.prefetch(range(len(ds)))
- if args.count > len(ds.doc_idx) - 1:
- args.count = len(ds.doc_idx) - 1
-
- for i in range(args.count):
- start = ds.doc_idx[i]
- end = ds.doc_idx[i + 1]
- ids = ds[start:end]
- print(f"Document {i}:")
- print("--------------")
- for s in ids:
- assert len(s) > 0
- l = s.data.tolist()
- text = tokenizer.detokenize(l)
- print(text)
- print("---")
-
-
-def test_indexed_dataset_get(args):
- ds = indexed_dataset.make_dataset(args.data, args.dataset_impl)
- tokenizer = build_tokenizer(args)
- size = ds.sizes[0]
- print(f"size: {size}")
- full = ds.get(0)
- print(full)
- # print(tokenizer.detokenize(full.data.tolist()))
- print("---")
- end = ds.get(0, offset=size - 10)
- print(end)
- # print(tokenizer.detokenize(end.data.tolist()))
-
- start = ds.get(0, length=10)
- print(start)
- # print(tokenizer.detokenize(start.data.tolist()))
-
- part = ds.get(0, offset=2, length=8)
- print(part)
- # print(tokenizer.detokenize(part.data.tolist()))
-
-# def test_albert_dataset(args):
-# # tokenizer = FullBertTokenizer(args.vocab, do_lower_case=True)
-# # idataset = indexed_dataset.make_dataset(args.data, args.dataset_impl)
-# # ds = AlbertDataset(idataset, tokenizer)
-# ds = AlbertDataset.from_paths(args.vocab, args.data, args.dataset_impl,
-# args.epochs, args.max_num_samples,
-# args.masked_lm_prob, args.seq_length,
-# args.short_seq_prob, args.seed)
-# truncated = 0
-# total = 0
-# for i, s in enumerate(ds):
-# ids = s['text']
-# tokens = ds.tokenizer.convert_ids_to_tokens(ids)
-# print(tokens)
-# if i >= args.count-1:
-# exit()
-
-
-def main():
- parser = argparse.ArgumentParser()
- parser.add_argument('--data', type=str, help='prefix to data files')
- parser.add_argument('--dataset-impl', type=str, default='infer',
- choices=['lazy', 'cached', 'mmap', 'infer'])
- parser.add_argument('--count', type=int, default=10,
- help='Number of samples/documents to print')
-
- group = parser.add_argument_group(title='tokenizer')
- group.add_argument('--tokenizer-type', type=str, required=True,
- choices=['BertWordPieceLowerCase',
- 'GPT2BPETokenizer'],
- help='What type of tokenizer to use.')
- group.add_argument('--vocab-file', type=str, default=None,
- help='Path to the vocab file')
- group.add_argument('--merge-file', type=str, default=None,
- help='Path to the BPE merge file (if necessary).')
-
- parser.add_argument('--epochs', type=int, default=5,
- help='Number of epochs to plan for')
- parser.add_argument('--max-num-samples', type=int, default=None,
- help='Maximum number of samples to plan for')
- parser.add_argument('--masked-lm-prob', type=float, default=0.15,
- help='probability of masking tokens')
- parser.add_argument('--seq-length', type=int, default=512,
- help='maximum sequence length')
- parser.add_argument('--short-seq-prob', type=float, default=0.1,
- help='probability of creating a short sequence')
- parser.add_argument('--seed', type=int, default=1234,
- help='random seed')
- args = parser.parse_args()
- args.rank = 0
- args.make_vocab_size_divisible_by = 128
- args.model_parallel_size = 1
-
- if args.dataset_impl == "infer":
- args.dataset_impl = indexed_dataset.infer_dataset_impl(args.data)
-
-# test_albert_dataset(args)
- test_indexed_dataset_get(args)
-
-
-if __name__ == "__main__":
- main()
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/data/test/test_preprocess_data.sh b/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/data/test/test_preprocess_data.sh
deleted file mode 100644
index d121c85958..0000000000
--- a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/data/test/test_preprocess_data.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash
-
-IMPL=cached
-python ../preprocess_data.py \
- --input test_samples.json \
- --vocab vocab.txt \
- --dataset-impl ${IMPL} \
- --output-prefix test_samples_${IMPL} \
- --workers 1 \
- --log-interval 2
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/deprecated_data_utils/__init__.py b/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/deprecated_data_utils/__init__.py
deleted file mode 100644
index abefedcef2..0000000000
--- a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/deprecated_data_utils/__init__.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""utils for creating datasets"""
-import os
-import math
-
-import torch
-
-from .samplers import DistributedBatchSampler
-from .datasets import json_dataset, csv_dataset, split_ds, ConcatDataset, SplitDataset, bert_sentencepair_dataset, GPT2Dataset
-from .lazy_loader import exists_lazy, make_lazy, lazy_array_loader
-from .tokenization import Tokenization, CommandToken, Tokenizer, CharacterLevelTokenizer, BertWordPieceTokenizer, GPT2BPETokenizer, make_tokenizer
-from . import corpora
-
-TRAIN_DATA = 0
-VAL_DATA = 1
-TEST_DATA = 2
-
-
-def should_split(split):
- """
- given split proportions checks if should split
- Examples:
- >>> should_split([10,0,0])
- False
- >>> should_split([1,.1,.2])
- True
- """
- return max(split) / sum(split) != 1.
-
-
-def get_ext(path):
- """gets path extension"""
- return os.path.splitext(path)[1]
-
-
-def get_dataset(path, **kwargs):
- """gets dataset object based on keyword args and file at `path`"""
- if supported_corpus(path):
- return corpora.NAMED_CORPORA[path](**kwargs)
- ext = get_ext(path)
- if '.json' in ext:
- text = json_dataset(path, **kwargs)
- elif ext in ['.csv', '.tsv']:
- text = csv_dataset(path, **kwargs)
- else:
- raise NotImplementedError('data file type %s is not supported' % (ext))
- return text
-
-
-def supported_corpus(corpus_name):
- """checks if corpus name is defined in `corpora.py`"""
- return corpus_name in corpora.NAMED_CORPORA
-
-
-def make_dataset(path, seq_length, text_key, label_key, lazy=False, process_fn=None, split=[1.],
- delim=',', loose=False, binarize_sent=False, drop_unlabeled=False, tokenizer=None,
- tokenizer_type='CharacterLevelTokenizer', tokenizer_model_path=None, vocab_size=None,
- model_type='bpe', pad_token=0, character_converage=1.0, non_binary_cols=None,
- parallel_group=None, **kwargs):
- """function to create datasets+tokenizers for common options"""
- if isinstance(process_fn, str):
- process_fn = eval(process_fn)
- if non_binary_cols is not None:
- # multilabel dataset support (only for csvs)
- label_key = non_binary_cols
-
- def get_dataset_from_path(path_):
- if lazy:
- # get lazily loaded dataset
- named_corpora = False
- if supported_corpus(path_):
- named_corpora = True
- name = path_
- path_ = corpora.NAMED_CORPORA[path_].PATH
- if torch.distributed.get_rank() == 0 and not exists_lazy(path_, data_type='data'):
- # create cached version of dataset for lazy loading if it doesn't exist
- text = get_dataset(name if named_corpora else path_, text_key=text_key, label_key=label_key, binarize_sent=binarize_sent,
- delim=delim, drop_unlabeled=drop_unlabeled, loose_json=loose)
- make_lazy(path_, text.X, data_type='data')
- # This should be a barrier but nccl barrier assumes
- # device_index=rank which is not the case for model
- # parallel case
- counts = torch.cuda.LongTensor([1])
- torch.distributed.all_reduce(counts, group=parallel_group)
- assert counts[0].item() == torch.distributed.get_world_size(
- group=parallel_group)
-
- text = lazy_array_loader(path_, data_type='data', map_fn=process_fn)
- else:
- # get dataset
- text = get_dataset(path_, text_key=text_key, label_key=label_key, binarize_sent=binarize_sent,
- delim=delim, drop_unlabeled=drop_unlabeled, loose_json=loose, preprocess_fn=process_fn)
- return text
- # get one or multiple datasets and concatenate
- if isinstance(path, str):
- path = [path]
- datasets = [get_dataset_from_path(p) for p in path]
- if len(datasets) == 1:
- ds = datasets[0]
- else:
- ds = ConcatDataset(datasets)
- # make tokenizer for dataset
- if tokenizer is None:
- tokenizer = make_tokenizer(tokenizer_type, ds, tokenizer_model_path, vocab_size, model_type,
- pad_token, character_converage, **kwargs)
-
- ds_type = ''
- if 'ds_type' in kwargs:
- ds_type = kwargs['ds_type']
- ds.SetTokenizer(tokenizer)
- # Split dataset into train/val/test (and wrap bert dataset)
- if should_split(split):
- ds = split_ds(ds, split)
- if 'bert' in ds_type.lower():
- presplit_sentences = kwargs['presplit_sentences'] if 'presplit_sentences' in kwargs else False
- dstype = bert_sentencepair_dataset
- ds = [dstype(d, max_seq_len=seq_length, presplit_sentences=presplit_sentences)
- if d is not None else None for d in ds]
- elif ds_type.lower() == 'gpt2':
- ds = [GPT2Dataset(d, max_seq_len=seq_length) if d is not None else None for d in ds]
- else:
- if 'bert' in ds_type.lower():
- presplit_sentences = kwargs['presplit_sentences'] if 'presplit_sentences' in kwargs else False
- dstype = bert_sentencepair_dataset
- ds = dstype(ds, max_seq_len=seq_length, presplit_sentences=presplit_sentences)
- elif ds_type.lower() == 'gpt2':
- ds = GPT2Dataset(ds, max_seq_len=seq_length)
- return ds, tokenizer
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/deprecated_data_utils/configure_data.py b/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/deprecated_data_utils/configure_data.py
deleted file mode 100644
index 357c2380c6..0000000000
--- a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/deprecated_data_utils/configure_data.py
+++ /dev/null
@@ -1,252 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""parses arguments and preps data loader"""
-
-import copy
-import torch
-
-from megatron import data_utils
-from megatron import mpu
-
-
-class DataConfig:
-
- def __init__(self, defaults={}):
- super(DataConfig, self).__init__()
- self.defaults = defaults
-
- def apply(self, args):
- if torch.distributed.get_rank() == 0:
- print('configuring data')
- self.apply_defaults(args)
- return make_loaders(args)
-
- def set_defaults(self, **kwargs):
- for k, v in kwargs.items():
- self.defaults[k] = v
-
- def apply_defaults(self, args):
- for k, v in self.defaults.items():
- k = k.replace('-', '_')
- if not hasattr(args, k):
- setattr(args, k, v)
-
-
-def make_data_loader(dataset, batch_size, args):
-
- shuffle = args.shuffle
- if shuffle:
- sampler = data_utils.samplers.RandomSampler(
- dataset, replacement=True, num_samples=batch_size * args.train_iters)
- else:
- sampler = torch.utils.data.SequentialSampler(dataset)
- world_size = torch.distributed.get_world_size(
- group=mpu.get_data_parallel_group())
- rank = torch.distributed.get_rank(group=mpu.get_data_parallel_group())
- distributed = world_size > 1
- drop_last = distributed
-
- if distributed:
- batch_sampler = data_utils.samplers.DistributedBatchSampler(sampler,
- batch_size,
- drop_last,
- rank,
- world_size)
- else:
- batch_sampler = torch.utils.data.BatchSampler(sampler,
- batch_size,
- drop_last)
-
- data_loader = torch.utils.data.DataLoader(dataset,
- batch_sampler=batch_sampler,
- num_workers=args.num_workers,
- pin_memory=True)
-
- return data_loader
-
-
-def make_tfrecord_loaders(args):
- """Load train/val/test dataset from shuffled TFRecords"""
-
- import data_utils.tf_dl
- data_set_args = {'batch_size': args.batch_size,
- 'max_seq_len': args.seq_length,
- 'max_preds_per_seq': args.max_preds_per_seq,
- 'train': True,
- 'num_workers': max(args.num_workers, 1),
- 'seed': args.seed + args.rank + 1,
- 'threaded_dl': args.num_workers > 0
- }
- train = data_utils.tf_dl.TFRecordDataLoader(args.train_data,
- **data_set_args)
- data_set_args['train'] = False
- if args.eval_seq_length is not None:
- data_set_args['max_seq_len'] = args.eval_seq_length
- if args.eval_max_preds_per_seq is not None:
- data_set_args['max_preds_per_seq'] = args.eval_max_preds_per_seq
- valid = None
- if args.valid_data is not None:
- valid = data_utils.tf_dl.TFRecordDataLoader(args.valid_data,
- **data_set_args)
- test = None
- if args.test_data is not None:
- test = data_utils.tf_dl.TFRecordDataLoader(args.test_data,
- **data_set_args)
- tokenizer = data_utils.make_tokenizer(args.tokenizer_type,
- train,
- args.tokenizer_path,
- args.vocab_size,
- args.tokenizer_model_type,
- cache_dir=args.cache_dir)
-
- return (train, valid, test), tokenizer
-
-
-def make_loaders(args):
- """makes training/val/test"""
-
- if args.data_loader == 'tfrecords':
- return make_tfrecord_loaders(args)
- world_size = torch.distributed.get_world_size(
- group=mpu.get_data_parallel_group())
- batch_size = args.batch_size * world_size
- eval_batch_size = batch_size
- if args.eval_batch_size is not None:
- eval_batch_size = args.eval_batch_size * world_size
- seq_length = args.seq_length
- if seq_length < 0:
- seq_length = seq_length * world_size
- eval_seq_length = args.eval_seq_length
- if eval_seq_length is not None and eval_seq_length < 0:
- eval_seq_length = eval_seq_length * world_size
- split = get_split(args)
- if args.data_path is not None:
- args.train_data = args.data_path
- data_set_args = {
- 'path': args.train_data,
- 'seq_length': seq_length,
- 'lazy': args.data_loader == 'lazy',
- 'delim': args.delim,
- 'text_key': args.text_key,
- 'label_key': 'label',
- 'non_binary_cols': None,
- 'ds_type': args.data_set_type,
- 'split': split,
- 'loose': args.loose_json,
- 'tokenizer_type': args.tokenizer_type,
- 'tokenizer_model_path': args.tokenizer_path,
- 'vocab_size': args.vocab_size,
- 'model_type': args.tokenizer_model_type,
- 'cache_dir': args.cache_dir,
- 'max_preds_per_seq': args.max_preds_per_seq,
- 'presplit_sentences': args.presplit_sentences,
- 'parallel_group': mpu.get_data_parallel_group()}
-
- eval_set_args = copy.copy(data_set_args)
- eval_set_args['split'] = [1.]
- # if optional eval args were set then replace their
- # equivalent values in the arg dict
- if eval_seq_length:
- eval_set_args['seq_length'] = eval_seq_length
- if args.eval_max_preds_per_seq:
- eval_set_args['max_preds_per_seq'] = args.eval_max_preds_per_seq
- if args.eval_text_key is not None:
- eval_set_args['text_key'] = args.eval_text_key
-
- # make datasets splits and tokenizer
- train = None
- valid = None
- test = None
-
- if args.train_data is not None:
- train, tokenizer = data_utils.make_dataset(**data_set_args)
- if data_utils.should_split(split):
- train, valid, test = train
- eval_set_args['tokenizer'] = tokenizer
-
- # make training and val dataset if necessary
- if valid is None and args.valid_data is not None:
- eval_set_args['path'] = args.valid_data
- valid, tokenizer = data_utils.make_dataset(**eval_set_args)
- eval_set_args['tokenizer'] = tokenizer
- if test is None and args.test_data is not None:
- eval_set_args['path'] = args.test_data
- test, tokenizer = data_utils.make_dataset(**eval_set_args)
-
- # wrap datasets with data loader
- if train is not None and args.batch_size > 0:
- train = make_data_loader(train, batch_size, args)
- args.do_train = True
- else:
- args.do_train = False
- eval_batch_size = eval_batch_size if eval_batch_size != 0 else batch_size
- if valid is not None:
- valid = make_data_loader(valid, eval_batch_size, args)
- args.do_valid = True
- else:
- args.do_valid = False
- if test is not None:
- test = make_data_loader(test, eval_batch_size, args)
- args.do_test = True
- else:
- args.do_test = False
-
- return (train, valid, test), tokenizer
-
-
-def get_split(args):
- """
- Get dataset splits from comma separated string list
- """
- splits = []
- if args.split.find(',') != -1:
- splits = [float(s) for s in args.split.split(',')]
- elif args.split.find('/') != -1:
- splits = [float(s) for s in args.split.split('/')]
- else:
- splits = [float(args.split)]
- split_total = sum(splits)
- if split_total < 1.:
- splits.append(1 - split_total)
- while len(splits) < 3:
- splits.append(0.)
- splits = splits[:3]
- if args.valid_data is not None:
- splits[1] = 0.
- if args.test_data is not None:
- splits[2] = 0.
- final_sum = sum(splits)
- return [s / final_sum for s in splits]
-
-
-def configure_data():
- """add cmdline flags for configuring datasets"""
- # These are options that are used by data_utils, but are either
- # deprecated or not meant to be exposed to the command line user.
- # These options are intneded to be set in code by specific scripts.
- defaults = {
- 'world_size': 1,
- 'rank': -1,
- 'persist_state': 0,
- 'lazy': False,
- 'transpose': False,
- 'data_set_type': 'supervised',
- 'seq_length': 256,
- 'eval_seq_length': 256,
- 'samples_per_shard': 100
- }
-
- return DataConfig(defaults=defaults)
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/deprecated_data_utils/corpora.py b/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/deprecated_data_utils/corpora.py
deleted file mode 100644
index 73749d9ce1..0000000000
--- a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/deprecated_data_utils/corpora.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""several datasets with preset arguments"""
-from .datasets import json_dataset, csv_dataset
-import os
-
-
-class wikipedia(json_dataset):
- """
- dataset for wikipedia with arguments configured for convenience
-
- command line usage: `--train-data wikipedia`
- """
- PATH = 'data/wikipedia/wikidump_lines.json'
- assert_str = "make sure to set PATH for wikipedia data_utils/corpora.py"
-
- def __init__(self, **kwargs):
- assert os.path.exists(wikipedia.PATH), \
- wikipedia.assert_str
- if not kwargs:
- kwargs = {}
- kwargs['text_key'] = 'text'
- kwargs['loose_json'] = True
- super(wikipedia, self).__init__(wikipedia.PATH, **kwargs)
-
-
-class webtext(json_dataset):
- """
- dataset for webtext with arguments configured for convenience
-
- command line usage: `--train-data webtext`
- """
- PATH = 'data/webtext/data.json'
- assert_str = "make sure to set PATH for webtext data_utils/corpora.py"
-
- def __init__(self, **kwargs):
- assert os.path.exists(webtext.PATH), \
- webtext.assert_str
- if not kwargs:
- kwargs = {}
- kwargs['text_key'] = 'text'
- kwargs['loose_json'] = True
- super(webtext, self).__init__(webtext.PATH, **kwargs)
-
-
-NAMED_CORPORA = {
- 'wikipedia': wikipedia,
- 'webtext': webtext,
-}
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/deprecated_data_utils/datasets.py b/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/deprecated_data_utils/datasets.py
deleted file mode 100644
index bf8ef8a5e9..0000000000
--- a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/deprecated_data_utils/datasets.py
+++ /dev/null
@@ -1,883 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""dataset objects for jsons, csvs, and BERT datasets"""
-
-import os
-import time
-from operator import itemgetter
-from bisect import bisect_right
-import json
-import csv
-import math
-import random
-from itertools import accumulate
-
-from torch.utils import data
-import pandas as pd
-import numpy as np
-
-import nltk
-from nltk import tokenize
-
-from .lazy_loader import lazy_array_loader, exists_lazy, make_lazy
-from .tokenization import Tokenization
-
-
-class ConcatDataset(data.Dataset):
- """
- Dataset to concatenate multiple datasets.
- Purpose: useful to assemble different existing datasets, possibly
- large-scale datasets as the concatenation operation is done in an
- on-the-fly manner.
- Arguments:
- datasets (sequence): List of datasets to be concatenated.
- """
-
- @staticmethod
- def cumsum(sequence):
- r, s = [], 0
- for e in sequence:
- l = len(e)
- r.append(l + s)
- s += l
- return r
-
- def __init__(self, datasets, **kwargs):
- super(ConcatDataset, self).__init__()
- assert len(datasets) > 0, 'datasets should not be an empty iterable'
- self.datasets = list(datasets)
- self.is_lazy = sum([isinstance(ds, lazy_array_loader)
- for ds in self.datasets]) == len(self.datasets)
- self.cumulative_sizes = self.cumsum(self.datasets)
- self._X = None
- self._Y = None
- self._lens = None
-
- def SetTokenizer(self, tokenizer):
- for ds in self.datasets:
- ds.SetTokenizer(tokenizer)
-
- def GetTokenizer(self):
- return self.datasets[0].GetTokenizer()
-
- def __len__(self):
- return self.cumulative_sizes[-1]
-
- def __getitem__(self, idx):
- dataset_idx = bisect_right(self.cumulative_sizes, idx)
- if dataset_idx == 0:
- sample_idx = idx
- else:
- sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
- return self.datasets[dataset_idx][sample_idx]
-
- @property
- def lens(self):
- if self._lens is None:
- self._lens = []
- if self.is_lazy:
- for data in self.datasets:
- self._lens.extend(data.lens)
- else:
- for data in self.datasets:
- self._lens.extend([len(d['text']) if isinstance(
- d, dict) else len(d) for d in data])
- return self._lens
-
- @property
- def X(self):
- if self._X is None:
- self._X = []
- for data in self.datasets:
- self._X.extend(data.X)
- return self._X
-
- @property
- def Y(self):
- if self._Y is None:
- self._Y = []
- for data in self.datasets:
- self._Y.extend(list(data.Y))
- self._Y = np.array(self._Y)
- return self._Y
-
- @property
- def cummulative_sizes(self):
- warnings.warn("cummulative_sizes attribute is renamed to "
- "cumulative_sizes", DeprecationWarning, stacklevel=2)
- return self.cumulative_sizes
-
-
-class SplitDataset(data.Dataset):
- """
- Dataset wrapper to access a subset of another dataset.
- Purpose: useful to index into existing datasets, possibly
- large-scale datasets as the subindexing operation is done in an
- on-the-fly manner.
- Arguments:
- ds (Dataset or array-like): List of datasets to be subindexed
- split_inds (1D array-like): List of indices part of subset
- """
-
- def __init__(self, ds, split_inds, **kwargs):
- self.split_inds = list(split_inds)
- self.wrapped_data = ds
- self.is_lazy = isinstance(ds, lazy_array_loader) or (hasattr(ds, 'is_lazy') and ds.is_lazy)
- if self.is_lazy:
- self.lens = itemgetter(*self.split_inds)(list(self.wrapped_data.lens))
- self._X = None
- self._Y = None
-
- def __len__(self):
- return len(self.split_inds)
-
- def __getitem__(self, index):
- return self.wrapped_data[self.split_inds[index]]
-
- def SetTokenizer(self, tokenizer):
- self.wrapped_data.SetTokenizer(tokenizer)
-
- def GetTokenizer(self):
- return self.wrapped_data.GetTokenizer()
-
- @property
- def X(self):
- if self._X is None:
- self._X = itemgetter(*self.split_inds)(self.wrapped_data.X)
- return self._X
-
- @property
- def Y(self):
- if self._Y is None:
- self._Y = np.array(itemgetter(*self.split_inds)(self.wrapped_data.Y))
- return self._Y
-
- def __iter__(self):
- for idx in self.split_inds:
- yield self.wrapped_data[idx]
-
-
-def split_ds(ds, split=[.8, .2, .0], shuffle=True):
- """
- Split a dataset into subsets given proportions of how
- much to allocate per split. If a split is 0% returns None for that split.
- Purpose: Useful for creating train/val/test splits
- Arguments:
- ds (Dataset or array-like): Data to be split.
- split (1D array-like): proportions to split `ds`. `sum(splits) != 0`
- shuffle (boolean): Randomly split dataset. Default: True
- """
- split_sum = sum(split)
- if split_sum == 0:
- raise Exception('Split cannot sum to 0.')
- split = np.array(split)
- split /= split_sum
- ds_len = len(ds)
- inds = np.arange(ds_len)
- if shuffle:
- np.random.shuffle(inds)
- start_idx = 0
- residual_idx = 0
- rtn_ds = [None] * len(split)
- for i, f in enumerate(split):
- if f != 0:
- proportion = ds_len * split[i]
- residual_idx += proportion % 1
- split_ = int(int(proportion) + residual_idx)
- split_inds = inds[start_idx:start_idx + max(split_, 1)]
- rtn_ds[i] = SplitDataset(ds, split_inds)
- start_idx += split_
- residual_idx %= 1
- return rtn_ds
-
-
-class csv_dataset(data.Dataset):
- """
- Class for loading datasets from csv files.
- Purpose: Useful for loading data for unsupervised modeling or transfer tasks
- Arguments:
- path (str): Path to csv file with dataset.
- tokenizer (data_utils.Tokenizer): Tokenizer to use when processing text. Default: None
- preprocess_fn (callable): Callable that process a string into desired format.
- delim (str): delimiter for csv. Default: ','
- binarize_sent (bool): binarize label values to 0 or 1 if they\'re on a different scale. Default: False
- drop_unlabeled (bool): drop rows with unlabelled values. Always fills remaining empty
- columns with -1 (regardless if rows are dropped based on value) Default: False
- text_key (str): key to get text from csv. Default: 'sentence'
- label_key (str): key to get label from json dictionary. Default: 'label'
- Attributes:
- X (list): all strings from the csv file
- Y (np.ndarray): labels to train with
- """
-
- def __init__(self, path, tokenizer=None, preprocess_fn=None, delim=',',
- binarize_sent=False, drop_unlabeled=False, text_key='sentence', label_key='label',
- **kwargs):
- self.is_lazy = False
- self.preprocess_fn = preprocess_fn
- self.SetTokenizer(tokenizer)
- self.path = path
- self.delim = delim
- self.text_key = text_key
- self.label_key = label_key
- self.drop_unlabeled = drop_unlabeled
-
- if '.tsv' in self.path:
- self.delim = '\t'
-
- self.X = []
- self.Y = []
- try:
- cols = [text_key]
- if isinstance(label_key, list):
- cols += label_key
- else:
- cols += [label_key]
- data = pd.read_csv(self.path, sep=self.delim, usecols=cols, encoding='latin-1')
- except BaseException:
- data = pd.read_csv(self.path, sep=self.delim, usecols=[text_key], encoding='latin-1')
-
- data = data.dropna(axis=0)
-
- self.X = data[text_key].values.tolist()
- try:
- self.Y = data[label_key].values
- except Exception as e:
- self.Y = np.ones(len(self.X)) * -1
-
- if binarize_sent:
- self.Y = binarize_labels(self.Y, hard=binarize_sent)
-
- def SetTokenizer(self, tokenizer):
- if tokenizer is None:
- self.using_tokenizer = False
- if not hasattr(self, '_tokenizer'):
- self._tokenizer = tokenizer
- else:
- self.using_tokenizer = True
- self._tokenizer = tokenizer
-
- def GetTokenizer(self):
- return self._tokenizer
-
- @property
- def tokenizer(self):
- if self.using_tokenizer:
- return self._tokenizer
- return None
-
- def __len__(self):
- return len(self.X)
-
- def __getitem__(self, index):
- """process+tokenize string and return string,label,and stringlen"""
- x = self.X[index]
- if self.tokenizer is not None:
- x = self.tokenizer.EncodeAsIds(x, self.preprocess_fn)
- elif self.preprocess_fn is not None:
- x = self.preprocess_fn(x)
- y = self.Y[index]
- if isinstance(y, str):
- if self.tokenizer is not None:
- y = self.tokenizer.EncodeAsIds(y, self.preprocess_fn)
- elif self.preprocess_fn is not None:
- y = self.preprocess_fn(y)
- return {'text': x, 'length': len(x), 'label': y}
-
- def write(self, writer_gen=None, path=None, skip_header=False):
- """
- given a generator of metrics for each of the data points X_i,
- write the metrics, text, and labels to a csv file
- """
- if path is None:
- path = self.path + '.results'
- print('generating csv at ' + path)
- with open(path, 'w') as csvfile:
- c = csv.writer(csvfile, delimiter=self.delim)
- if writer_gen is not None:
- # if first item of generator is a header of what the metrics mean then
- # write header to csv file
- if not skip_header:
- header = (self.label_key,) + tuple(next(writer_gen)) + (self.text_key,)
- c.writerow(header)
- for i, row in enumerate(writer_gen):
- row = (self.Y[i],) + tuple(row) + (self.X[i],)
- c.writerow(row)
- else:
- c.writerow([self.label_key, self.text_key])
- for row in zip(self.Y, self.X):
- c.writerow(row)
-
-
-class json_dataset(data.Dataset):
- """
- Class for loading datasets from a json dump.
- Purpose: Useful for loading data for unsupervised modeling or transfer tasks
- Arguments:
- path (str): path to json file with dataset.
- tokenizer (data_utils.Tokenizer): Tokenizer to use when processing text. Default: None
- preprocess_fn (callable): callable function that process a string into desired format.
- Takes string, maxlen=None, encode=None as arguments. Default: process_str
- text_key (str): key to get text from json dictionary. Default: 'sentence'
- label_key (str): key to get label from json dictionary. Default: 'label'
- Attributes:
- all_strs (list): list of all strings from the dataset
- all_labels (list): list of all labels from the dataset (if they have it)
- """
-
- def __init__(self, path, tokenizer=None, preprocess_fn=None, binarize_sent=False,
- text_key='sentence', label_key='label', loose_json=False, **kwargs):
- self.is_lazy = False
- self.preprocess_fn = preprocess_fn
- self.path = path
- self.SetTokenizer(tokenizer)
- self.X = []
- self.Y = []
- self.text_key = text_key
- self.label_key = label_key
- self.loose_json = loose_json
-
- for j in self.load_json_stream(self.path):
- s = j[text_key]
- self.X.append(s)
- self.Y.append(j[label_key])
-
- if binarize_sent:
- self.Y = binarize_labels(self.Y, hard=binarize_sent)
-
- def SetTokenizer(self, tokenizer):
- if tokenizer is None:
- self.using_tokenizer = False
- if not hasattr(self, '_tokenizer'):
- self._tokenizer = tokenizer
- else:
- self.using_tokenizer = True
- self._tokenizer = tokenizer
-
- def GetTokenizer(self):
- return self._tokenizer
-
- @property
- def tokenizer(self):
- if self.using_tokenizer:
- return self._tokenizer
- return None
-
- def __getitem__(self, index):
- """gets the index'th string from the dataset"""
- x = self.X[index]
- if self.tokenizer is not None:
- x = self.tokenizer.EncodeAsIds(x, self.preprocess_fn)
- elif self.preprocess_fn is not None:
- x = self.preprocess_fn(x)
- y = self.Y[index]
- if isinstance(y, str):
- if self.tokenizer is not None:
- y = self.tokenizer.EncodeAsIds(y, self.preprocess_fn)
- elif self.preprocess_fn is not None:
- y = self.preprocess_fn(y)
- return {'text': x, 'length': len(x), 'label': y}
-
- def __len__(self):
- return len(self.X)
-
- def write(self, writer_gen=None, path=None, skip_header=False):
- """
- given a generator of metrics for each of the data points X_i,
- write the metrics, text, and labels to a json file
- """
- if path is None:
- path = self.path + '.results'
-
- jsons = []
-
- if writer_gen is not None:
- # if first item of generator is a header of what the metrics mean then
- # write header to csv file
- def gen_helper():
- keys = {}
- keys[0] = self.label_key
- if not skip_header:
- for idx, k in enumerate(tuple(next(writer_gen))):
- keys[idx + 1] = k
- for i, row in enumerate(writer_gen):
- if i == 0 and skip_header:
- for idx, _ in enumerate(row):
- keys[idx + 1] = 'metric_%d' % (idx,)
- j = {}
- for idx, v in enumerate((self.Y[i],) + tuple(row)):
- k = keys[idx]
- j[k] = v
- yield j
- else:
- def gen_helper():
- for y in self.Y:
- j = {}
- j[self.label_key] = y
- yield j
-
- def out_stream():
- for i, j in enumerate(gen_helper()):
- j[self.text_key] = self.X[i]
- yield j
-
- self.save_json_stream(path, out_stream())
-
- def save_json_stream(self, save_path, json_stream):
- if self.loose_json:
- with open(save_path, 'w') as f:
- for i, j in enumerate(json_stream):
- write_string = ''
- if i != 0:
- write_string = '\n'
- write_string += json.dumps(j)
- f.write(write_string)
- else:
- jsons = [j for j in json_stream]
- json.dump(jsons, open(save_path, 'w'), separators=(',', ':'))
-
- def load_json_stream(self, load_path):
- if not self.loose_json:
- jsons = json.load(open(load_path, 'r'))
- generator = iter(jsons)
- else:
- def gen_helper():
- with open(load_path, 'r') as f:
- for row in f:
- yield json.loads(row)
- generator = gen_helper()
-
- for j in generator:
- if self.label_key not in j:
- j[self.label_key] = -1
- yield j
-
-
-class GPT2Dataset(data.Dataset):
-
- def __init__(self, ds,
- max_seq_len=1024,
- num_samples=None,
- weighted=True,
- sample_across_doc=True,
- random_across_doc_sampling=True,
- bias_for_single_doc=False,
- sentence_start=False, **kwargs):
- self.ds = ds
- self.ds_len = len(self.ds)
- self.num_samples = num_samples
- if num_samples is None:
- self.num_samples = 1000 * self.ds_len
- self.max_seq_len = max_seq_len
- self.tokenizer = self.ds.GetTokenizer()
- self.ds.SetTokenizer(None)
- self.weighted = weighted
- self.sample_across_doc = sample_across_doc
- self.random_across_doc_sampling = random_across_doc_sampling
- self.bias_for_single_doc = bias_for_single_doc
- self.sentence_start = sentence_start
- self.init_weighting()
-
- def init_weighting(self):
- if self.weighted:
- if hasattr(self.ds, 'is_lazy') and self.ds.is_lazy:
- lens = np.array(self.ds.lens)
- else:
- lens = np.array([len(d['text']) if isinstance(d, dict)
- else len(d) for d in self.ds])
- self.total_len = np.sum(lens)
- self.weighting = list(accumulate(lens))
- else:
- self.weighting = None
-
- def get_weighted_samples(self, np_rng):
- if self.weighting is not None:
- idx = np_rng.randint(self.total_len)
- return bisect_right(self.weighting, idx)
- else:
- return np_rng.randint(self.ds_len)
-
- def __len__(self):
- return self.num_samples
-
- def __getitem__(self, idx):
- # init rng
- rng = random.Random(idx)
- rng = np.random.RandomState(seed=[rng.randint(0, 2**32 - 1) for _ in range(16)])
-
- # get possibly weighted random index from dataset
- data_idx = self.get_weighted_samples(rng)
-# data_idx = rng.choice(self.ds_len, p=self.weighting)
- tokens = self.getidx(data_idx)
-
- # truncate or pad tokens
- num_tokens = len(tokens)
- if self.bias_for_single_doc:
- tokens_to_strip = num_tokens - self.max_seq_len - 1
- else:
- tokens_to_strip = num_tokens - 1
- if tokens_to_strip > 0:
- strip_left_tokens = rng.randint(tokens_to_strip + 1)
- tokens = tokens[strip_left_tokens:]
- if self.sentence_start:
- token_copy = list(tokens)
- not_done = True
- while (len(token_copy) > 0) and not_done:
- tok = token_copy.pop(0)
- if self.contains_sentence_end(tok):
- tokens = token_copy
- not_done = False
- strip_right_rokens = len(tokens) - self.max_seq_len - 1
- if strip_right_rokens > 0:
- tokens = tokens[:-strip_right_rokens]
-
- if self.sample_across_doc:
- while (len(tokens) < (self.max_seq_len + 1)):
- if self.random_across_doc_sampling:
- data_idx = self.get_weighted_samples(rng)
- else:
- data_idx = (data_idx + 1) % self.ds_len
- tokens += self.getidx(data_idx)
- tokens = tokens[:(self.max_seq_len + 1)]
-
- tokens = self.pad_seq(tokens)
- return {'text': np.array(tokens), }
-
- def getidx(self, data_idx):
- data = self.ds[data_idx]
- if isinstance(data, dict):
- data = data['text']
- # tokenize
- tokenization = self.tokenizer.EncodeAsIds(data)
- tokenization.append(self.tokenizer.get_command('eos'))
- tokens = tokenization.tokenization
- return tokens
-
- def pad_seq(self, seq):
- total_tokens = self.max_seq_len + 1
- num_pad_tokens = max(0, total_tokens - len(seq))
- seq += [self.tokenizer.get_command('pad').Id] * (num_pad_tokens)
- return seq
-
- def contains_sentence_end(self, tok):
- tok = self.tokenizer.IdToToken(tok)
- if '.' in tok:
- return True
- if '?' in tok:
- return True
- if '!' in tok:
- return True
- return False
-
-
-class bert_sentencepair_dataset(data.Dataset):
- """
- Dataset containing sentencepairs for BERT training. Each index corresponds to a randomly generated sentence pair.
- Arguments:
- ds (Dataset or array-like): data corpus to use for training
- max_seq_len (int): maximum sequence length to use for a sentence pair
- mask_lm_prob (float): proportion of tokens to mask for masked LM
- max_preds_per_seq (int): Maximum number of masked tokens per sentence pair. Default: math.ceil(max_seq_len*mask_lm_prob/10)*10
- short_seq_prob (float): Proportion of sentence pairs purposefully shorter than max_seq_len
- dataset_size (int): number of random sentencepairs in the dataset. Default: len(ds)*(len(ds)-1)
-
- """
-
- def __init__(self, ds, max_seq_len=512, mask_lm_prob=.15, max_preds_per_seq=None,
- short_seq_prob=.01, dataset_size=None, presplit_sentences=False, weighted=True, **kwargs):
- self.ds = ds
- self.ds_len = len(self.ds)
- self.tokenizer = self.ds.GetTokenizer()
- self.vocab_words = list(self.tokenizer.text_token_vocab.values())
- self.ds.SetTokenizer(None)
- self.max_seq_len = max_seq_len
- self.mask_lm_prob = mask_lm_prob
- if max_preds_per_seq is None:
- max_preds_per_seq = math.ceil(max_seq_len * mask_lm_prob / 10) * 10
- self.max_preds_per_seq = max_preds_per_seq
- self.short_seq_prob = short_seq_prob
- self.dataset_size = dataset_size
- if self.dataset_size is None:
- self.dataset_size = self.ds_len * (self.ds_len - 1)
- self.presplit_sentences = presplit_sentences
- if not self.presplit_sentences:
- nltk.download('punkt', download_dir="./nltk")
- self.weighted = weighted
- self.get_weighting()
-
- def get_weighting(self):
- if self.weighted:
- if hasattr(self.ds, 'is_lazy') and self.ds.is_lazy:
- lens = np.array(self.ds.lens)
- else:
- lens = np.array([len(d['text']) if isinstance(d, dict) else len(d)
- for d in self.ds])
- self.total_len = np.sum(lens)
- self.weighting = list(accumulate(lens))
- else:
- self.weighting = None
-
- def get_weighted_samples(self, np_rng):
- if self.weighting is not None:
- idx = np_rng.randint(self.total_len)
- return bisect_right(self.weighting, idx)
- else:
- return np_rng.randint(self.ds_len)
-
- def __len__(self):
- return self.dataset_size
-
- def __getitem__(self, idx):
- # get rng state corresponding to index (allows deterministic random pair)
- rng = random.Random(idx)
- np_rng = np.random.RandomState(seed=[rng.randint(0, 2**32 - 1) for _ in range(16)])
- # get seq length
- target_seq_length = self.max_seq_len
- short_seq = False
- if rng.random() < self.short_seq_prob:
- target_seq_length = rng.randint(2, target_seq_length)
- short_seq = True
-
- # get sentence pair and label
- is_random_next = None
- lena = 0
- lenb = 0
- while (is_random_next is None) or (lena < 1) or (lenb < 1):
- tokensa, tokensb, is_random_next = self.create_random_sentencepair(
- target_seq_length, rng, np_rng)
- lena = len(tokensa[0])
- lenb = len(tokensb[0])
-
- # truncate sentence pair to max_seq_len
- tokensa, tokensb = self.truncate_seq_pair(tokensa, tokensb, self.max_seq_len, rng)
- # join sentence pair, mask, and pad
- tokens, mask, mask_labels, pad_mask = self.create_masked_lm_predictions(
- tokensa, tokensb, self.mask_lm_prob, self.max_preds_per_seq, self.vocab_words, rng)
- sample = {
- 'text': np.array(
- tokens[0]),
- 'types': np.array(
- tokens[1]),
- 'is_random': int(is_random_next),
- 'mask': np.array(mask),
- 'mask_labels': np.array(mask_labels),
- 'pad_mask': np.array(pad_mask)}
- return sample
-
- def sentence_split(self, document):
- """split document into sentences"""
- lines = document.split('\n')
- if self.presplit_sentences:
- return [line for line in lines if line]
- rtn = []
- for line in lines:
- if line != '':
- rtn.extend(tokenize.sent_tokenize(line))
- return rtn
-
- def sentence_tokenize(self, sent, sentence_num=0, beginning=False, ending=False):
- """tokenize sentence and get token types"""
- tokens = self.tokenizer.EncodeAsIds(sent).tokenization
- str_type = 'str' + str(sentence_num)
- token_types = [self.tokenizer.get_type(str_type).Id] * len(tokens)
- return tokens, token_types
-
- def get_doc(self, idx):
- """gets text of document corresponding to idx"""
- rtn = self.ds[idx]
- if isinstance(rtn, dict):
- rtn = rtn['text']
- return rtn
-
- def create_random_sentencepair(self, target_seq_length, rng, np_rng):
- """
- fetches a random sentencepair corresponding to rng state similar to
- https://github.com/google-research/bert/blob/master/create_pretraining_data.py#L248-L294
- """
- is_random_next = None
-
- curr_strs = []
- curr_str_types = []
- curr_len = 0
-
- while curr_len < 1:
- curr_len = 0
- doc_a = None
- while doc_a is None:
- if self.weighted:
- # doc_a_idx = np_rng.choice(self.ds_len, p=self.weighting)
- doc_a_idx = self.get_weighted_samples(np_rng)
- else:
- doc_a_idx = rng.randint(0, self.ds_len - 1)
- doc_a = self.sentence_split(self.get_doc(doc_a_idx))
- if not doc_a:
- doc_a = None
-
- random_start_a = rng.randint(0, len(doc_a) - 1)
- while random_start_a < len(doc_a):
- sentence = doc_a[random_start_a]
- sentence, sentence_types = self.sentence_tokenize(
- sentence, 0, random_start_a == 0, random_start_a == len(doc_a))
- curr_strs.append(sentence)
- curr_str_types.append(sentence_types)
- curr_len += len(sentence)
- if random_start_a == len(doc_a) - 1 or curr_len >= target_seq_length:
- break
- random_start_a = (random_start_a + 1)
-
- if curr_strs:
- num_a = 1
- if len(curr_strs) >= 2:
- num_a = rng.randint(0, len(curr_strs))
-
- tokens_a = []
- token_types_a = []
- for j in range(num_a):
- tokens_a.extend(curr_strs[j])
- token_types_a.extend(curr_str_types[j])
-
- tokens_b = []
- token_types_b = []
- is_random_next = False
- if len(curr_strs) == 1 or rng.random() < 0.5:
- is_random_next = True
- target_b_length = target_seq_length - len(tokens_a)
- b_len = 0
- while b_len < 1:
- doc_b = None
- while doc_b is None:
- doc_b_idx = rng.randint(0, self.ds_len - 2)
- doc_b_idx += int(doc_b_idx >= doc_a_idx)
-
- doc_b = self.sentence_split(self.get_doc(doc_b_idx))
- if not doc_b:
- doc_b = None
-
- random_start_b = rng.randint(0, len(doc_b) - 1)
- while random_start_b < len(doc_b):
- sentence_b = doc_b[random_start_b]
- new_b_tokens, new_b_types = self.sentence_tokenize(
- sentence_b, 1, random_start_b == 0, random_start_b == len(doc_b))
- b_len += len(new_b_tokens)
- tokens_b.extend(new_b_tokens)
- token_types_b.extend(new_b_types)
- if len(tokens_b) >= target_b_length:
- break
- random_start_b = (random_start_b + 1)
- else:
- is_random_next = False
- for j in range(num_a, len(curr_strs)):
- tokens_b.extend(curr_strs[j])
- token_types_b.extend(curr_str_types[j])
-
- return (tokens_a, token_types_a), (tokens_b, token_types_b), is_random_next
-
- def truncate_seq_pair(self, a, b, max_seq_len, rng):
- """
- Truncate sequence pair according to original BERT implementation:
- https://github.com/google-research/bert/blob/master/create_pretraining_data.py#L391
- """
- tokens_a, token_types_a = a
- tokens_b, token_types_b = b
- max_num_tokens = self.calc_seq_len(max_seq_len)
- # max_num_tokens = max_seq_len - 3
- while True:
- len_a = len(tokens_a)
- len_b = len(tokens_b)
- total_length = len_a + len_b
- if total_length <= max_num_tokens:
- break
- if len(tokens_a) > len(tokens_b):
- trunc_tokens = tokens_a
- trunc_types = token_types_a
- else:
- trunc_tokens = tokens_b
- trunc_types = token_types_b
-
- assert len(trunc_tokens) >= 1
-
- if rng.random() < 0.5:
- trunc_tokens.pop(0)
- trunc_types.pop(0)
- else:
- trunc_tokens.pop()
- trunc_types.pop()
- return (tokens_a, token_types_a), (tokens_b, token_types_b)
-
- def calc_seq_len(self, max_seq_len):
- return max_seq_len - 3
-
- def mask_token(self, idx, tokens, types, vocab_words, rng):
- """
- helper function to mask `idx` token from `tokens` according to
- section 3.3.1 of https://arxiv.org/pdf/1810.04805.pdf
- """
- label = tokens[idx]
- if rng.random() < 0.8:
- new_label = self.tokenizer.get_command('MASK').Id
- else:
- if rng.random() < 0.5:
- new_label = label
- else:
- new_label = rng.choice(vocab_words)
-
- tokens[idx] = new_label
-
- return label
-
- def pad_seq(self, seq):
- """helper function to pad sequence pair"""
- num_pad = max(0, self.max_seq_len - len(seq))
- pad_mask = [0] * len(seq) + [1] * num_pad
- seq += [self.tokenizer.get_command('pad').Id] * num_pad
- return seq, pad_mask
-
- def concat_tokens(self, tokens_a, token_types_a, tokens_b, token_types_b):
- tokens = [self.tokenizer.get_command('ENC').Id] + tokens_a + [self.tokenizer.get_command(
- 'sep').Id] + tokens_b + [self.tokenizer.get_command('sep').Id]
- token_types = [token_types_a[0]] + token_types_a + \
- [token_types_a[0]] + token_types_b + [token_types_b[0]]
- return tokens, token_types
-
- def create_masked_lm_predictions(self, a, b, mask_lm_prob, max_preds_per_seq, vocab_words, rng):
- """
- Mask sequence pair for BERT training according to:
- https://github.com/google-research/bert/blob/master/create_pretraining_data.py#L338
- """
- tokens_a, token_types_a = a
- tokens_b, token_types_b = b
- tokens, token_types = self.concat_tokens(tokens_a, token_types_a, tokens_b, token_types_b)
-
- len_a = len(tokens_a)
- len_b = len(tokens_b)
-
- cand_indices = [idx + 1 for idx in range(len_a)] + [idx + 2 + len_a for idx in range(len_b)]
-
- rng.shuffle(cand_indices)
-
- output_tokens, pad_mask = self.pad_seq(list(tokens))
- output_types, _ = self.pad_seq(list(token_types))
-
- num_to_predict = min(max_preds_per_seq, max(1, int(round(len(tokens) * mask_lm_prob))))
-
- mask = [0] * len(output_tokens)
- mask_labels = [-1] * len(output_tokens)
-
- for idx in sorted(cand_indices[:num_to_predict]):
- mask[idx] = 1
- label = self.mask_token(idx, output_tokens, output_types, vocab_words, rng)
- mask_labels[idx] = label
-
- return (output_tokens, output_types), mask, mask_labels, pad_mask
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/deprecated_data_utils/file_utils.py b/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/deprecated_data_utils/file_utils.py
deleted file mode 100644
index 4dc7fdcc41..0000000000
--- a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/deprecated_data_utils/file_utils.py
+++ /dev/null
@@ -1,253 +0,0 @@
-# This file is provided as is from:
-# https://github.com/huggingface/pytorch-pretrained-BERT
-# Please refer to their repository for copyright.
-
-"""
-Utilities for working with the local dataset cache.
-This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
-Copyright by the AllenNLP authors.
-"""
-from __future__ import (absolute_import, division, print_function, unicode_literals)
-
-import json
-import logging
-import os
-import shutil
-import tempfile
-from functools import wraps
-from hashlib import sha256
-import sys
-from io import open
-
-import boto3
-import requests
-from botocore.exceptions import ClientError
-from tqdm import tqdm
-
-try:
- from urllib.parse import urlparse
-except ImportError:
- from urlparse import urlparse
-
-try:
- from pathlib import Path
- PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
- Path.home() / '.pytorch_pretrained_bert'))
-except (AttributeError, ImportError):
- PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
- os.path.join(os.path.expanduser("~"), '.pytorch_pretrained_bert'))
-
-logger = logging.getLogger(__name__) # pylint: disable=invalid-name
-
-
-def url_to_filename(url, etag=None):
- """
- Convert `url` into a hashed filename in a repeatable way.
- If `etag` is specified, append its hash to the url's, delimited
- by a period.
- """
- url_bytes = url.encode('utf-8')
- url_hash = sha256(url_bytes)
- filename = url_hash.hexdigest()
-
- if etag:
- etag_bytes = etag.encode('utf-8')
- etag_hash = sha256(etag_bytes)
- filename += '.' + etag_hash.hexdigest()
-
- return filename
-
-
-def filename_to_url(filename, cache_dir=None):
- """
- Return the url and etag (which may be ``None``) stored for `filename`.
- Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
- """
- if cache_dir is None:
- cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
- if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
- cache_dir = str(cache_dir)
-
- cache_path = os.path.join(cache_dir, filename)
- if not os.path.exists(cache_path):
- raise EnvironmentError("file {} not found".format(cache_path))
-
- meta_path = cache_path + '.json'
- if not os.path.exists(meta_path):
- raise EnvironmentError("file {} not found".format(meta_path))
-
- with open(meta_path, encoding="utf-8") as meta_file:
- metadata = json.load(meta_file)
- url = metadata['url']
- etag = metadata['etag']
-
- return url, etag
-
-
-def cached_path(url_or_filename, cache_dir=None):
- """
- Given something that might be a URL (or might be a local path),
- determine which. If it's a URL, download the file and cache it, and
- return the path to the cached file. If it's already a local path,
- make sure the file exists and then return the path.
- """
- if cache_dir is None:
- cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
- if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
- url_or_filename = str(url_or_filename)
- if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
- cache_dir = str(cache_dir)
-
- parsed = urlparse(url_or_filename)
-
- if parsed.scheme in ('http', 'https', 's3'):
- # URL, so get it from the cache (downloading if necessary)
- return get_from_cache(url_or_filename, cache_dir)
- elif os.path.exists(url_or_filename):
- # File, and it exists.
- return url_or_filename
- elif parsed.scheme == '':
- # File, but it doesn't exist.
- raise EnvironmentError("file {} not found".format(url_or_filename))
- else:
- # Something unknown
- raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))
-
-
-def split_s3_path(url):
- """Split a full s3 path into the bucket name and path."""
- parsed = urlparse(url)
- if not parsed.netloc or not parsed.path:
- raise ValueError("bad s3 path {}".format(url))
- bucket_name = parsed.netloc
- s3_path = parsed.path
- # Remove '/' at beginning of path.
- if s3_path.startswith("/"):
- s3_path = s3_path[1:]
- return bucket_name, s3_path
-
-
-def s3_request(func):
- """
- Wrapper function for s3 requests in order to create more helpful error
- messages.
- """
-
- @wraps(func)
- def wrapper(url, *args, **kwargs):
- try:
- return func(url, *args, **kwargs)
- except ClientError as exc:
- if int(exc.response["Error"]["Code"]) == 404:
- raise EnvironmentError("file {} not found".format(url))
- else:
- raise
-
- return wrapper
-
-
-@s3_request
-def s3_etag(url):
- """Check ETag on S3 object."""
- s3_resource = boto3.resource("s3")
- bucket_name, s3_path = split_s3_path(url)
- s3_object = s3_resource.Object(bucket_name, s3_path)
- return s3_object.e_tag
-
-
-@s3_request
-def s3_get(url, temp_file):
- """Pull a file directly from S3."""
- s3_resource = boto3.resource("s3")
- bucket_name, s3_path = split_s3_path(url)
- s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
-
-
-def http_get(url, temp_file):
- req = requests.get(url, stream=True)
- content_length = req.headers.get('Content-Length')
- total = int(content_length) if content_length is not None else None
- progress = tqdm(unit="B", total=total)
- for chunk in req.iter_content(chunk_size=1024):
- if chunk: # filter out keep-alive new chunks
- progress.update(len(chunk))
- temp_file.write(chunk)
- progress.close()
-
-
-def get_from_cache(url, cache_dir=None):
- """
- Given a URL, look for the corresponding dataset in the local cache.
- If it's not there, download it. Then return the path to the cached file.
- """
- if cache_dir is None:
- cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
- if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
- cache_dir = str(cache_dir)
-
- if not os.path.exists(cache_dir):
- os.makedirs(cache_dir)
-
- # Get eTag to add to filename, if it exists.
- if url.startswith("s3://"):
- etag = s3_etag(url)
- else:
- response = requests.head(url, allow_redirects=True)
- if response.status_code != 200:
- raise IOError("HEAD request failed for url {} with status code {}"
- .format(url, response.status_code))
- etag = response.headers.get("ETag")
-
- filename = url_to_filename(url, etag)
-
- # get cache path to put the file
- cache_path = os.path.join(cache_dir, filename)
-
- if not os.path.exists(cache_path):
- # Download to temporary file, then copy to cache dir once finished.
- # Otherwise you get corrupt cache entries if the download gets interrupted.
- with tempfile.NamedTemporaryFile() as temp_file:
- logger.info("%s not found in cache, downloading to %s", url, temp_file.name)
-
- # GET file object
- if url.startswith("s3://"):
- s3_get(url, temp_file)
- else:
- http_get(url, temp_file)
-
- # we are copying the file before closing it, so flush to avoid truncation
- temp_file.flush()
- # shutil.copyfileobj() starts at the current position, so go to the start
- temp_file.seek(0)
-
- logger.info("copying %s to cache at %s", temp_file.name, cache_path)
- with open(cache_path, 'wb') as cache_file:
- shutil.copyfileobj(temp_file, cache_file)
-
- logger.info("creating metadata file for %s", cache_path)
- meta = {'url': url, 'etag': etag}
- meta_path = cache_path + '.json'
- with open(meta_path, 'w', encoding="utf-8") as meta_file:
- json.dump(meta, meta_file)
-
- logger.info("removing temp file %s", temp_file.name)
-
- return cache_path
-
-
-def read_set_from_file(filename):
- '''
- Extract a de-duped collection (set) of text from a file.
- Expected file format is one item per line.
- '''
- collection = set()
- with open(filename, 'r', encoding='utf-8') as file_:
- for line in file_:
- collection.add(line.rstrip())
- return collection
-
-
-def get_file_extension(path, dot=True, lower=True):
- ext = os.path.splitext(path)[1]
- ext = ext if dot else ext[1:]
- return ext.lower() if lower else ext
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/deprecated_data_utils/lazy_loader.py b/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/deprecated_data_utils/lazy_loader.py
deleted file mode 100644
index 506f5294d5..0000000000
--- a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/deprecated_data_utils/lazy_loader.py
+++ /dev/null
@@ -1,202 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""utils for loading text from disk"""
-import os
-import mmap
-import pickle as pkl
-import time
-from itertools import accumulate
-
-import torch
-from torch.multiprocessing import Lock
-
-
-def get_lazy_path(path):
- """
- Gets directory path where lazy files are stored.
- """
- return os.path.splitext(path)[0] + '.lazy'
-
-
-def exists_lazy(path, data_type='data'):
- """
- Check if we've already made a lazy version of this file for the `data_type` field.
- """
- if not os.path.exists(get_lazy_path(path)):
- return False
- contents = os.listdir(get_lazy_path(path))
- if data_type not in contents:
- return False
- if data_type + '.len.pkl' not in contents:
- return False
- return True
-
-
-def make_lazy(path, strs, data_type='data'):
- """
- Make lazy version of `data_type` field of the file. Byte offsets
- corresponding to data indices are stored in a `.len.pkl` data file.
- """
- lazypath = get_lazy_path(path)
- if not os.path.exists(lazypath):
- os.makedirs(lazypath)
- datapath = os.path.join(lazypath, data_type)
- lenpath = os.path.join(lazypath, data_type + '.len.pkl')
- if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
- with open(datapath, 'wb') as f:
- str_lens = []
- str_cnt = 0
- for s in strs:
- if isinstance(s, dict):
- s = s['text']
- encoded = s.encode('utf-8')
- f.write(encoded)
- str_cnt = len(encoded)
- str_lens.append(str_cnt)
- pkl.dump(str_lens, open(lenpath, 'wb'))
- else:
- while not os.path.exists(lenpath):
- time.sleep(1)
-
-
-def split_strings(strings, start, chr_lens):
- """
- Split strings based on string lengths and given start.
- """
- return [strings[i - start:j - start] for i, j in zip([start] + chr_lens[:-1], chr_lens)]
-
-
-class ProcessorTokenizer:
- """
- callable class that runs a preprocessing, as well as tokenization step,
- on input text.
- """
-
- def __init__(self, tokenizer, process_fn=None):
- self.tokenizer = tokenizer
- self.process_fn = process_fn
-
- def __call__(self, string):
- if self.tokenizer is not None:
- string = self.tokenizer(string, process_fn=self.process_fn)
- elif self.process_fn is not None:
- string = self.process_fn(string)
- return string
-
-
-class lazy_array_loader(object):
- """
- Arguments:
- path: path to directory where array entries are concatenated into one big string file
- and the .len file are located
- data_type (str): Some datsets have multiple fields that are stored in different paths.
- `data_type` specifies which of these fields to load in this class
- mem_map (boolean): Specifies whether to memory map file `path`
- map_fn (callable): Fetched strings are passed through map_fn before being returned.
-
- Example of lazy loader directory structure:
- file.json
- file.lazy/
- data_type1
- data_type1.len.pkl
- data_type2
- data_type2.len.pkl
- """
-
- def __init__(self, path, data_type='data', mem_map=False, map_fn=None):
- lazypath = get_lazy_path(path)
- datapath = os.path.join(lazypath, data_type)
- # get file where array entries are concatenated into one big string
- self._file = open(datapath, 'rb', buffering=0)
- self.file = self._file
- # memory map file if necessary
- self.mem_map = mem_map
- if self.mem_map:
- self.file = mmap.mmap(self.file.fileno(), 0, prot=mmap.PROT_READ)
- lenpath = os.path.join(lazypath, data_type + '.len.pkl')
- self.lens = pkl.load(open(lenpath, 'rb'))
- self.ends = list(accumulate(self.lens))
- self.dumb_ends = list(self.ends)
- self.read_lock = Lock()
- self.process_fn = map_fn
- self.map_fn = map_fn
- self._tokenizer = None
-
- def SetTokenizer(self, tokenizer):
- """
- logic to set and remove (set to None) tokenizer.
- combines preprocessing/tokenization into one callable.
- """
- if tokenizer is None:
- if not hasattr(self, '_tokenizer'):
- self._tokenizer = tokenizer
- else:
- self._tokenizer = tokenizer
- self.map_fn = ProcessorTokenizer(tokenizer, self.process_fn)
-
- def GetTokenizer(self):
- return self._tokenizer
-
- def __getitem__(self, index):
- """
- read file and splice strings based on string ending array `self.ends`
- """
- if not isinstance(index, slice):
- if index == 0:
- start = 0
- else:
- start = self.ends[index - 1]
- end = self.ends[index]
- rtn = self.file_read(start, end)
- if self.map_fn is not None:
- return self.map_fn(rtn)
- else:
- # if slice, fetch strings with 1 diskread and then splice in memory
- chr_lens = self.ends[index]
- if index.start == 0 or index.start is None:
- start = 0
- else:
- start = self.ends[index.start - 1]
- stop = chr_lens[-1]
- strings = self.file_read(start, stop)
- rtn = split_strings(strings, start, chr_lens)
- if self.map_fn is not None:
- return self.map_fn([s for s in rtn])
- return rtn
-
- def __len__(self):
- return len(self.ends)
-
- def file_read(self, start=0, end=None):
- """read specified portion of file"""
-
- # atomic reads to avoid race conditions with multiprocess dataloader
- self.read_lock.acquire()
- # seek to start of file read
- self.file.seek(start)
- # read to end of file if no end point provided
- if end is None:
- rtn = self.file.read()
- # else read amount needed to reach end point
- else:
- rtn = self.file.read(end - start)
- self.read_lock.release()
- # TODO: @raulp figure out mem map byte string bug
- # if mem map'd need to decode byte string to string
- rtn = rtn.decode('utf-8', 'ignore')
- # rtn = str(rtn)
- if self.mem_map:
- rtn = rtn.decode('unicode_escape')
- return rtn
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/deprecated_data_utils/samplers.py b/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/deprecated_data_utils/samplers.py
deleted file mode 100644
index baa6b9d088..0000000000
--- a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/deprecated_data_utils/samplers.py
+++ /dev/null
@@ -1,143 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""batch samplers that work with either random or sequential data samplers"""
-import math
-import os
-import sys
-
-import torch
-from torch.utils import data
-import numpy as np
-
-
-class RandomSampler(data.sampler.Sampler):
- r"""
- Based off of pytorch RandomSampler and DistributedSampler. Essentially a RandomSampler,
- but this class lets the user set an epoch like DistributedSampler
- Samples elements randomly. If without replacement, then sample from a shuffled dataset.
- If with replacement, then user can specify ``num_samples`` to draw.
- Arguments:
- data_source (Dataset): dataset to sample from
- num_samples (int): number of samples to draw, default=len(dataset)
- replacement (bool): samples are drawn with replacement if ``True``, default=False
- """
-
- def __init__(self, data_source, replacement=False, num_samples=None):
- self.data_source = data_source
- self.replacement = replacement
- self._num_samples = num_samples
- self.epoch = -1
-
- if self._num_samples is not None and replacement is False:
- raise ValueError("With replacement=False, num_samples should not be specified, "
- "since a random permute will be performed.")
-
- if not isinstance(self.num_samples, int) or self.num_samples <= 0:
- raise ValueError("num_samples should be a positive integer "
- "value, but got num_samples={}".format(self.num_samples))
- if not isinstance(self.replacement, bool):
- raise ValueError("replacement should be a boolean value, but got "
- "replacement={}".format(self.replacement))
-
- @property
- def num_samples(self):
- # dataset size might change at runtime
- if self._num_samples is None:
- return len(self.data_source)
- return self._num_samples
-
- def __iter__(self):
- n = len(self.data_source)
- g = torch.Generator()
- if self.epoch >= 0:
- g.manual_seed(self.epoch)
- if self.replacement:
- return iter(torch.randint(high=n, size=(self.num_samples,),
- dtype=torch.int64, generator=g).tolist())
- return iter(torch.randperm(n, generator=g).tolist())
-
- def __len__(self):
- return self.num_samples
-
- def set_epoch(self, epoch):
- self.epoch = epoch
-
-
-class DistributedBatchSampler(data.sampler.BatchSampler):
- """
- similar to normal implementation of distributed sampler, except implementation is at the
- batch sampler level, instead of just the sampler level. This allows wrapping of arbitrary
- data samplers (sequential, random, WeightedRandomSampler, etc.) with this batch sampler.
- """
-
- def __init__(self, sampler, batch_size, drop_last, rank=-1, world_size=2, wrap_last=False):
- super(DistributedBatchSampler, self).__init__(sampler, batch_size, drop_last)
- if rank == -1:
- assert False, 'should not be here'
- rank = torch.distributed.get_rank()
- self.rank = rank
- self.world_size = world_size
- self.sampler.wrap_around = 0
- self.wrap_around = 0
- self.wrap_last = wrap_last
- self.start_iter = 0
-
- def __iter__(self):
- batch = []
- last_batch = None
- i = 0
- for idx in self.data_iterator(self.sampler, wrap_around=False):
- batch.append(idx)
- if len(batch) == self.batch_size:
- tbatch = self._batch(batch)
- if i >= self.start_iter:
- yield tbatch
- self.start_iter = 0
- i += 1
- last_batch = np.array(list(tbatch))
- batch = []
- batch_len = len(batch)
- if batch_len > 0 and not self.drop_last:
- if self.wrap_last:
- self.sampler.wrap_around -= (self.batch_size)
- self.wrap_around += (len(batch))
- self.wrap_around %= self.batch_size
- if isinstance(self.sampler, TransposedSampler):
- for i, idx in enumerate(self.data_iterator(self.sampler, wrap_around=True)):
- if i == 0:
- continue
- batch.append(idx)
- new_batch_len = len(batch)
- if len(batch) == self.batch_size:
- break
- yield self._batch(batch)
- if self.wrap_last:
- self.sampler.wrap_around += self.batch_size
-
- def data_iterator(self, _iter, wrap_around=False):
- """iterates through data and handles wrap around"""
- for i, idx in enumerate(_iter):
- if i < self.wrap_around % self.batch_size:
- continue
- if wrap_around:
- self.wrap_around += 1
- self.wrap_around %= self.batch_size
- yield idx
-
- def _batch(self, batch):
- """extracts samples only pertaining to this worker's batch"""
- start = self.rank * self.batch_size // self.world_size
- end = (self.rank + 1) * self.batch_size // self.world_size
- return batch[start:end]
diff --git a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/deprecated_data_utils/scripts/presplit_sentences_json.py b/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/deprecated_data_utils/scripts/presplit_sentences_json.py
deleted file mode 100644
index f150f2f694..0000000000
--- a/megatron/Megatron-LM-v1.1.5-3D_parallelism/megatron/deprecated_data_utils/scripts/presplit_sentences_json.py
+++ /dev/null
@@ -1,27 +0,0 @@
-"""
-Usage:
-python scripts/presplit_sentences_json.py