quic
diff --git a/‎QEfficient/base/modeling_qeff.py
Lines changed: 31 additions & 34 deletions b/‎QEfficient/base/modeling_qeff.py
Lines changed: 31 additions & 34 deletions
diff --git a/‎QEfficient/exporter/export_hf_to_cloud_ai_100.py
Lines changed: 0 additions & 1 deletion b/‎QEfficient/exporter/export_hf_to_cloud_ai_100.py
Lines changed: 0 additions & 1 deletion
diff --git a/‎QEfficient/exporter/export_utils.py
Lines changed: 0 additions & 4 deletions b/‎QEfficient/exporter/export_utils.py
Lines changed: 0 additions & 4 deletions
diff --git a/‎QEfficient/finetune/data/sampler.py
Lines changed: 10 additions & 6 deletions b/‎QEfficient/finetune/data/sampler.py
Lines changed: 10 additions & 6 deletions
diff --git a/‎QEfficient/finetune/dataset/samsum_dataset.py
Lines changed: 1 addition & 1 deletion b/‎QEfficient/finetune/dataset/samsum_dataset.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎QEfficient/finetune/utils/dataset_utils.py
Lines changed: 35 additions & 3 deletions b/‎QEfficient/finetune/utils/dataset_utils.py
Lines changed: 35 additions & 3 deletions
diff --git a/‎QEfficient/finetune/utils/helper.py
Lines changed: 4 additions & 0 deletions b/‎QEfficient/finetune/utils/helper.py
Lines changed: 4 additions & 0 deletions
@@ -7,7 +7,6 @@
 
 import hashlib
 import inspect
-import json
 import logging
 import shutil
 import subprocess
@@ -23,7 +22,7 @@
 from QEfficient.base.pytorch_transforms import PytorchTransform
 from QEfficient.compile.qnn_compiler import compile as qnn_compile
 from QEfficient.generation.cloud_infer import QAICInferenceSession
-from QEfficient.utils import constants, dump_qconfig
+from QEfficient.utils import constants, create_json, dump_qconfig, generate_mdp_partition_config, load_json
 from QEfficient.utils.cache import QEFF_HOME, to_hashable
 
 logger = logging.getLogger(__name__)
@@ -269,17 +268,17 @@ def _compile(
                 specializations=specializations,
                 custom_io=custom_io,
                 device_group=list(range(mdp_ts_num_devices)),
-                num_cores=compiler_options.get("aic_num_cores", 16),
-                mxfp6=compiler_options.get("mxfp6_matmul", False),
+                num_cores=compiler_options.get("aic_num_cores", constants.DEFAULT_AIC_NUM_CORES),
+                mxfp6=compiler_options.get("mxfp6_matmul", constants.DEFAULT_AIC_MXPF6_MATMUL),
                 mxint8=mxint8_kv_cache,
                 qnn_config=qnn_config,
             )
 
             return self.qpc_path
 
         command = constants.COMPILER + [f"-m={onnx_path}"]
-        if mdp_ts_json_path := compiler_options.pop("mdp_ts_json_path", None):
-            mdp_ts_num_devices = None
+
+        if mdp_ts_json_path := compiler_options.pop("mdp_load_partition_config", None):
             command.append(f"-mdp-load-partition-config={mdp_ts_json_path}")
 
         for key, value in compiler_options.items():
@@ -289,6 +288,17 @@ def _compile(
                     command.append(option)
                 continue
             command.append(f"{option}={value}")
+
+        # Create a dummy mdp_ts_json if mdp-load-partition-config not provided and num_devices > 1
+        if mdp_ts_json_path is not None:
+            mdp_ts_json = load_json(str(mdp_ts_json_path))
+        elif mdp_ts_num_devices > 1:
+            mdp_ts_json = generate_mdp_partition_config(
+                mdp_ts_num_devices, compiler_options.get("aic_num_cores", constants.DEFAULT_AIC_NUM_CORES)
+            )
+        else:
+            mdp_ts_json = None
+
         compile_hash = hashlib.sha256(to_hashable(command))
 
         if specializations is not None:
@@ -299,30 +309,37 @@ def _compile(
 
         if num_speculative_tokens:
             compile_hash.update(to_hashable({"num_speculative_tokens": num_speculative_tokens}))
-        # Hash num_devices too, since default value would always be 1.
-        compile_hash.update(to_hashable(mdp_ts_num_devices))
+
+        # Hash the MDP partition config and the number of devices.
+        compile_hash.update(to_hashable(mdp_ts_json))
+        compile_hash.update(to_hashable({"mdp_ts_num_devices": mdp_ts_num_devices}))
 
         # Check if already compiled
         compile_hash = compile_hash.hexdigest()[:16]
         compile_dir = qpc_path.with_name(qpc_path.name + "-" + compile_hash)
         qpc_path = compile_dir / "qpc"
         qpc_path.mkdir(parents=True, exist_ok=True)
+
         if qpc_path.is_dir():
             if (qpc_path / "programqpc.bin").is_file():
                 self.qpc_path = qpc_path
                 return qpc_path
             # Probably compilation failure last time, delete directory to start over
             shutil.rmtree(qpc_path)
 
+        # write the MDP partition config file if not provided
+        if mdp_ts_json is not None:
+            mdp_ts_json_path = compile_dir / f"mdp_ts_{mdp_ts_num_devices}.json"
+            create_json(str(mdp_ts_json_path), mdp_ts_json)
+            command.append(f"-mdp-load-partition-config={mdp_ts_json_path}")
+
         # Write specializations.json file
         if specializations is not None:
             specializations_json = compile_dir / "specializations.json"
-            with open(specializations_json, "w") as fp:
-                json.dump(
-                    {"specializations": [{k: str(v) for k, v in spec.items()} for spec in specializations]},
-                    fp,
-                    indent=4,
-                )
+            specializations_data = {
+                "specializations": [{k: str(v) for k, v in spec.items()} for spec in specializations]
+            }
+            create_json(str(specializations_json), specializations_data)
             command.append(f"-network-specialization-config={specializations_json}")
 
         # Write custom_io.yaml file
@@ -333,26 +350,6 @@ def _compile(
                     fp.write(f" - IOName: {io_name}\n   Precision: {dtype}\n\n")
             command.append(f"-custom-IO-list-file={custom_io_yaml}")
 
-        # Write mdp_config.json file
-        if not mdp_ts_json_path and mdp_ts_num_devices > 1:
-            num_cores = compiler_options.get("aic_num_cores", 16)
-            mdp_ts_json = compile_dir / f"mdp_ts_{mdp_ts_num_devices}.json"
-            with open(mdp_ts_json, "w") as fp:
-                json.dump(
-                    {
-                        "connections": [{"devices": list(range(mdp_ts_num_devices)), "type": "p2p"}],
-                        "partitions": [
-                            {
-                                "name": "Partition0",
-                                "devices": [{"deviceId": d, "numCores": num_cores} for d in range(mdp_ts_num_devices)],
-                            }
-                        ],
-                    },
-                    fp,
-                    indent=4,
-                )
-            command.append(f"-mdp-load-partition-config={mdp_ts_json}")
-
         command.append(f"-aic-binary-dir={qpc_path}")
         logger.info(f"Running compiler: {' '.join(command)}")
         try:
 
@@ -129,7 +129,6 @@ def export_bertstyle_model_to_onnx(model_name, model, tokenizer, onnx_dir_path,
     )
 
     # Generate inputFiles
-    # todo(ochougul):rename to bert_style_input_list.txt
     input_list_file = os.path.join(onnx_dir_path, "input_list.txt")
     generate_input_files(
         input_files_path=os.path.join(onnx_dir_path, "inputFiles"),
 
@@ -218,8 +218,6 @@ def fix_onnx_fp16(
         :str: Updated base name of exported ONNX model.
     """
     model = onnx.load(os.path.join(gen_models_path, f"{model_base_name}.onnx"))
-    # TODO: Remove this `fix_onnx_fp16` function and replace with this transform
-    # as we're not utilizing the validations done in this function
     model, fp16_fix = FP16ClipTransform.apply(model, onnx_base_dir=gen_models_path)
 
     if fp16_fix:
@@ -256,8 +254,6 @@ def fix_onnx_fp16(
         if ort_outputs is not None:
             for oname, orto, ortof in zip(output_names, ort_outputs, ort_outputs_fixed):
                 fix_diff = np.abs(orto.astype(np.float32) - ortof.astype(np.float32)).max()
-                # TODO: need to the debug this
-                # info(oname, fix_diff)
                 close_outputs.append(fix_diff < 1e-5)
     else:
         info("No constants out of FP16 range")
 
@@ -4,11 +4,9 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------
-
 import random
 from itertools import islice
 
-import numpy as np
 import torch
 
 
@@ -22,14 +20,14 @@ def __init__(self, data_source, batch_size: int, drop_last: bool, shuffle: bool
         self.batch_size = batch_size
         self.drop_last = drop_last
         self.shuffle = shuffle
+        self.data_source = data_source
 
     def __iter__(self):
-        ids = np.argsort(self.lengths, kind="mergesort")
+        ids = list(range(len(self.data_source)))
         if self.drop_last:
             ids = ids[: len(ids) // self.batch_size * self.batch_size]
 
         batches = [ids[i : i + self.batch_size] for i in range(0, len(ids), self.batch_size)]
-
         if self.shuffle:
             random.shuffle(batches)
 
@@ -45,11 +43,17 @@ def __len__(self):
 
 class DistributedLengthBasedBatchSampler(torch.utils.data.BatchSampler):
     def __init__(
-        self, data_source, batch_size: int, num_replicas: int, rank: int, shuffle: bool = True, seed: int = 0
+        self,
+        data_source,
+        batch_size: int,
+        num_replicas: int,
+        rank: int,
+        shuffle: bool = True,
+        seed: int = 0,
     ) -> None:
         random.seed(seed)
         self.batch_sampler = LengthBasedBatchSampler(
-            data_source, batch_size=batch_size, drop_last=True, shuffle=shuffle
+            data_source, batch_size=batch_size, drop_last=False, shuffle=shuffle
         )
         self.num_replicas = num_replicas
         self.rank = rank
 
@@ -9,7 +9,7 @@
 
 
 def get_preprocessed_samsum(dataset_config, tokenizer, split, context_length=None):
-    dataset = datasets.load_dataset("Samsung/samsum", split=split, trust_remote_code=True)
+    dataset = datasets.load_dataset("knkarthick/samsum", split=split, trust_remote_code=True)
 
     prompt = "Summarize this dialog:\n{dialog}\n---\nSummary:\n"
 
 
@@ -4,14 +4,15 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------
-
+import datasets
 import torch
 import torch.distributed as dist
 from transformers.data import DataCollatorForSeq2Seq
 
 from QEfficient.finetune.data.sampler import DistributedLengthBasedBatchSampler
 from QEfficient.finetune.dataset.dataset_config import DATALOADER_COLLATE_FUNC, DATASET_PREPROC
 from QEfficient.finetune.utils.logging_utils import logger
+from QEfficient.finetune.utils.helper import get_num_ddp_devices
 
 
 def get_preprocessed_dataset(
@@ -56,20 +57,51 @@ def get_dataloader_kwargs(train_config, dataset, dataset_processer, split):
                 dataset, num_replicas=dist.get_world_size(), rank=dist.get_rank(), shuffle=False
             )
             kwargs["batch_size"] = batch_size
-            kwargs["drop_last"] = True
+            kwargs["drop_last"] = False
     else:
         kwargs["batch_size"] = batch_size
-        kwargs["drop_last"] = True
+        kwargs["drop_last"] = False
     kwargs["collate_fn"] = DataCollatorForSeq2Seq(dataset_processer)
     return kwargs
 
 
+def padding_dataset(train_config, dataset, batch_size):
+    if train_config.enable_ddp and train_config.enable_sorting_for_ddp:
+        if isinstance(dataset, datasets.Dataset):
+            # Hugging Face Dataset transformation
+            dataset = dataset.map(lambda x: {"input_length": len(x["input_ids"])})
+            dataset = dataset.sort("input_length")
+
+        else:
+            dataset = sorted(dataset, key=lambda x: len(x["input_ids"]))
+
+    dummy_row = next(iter(dataset))
+    dummy_row["labels"] = torch.tensor([-100] * len(dummy_row["labels"]))
+    padding_size = 0
+    num_replicas = get_num_ddp_devices()
+    remainder = len(dataset) % (num_replicas * batch_size)
+    padding_size = (num_replicas * batch_size) - remainder
+
+    dummy_data = [dummy_row.copy() for _ in range(padding_size)]
+    dummy_dataset = datasets.Dataset.from_list(dummy_data)
+    if isinstance(dataset, datasets.Dataset):
+        combined_dataset = datasets.concatenate_datasets([dataset, dummy_dataset])
+    else:
+        combined_dataset = dataset + list(dummy_dataset)
+    return combined_dataset
+
+
 def get_dataloader(tokenizer, dataset_config, train_config, split: str = "train"):
     dataset = get_preprocessed_dataset(tokenizer, dataset_config, split, context_length=train_config.context_length)
+
+    batch_size = train_config.train_batch_size if split == "train" else train_config.val_batch_size
+    dataset = padding_dataset(train_config, dataset, batch_size)
+
     dl_kwargs = get_dataloader_kwargs(train_config, dataset, tokenizer, split)
 
     # FIXME (Meet): Add custom data collator registration from the outside by the user.
     custom_data_collator = get_custom_data_collator(tokenizer, dataset_config)
+
     if custom_data_collator:
         print("custom_data_collator is used")
         dl_kwargs["collate_fn"] = custom_data_collator
 
@@ -4,6 +4,7 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------
+import os
 
 import os
 
@@ -15,3 +16,6 @@
 
 def is_rank_zero():
     return int(os.getenv("LOCAL_RANK", 0)) == 0
+
+def get_num_ddp_devices():
+    return int(os.getenv("WORLD_SIZE", 1))
Original file line number	Diff line number	Diff line change
`@@ -129,7 +129,6 @@ def export_bertstyle_model_to_onnx(model_name, model, tokenizer, onnx_dir_path,`
`129`	`129`	`)`
`130`	`130`
`131`	`131`	`# Generate inputFiles`
`132`		`- # todo(ochougul):rename to bert_style_input_list.txt`
`133`	`132`	`input_list_file = os.path.join(onnx_dir_path, "input_list.txt")`
`134`	`133`	`generate_input_files(`
`135`	`134`	`input_files_path=os.path.join(onnx_dir_path, "inputFiles"),`