aws-neuron
diff --git a/Diff for: ‎.gitignore
+9 b/Diff for: ‎.gitignore
+9
diff --git a/Diff for: ‎.pre-commit-config.yaml
+8-1 b/Diff for: ‎.pre-commit-config.yaml
+8-1
diff --git a/Diff for: ‎README.md
+1-1 b/Diff for: ‎README.md
+1-1
diff --git a/Diff for: ‎build.sh
+2-2 b/Diff for: ‎build.sh
+2-2
diff --git a/Diff for: ‎examples/inference/dbrx/dbrx_runner.py
+11-19 b/Diff for: ‎examples/inference/dbrx/dbrx_runner.py
+11-19
@@ -1,5 +1,8 @@
 # Python .gitignore template
 
+*.deb
+*.pt
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@@ -78,6 +81,7 @@ target/
 
 # Jupyter Notebook
 .ipynb_checkpoints
+*.ipynb
 
 # IPython
 profile_default/
@@ -140,3 +144,8 @@ src/neuronx_distributed.egg-info/
 *.whl
 **/.DS_Store
 __pycache__
+.vscode
+/exp*
+/tmp*
+tmp.*
+pyproject.toml
@@ -14,7 +14,7 @@ repos:
     - id: clang-format
       args: [--style=file, -i]
 - repo: https://github.com/astral-sh/ruff-pre-commit
-  rev: v0.5.0
+  rev: v0.6.2
   hooks:
     - id: ruff
       name: ruff
@@ -23,3 +23,10 @@ repos:
       types: [python]
       language: system
       exclude: cases_update
+- repo: https://github.com/pre-commit/mirrors-mypy
+  rev: v1.11.2
+  hooks:
+    - id: mypy
+      name: mypy
+      language: python
+      files: src/.*\.py
@@ -11,7 +11,7 @@ To build from source, run the following command:
 ```
 bash ./build.sh
 ```
- 
+
 It should place the wheel at `build/`
 
 ## API Reference Guide
 
@@ -15,9 +15,9 @@ fi
 # Run static code analysis
 python3.8 -m pip install mypy
 # Install type bindings
-python3.8 -m pip install types-requests boto3-stubs[s3]
+python3.8 -m pip install types-requests boto3-stubs[s3] types-PyYAML
 # removing cache fails in ToD
-python3.8 -m mypy --no-incremental || true
+python3.8 -m mypy --no-incremental --cache-dir=/dev/null
 # exit when asked to run `mypy` only
 if [[ "$1" == "mypy" ]]
 then
 
@@ -5,54 +5,46 @@
     NeuronDbrxModel,
 )
 from runner import InferenceRunner
-from transformers import AutoTokenizer
+from transformers import AutoTokenizer, DbrxConfig
 
 from neuronx_distributed.parallel_layers.checkpointing import _invoke_preshard_hook
 
 
 class DbrxRunner(InferenceRunner):
     def load_hf_model(self):
-        config = NeuronDbrxConfig.from_pretrained(self.model_path)
-        return NeuronDbrxForCausalLM.load_hf_model(self.model_path, config)
+        hf_config = DbrxConfig.from_pretrained(self.model_path)
+        return NeuronDbrxForCausalLM.load_hf_model(self.model_path, hf_config)
 
     def load_neuron_model_on_cpu(self, max_prompt_length, sequence_length, batch_size, **kwargs):
         # On CPU we can only run tensor parallelism with degree 1
-        config = self.get_config_for_nxd(
+        hf_config = self.get_hf_config(sequence_length=sequence_length, **kwargs)
+        neuron_config = self.get_config_for_nxd(
+            hf_config,
             batch_size,
             1,
             max_prompt_length=max_prompt_length,
             sequence_length=sequence_length,
             enable_bucketing=False,
             **kwargs)
-        config.torch_dtype = torch.float32
+        hf_config.torch_dtype = torch.float32
 
         self.init_ditributed_env()
-        neuron_model = NeuronDbrxModel(config)
+        neuron_model = NeuronDbrxModel(neuron_config)
 
-        state_dict = NeuronDbrxForCausalLM.get_state_dict(self.model_path, config)
+        state_dict = NeuronDbrxForCausalLM.get_state_dict(self.model_path, neuron_config)
 
         _invoke_preshard_hook(neuron_model, state_dict)
 
         neuron_model.load_state_dict(state_dict, strict=False)
 
-        if config.torch_dtype == torch.bfloat16:
+        if hf_config.torch_dtype == torch.bfloat16:
             neuron_model.bfloat16()
 
-        model = NeuronDbrxForCausalLM(None, config)
+        model = NeuronDbrxForCausalLM(None, neuron_config)
         model.context_encoding_model.model = neuron_model
         model.token_generation_model.model = neuron_model
         return model
 
-    def load_neuron_model(self, traced_model_path):
-        config = NeuronDbrxConfig.from_pretrained(traced_model_path)
-        model = NeuronDbrxForCausalLM.from_pretrained("", config)
-
-        model.load(traced_model_path)
-        if config.torch_dtype == torch.bfloat16:
-            model.bfloat16()
-
-        return model
-
     def load_tokenizer(self, padding_side=None):
         tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_path)
         tokenizer.pad_token = tokenizer.unk_token