adding Context Length Specialization (CCL)

vjanfaza · vjanfaza · commit c48d689541df · 2025-10-07T14:36:03.000-07:00
Signed-off-by: Vahid Janfaza &lt;vjanfaza@qti.qualcomm.com&gt;
diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py
@@ -808,7 +808,11 @@ def run_prefill(self, prompt, generation_len, prefill_logit_bs=1, decode_batch_i
             if (i + 1) * self._prefill_seq_len > self.comp_ctx_lengths[prefill_ccl_id]:
                 prefill_ccl_id += 1
                 if prefill_ccl_id >= self.prefill_ccl_len:
-                    prefill_ccl_id = self.prefill_ccl_len - 1
+                    prefill_ccl_id = (
+                        (self.prefill_ccl_len - 1)
+                        if self.prefill_ccl_len != 0
+                        else min(prefill_ccl_id, len(self.comp_ctx_lengths) - 1)
+                    )
                 inputs["comp_ctx_lengths"] = self.list_of_comp_ctx_lengths[prefill_ccl_id]
 
             chunk_inputs = inputs.copy()
@@ -839,7 +843,6 @@ def initialize_ccl(self, decode_inputs):
                 ccl_id = i
                 break
 
-        print(f"Decode CCL: {self.comp_ctx_lengths[ccl_id]}")
         return ccl_id, max_ccl_id
 
     def run_continuous_batching_decode(self, prompt_queue, generation_len):
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
@@ -862,7 +862,11 @@ def kv_offload_generate(
             if (i + 1) * prefill_seq_len > self.comp_ctx_lengths[prefill_ccl_id]:
                 prefill_ccl_id += 1
                 if prefill_ccl_id >= self.prefill_ccl_len:
-                    prefill_ccl_id = self.prefill_ccl_len - 1
+                    prefill_ccl_id = (
+                        (self.prefill_ccl_len - 1)
+                        if self.prefill_ccl_len != 0
+                        else min(prefill_ccl_id, len(self.comp_ctx_lengths) - 1)
+                    )
                 chunk_inputs["comp_ctx_lengths"] = list_of_comp_ctx_lengths[prefill_ccl_id]
 
             chunk_inputs["input_ids"] = lang_inputs["input_ids"][:, i * prefill_seq_len : (i + 1) * prefill_seq_len]
@@ -1196,7 +1200,11 @@ def cloud_ai_100_generate(
             if (i + 1) * prefill_seq_len > self.comp_ctx_lengths[prefill_ccl_id]:
                 prefill_ccl_id += 1
                 if prefill_ccl_id >= self.prefill_ccl_len:
-                    prefill_ccl_id = self.prefill_ccl_len - 1
+                    prefill_ccl_id = (
+                        (self.prefill_ccl_len - 1)
+                        if self.prefill_ccl_len != 0
+                        else min(prefill_ccl_id, len(self.comp_ctx_lengths) - 1)
+                    )
                 chunk_inputs["comp_ctx_lengths"] = list_of_comp_ctx_lengths[prefill_ccl_id]
 
             chunk_inputs["input_ids"] = inputs["input_ids"][:, i * prefill_seq_len : (i + 1) * prefill_seq_len]
@@ -1784,8 +1792,8 @@ def build_decode_specialization(
         full_batch_size: Optional[int] = None,
         num_speculative_tokens: Optional[int] = None,
     ):
-        if prefill_seq_len == 1 and not self.continuous_batching:
-            return None  # Avoid duplication with prefill
+        if prefill_seq_len == 1 and not self.continuous_batching and not self.comp_ctx_lengths:
+            return None  # Avoid duplication with prefill in non-CCL
         spec = {
             "batch_size": full_batch_size if self.continuous_batching else batch_size,
             "seq_len": (num_speculative_tokens + 1) if self.is_tlm else 1,
@@ -1908,6 +1916,8 @@ def compile(
         specializations = []
         if prefill_only is None or prefill_only or prefill_seq_len == 1:
             if self.comp_ctx_lengths is not None:
+                if prefill_seq_len != 1 and self.prefill_ccl_len == 0:
+                    raise ValueError("When prefill_seq_len > 1, prefill_ccl_len must be greater than 0.")
                 # Adding elements from self.comp_ctx_lengths to prefill_specialization
                 for i in range(0, self.prefill_ccl_len):
                     specializations.append(
diff --git a/examples/granite_example/ccl_granitemoe_inference.py b/examples/granite_example/ccl_granitemoe_inference.py
@@ -18,8 +18,11 @@
 
 comp_ctx_lengths = [256, 512, 1024, 2048]  # None
 
-## Prefill_ccl_len shows how many numbers in the comp_ctx_lengths list is related to prefilling and the rest would be for decoding. The default value is 1.
-prefill_ccl_len = 2
+"""
+# Prefill_ccl_len shows how many numbers in the comp_ctx_lengths list is related to prefilling and the rest would be for decoding. The default value is 1 means the first value is for prefilling and the rest are for decoding.
+# In moe models with prefill_seq_len=1, we can pass prefill_ccl_len=0 to use all ccl values for both prefilling and decoding steps.
+"""
+prefill_ccl_len = 0
 
 model = QEFFAutoModelForCausalLM.from_pretrained(
     model_name, comp_ctx_lengths=comp_ctx_lengths, prefill_ccl_len=prefill_ccl_len, continuous_batching=False
diff --git a/examples/qwen3moe_example/ccl_qwen3moe_inference.py b/examples/qwen3moe_example/ccl_qwen3moe_inference.py
@@ -15,23 +15,28 @@
 # For CB inference, set continuous_batching to True and add full_batch_size,mxfp6,mint8 argument in compile function
 # We will use prompt_len=1 for compilation for both cb and non-cb inference
 """
+ctx_len = 1024
+batch_size = 1
+comp_ctx_lengths = [128, 256, 512, 1024]
 
-comp_ctx_lengths = [192, 256, 512, 1024]  # None
-
-## Prefill_ccl_len shows how many numbers in the comp_ctx_lengths list is related to prefilling and the rest would be for decoding. The default value is 1.
-prefill_ccl_len = 2
+"""
+# Prefill_ccl_len shows how many numbers in the comp_ctx_lengths list is related to prefilling and the rest would be for decoding. The default value is 1 means the first value is for prefilling and the rest are for decoding.
+# In moe models with prefill_seq_len=1, we can pass prefill_ccl_len=0 to use all ccl values for both prefilling and decoding steps.
+"""
+prefill_ccl_len = 0
 
 model = QEFFAutoModelForCausalLM.from_pretrained(
-    model_name, comp_ctx_lengths=comp_ctx_lengths, prefill_ccl_len=prefill_ccl_len, continuous_batching=True
+    model_name, comp_ctx_lengths=comp_ctx_lengths, prefill_ccl_len=prefill_ccl_len
 )
 model.compile(
     prefill_seq_len=1,
-    ctx_len=1024,
-    full_batch_size=2,
+    ctx_len=ctx_len,
+    batch_size=batch_size,
     num_cores=16,
     num_devices=4,
     mxfp6_matmul=True,
     mxint8_kv_cache=True,
+    mos=1,
 )
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 exec_info = model.generate(prompts=Constants.INPUT_STR, tokenizer=tokenizer)