@@ -660,7 +660,7 @@ def handle_embedding_request(
660
660
661
661
self .waiting_queue .append (req )
662
662
663
- def log_prefill_stats (self , adder , can_run_list , running_bs , has_inflight ):
663
+ def log_prefill_stats (self , adder , can_run_list , running_bs , has_being_chunked ):
664
664
if isinstance (self .tree_cache , RadixCache ):
665
665
self .tree_cache_metrics ["total" ] += (
666
666
adder .log_input_tokens + adder .log_hit_tokens
@@ -684,14 +684,14 @@ def log_prefill_stats(self, adder, can_run_list, running_bs, has_inflight):
684
684
f"cache hit rate: { 100.0 * tree_cache_hit_rate :.2f} %, "
685
685
f"token usage: { num_used / self .max_total_num_tokens :.2f} , "
686
686
f"#running-req: { running_bs } , "
687
- f"#queue-req: { len (self .waiting_queue ) + has_inflight } "
687
+ f"#queue-req: { len (self .waiting_queue ) + has_being_chunked } "
688
688
)
689
689
690
690
if self .enable_metrics :
691
691
self .stats .num_running_reqs = running_bs
692
692
self .stats .num_used_tokens = num_used
693
693
self .stats .token_usage = round (num_used / self .max_total_num_tokens , 2 )
694
- self .stats .num_queue_reqs = len (self .waiting_queue ) + has_inflight
694
+ self .stats .num_queue_reqs = len (self .waiting_queue ) + has_being_chunked
695
695
self .stats .cache_hit_rate = tree_cache_hit_rate
696
696
self .metrics_collector .log_stats (self .stats )
697
697
@@ -752,7 +752,7 @@ def get_next_batch_to_run(self):
752
752
# Move the chunked request out of the batch
753
753
self .last_batch .filter_batch (being_chunked_req = self .being_chunked_req )
754
754
self .tree_cache .cache_unfinished_req (self .being_chunked_req )
755
- # Inflight request keeps its rid but will get a new req_pool_idx
755
+ # being chunked request keeps its rid but will get a new req_pool_idx
756
756
self .req_to_token_pool .free (self .being_chunked_req .req_pool_idx )
757
757
self .batch_is_full = False
758
758
@@ -803,10 +803,10 @@ def get_new_batch_prefill(self) -> Optional[ScheduleBatch]:
803
803
running_bs if self .is_mixed_chunk else 0 ,
804
804
)
805
805
806
- has_inflight = self .being_chunked_req is not None
807
- if has_inflight :
806
+ has_being_chunked = self .being_chunked_req is not None
807
+ if has_being_chunked :
808
808
self .being_chunked_req .init_next_round_input ()
809
- self .being_chunked_req = adder .add_inflight_req (self .being_chunked_req )
809
+ self .being_chunked_req = adder .add_being_chunked_req (self .being_chunked_req )
810
810
811
811
if self .lora_paths :
812
812
lora_set = (
@@ -848,16 +848,16 @@ def get_new_batch_prefill(self) -> Optional[ScheduleBatch]:
848
848
x for x in self .waiting_queue if x not in set (can_run_list )
849
849
]
850
850
851
- if adder .new_inflight_req is not None :
851
+ if adder .new_being_chunked_req is not None :
852
852
assert self .being_chunked_req is None
853
- self .being_chunked_req = adder .new_inflight_req
853
+ self .being_chunked_req = adder .new_being_chunked_req
854
854
855
855
if self .being_chunked_req :
856
856
self .being_chunked_req .is_being_chunked += 1
857
857
858
858
# Print stats
859
859
if self .tp_rank == 0 :
860
- self .log_prefill_stats (adder , can_run_list , running_bs , has_inflight )
860
+ self .log_prefill_stats (adder , can_run_list , running_bs , has_being_chunked )
861
861
862
862
# Create a new batch
863
863
new_batch = ScheduleBatch .init_new (
@@ -1030,7 +1030,7 @@ def process_batch_result_prefill(self, batch: ScheduleBatch, result):
1030
1030
if req .grammar is not None :
1031
1031
req .grammar .accept_token (next_token_id )
1032
1032
else :
1033
- # Inflight reqs' prefill is not finished
1033
+ # being chunked reqs' prefill is not finished
1034
1034
req .is_being_chunked -= 1
1035
1035
1036
1036
if batch .next_batch_sampling_info :
@@ -1058,7 +1058,7 @@ def process_batch_result_prefill(self, batch: ScheduleBatch, result):
1058
1058
else :
1059
1059
self .tree_cache .cache_unfinished_req (req )
1060
1060
else :
1061
- # Inflight reqs' prefill is not finished
1061
+ # being chunked reqs' prefill is not finished
1062
1062
req .is_being_chunked -= 1
1063
1063
1064
1064
self .stream_output (batch .reqs )
0 commit comments