LISTENAI
diff --git a/‎doc/tutorial/release.md‎
Lines changed: 4 additions & 1 deletion b/‎doc/tutorial/release.md‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎linger/__version.py‎
Lines changed: 1 addition & 1 deletion b/‎linger/__version.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎linger/constrain/clayernorm.py‎
Lines changed: 0 additions & 1 deletion b/‎linger/constrain/clayernorm.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎linger/initialize.py‎
Lines changed: 15 additions & 5 deletions b/‎linger/initialize.py‎
Lines changed: 15 additions & 5 deletions
diff --git a/‎linger/kernel/cpu/venusa_qsigmoid_kernel.cpp‎
Lines changed: 17 additions & 3 deletions b/‎linger/kernel/cpu/venusa_qsigmoid_kernel.cpp‎
Lines changed: 17 additions & 3 deletions
diff --git a/‎linger/kernel/cpu/venusa_qsoftmax_kernel.cpp‎
Lines changed: 2 additions & 3 deletions b/‎linger/kernel/cpu/venusa_qsoftmax_kernel.cpp‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎linger/kernel/cpu/venusa_qtanh_kernel.cpp‎
Lines changed: 18 additions & 5 deletions b/‎linger/kernel/cpu/venusa_qtanh_kernel.cpp‎
Lines changed: 18 additions & 5 deletions
diff --git a/‎linger/kernel/gpu/fake_quant_kernel.cu‎
Lines changed: 4 additions & 4 deletions b/‎linger/kernel/gpu/fake_quant_kernel.cu‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎linger/kernel/gpu/venusa_qsigmoid_kernel.cu‎
Lines changed: 17 additions & 5 deletions b/‎linger/kernel/gpu/venusa_qsigmoid_kernel.cu‎
Lines changed: 17 additions & 5 deletions
diff --git a/‎linger/kernel/gpu/venusa_qsoftmax_kernel.cu‎
Lines changed: 4 additions & 4 deletions b/‎linger/kernel/gpu/venusa_qsoftmax_kernel.cu‎
Lines changed: 4 additions & 4 deletions
@@ -17,4 +17,7 @@
 # V3.0.6  2026.03.26
 * 解决linger.init之后加载浮点模型dict不匹配问题
 * 解决avgpool算子一致性问题
-* 解决gru算子一致性问题
+* 解决gru算子一致性问题
+# V3.0.7  2026.04.30
+* 解决venusA和arcs平台gru一致性问题；
+* 重构导图后处理代码，增加图优化和常量折叠功能；
@@ -8,5 +8,5 @@ def _to_int(s):
         return s
 
 
-__version__ = "3.0.6"
+__version__ = "3.0.7"
 version_info = tuple(_to_int(s) for s in __version__.split("."))
@@ -20,7 +20,6 @@ def ccreate(
             module.normalized_shape,
             module.eps,
             module.elementwise_affine,
-            None if module.bias is None else True,
             dtype=module.weight.dtype,
             device=device,
             constrain=constrain,
 
@@ -85,29 +85,31 @@ def quant_module(module: nn.Module, c_activation_val: float = 8.0, c_weight_val:
             m.output_quantizer.data_bits = out_bits
             m.output_quantizer.clamp_activation_value = c_activation_val
 
-def constrain(model: nn.Module, config_file: str = None, disable_module=None, disable_submodel=[]):
+def constrain(model: nn.Module, config_file: str = None, disable_module=None, disable_submodel=None):
     c_configs = QUANT_CONFIGS
     if config_file is not None:
         c_configs._load_from_yaml(config_file)
+    disabled_submodels = [] if disable_submodel is None else list(disable_submodel)
 
     if disable_module is not None:
         for name in disable_module:
             if _CMODULE_TABLE.get(name, None) is not None:
                 _CMODULE_TABLE.pop(name)
 
     for name, m in model.named_modules():
-        if any(name.startswith(p) for p in disable_submodel): # disable_submodel 直接按照module的名称进行屏蔽
+        if any(name.startswith(p) for p in disabled_submodels): # disable_submodel 直接按照module的名称进行屏蔽
             continue
         _constrain_submodule(model, name, m, c_configs.clamp_info.to_dict())
 
     model.to(c_configs.device)
     return model
 
-def init(model: nn.Module, config_file: str = None, disable_module=None, disable_submodel=[]):
+def init(model: nn.Module, config_file: str = None, disable_module=None, disable_submodel=None):
 
     q_configs = QUANT_CONFIGS
     if config_file is not None:
         q_configs._load_from_yaml(config_file)
+    disabled_submodels = [] if disable_submodel is None else list(disable_submodel)
 
     if disable_module is not None:
         for name in disable_module:
@@ -118,15 +120,15 @@ def init(model: nn.Module, config_file: str = None, disable_module=None, disable
     # model = _replace_ops(traced_model, q_configs)
 
     for name, m in model.named_modules():
-        if any(name.startswith(p) for p in disable_submodel): # disable_submodel 直接按照module的名称进行量化屏蔽
+        if any(name.startswith(p) for p in disabled_submodels): # disable_submodel 直接按照module的名称进行量化屏蔽
             continue
 
         m.register_forward_pre_hook(hook_pre_forward)
         m.register_forward_hook(hook_forward)
 
         is_replaced = _quantize_submodule(model, name, m, weights_cfg=q_configs.quant_info.to_dict(), activations_cfg=q_configs.quant_info.to_dict(), bias_cfg=q_configs.quant_info.to_dict(), constrain =  q_configs.clamp_info.to_dict())
         if is_replaced:
-            disable_submodel.append(name)
+            disabled_submodels.append(name)
 
     def quant_tensor_pre_hook(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
         quantizer_tags = (
@@ -192,6 +194,14 @@ def quant_tensor_layer(module, prefix=''):
                                 iq_layer.training = model.training
                                 # iq_layer = iq_layer.to(device)
                                 setattr(module, input_name, iq_layer)
+                            elif '_qgelu_' in input_name:
+                                iq_layer = QGelu(activate_config=activate_cfg, num_input=1)
+                                iq_layer.training = model.training
+                                setattr(module, input_name, iq_layer)
+                            elif '_qswish_' in input_name:
+                                iq_layer = QSwish(activate_config=activate_cfg, num_input=1)
+                                iq_layer.training = model.training
+                                setattr(module, input_name, iq_layer)
                             elif '_qsoftmax_' in input_name:
                                 iq_layer = QSoftmax(activate_config=activate_cfg, num_input=1)
                                 iq_layer.training = model.training
 
@@ -11,6 +11,21 @@
   ((x) > MAX_BITS(bits) ? MAX_BITS(bits) \
                         : ((x) < MIN_BITS(bits) ? MIN_BITS(bits) : (x)))
 
+static int64_t shfit_floor_x05_int64(int64_t x, int32_t shift)
+{
+	int64_t val = x;
+
+	if (shift >= 64) {
+		return 0;
+	}
+	if (shift > 0) {
+		val = val >> (shift - 1);
+		val = (val & 0x1) + (val >> 1);
+	}
+
+	return val;
+}
+
 torch::Tensor venusa_qsigmoid_cpu(torch::Tensor a)
 {
 	int32_t N = a.numel();
@@ -29,7 +44,6 @@ torch::Tensor venusa_qsigmoid_cpu(torch::Tensor a)
 	int64_t absx = 0;
 	int64_t slope = 0;
 	int64_t bias = 0;
-	int32_t shift = 0;
 	int64_t tmp = 0;
 	int64_t out = 0;
 
@@ -73,8 +87,8 @@ torch::Tensor venusa_qsigmoid_cpu(torch::Tensor a)
 				bias = 0;
 			}
 		}
-        bias = bias << 3;
-		out = ((slope * tmp) >> 27) + bias;
+		out = slope * tmp + (bias << 30);
+		out = shfit_floor_x05_int64(out, 27);
 		c_ptr[i] = SATURATE(out, 32);
 	}
 
 
@@ -21,7 +21,7 @@ static int32_t shift_pure(int64_t v, int32_t s)
 	} else {
         v = v >> (-s);
     }
-    return SATURATE(v, 32);
+    return v;
 }
 
 static int32_t sub32s(int32_t a, int32_t b)
@@ -150,7 +150,7 @@ torch::Tensor venusa_qsoftmax_cpu(const torch::Tensor& in, int64_t dim)
 		for (int i = 0; i < N; i++)
 		{
 			data = sub32s(x_ptr[i], max_value);
-            X = shfit_floor_x05_int64((int64_t)X * (int64_t)774541002, 31);
+            X = shfit_floor_x05_int64((int64_t)data * (int64_t)774541002, 31);
 			// X = shift_rasyms((int64_t)data * (int64_t)774541002, -31);//exp=>2xp，Q6.25=>Q8.23
 			E = X >> 23;
 			E = E + 1;//与118行对应
@@ -201,4 +201,3 @@ torch::Tensor venusa_qsoftmax_cpu(const torch::Tensor& in, int64_t dim)
 
 	return out;
 }
-
@@ -11,6 +11,21 @@
   ((x) > MAX_BITS(bits) ? MAX_BITS(bits) \
                         : ((x) < MIN_BITS(bits) ? MIN_BITS(bits) : (x)))
 
+static int64_t shfit_floor_x05_int64(int64_t x, int32_t shift)
+{
+	int64_t val = x;
+
+	if (shift >= 64) {
+		return 0;
+	}
+	if (shift > 0) {
+		val = val >> (shift - 1);
+		val = (val & 0x1) + (val >> 1);
+	}
+
+	return val;
+}
+
 torch::Tensor venusa_qtanh_cpu(torch::Tensor a)
 {
 	int32_t N = a.numel();
@@ -28,7 +43,6 @@ torch::Tensor venusa_qtanh_cpu(torch::Tensor a)
 	int64_t absx = 0;
 	int64_t slope = 0;
 	int64_t bias = 0;
-	int32_t shift = 0;
 	int64_t tmp = 0;
 	int64_t out = 0;
 
@@ -68,16 +82,15 @@ torch::Tensor venusa_qtanh_cpu(torch::Tensor a)
 				bias = 0;
 			}
 		}
-
-        bias = bias << 3;
 		if (1 == sign)
 		{
-			out = ((-1 * slope * absx) >> 27) - bias;
+			out = (-slope * absx) - (bias << 30);
 		}
 		else
 		{
-			out = ((slope * absx) >> 27) + bias;
+			out = (slope * absx) + (bias << 30);
 		}
+		out = shfit_floor_x05_int64(out, 27);
 
 		c_ptr[i] = SATURATE(out, 32);
 	}
 
@@ -44,7 +44,7 @@ std::tuple<torch::Tensor, torch::Tensor, float> fake_quant_cuda(
     // printf("到位置 2 了 \n");
     // 计算 scale
     float f = (float)(bit - 1) - factor;
-    float scale = powf(2.0f, roundf(f));
+    float scale = powf(2.0f, fminf(fmaxf(roundf(f), 0.0f), 23.0f));
     // scale = fminf(fmaxf(scale, 1e-6f), powf(2.0f, 32));
     // printf("到位置 1 了,scale:%f \n", scale);
     auto output = torch::empty_like(input);
@@ -84,7 +84,7 @@ std::tuple<torch::Tensor, torch::Tensor, float> fake_quant_cuda(
 std::tuple<torch::Tensor, torch::Tensor> bias_quant_cuda(
     torch::Tensor input,
     int bit,
-    float scale,
+    float scale, // 这里scale是在python代码里进行clamp到0-31的
     float scale_min,
     float quant_min,
     float quant_max) {
@@ -175,7 +175,7 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, float> fake_quant_cuda_w
     // printf("到位置 2 了 \n");
     // 计算 scale
     float f = (float)(bit - 1) - factor;
-    float scale = powf(2.0f, roundf(f));
+    float scale = powf(2.0f, fminf(fmaxf(roundf(f), 0.0f), 23.0f));
     // scale = fminf(fmaxf(scale, 1e-6f), powf(2.0f, 32));
     // printf("到位置 1 了,scale:%f \n", scale);
     auto output = torch::empty_like(input);
@@ -218,7 +218,7 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, float> fake_quant_cuda_w
 std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> bias_quant_cuda_with_grad_scale(
     torch::Tensor input,
     int bit,
-    float scale,
+    float scale, // 这里的scale是在python里clamp的
     float scale_min,
     float quant_min,
     float quant_max) {
 
@@ -10,6 +10,21 @@
   ((x) > MAX_BITS(bits) ? MAX_BITS(bits) \
                         : ((x) < MIN_BITS(bits) ? MIN_BITS(bits) : (x)))
 
+static __device__ int64_t shfit_floor_x05_int64(int64_t x, int32_t shift)
+{
+	int64_t val = x;
+
+	if (shift >= 64) {
+		return 0;
+	}
+	if (shift > 0) {
+		val = val >> (shift - 1);
+		val = (val & 0x1) + (val >> 1);
+	}
+
+	return val;
+}
+
 __global__ void venusa_qsigmoid_gpu_kernel(const int* __restrict__ a,
                              int* __restrict__ c, 
                             int32_t len, uint32_t* bands , uint32_t* slopes,
@@ -21,7 +36,6 @@ __global__ void venusa_qsigmoid_gpu_kernel(const int* __restrict__ a,
     int64_t absx = 0;
     int64_t slope = 0;
     int64_t bias = 0;
-    int32_t shift = 0;
     int64_t tmp = 0;
     int64_t out = 0;
 
@@ -65,12 +79,11 @@ __global__ void venusa_qsigmoid_gpu_kernel(const int* __restrict__ a,
 			{
 				slope = 0;
 				bias = 0;
-				shift = 0;
 			}
 		}
 
-        bias = bias << 3;
-		out = ((slope * tmp) >> 27) + bias;
+		out = slope * tmp + (bias << 30);
+		out = shfit_floor_x05_int64(out, 27);
 		c[idx] = SATURATE(out, 32);
     }
 }
@@ -111,4 +124,3 @@ torch::Tensor venusa_qsigmoid_gpu(torch::Tensor a)
 
     return c;
 }
-
 
@@ -22,7 +22,7 @@ static __device__ int32_t shift_pure(int64_t v, int32_t s)
 	} else {
 		v = v >> (-s);
 	}
-	return SATURATE(v, 32);
+	return v;
 }
 
 static __device__ int32_t sub32s(int32_t a, int32_t b)
@@ -98,6 +98,7 @@ static __device__ int64_t shfit_floor_x05_int64(int64_t x, int32_t shift)
 static __device__ void venusa_qsoftmax_c(const int* __restrict__ x_ptr, int* __restrict__ y_ptr, int32_t N, int32_t* p23)
 {
 	int32_t max_value = x_ptr[0];
+	int32_t data = 0;
 	int32_t X = 0;
 	int64_t Y = 0;
 	int32_t E = 0;
@@ -112,8 +113,8 @@ static __device__ void venusa_qsoftmax_c(const int* __restrict__ x_ptr, int* __r
 
 	for (int i = 0; i < N; i++)
 	{
-		X = sub32s(x_ptr[i] ,max_value);
-        X = shfit_floor_x05_int64((int64_t)X * (int64_t)774541002, 31);
+		data = sub32s(x_ptr[i] ,max_value);
+        X = shfit_floor_x05_int64((int64_t)data * (int64_t)774541002, 31);
 		// X = shift_rasyms((int64_t)X * (int64_t)774541002,-31);
 		E = X >> 23;
 		E = E + 1;
@@ -207,4 +208,3 @@ torch::Tensor venusa_qsoftmax_gpu(const torch::Tensor& in, int64_t dim)
 
 	return out;
 }
-
Original file line number	Diff line number	Diff line change
`@@ -21,7 +21,7 @@ static int32_t shift_pure(int64_t v, int32_t s)`
`21`	`21`	`} else {`
`22`	`22`	`v = v >> (-s);`
`23`	`23`	`}`
`24`		`- return SATURATE(v, 32);`
	`24`	`+ return v;`
`25`	`25`	`}`
`26`	`26`
`27`	`27`	`static int32_t sub32s(int32_t a, int32_t b)`
`@@ -150,7 +150,7 @@ torch::Tensor venusa_qsoftmax_cpu(const torch::Tensor& in, int64_t dim)`
`150`	`150`	`for (int i = 0; i < N; i++)`
`151`	`151`	`{`
`152`	`152`	`data = sub32s(x_ptr[i], max_value);`
`153`		`- X = shfit_floor_x05_int64((int64_t)X * (int64_t)774541002, 31);`
	`153`	`+ X = shfit_floor_x05_int64((int64_t)data * (int64_t)774541002, 31);`
`154`	`154`	`// X = shift_rasyms((int64_t)data * (int64_t)774541002, -31);//exp=>2xp，Q6.25=>Q8.23`
`155`	`155`	`E = X >> 23;`
`156`	`156`	`E = E + 1;//与118行对应`
`@@ -201,4 +201,3 @@ torch::Tensor venusa_qsoftmax_cpu(const torch::Tensor& in, int64_t dim)`
`201`	`201`
`202`	`202`	`return out;`
`203`	`203`	`}`
`204`		`-`
Original file line number	Diff line number	Diff line change
`@@ -22,7 +22,7 @@ static __device__ int32_t shift_pure(int64_t v, int32_t s)`
`22`	`22`	`} else {`
`23`	`23`	`v = v >> (-s);`
`24`	`24`	`}`
`25`		`- return SATURATE(v, 32);`
	`25`	`+ return v;`
`26`	`26`	`}`
`27`	`27`
`28`	`28`	`static __device__ int32_t sub32s(int32_t a, int32_t b)`
`@@ -98,6 +98,7 @@ static __device__ int64_t shfit_floor_x05_int64(int64_t x, int32_t shift)`
`98`	`98`	`static __device__ void venusa_qsoftmax_c(const int* __restrict__ x_ptr, int* __restrict__ y_ptr, int32_t N, int32_t* p23)`
`99`	`99`	`{`
`100`	`100`	`int32_t max_value = x_ptr[0];`
	`101`	`+ int32_t data = 0;`
`101`	`102`	`int32_t X = 0;`
`102`	`103`	`int64_t Y = 0;`
`103`	`104`	`int32_t E = 0;`
`@@ -112,8 +113,8 @@ static __device__ void venusa_qsoftmax_c(const int* __restrict__ x_ptr, int* __r`
`112`	`113`
`113`	`114`	`for (int i = 0; i < N; i++)`
`114`	`115`	`{`
`115`		`- X = sub32s(x_ptr[i] ,max_value);`
`116`		`- X = shfit_floor_x05_int64((int64_t)X * (int64_t)774541002, 31);`
	`116`	`+ data = sub32s(x_ptr[i] ,max_value);`
	`117`	`+ X = shfit_floor_x05_int64((int64_t)data * (int64_t)774541002, 31);`
`117`	`118`	`// X = shift_rasyms((int64_t)X * (int64_t)774541002,-31);`
`118`	`119`	`E = X >> 23;`
`119`	`120`	`E = E + 1;`
`@@ -207,4 +208,3 @@ torch::Tensor venusa_qsoftmax_gpu(const torch::Tensor& in, int64_t dim)`
`207`	`208`
`208`	`209`	`return out;`
`209`	`210`	`}`
`210`		`-`