-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexample_01_quant_of_single_operation.py
83 lines (68 loc) · 4.22 KB
/
example_01_quant_of_single_operation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# coding: utf-8
__author__ = 'Roman Solovyev (ZFTurbo), IPPM RAS'
import numpy as np
if __name__ == '__main__':
bit_size = 8
# Initial floating point data. We need to calculate value of W*A + B
# W - weights, A - activations, B - bias
weights = np.array([-23.543, 8.533, 1.012, -12.4756, 34.53])
activations = np.array([0.32432, 1.123213, 0.234324, 5.234234, 4.234323])
bias = np.array(33.1)
res = (weights*activations).sum() + bias
print('Floating point result: {}'.format(res))
# Floating point result: 116.19701015660002
# We can find minimum and maximum value for weights directly because it fixed in neural net after training
# We use symmetric quantization, so choose scale on which we multiply weights
# Zero is not moving in symmetric quantization
# Yq = round(scale * clip(Y, -a, a)) - 'scale' - scaling factor, 'a' - clip threshold
# s = (2 ** (k-1) - 1) / a - k - количество бит для представления числа
k = bit_size
a = np.abs(weights).max()
scale_weights = (2 ** (k-1) - 1) / a
print('Weights data: k = {} a = {:.6f} s = {:.6f}'.format(k, a, scale_weights))
# To find maximum/minimum values for activation we need to run set of training data through neural net to find possible activation values
# Let's assume we already run some data and find minimum is 0 and maximum is 6.
k = bit_size
a = np.abs([0, 6]).max()
scale_activations = (2 ** (k-1) - 1) / a
print('Activations data: k = {} a = {:.6f} s = {:.6f}'.format(k, a, scale_activations))
print('Maximum value in quant: {}'.format((2 ** (k-1)) - 1))
# For bias we need to use scale equal to (scale weights) * (scale activations) because we sum it
# with result of multiplication.
scale_bias = scale_weights * scale_activations
# Now we ready to get quantized values. We can't set them int8 because operations like multiplication will overflow
# so we use 32-bit accumulators
weights_quant = np.round(weights*scale_weights).astype(np.int32)
bias_quant = np.round(bias * scale_bias).astype(np.int32)
activations_quant = np.round(activations * scale_activations).astype(np.int32)
# In general case we can't gurantee that acivation not overflow. sometimes it can happens,
# so we need to clip them to [-127, 127] interval
activations_quant = np.clip(activations_quant, -127, 127)
# Let's print them now
print('Quant weights: {}'.format(weights_quant))
print('Quant bias: {}'.format(bias_quant))
print('Quant activations: {}'.format(activations_quant))
# After perform the multiplication the new_scale = scale1 * scale2
new_scale = scale_weights * scale_activations
print('New scale after multiplication: {:.6f}'.format(new_scale))
# Now we can calculate the final result
res_quant = (weights_quant * activations_quant).sum() + bias_quant
print('Quantized result: {}'.format(res_quant))
# Next step shows how can we make a requantization. In current example it's not really needed
# But it's needed if you plan to use result of previous operation on next layer
# We know maximum values for activations on next operation because we run calibration dataset.
# Let's assume we have coefficient a = 150. Then we need to requntize "res_quant" variable using it
a = 150.0
scale_layer2 = (2 ** (k-1) - 1) / a
print('Result scale data a = {:.6f} scale = {:.6f}'.format(a, scale_layer2))
requant_coeff = scale_layer2 / new_scale
print('Requantization coefficient: {:.6f}'.format(requant_coeff))
res_quant_rescaled = requant_coeff * res_quant.astype(np.float32)
# Again we can't qurantee that activations are fit in -127, 127 interval, so we clip them
res_quant_rescaled = np.clip(res_quant_rescaled, -127, 127)
res_quant_rescaled = res_quant_rescaled.astype(np.int8)
print('Result rescaled for layer2: {:.6f}'.format(res_quant_rescaled))
# Now we get final quantized result at the end of calculations.
# We can dequantize it back to floating point using current scale
res_dequant = res_quant_rescaled.astype(np.float32) / scale_layer2
print('Dequant result: {:.6f} Real result: {:.6f} Error: {:.6f}%'.format(res_dequant, res, 100 * np.abs(res - res_dequant) / res))