Fix inaccurate APISIX metrics (#12108)

apache · Apr 16, 2024 · bea7ed3 · bea7ed3
1 parent bf19d64
commit bea7ed3
Show file tree

Hide file tree

Showing 6 changed files with 31 additions and 33 deletions.
diff --git a/docs/en/api/metrics-query-expression.md b/docs/en/api/metrics-query-expression.md
@@ -251,9 +251,6 @@ The order of the new label values should be the same as the order of the label v
 For example:
 If we want to query the `service_percentile` metric with the label values `50,75,90,95,99`, and rename the label name to `percentile` and the label values to `P50,P75,P90,P95,P99`, we can use the following expression:
 
-```text
-and rename the label values to `P50,P75,P90,P95,P99`, we can use the following expression:
-
 ```text
 relabel(service_percentile{p='50,75,90,95,99'}, p='50,75,90,95,99', percentile='P50,P75,P90,P95,P99')
 ```

diff --git a/docs/en/changes/changes.md b/docs/en/changes/changes.md
@@ -105,6 +105,7 @@
   - `memory_swap_percentage` -> `memory_virtual_memory_percentage`
 * Fix/Change UI init setting for Windows Swap -> Virtual Memory
 * Fix `Memory Swap Usage`/`Virtual Memory Usage` display with UI init.(Linux/Windows)
+* Fix inaccurate APISIX metrics
 
 #### UI
 

diff --git a/oap-server/server-starter/src/main/resources/otel-rules/apisix.yaml b/oap-server/server-starter/src/main/resources/otel-rules/apisix.yaml
@@ -36,24 +36,24 @@ metricsRules:
   # Service
     # Ignore http_connections metrics with accepted and handled state as the actual type is counter
   - name: sv_http_connections
-    exp: apisix_nginx_http_current_connections.tagNotMatch('state','accepted|handled').sum(['state','service_name']).service(['service_name'] , Layer.APISIX)
+    exp: apisix_nginx_http_current_connections.tagNotMatch('state','accepted|handled').sum(['state','service_name','node']).service(['service_name'] , Layer.APISIX)
   - name: sv_http_requests
-    exp: apisix_http_requests_total.sum(['service_instance_id','service_name']).rate('PT1M').service(['service_name'] , Layer.APISIX)
+    exp: apisix_http_requests_total.sum(['service_instance_id','service_name','node']).rate('PT1M').service(['service_name'] , Layer.APISIX)
     # Not match any route
     # Refer to  https://apisix.apache.org/docs/apisix/plugins/prometheus/
   - name: sv_bandwidth_unmatched
-    exp: apisix_bandwidth.tagEqual('route' , '' , 'node' , '').sum(['type','service_name']).rate('PT1M').service(['service_name'] , Layer.APISIX)
+    exp: apisix_bandwidth.tagEqual('route' , '' , 'node' , '').sum(['type','service_name','node']).rate('PT1M').service(['service_name'] , Layer.APISIX)
   - name: sv_http_status_unmatched
-    exp: apisix_http_status.tagEqual('route' , '' , 'node' , '').sum(['code','service_name']).rate('PT1M').service(['service_name'] , Layer.APISIX)
+    exp: apisix_http_status.tagEqual('route' , '' , 'node' , '').sum(['code','service_name','node']).rate('PT1M').service(['service_name'] , Layer.APISIX)
   - name: sv_http_latency_unmatched
-    exp: apisix_http_latency.tagEqual('route' , '' , 'node' , '').sum(['type','le','service_name']).histogram().histogram_percentile([50,70,90,99]).service(['service_name'] , Layer.APISIX)
+    exp: apisix_http_latency.tagEqual('route' , '' , 'node' , '').sum(['type','le','service_name','node']).histogram().histogram_percentile([50,70,90,99]).service(['service_name'] , Layer.APISIX)
    # Match a route
   - name: sv_bandwidth_matched
-    exp: apisix_bandwidth.tagNotEqual('route' , '' , 'node' , '').sum(['type','service_name']).rate('PT1M').service(['service_name'] , Layer.APISIX)
+    exp: apisix_bandwidth.tagNotEqual('route' , '' , 'node' , '').sum(['type','service_name','node']).rate('PT1M').service(['service_name'] , Layer.APISIX)
   - name: sv_http_status_matched
-    exp: apisix_http_status.tagNotEqual('route' , '' , 'node' , '').sum(['code','service_name']).rate('PT1M').service(['service_name'] , Layer.APISIX)
+    exp: apisix_http_status.tagNotEqual('route' , '' , 'node' , '').sum(['code','service_name','node']).rate('PT1M').service(['service_name'] , Layer.APISIX)
   - name: sv_http_latency_matched
-    exp: apisix_http_latency.tagNotEqual('route' , '' , 'node' , '').sum(['type','le','service_name']).histogram().histogram_percentile([50,70,90,99]).service(['service_name'] , Layer.APISIX)
+    exp: apisix_http_latency.tagNotEqual('route' , '' , 'node' , '').sum(['type','le','service_name','node']).histogram().histogram_percentile([50,70,90,99]).service(['service_name'] , Layer.APISIX)
 
   # Instance
   - name: instance_shared_dict_capacity_bytes
@@ -88,15 +88,15 @@ metricsRules:
   # Endpoint
     # Reorganization metrics which has `route` label as endpoint ,that is formatted to `router/{routerId}`
   - name: endpoint_http_status
-    exp: apisix_http_status.tagNotEqual('route','').tag({tags->tags.route = 'route/'+tags['route']}).sum(['code','service_name','route']).rate('PT1M').endpoint(['service_name'],['route'], Layer.APISIX)
+    exp: apisix_http_status.tagNotEqual('route','').tag({tags->tags.route = 'route/'+tags['route']}).sum(['code','service_name','route','node']).rate('PT1M').endpoint(['service_name'],['route'], Layer.APISIX)
   - name: endpoint_bandwidth
-    exp: apisix_bandwidth.tagNotEqual('route','').tag({tags->tags.route = 'route/'+tags['route']}).sum(['type','service_name','route']).rate('PT1M').endpoint(['service_name'],['route'], Layer.APISIX)
+    exp: apisix_bandwidth.tagNotEqual('route','').tag({tags->tags.route = 'route/'+tags['route']}).sum(['type','service_name','route','node']).rate('PT1M').endpoint(['service_name'],['route'], Layer.APISIX)
   - name: endpoint_http_latency
-    exp: apisix_http_latency.tagNotEqual('route','').tag({tags->tags.route = 'route/'+tags['route']}).sum(['type','le','service_name','route']).histogram().histogram_percentile([50,70,90,99]).endpoint(['service_name'],['route'], Layer.APISIX)
+    exp: apisix_http_latency.tagNotEqual('route','').tag({tags->tags.route = 'route/'+tags['route']}).sum(['type','le','service_name','route','node']).histogram().histogram_percentile([50,70,90,99]).endpoint(['service_name'],['route'], Layer.APISIX)
     # Reorganization metrics which has `node` label as endpoint , that is formatted to `node/{node}`
   - name: endpoint_http_status
     exp: apisix_http_status.tagNotEqual('node','').tag({tags->tags.node = 'upstream/'+tags['node']}).sum(['code','service_name','node']).rate('PT1M').endpoint(['service_name'],['node'], Layer.APISIX)
   - name: endpoint_bandwidth
     exp: apisix_bandwidth.tagNotEqual('node','').tag({tags->tags.node = 'upstream/'+tags['node']}).sum(['type','service_name','node']).rate('PT1M').endpoint(['service_name'],['node'], Layer.APISIX)
   - name: endpoint_http_latency
-    exp: apisix_http_latency.tagNotEqual('node','').tag({tags->tags.node = 'upstream/'+tags['node']})sum(['type','le','service_name','node']).histogram().histogram_percentile([50,70,90,99]).endpoint(['service_name'],['node'], Layer.APISIX)
+    exp: apisix_http_latency.tagNotEqual('node','').tag({tags->tags.node = 'upstream/'+tags['node']}).sum(['type','le','service_name','node']).histogram().histogram_percentile([50,70,90,99]).endpoint(['service_name'],['node'], Layer.APISIX)
diff --git a/...er/server-starter/src/main/resources/ui-initialized-templates/apisix/apisix-endpoint.json b/...er/server-starter/src/main/resources/ui-initialized-templates/apisix/apisix-endpoint.json
@@ -22,7 +22,7 @@
             "showYAxis": true
           },
           "expressions": [
-            "meter_apisix_endpoint_http_status"
+            "aggregate_labels(meter_apisix_endpoint_http_status,sum(code))"
           ],
           "associate": [
             {
@@ -55,7 +55,7 @@
             "showYAxis": true
           },
           "expressions": [
-            "meter_apisix_endpoint_http_latency"
+            "aggregate_labels(meter_apisix_endpoint_http_latency,avg(type,p))"
           ],
           "associate": [
             {
@@ -88,7 +88,7 @@
             "showYAxis": true
           },
           "expressions": [
-            "meter_apisix_endpoint_bandwidth/1024"
+            "aggregate_labels(meter_apisix_endpoint_bandwidth/1024,sum(type))"
           ],
           "associate": [
             {

diff --git a/...ver/server-starter/src/main/resources/ui-initialized-templates/apisix/apisix-service.json b/...ver/server-starter/src/main/resources/ui-initialized-templates/apisix/apisix-service.json
@@ -22,7 +22,7 @@
             "showYAxis": true
           },
           "expressions": [
-            "meter_apisix_sv_http_requests"
+            "aggregate_labels(meter_apisix_sv_http_requests,sum(service_instance_id))"
           ]
         },
         {
@@ -44,7 +44,7 @@
             "showYAxis": true
           },
           "expressions": [
-            "meter_apisix_sv_http_status_matched"
+            "aggregate_labels(meter_apisix_sv_http_status_matched,sum(code))"
           ],
           "associate": [
             {
@@ -89,7 +89,7 @@
             "showYAxis": true
           },
           "expressions": [
-            "meter_apisix_sv_http_latency_matched"
+            "aggregate_labels(meter_apisix_sv_http_latency_matched,avg(type,p))"
           ],
           "associate": [
             {
@@ -134,7 +134,7 @@
             "showYAxis": true
           },
           "expressions": [
-            "meter_apisix_sv_bandwidth_matched/1024"
+            "aggregate_labels(meter_apisix_sv_bandwidth_matched/1024,sum(type))"
           ],
           "associate": [
             {
@@ -168,7 +168,7 @@
           "i": "5",
           "type": "Widget",
           "expressions": [
-            "meter_apisix_sv_http_connections"
+            "aggregate_labels(meter_apisix_sv_http_connections,sum(state))"
           ],
           "graph": {
             "type": "Line",
@@ -224,7 +224,7 @@
             "showYAxis": true
           },
           "expressions": [
-            "meter_apisix_sv_http_status_unmatched"
+            "aggregate_labels(meter_apisix_sv_http_status_unmatched,sum(code))"
           ],
           "associate": [
             {
@@ -269,7 +269,7 @@
             "showYAxis": true
           },
           "expressions": [
-            "meter_apisix_sv_http_latency_unmatched"
+            "aggregate_labels(meter_apisix_sv_http_latency_unmatched,avg(type,p))"
           ],
           "associate": [
             {
@@ -314,7 +314,7 @@
             "showYAxis": true
           },
           "expressions": [
-            "meter_apisix_sv_bandwidth_unmatched/1024"
+            "aggregate_labels(meter_apisix_sv_bandwidth_unmatched/1024,sum(type))"
           ],
           "associate": [
             {
@@ -379,11 +379,11 @@
       "isRoot": false,
       "isDefault": true,
       "expressions": [
-        "avg(meter_apisix_sv_http_status_matched{code='200'})",
-        "avg(meter_apisix_sv_http_status_matched{code='304'})",
-        "avg(meter_apisix_sv_http_status_matched{code='404'})",
-        "avg(meter_apisix_sv_http_status_matched{code='499'})",
-        "avg(meter_apisix_sv_http_status_matched{code='503'})"
+        "avg(aggregate_labels(meter_apisix_sv_http_status_matched{code='200'},sum(code)))",
+        "avg(aggregate_labels(meter_apisix_sv_http_status_matched{code='304'},sum(code)))",
+        "avg(aggregate_labels(meter_apisix_sv_http_status_matched{code='404'},sum(code)))",
+        "avg(aggregate_labels(meter_apisix_sv_http_status_matched{code='499'},sum(code)))",
+        "avg(aggregate_labels(meter_apisix_sv_http_status_matched{code='503'},sum(code)))"
       ],
       "expressionsConfig": [
         {

diff --git a/test/e2e-v2/cases/apisix/apisix-cases.yaml b/test/e2e-v2/cases/apisix/apisix-cases.yaml
@@ -22,7 +22,7 @@
     - query: swctl --display yaml --base-url=http://${oap_host}:${oap_12800}/graphql instance ls --service-name=APISIX::showcase-apisix-service
       expected:  expected/instance.yml
     # service metrics
-    - query: swctl --display yaml --base-url=http://${oap_host}:${oap_12800}/graphql metrics exec --expression=meter_apisix_sv_http_connections --service-name=APISIX::showcase-apisix-service
+    - query: swctl --display yaml --base-url=http://${oap_host}:${oap_12800}/graphql metrics exec --expression='aggregate_labels(meter_apisix_sv_http_connections,sum(state))' --service-name=APISIX::showcase-apisix-service
       expected: expected/metrics-has-connection-value-label.yml
     # instance metrics
     - query: |
@@ -31,5 +31,5 @@
         )
       expected: expected/metrics-has-status-value-label.yml
     # endpoint metrics
-    - query: swctl --display yaml --base-url=http://${oap_host}:${oap_12800}/graphql metrics exec --expression=meter_apisix_endpoint_http_latency --endpoint-name='route/routes#1' --service-name=APISIX::showcase-apisix-service
+    - query: swctl --display yaml --base-url=http://${oap_host}:${oap_12800}/graphql metrics exec --expression='aggregate_labels(meter_apisix_endpoint_http_latency,avg(type,p))' --endpoint-name='route/routes#1' --service-name=APISIX::showcase-apisix-service
       expected: expected/metrics-has-latency-value-label.yml