Skip to content

Commit 0842bd4

Browse files
authored
chore: Migration from Datadog to Grafana (#42)
* migration from Datadog to grafana * migration from Datadog to grafana * you dont need to build your own agent anymore * how to configure the logs with fluentbit * how to configure the logs with fluentbit * how to configure the logs with fluentbit
1 parent f5509b8 commit 0842bd4

File tree

7 files changed

+160
-586
lines changed

7 files changed

+160
-586
lines changed

README.md

Lines changed: 101 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -242,27 +242,111 @@ Relayer configuration is done with `--config-url` flag on Relayer start and can
242242
This flag sets up shared configuration IPNS URL that is used by all Relayers in the MPC network and provided by Sygma.
243243
More on [shared configuration](https://github.com/sygmaprotocol/sygma-shared-configuration)
244244

245+
## Logs and Metrics
245246

246-
### OTLP AGENT
247-
We use OpenTelemetry Agent as a sidecar container for aggregating relayers metrics, for now. Read the followings to build the OpenTelemetry Agent
247+
### Logs
248+
Configure Fluent Bit as follows
249+
- Log Router
250+
- Log Configuration
248251

249-
**Two stages are required for the configuration**
250-
- Building OpenTelemetry Agent
251-
- Configuring Task Definition for ecs users
252-
253-
#### Building OpenTelemetry Agent
254-
See the otlp-agent directory [here](https://github.com/sygmaprotocol/sygma-relayer-deployment/tree/main/otlp-agent) br
255-
The agent require three major files
256-
- Builder: `otlp-builder.yml`
257-
- Config File: `otlp-config.yml`
258-
- Dockerfile
252+
1. Log Router
253+
```
254+
{
255+
"name": "log_router",
256+
"image": "grafana/fluent-bit-plugin-loki:2.9.3-amd64",
257+
"cpu": 0,
258+
"memoryReservation": 50,
259+
"portMappings": [],
260+
"essential": true,
261+
"environment": [],
262+
"mountPoints": [],
263+
"volumesFrom": [],
264+
"user": "0",
265+
"logConfiguration": {
266+
"logDriver": "awslogs",
267+
"options": {
268+
"awslogs-group": "/ecs/relayer-{{ relayerId }}-TESTNET",
269+
"awslogs-create-group": "true",
270+
"awslogs-region": "{{ awsRegion }}",
271+
"awslogs-stream-prefix": "ecs"
272+
}
273+
},
274+
"systemControls": [],
275+
"firelensConfiguration": {
276+
"type": "fluentbit",
277+
"options": {
278+
"enable-ecs-log-metadata": "true"
279+
}
280+
}
281+
},
282+
```
283+
2. Log Configuration - configure the Relayer container with this lines of codes
284+
see here for example
285+
```
286+
"logConfiguration": {
287+
"logDriver": "awsfirelens",
288+
"options": {
289+
"tls.verify": "on",
290+
"remove_keys": "container_id,ecs_task_arn",
291+
"label_keys": "$source,$container_name,$ecs_task_definition,$ecs_cluster",
292+
"Port": "443",
293+
"host": " { request for the endpoint } ",
294+
"http_user": " { request for the userID } ",
295+
"tls": "on",
296+
"line_format": "json",
297+
"Name": "loki",
298+
"labels": "job=fluent-bit,env=testnet,project=sygma,service_name=relayer-{{ relayerId }}-container-TESTNET,image={{ imageTag }}"
299+
},
300+
"secretOptions": [
301+
{
302+
"name": "http_passwd",
303+
"valueFrom": "arn:aws:ssm:{{ awsRegion }}:{{ awsAccountId }}:parameter/sygma/logs/grafana"
304+
}
305+
]
306+
},
307+
```
308+
### OTLP AGENT for Metrics
309+
We use OpenTelemetry Agent as a sidecar container for aggregating relayers metrics, for now.
259310
311+
#### The OTLP Agent
312+
Configure The OLTP Agent as a sidecar container on the ECS Task definition file
313+
```
314+
{
315+
"name": "otel-collector",
316+
"image": "ghcr.io/sygmaprotocol/sygma-opentelemetry-collector:v1.0.3",
317+
"essential": true,
318+
"secrets": [
319+
{
320+
"name": "GRAFANA_CLOUD",
321+
"valueFrom": "arn:aws:ssm:{{ awsRegion }}:{{ awsAccountId }}:parameter/sygma/auth/secrets"
322+
},
323+
{
324+
"name": "USER_ID",
325+
"valueFrom": "arn:aws:ssm:{{ awsRegion }}:{{ awsAccountId }}:parameter/sygma/auth/userid"
326+
},
327+
{
328+
"name": "ENDPOINT",
329+
"valueFrom": "arn:aws:ssm:{{ awsRegion }}:{{ awsAccountId }}:parameter/sygma/logs/grafana/endpoint"
330+
}
331+
],
332+
"logConfiguration": {
333+
"logDriver": "awslogs",
334+
"options": {
335+
"awslogs-group": "/ecs/{{ relayerName }}-{{ relayerId }}-{{ TESTNET }}",
336+
"awslogs-create-group": "True",
337+
"awslogs-region": "{{ awsRegion }}",
338+
"awslogs-stream-prefix": "ecs"
339+
}
340+
}
341+
}
260342

261-
#### Build The OTLP Agent
262-
The otlp-agent directory contains a CI workflow in .github directory to automate the build process. [Here](https://github.com/sygmaprotocol/sygma-relayer-deployment/blob/main/otlp-agent/.github/workflows/opentelemetry.yaml) is GitHub CI that build the image.
263-
You can use it as an example or use our build system of choice.
343+
```
344+
For K8s or other environment
345+
Here is the image ghcr.io/sygmaprotocol/sygma-opentelemetry-collector:v1.0.3
346+
- Run the Image as a sidecar container
347+
- set this variables `GRAFANA_CLOUD` `USER_ID` `ENDPOINT`
348+
- Sygma will share the values of these variables through secure channel(s)
264349
265-
After you have built your image, you should change [here](https://github.com/sygmaprotocol/sygma-relayer-deployment/blob/main/ecs/task_definition_PARTNERS.j2#L200) for your image path
266350
267351
#### The Integration of the OpenTelemetry Agent
268352
See the task Definition section for the integration [here](https://github.com/sygmaprotocol/sygma-relayer-deployment/blob/main/ecs/task_definition_PARTNERS.j2#L199)
@@ -282,4 +366,4 @@ Configure [this](https://github.com/sygmaprotocol/sygma-relayer-deployment/blob/
282366
You may chose to remove [this](https://github.com/sygmaprotocol/sygma-relayer-deployment/blob/main/ecs/task_definition_PARTNERS.j2#L201) for accessing private repository.
283367
284368
285-
The Sygma Team Highly Recommend to use private repository for the otlp agent
369+
The Sygma Team Highly recommend to be security conscious while storing the shared credentials - store the credentials in private and secure environment with least previlige. Use Vault, AWS secrets manager for storing crednetials.

ecs/task_definition_PARTNERS.j2

Lines changed: 59 additions & 112 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
{
2-
"family": "{{ relayerName }}-{{ relayerId }}-container-{{ appTag }}",
2+
"family": "{{ relayerName }}-{{ relayerId }}-container-{{ TESTNET }}",
33
"containerDefinitions": [
44
{
5-
"name": "{{ relayerName }}-{{ relayerId }}-container-{{ appTag }}",
5+
"name": "{{ relayerName }}-{{ relayerId }}-container-{{ TESTNET }}",
66
"image": "ghcr.io/sygmaprotocol/sygma-relayer:{{ set Sygma release version }}",
77
"portMappings": [
88
{
@@ -40,7 +40,7 @@
4040
},
4141
{
4242
"name": "SYG_RELAYER_ID",
43-
"value": "5"
43+
"value": "{{ relayerId }}"
4444
},
4545
{
4646
"name": "SYG_RELAYER_ENV",
@@ -87,138 +87,85 @@
8787
"logConfiguration": {
8888
"logDriver": "awsfirelens",
8989
"options": {
90-
"provider": "ecs",
91-
"dd_service": "{{ env }}-relayers-{{ relayerId }}",
92-
"dd_tags": "env:{{ env }},project:chainbridge,relayerid:{{ relayerId }},image:{{ set Sygma release version }}",
93-
"dd_message_key": "log",
94-
"Host": "http-intake.logs.datadoghq.com",
95-
"TLS": "on",
96-
"dd_source": "{{ relayerName }}-{{ relayerId }}-container-{{ appTag }}",
97-
"Name": "datadog"
98-
},
99-
"secretOptions": [
100-
{
101-
"name": "apikey",
102-
"valueFrom": "arn:aws:ssm:{{ awsRegion }}:{{ awsAccountId }}:parameter/common/datadog/key"
103-
}
90+
"tls.verify": "on",
91+
"remove_keys": "container_id,ecs_task_arn",
92+
"label_keys": "$source,$container_name,$ecs_task_definition,$ecs_cluster",
93+
"Port": "443",
94+
"host": " { request for the Loging ENDPOINT } ",
95+
"http_user": " { request for the USER_ID } ",
96+
"tls": "on",
97+
"line_format": "json",
98+
"Name": "loki",
99+
"labels": "job=fluent-bit,env=testnet,project=sygma,service_name=relayer-{{ relayerId }}-container-TESTNET,image={{ imageTag }}"
100+
},
101+
"secretOptions": [
102+
{
103+
"name": "http_passwd",
104+
"valueFrom": "arn:aws:ssm:{{ awsRegion }}:{{ awsAccountId }}:parameter/sygma/logs/grafana"
105+
}
104106
]
105107
},
106-
"dependsOn": [
107-
{
108-
"containerName": "log_router",
109-
"condition": "START"
110-
}
111-
]
112108
},
113109
{
114-
"name": "datadog-agent",
115-
"image": "gcr.io/datadoghq/agent:latest",
116-
"essential": true,
117-
"logConfiguration": {
118-
"logDriver": "awslogs",
119-
"options": {
120-
"awslogs-group": "/ecs/{{ relayerName }}-{{ relayerId }}-{{ appTag }}",
121-
"awslogs-region": "{{ awsRegion }}",
122-
"awslogs-stream-prefix": "ecs"
123-
}
124-
},
125-
"healthCheck": {
126-
"retries": 3,
127-
"command": ["CMD-SHELL","agent health"],
128-
"timeout": 5,
129-
"interval": 30,
130-
"startPeriod": 15
131-
},
132-
"portMappings": [
133-
{
134-
"hostPort": 8126,
135-
"protocol": "tcp",
136-
"containerPort": 8126
137-
}
138-
],
139-
"command": [],
140-
"cpu": 0,
141-
"environment": [
142-
{
143-
"name": "DD_APM_ENABLED",
144-
"value": "true"
145-
},
146-
{
147-
"name": "DD_APM_NON_LOCAL_TRAFFIC",
148-
"value": "true"
149-
},
150-
{
151-
"name": "DD_TAGS",
152-
"value": "env:{{ env }},project:relayer-{{ relayerId }}"
153-
},
154-
{
155-
"name": "DD_LOG_LEVEL",
156-
"value": "INFO"
157-
},
158-
{
159-
"name": "ECS_FARGATE",
160-
"value": "true"
161-
},
162-
{
163-
"name": "ENV",
164-
"value": "{{ env }}"
165-
}
166-
],
167-
"secrets": [
168-
{
169-
"name": "DD_API_KEY",
170-
"valueFrom": "arn:aws:ssm:{{ awsRegion }}:{{ awsAccountId }}:parameter/common/datadog/key"
171-
}
172-
],
173-
"mountPoints": [],
174-
"volumesFrom": []
175-
},
176-
{
177-
"name": "log_router",
178-
"image": "amazon/aws-for-fluent-bit:latest",
179-
"essential": true,
180-
"firelensConfiguration": {
181-
"type": "fluentbit",
182-
"options": {
183-
"enable-ecs-log-metadata": "true"
184-
}
185-
},
186-
"logConfiguration": {
187-
"logDriver": "awslogs",
188-
"options": {
189-
"awslogs-group": "/ecs/{{ relayerName }}-{{ relayerId }}-{{ appTag }}",
190-
"awslogs-region": "{{ awsRegion }}",
191-
"awslogs-stream-prefix": "ecs"
192-
}
193-
},
194-
"portMappings": [],
195-
"command": [],
110+
"name": "log_router",
111+
"image": "grafana/fluent-bit-plugin-loki:2.9.3-amd64",
196112
"cpu": 0,
113+
"memoryReservation": 50,
114+
"portMappings": [],
115+
"essential": true,
197116
"environment": [],
198117
"mountPoints": [],
118+
"volumesFrom": [],
199119
"user": "0",
200-
"volumesFrom": []
120+
"logConfiguration": {
121+
"logDriver": "awslogs",
122+
"options": {
123+
"awslogs-group": "/ecs/relayer-{{ relayerId }}-TESTNET",
124+
"awslogs-create-group": "true",
125+
"awslogs-region": "{{ awsRegion }}",
126+
"awslogs-stream-prefix": "ecs"
127+
}
128+
},
129+
"systemControls": [],
130+
"firelensConfiguration": {
131+
"type": "fluentbit",
132+
"options": {
133+
"enable-ecs-log-metadata": "true"
134+
}
135+
}
201136
},
202137
{
203138
"name": "otel-collector",
204-
"image": "ghcr.io/sygmaprotocol/sygma-opentelemetry-collector:latest",
205-
"repositoryCredentials": {
206-
"credentialsParameter": "arn:aws:secretsmanager:{{ awsRegion }}:{{ awsAccountId }}:secret:sygma/opentelemetry-Z1wcYA"
207-
},
139+
"image": "ghcr.io/sygmaprotocol/sygma-opentelemetry-collector:v1.0.3",
208140
"cpu": 0,
209141
"portMappings": [],
210142
"essential": true,
211143
"environment": [],
212144
"mountPoints": [],
213145
"volumesFrom": [],
146+
"secrets": [
147+
{
148+
"name": "GRAFANA_CLOUD",
149+
"valueFrom": "arn:aws:ssm:{{ awsRegion }}:{{ awsAccountId }}:parameter/sygma/basicauth/secrets"
150+
},
151+
{
152+
"name": "USER_ID",
153+
"valueFrom": "arn:aws:ssm:{{ awsRegion }}:{{ awsAccountId }}:parameter/sygma/basicauth/userid"
154+
},
155+
{
156+
"name": "ENDPOINT",
157+
"valueFrom": "arn:aws:ssm:{{ awsRegion }}:{{ awsAccountId }}:parameter/sygma/logs/grafana/endpoint"
158+
}
159+
],
214160
"dockerLabels": {},
215161
"logConfiguration": {
216162
"logDriver": "awslogs",
217163
"options": {
218-
"awslogs-group": "/ecs/{{ relayerName }}-{{ relayerId }}-{{ appTag }}",
164+
"awslogs-group": "/ecs/{{ relayerName }}-{{ relayerId }}-{{ TESTNET }}",
165+
"awslogs-create-group": "True",
219166
"awslogs-region": "{{ awsRegion }}",
220167
"awslogs-stream-prefix": "ecs"
221-
}
168+
}
222169
}
223170
}
224171
],

0 commit comments

Comments
 (0)