-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdocker-compose.yml
More file actions
365 lines (353 loc) · 14.1 KB
/
docker-compose.yml
File metadata and controls
365 lines (353 loc) · 14.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
services:
postgres:
image: postgres:16
restart: always
environment:
POSTGRES_USER: conduit
POSTGRES_PASSWORD: conduitpass
POSTGRES_DB: conduitdb
ports:
- "5432:5432"
volumes:
- pgdata:/var/lib/postgresql/data
healthcheck:
test: ["CMD-SHELL", "pg_isready -U conduit -d conduitdb"]
interval: 5s
timeout: 5s
retries: 5
redis:
image: redis:alpine
restart: always
ports:
- "6379:6379"
volumes:
- redis-data:/data
healthcheck:
test: ["CMD", "redis-cli", "ping"]
interval: 5s
timeout: 5s
retries: 5
rabbitmq:
build:
context: ./docker/rabbitmq
dockerfile: Dockerfile
restart: always
environment:
RABBITMQ_DEFAULT_USER: conduit
RABBITMQ_DEFAULT_PASS: conduitpass
RABBITMQ_DEFAULT_VHOST: /
ports:
- "5672:5672" # AMQP port
- "15672:15672" # Management UI
volumes:
- rabbitmq-data:/var/lib/rabbitmq
healthcheck:
test: ["CMD", "rabbitmq-diagnostics", "-q", "ping"]
interval: 5s
timeout: 5s
retries: 5
api:
build:
context: .
dockerfile: Services/ConduitLLM.Gateway/Dockerfile
depends_on:
postgres:
condition: service_healthy
redis:
condition: service_healthy
rabbitmq:
condition: service_healthy
environment:
DATABASE_URL: postgresql://conduit:conduitpass@postgres:5432/conduitdb
ASPNETCORE_ENVIRONMENT: Production
# Using new REDIS_URL format (old variables still work for backward compatibility)
REDIS_URL: "redis://redis:6379"
CONDUIT_REDIS_INSTANCE_NAME: "conduit:"
Conduit__PerformanceTracking__Enabled: "true"
Conduit__PerformanceTracking__IncludeInResponse: "true"
Conduit__PerformanceTracking__TrackStreamingMetrics: "true"
# Media Lifecycle Configuration
MediaLifecycle__Enabled: "false" # Cleanup runs on Admin API, not Gateway API
MediaLifecycle__DryRunMode: "true" # Set to false to enable actual deletions
MediaLifecycle__ScheduleIntervalMinutes: "5" # How often to run cleanup
MediaLifecycle__EnableSoftDelete: "true"
MediaLifecycle__SoftDeleteGracePeriodDays: "7"
MediaLifecycle__TestVirtualKeyGroups__0: "1" # Test with group ID 1
MediaLifecycle__RequireManualApprovalForLargeBatches: "false"
MediaLifecycle__LargeBatchThreshold: "100"
MediaLifecycle__MaxBatchSize: "50"
MediaLifecycle__DelayBetweenBatchesMs: "500"
MediaLifecycle__MaxConcurrentBatches: "2"
MediaLifecycle__MonthlyDeleteBudget: "500000"
MediaLifecycle__EnableAuditLogging: "true"
MediaLifecycle__EnableMetrics: "true"
MediaLifecycle__R2OperationTimeoutSeconds: "30"
Logging__LogLevel__Default: "Information"
Logging__LogLevel__ConduitLLM: "Information"
Logging__LogLevel__ConduitLLM__Providers: "Debug"
Logging__LogLevel__ConduitLLM__Providers__MiniMaxClient: "Debug"
# Configure the public base URL for media storage
CONDUITLLM__MEDIA_BASE_URL: http://localhost:5000
# S3/R2 configuration (required for production)
CONDUIT_MEDIA_STORAGE_TYPE: "${CONDUIT_MEDIA_STORAGE_TYPE:-S3}"
CONDUIT_S3_ENDPOINT: "${CONDUIT_S3_ENDPOINT}"
CONDUIT_S3_ACCESS_KEY: "${CONDUIT_S3_ACCESS_KEY}"
CONDUIT_S3_SECRET_KEY: "${CONDUIT_S3_SECRET_KEY}"
CONDUIT_S3_BUCKET_NAME: "${CONDUIT_S3_BUCKET_NAME:-conduit-media}"
CONDUIT_S3_REGION: "${CONDUIT_S3_REGION:-us-east-1}"
# RabbitMQ configuration
CONDUITLLM__RABBITMQ__HOST: rabbitmq
CONDUITLLM__RABBITMQ__PORT: 5672
CONDUITLLM__RABBITMQ__USERNAME: conduit
CONDUITLLM__RABBITMQ__PASSWORD: conduitpass
CONDUITLLM__RABBITMQ__VHOST: /
CONDUITLLM__RABBITMQ__PREFETCHCOUNT: 10
CONDUITLLM__RABBITMQ__PARTITIONCOUNT: 10
# Batch Cache Invalidation configuration
CacheInvalidation__BatchingEnabled: "true"
CacheInvalidation__BatchWindow: "00:00:00.100" # 100ms
CacheInvalidation__MaxBatchSize: "100"
CacheInvalidation__EnableCoalescing: "true"
# Discovery Cache configuration
Discovery__CacheDurationMinutes: "360" # 6 hours
Discovery__EnableCaching: "true"
Discovery__WarmCacheOnStartup: "true"
Discovery__WarmupStartupDelaySeconds: "5" # Configurable startup delay
Discovery__UseDistributedLockForWarming: "true" # Use Postgres advisory locks
Discovery__DistributedLockTimeoutSeconds: "30" # Lock acquisition timeout
Discovery__PriorityModels__0: "gpt-4"
Discovery__PriorityModels__1: "claude-3"
Discovery__PriorityModels__2: "gemini-pro"
# Batch Spending configuration
BatchSpending__FlushIntervalSeconds: "30"
BatchSpending__MinimumIntervalSeconds: "1"
BatchSpending__MaximumIntervalSeconds: "21600" # 6 hours
BatchSpending__RedisTtlHours: "24"
# Health monitoring configuration - API endpoint checks
HealthMonitoring__ApiEndpoints__0__Name: AdminAPI
HealthMonitoring__ApiEndpoints__0__Url: http://admin:8080/health
HealthMonitoring__ApiEndpoints__0__TimeoutMs: 5000
HealthMonitoring__ApiEndpoints__0__WarningThresholdMs: 1000
ports:
- "5000:8080"
restart: on-failure
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8080/health/ready"]
interval: 60s
timeout: 10s
retries: 3
start_period: 60s
admin:
build:
context: .
dockerfile: Services/ConduitLLM.Admin/Dockerfile
depends_on:
postgres:
condition: service_healthy
redis:
condition: service_healthy
rabbitmq:
condition: service_healthy
environment:
DATABASE_URL: postgresql://conduit:conduitpass@postgres:5432/conduitdb
ASPNETCORE_ENVIRONMENT: Production
# API-to-API authentication key for backend services
CONDUIT_API_TO_API_BACKEND_AUTH_KEY: alpha
# Redis cache configuration - using new REDIS_URL format
REDIS_URL: "redis://redis:6379"
CONDUIT_REDIS_INSTANCE_NAME: "conduit:"
AdminApi__AllowedOrigins__0: http://localhost:3000
# RabbitMQ configuration
CONDUITLLM__RABBITMQ__HOST: rabbitmq
CONDUITLLM__RABBITMQ__PORT: 5672
CONDUITLLM__RABBITMQ__USERNAME: conduit
CONDUITLLM__RABBITMQ__PASSWORD: conduitpass
CONDUITLLM__RABBITMQ__VHOST: /
# Batch Cache Invalidation configuration
CacheInvalidation__BatchingEnabled: "true"
CacheInvalidation__BatchWindow: "00:00:00.100" # 100ms
CacheInvalidation__MaxBatchSize: "100"
CacheInvalidation__EnableCoalescing: "true"
# Media Lifecycle Configuration (Admin API runs the cleanup service)
MediaLifecycle__Enabled: "true" # Enable cleanup on Admin API
MediaLifecycle__DryRunMode: "true" # Set to false to enable actual deletions
MediaLifecycle__ScheduleIntervalMinutes: "5" # How often to run cleanup
MediaLifecycle__EnableSoftDelete: "true"
MediaLifecycle__SoftDeleteGracePeriodDays: "7"
MediaLifecycle__TestVirtualKeyGroups__0: "1" # Test with group ID 1
MediaLifecycle__MaxBatchSize: "50"
MediaLifecycle__MonthlyDeleteBudget: "500000"
# Discovery Cache configuration
Discovery__CacheDurationMinutes: "360" # 6 hours
Discovery__EnableCaching: "true"
Discovery__WarmCacheOnStartup: "true"
Discovery__WarmupStartupDelaySeconds: "5" # Configurable startup delay
Discovery__UseDistributedLockForWarming: "true" # Use Postgres advisory locks
Discovery__DistributedLockTimeoutSeconds: "30" # Lock acquisition timeout
Discovery__PriorityModels__0: "gpt-4"
Discovery__PriorityModels__1: "claude-3"
Discovery__PriorityModels__2: "gemini-pro"
# Batch Spending configuration
BatchSpending__FlushIntervalSeconds: "30"
BatchSpending__MinimumIntervalSeconds: "1"
BatchSpending__MaximumIntervalSeconds: "21600" # 6 hours
BatchSpending__RedisTtlHours: "24"
# S3/R2 configuration (required for production)
CONDUIT_MEDIA_STORAGE_TYPE: "${CONDUIT_MEDIA_STORAGE_TYPE:-S3}"
CONDUIT_S3_ENDPOINT: "${CONDUIT_S3_ENDPOINT}"
CONDUIT_S3_ACCESS_KEY: "${CONDUIT_S3_ACCESS_KEY}"
CONDUIT_S3_SECRET_KEY: "${CONDUIT_S3_SECRET_KEY}"
CONDUIT_S3_BUCKET_NAME: "${CONDUIT_S3_BUCKET_NAME:-conduit-media}"
CONDUIT_S3_REGION: "${CONDUIT_S3_REGION:-us-east-1}"
# Disable IP banning for development
CONDUIT_ADMIN_IP_BANNING_ENABLED: "false"
ports:
- "5002:8080"
restart: on-failure
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8080/health/ready"]
interval: 60s
timeout: 10s
retries: 3
start_period: 60s
webadmin:
build:
context: .
dockerfile: WebAdmin/Dockerfile
depends_on:
admin:
condition: service_healthy
api:
condition: service_healthy
environment:
# Next.js specific environment variables
NODE_ENV: development
PORT: 3000
# Server-side environment variables
CONDUIT_API_TO_API_BACKEND_AUTH_KEY: alpha
CONDUIT_API_BASE_URL: http://api:8080
CONDUIT_ADMIN_API_BASE_URL: http://admin:8080
# External URLs for SignalR connections from browser
CONDUIT_API_EXTERNAL_URL: http://localhost:5000
CONDUIT_ADMIN_API_EXTERNAL_URL: http://localhost:5002
# Clerk authentication for WebAdmin access (Required)
# Get these values from your Clerk dashboard at https://dashboard.clerk.com
# Users must have siteadmin: true in their public metadata to access the WebAdmin
# Optional: Configure a redirect URL for users without admin access
# If set, users without proper permissions will be redirected to this URL
# instead of seeing the access denied page
# Example: ACCESS_DENIED_REDIRECT: "https://your-main-site.com"
NEXT_PUBLIC_CLERK_PUBLISHABLE_KEY: "pk_test_cHJlY2lzZS1yaW5ndGFpbC0zMy5jbGVyay5hY2NvdW50cy5kZXYk"
CLERK_SECRET_KEY: "sk_test_oOI6FRt1EhRGaCVUc1zJVyWCNyqf3w1DSJnLZlxEQK"
# Disable Clerk authentication for development
CLERK_AUTH_ENABLED: "false"
ports:
- "3000:3000"
restart: on-failure
healthcheck:
test: ["CMD", "node", "-e", "require('http').get('http://localhost:3000/api/health', (res) => process.exit(res.statusCode === 200 ? 0 : 1))"]
interval: 30s
timeout: 3s
retries: 3
start_period: 40s
# Redis Exporter for Prometheus metrics
# Note: No healthcheck - these are minimal scratch images without shell/wget
# Prometheus scrape failures will indicate if the exporter is down
redis-exporter:
image: oliver006/redis_exporter:latest
restart: always
environment:
REDIS_ADDR: "redis://redis:6379"
ports:
- "9121:9121"
depends_on:
redis:
condition: service_healthy
# RabbitMQ Exporter for Prometheus metrics
rabbitmq-exporter:
image: kbudde/rabbitmq-exporter:latest
restart: always
environment:
RABBIT_URL: "http://rabbitmq:15672"
RABBIT_USER: conduit
RABBIT_PASSWORD: conduitpass
PUBLISH_PORT: "9419"
OUTPUT_FORMAT: "prometheus"
ports:
- "9419:9419"
depends_on:
rabbitmq:
condition: service_healthy
# PostgreSQL Exporter for Prometheus metrics
postgres-exporter:
image: prometheuscommunity/postgres-exporter:latest
restart: always
environment:
DATA_SOURCE_NAME: "postgresql://conduit:conduitpass@postgres:5432/conduit?sslmode=disable"
ports:
- "9187:9187"
depends_on:
postgres:
condition: service_healthy
# Prometheus for metrics collection
prometheus:
image: prom/prometheus:latest
restart: always
ports:
- "9090:9090"
volumes:
- ./grafana/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- ./grafana/rules:/etc/prometheus/rules:ro
- prometheus-data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
- '--web.console.templates=/usr/share/prometheus/consoles'
healthcheck:
test: ["CMD", "wget", "--spider", "-q", "http://localhost:9090/-/healthy"]
interval: 10s
timeout: 5s
retries: 5
# Grafana for visualization
grafana:
image: grafana/grafana:latest
restart: always
ports:
- "3001:3000"
environment:
- GF_SECURITY_ADMIN_USER=admin
- GF_SECURITY_ADMIN_PASSWORD=conduitadmin
- GF_INSTALL_PLUGINS=
- GF_USERS_ALLOW_SIGN_UP=false
# Enable unified alerting
- GF_UNIFIED_ALERTING_ENABLED=true
- GF_ALERTING_ENABLED=false
volumes:
- grafana-data:/var/lib/grafana
- ./grafana/provisioning/datasources/grafana-datasources.yml:/etc/grafana/provisioning/datasources/datasources.yml:ro
- ./grafana/provisioning/dashboards/grafana-dashboard-provider.yml:/etc/grafana/provisioning/dashboards/provider.yml:ro
- ./grafana/provisioning/dashboards/grafana-conduit-overview-dashboard.json:/var/lib/grafana/dashboards/conduit-overview.json:ro
- ./grafana/provisioning/dashboards/grafana-provider-performance-dashboard.json:/var/lib/grafana/dashboards/provider-performance.json:ro
- ./grafana/provisioning/dashboards/grafana-virtual-key-analytics-dashboard.json:/var/lib/grafana/dashboards/virtual-key-analytics.json:ro
- ./grafana/provisioning/dashboards/grafana-request-pipeline-dashboard.json:/var/lib/grafana/dashboards/request-pipeline.json:ro
- ./grafana/provisioning/dashboards/grafana-infrastructure-dashboard.json:/var/lib/grafana/dashboards/infrastructure.json:ro
- ./grafana/provisioning/dashboards/grafana-postgresql-dashboard.json:/var/lib/grafana/dashboards/postgresql.json:ro
# Alerting rules - can be enabled once basic setup is verified
# - ./grafana/provisioning/alerting/alerting-rules.yml:/etc/grafana/provisioning/alerting/alerting-rules.yml:ro
depends_on:
prometheus:
condition: service_healthy
postgres:
condition: service_healthy
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:3000/api/health"]
interval: 10s
timeout: 5s
retries: 5
volumes:
pgdata:
redis-data:
rabbitmq-data:
prometheus-data:
grafana-data: