Skip to content

Commit 5ad419f

Browse files
committed
Add migration to include Provider fields in RequestLogs and create PostgreSQL dashboard for Grafana
- Added ProviderId and ProviderType columns to RequestLogs table in the migration file 20251201072144_AddProviderFieldsToRequestLogs.cs. - Created a comprehensive Grafana dashboard configuration for monitoring PostgreSQL metrics, including status, connections, transactions, performance, and block I/O.
1 parent 60e6279 commit 5ad419f

12 files changed

Lines changed: 2942 additions & 59 deletions

File tree

Services/ConduitLLM.Gateway/Middleware/UsageTrackingMiddleware.cs

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,16 @@ await ProcessVideoResponseAsync(context, responseBody, costCalculationService, b
280280

281281
UsageMetrics.UsageTrackingCosts.WithLabels(model, providerType, endpointType).Inc(Convert.ToDouble(totalCost));
282282

283+
// Record business metrics for Grafana dashboards (real-time counters)
284+
var requestStatus = context.Response.StatusCode >= 200 && context.Response.StatusCode < 300 ? "success" : "error";
285+
BusinessMetricsService.RecordModelRequest(model, providerType, requestStatus);
286+
BusinessMetricsService.RecordTokens(model, providerType, usage.PromptTokens ?? 0, usage.CompletionTokens ?? 0);
287+
BusinessMetricsService.RecordResponseTime(model, providerType, UsageExtractor.GetResponseTime(context) / 1000.0);
288+
if (totalCost > 0)
289+
{
290+
BusinessMetricsService.RecordCost(providerType, model, endpointType, Convert.ToDouble(totalCost));
291+
}
292+
283293
// Update spend using batch service only if there's a cost
284294
if (totalCost > 0)
285295
{
@@ -441,6 +451,16 @@ private async Task TrackStreamingUsageAsync(
441451

442452
UsageMetrics.UsageTrackingCosts.WithLabels(model, providerType, endpointType + "_stream").Inc(Convert.ToDouble(cost));
443453

454+
// Record business metrics for Grafana dashboards (real-time counters)
455+
var requestStatus = context.Response.StatusCode >= 200 && context.Response.StatusCode < 300 ? "success" : "error";
456+
BusinessMetricsService.RecordModelRequest(model, providerType, requestStatus);
457+
BusinessMetricsService.RecordTokens(model, providerType, usage.PromptTokens ?? 0, usage.CompletionTokens ?? 0);
458+
BusinessMetricsService.RecordResponseTime(model, providerType, UsageExtractor.GetResponseTime(context) / 1000.0);
459+
if (cost > 0)
460+
{
461+
BusinessMetricsService.RecordCost(providerType, model, endpointType, Convert.ToDouble(cost));
462+
}
463+
444464
// Update spend only if there's a cost
445465
if (cost > 0)
446466
{
@@ -473,10 +493,20 @@ private async Task LogRequestAsync(
473493
{
474494
var requestType = UsageExtractor.DetermineRequestType(context.Request.Path);
475495

496+
// Extract provider info from HttpContext.Items (set by controllers)
497+
int? providerId = context.Items.TryGetValue("ProviderId", out var providerIdObj) && providerIdObj is int pid
498+
? pid
499+
: null;
500+
var providerType = context.Items.TryGetValue("ProviderType", out var providerTypeObj)
501+
? providerTypeObj?.ToString()
502+
: null;
503+
476504
var logRequest = new LogRequestDto
477505
{
478506
VirtualKeyId = virtualKeyId,
479507
ModelName = model,
508+
ProviderId = providerId,
509+
ProviderType = providerType,
480510
RequestType = requestType,
481511
InputTokens = usage.PromptTokens ?? 0,
482512
OutputTokens = usage.CompletionTokens ?? 0,
@@ -580,6 +610,15 @@ private async Task ProcessFunctionResponseAsync(
580610
UsageMetrics.UsageTrackingRequests.WithLabels("function", "success").Inc();
581611
UsageMetrics.UsageTrackingCosts.WithLabels(functionName, providerType, "function").Inc(Convert.ToDouble(cost));
582612

613+
// Record business metrics for Grafana dashboards (real-time counters)
614+
var requestStatus = context.Response.StatusCode >= 200 && context.Response.StatusCode < 300 ? "success" : "error";
615+
BusinessMetricsService.RecordModelRequest(functionName, providerType, requestStatus);
616+
BusinessMetricsService.RecordResponseTime(functionName, providerType, UsageExtractor.GetResponseTime(context) / 1000.0);
617+
if (cost > 0)
618+
{
619+
BusinessMetricsService.RecordCost(providerType, functionName, "function", Convert.ToDouble(cost));
620+
}
621+
583622
// Update spend if there's a cost
584623
if (cost > 0)
585624
{
@@ -726,6 +765,15 @@ private async Task ProcessImageResponseAsync(
726765
UsageMetrics.UsageTrackingRequests.WithLabels("image", "success").Inc();
727766
UsageMetrics.UsageTrackingCosts.WithLabels(model, providerType, "image").Inc(Convert.ToDouble(cost));
728767

768+
// Record business metrics for Grafana dashboards (real-time counters)
769+
var requestStatus = context.Response.StatusCode >= 200 && context.Response.StatusCode < 300 ? "success" : "error";
770+
BusinessMetricsService.RecordModelRequest(model, providerType, requestStatus);
771+
BusinessMetricsService.RecordResponseTime(model, providerType, UsageExtractor.GetResponseTime(context) / 1000.0);
772+
if (cost > 0)
773+
{
774+
BusinessMetricsService.RecordCost(providerType, model, "image", Convert.ToDouble(cost));
775+
}
776+
729777
// Update spend if there's a cost
730778
if (cost > 0)
731779
{
@@ -912,6 +960,15 @@ private async Task ProcessVideoResponseAsync(
912960
UsageMetrics.UsageTrackingRequests.WithLabels("video", "success").Inc();
913961
UsageMetrics.UsageTrackingCosts.WithLabels(model, providerType, "video").Inc(Convert.ToDouble(cost));
914962

963+
// Record business metrics for Grafana dashboards (real-time counters)
964+
var requestStatus = context.Response.StatusCode >= 200 && context.Response.StatusCode < 300 ? "success" : "error";
965+
BusinessMetricsService.RecordModelRequest(model, providerType, requestStatus);
966+
BusinessMetricsService.RecordResponseTime(model, providerType, UsageExtractor.GetResponseTime(context) / 1000.0);
967+
if (cost > 0)
968+
{
969+
BusinessMetricsService.RecordCost(providerType, model, "video", Convert.ToDouble(cost));
970+
}
971+
915972
// Update spend if there's a cost
916973
if (cost > 0)
917974
{

Services/ConduitLLM.Gateway/Program.Monitoring.cs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,5 +103,16 @@ public static void ConfigureMonitoringServices(WebApplicationBuilder builder)
103103

104104
// Add cache statistics registration service
105105
builder.Services.AddHostedService<ConduitLLM.Gateway.Services.CacheStatisticsRegistrationService>();
106+
107+
// Add business metrics service for Prometheus/Grafana dashboards
108+
// Uses leader election to avoid duplicate metrics collection in scaled-out deployments
109+
builder.Services.AddLeaderElectedHostedService<ConduitLLM.Gateway.Services.BusinessMetricsService>(
110+
serviceProvider =>
111+
{
112+
var scopeFactory = serviceProvider.GetRequiredService<IServiceScopeFactory>();
113+
var logger = serviceProvider.GetRequiredService<ILogger<ConduitLLM.Gateway.Services.BusinessMetricsService>>();
114+
return new ConduitLLM.Gateway.Services.BusinessMetricsService(scopeFactory, logger);
115+
},
116+
"BusinessMetricsService");
106117
}
107118
}

Services/ConduitLLM.Gateway/Services/BusinessMetricsService.cs

Lines changed: 49 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -132,9 +132,6 @@ public class BusinessMetricsService : BackgroundService
132132
LabelNames = new[] { "sla_type", "model" } // sla_type: latency, availability, error_rate
133133
});
134134

135-
private readonly Dictionary<string, DateTime> _lastCostUpdate = new();
136-
private readonly Dictionary<string, decimal> _lastCostValue = new();
137-
138135
public BusinessMetricsService(
139136
IServiceScopeFactory serviceScopeFactory,
140137
ILogger<BusinessMetricsService> logger)
@@ -201,21 +198,28 @@ private async Task CollectVirtualKeyMetrics(IServiceScope scope)
201198

202199
private async Task CollectModelUsageMetrics(IServiceScope scope)
203200
{
201+
// NOTE: Model/provider counters (conduit_model_requests_total, conduit_model_tokens_total)
202+
// are updated in REAL-TIME via static methods called from UsageTrackingMiddleware.
203+
// This background method only collects supplementary gauge metrics.
204+
//
205+
// DO NOT increment counters here - it would cause double-counting since the middleware
206+
// already records each request as it happens.
207+
204208
try
205209
{
206210
var dbContextFactory = scope.ServiceProvider.GetRequiredService<IDbContextFactory<ConduitLLM.Configuration.ConduitDbContext>>();
207211
await using var context = await dbContextFactory.CreateDbContextAsync();
208212

209-
// Get model usage statistics for the last hour
210-
var oneHourAgo = DateTime.UtcNow.AddHours(-1);
213+
// Get model usage statistics for the last 5 minutes to calculate current rates
214+
var fiveMinutesAgo = DateTime.UtcNow.AddMinutes(-5);
211215

212-
// First get the data, then process in memory to avoid expression tree issues
213216
var requestLogs = await context.RequestLogs
214-
.Where(r => r.Timestamp >= oneHourAgo)
217+
.Where(r => r.Timestamp >= fiveMinutesAgo)
215218
.ToListAsync();
216219

220+
// Use the new ProviderType field directly instead of parsing model names
217221
var modelStats = requestLogs
218-
.GroupBy(r => new { Model = r.ModelName, Provider = r.ModelName.Contains("/") ? r.ModelName.Split('/')[0] : "unknown" })
222+
.GroupBy(r => new { Model = r.ModelName, Provider = r.ProviderType ?? "unknown" })
219223
.Select(g => new
220224
{
221225
g.Key.Model,
@@ -227,23 +231,15 @@ private async Task CollectModelUsageMetrics(IServiceScope scope)
227231
})
228232
.ToList();
229233

234+
_logger.LogDebug("Collected model usage metrics: {Count} model/provider combinations in last 5 minutes",
235+
modelStats.Count);
236+
237+
// Observe average response times (histograms are safe to update periodically)
230238
foreach (var stat in modelStats)
231239
{
232-
if (stat.TotalPromptTokens > 0)
233-
{
234-
ModelTokensProcessed.WithLabels(stat.Model ?? "unknown", stat.Provider ?? "unknown", "prompt")
235-
.Inc(stat.TotalPromptTokens);
236-
}
237-
238-
if (stat.TotalCompletionTokens > 0)
239-
{
240-
ModelTokensProcessed.WithLabels(stat.Model ?? "unknown", stat.Provider ?? "unknown", "completion")
241-
.Inc(stat.TotalCompletionTokens);
242-
}
243-
244240
if (stat.AvgResponseTime > 0)
245241
{
246-
ModelResponseTime.WithLabels(stat.Model ?? "unknown", stat.Provider ?? "unknown")
242+
ModelResponseTime.WithLabels(stat.Model ?? "unknown", stat.Provider)
247243
.Observe(stat.AvgResponseTime / 1000.0); // Convert ms to seconds
248244
}
249245
}
@@ -256,21 +252,25 @@ private async Task CollectModelUsageMetrics(IServiceScope scope)
256252

257253
private async Task CollectCostMetrics(IServiceScope scope)
258254
{
255+
// NOTE: Cost counters (conduit_cost_total_dollars) are updated in REAL-TIME via
256+
// static methods called from UsageTrackingMiddleware.
257+
// This background method only updates the CostRate gauge for rate calculations.
258+
259259
try
260260
{
261261
var dbContextFactory = scope.ServiceProvider.GetRequiredService<IDbContextFactory<ConduitLLM.Configuration.ConduitDbContext>>();
262262
await using var context = await dbContextFactory.CreateDbContextAsync();
263263

264-
// Calculate cost rate per provider
264+
// Calculate cost rate per provider using the ProviderType field
265265
var fiveMinutesAgo = DateTime.UtcNow.AddMinutes(-5);
266266

267-
// First get the data, then process in memory to avoid expression tree issues
268267
var costLogs = await context.RequestLogs
269268
.Where(r => r.Timestamp >= fiveMinutesAgo && r.Cost > 0)
270269
.ToListAsync();
271270

271+
// Use the new ProviderType field directly
272272
var costByProvider = costLogs
273-
.GroupBy(r => r.ModelName.Contains("/") ? r.ModelName.Split('/')[0] : "unknown")
273+
.GroupBy(r => r.ProviderType ?? "unknown")
274274
.Select(g => new
275275
{
276276
Provider = g.Key,
@@ -280,28 +280,15 @@ private async Task CollectCostMetrics(IServiceScope scope)
280280

281281
foreach (var providerCost in costByProvider)
282282
{
283-
var provider = providerCost.Provider ?? "unknown";
283+
var provider = providerCost.Provider;
284284
var costPerMinute = (double)(providerCost.TotalCost / 5); // 5-minute window
285285

286+
// Update the rate gauge (this is safe to update periodically)
286287
CostRate.WithLabels(provider).Set(costPerMinute);
287-
288-
// Track cost changes
289-
if (_lastCostUpdate.TryGetValue(provider, out var lastUpdate))
290-
{
291-
var timeDiff = (DateTime.UtcNow - lastUpdate).TotalMinutes;
292-
if (timeDiff > 0 && _lastCostValue.TryGetValue(provider, out var lastCost))
293-
{
294-
var costDiff = providerCost.TotalCost - lastCost;
295-
if (costDiff > 0)
296-
{
297-
CostTotal.WithLabels(provider, "all", "inference").Inc((double)costDiff);
298-
}
299-
}
300-
}
301-
302-
_lastCostUpdate[provider] = DateTime.UtcNow;
303-
_lastCostValue[provider] = providerCost.TotalCost;
304288
}
289+
290+
_logger.LogDebug("Collected cost metrics: {Count} providers with costs in last 5 minutes",
291+
costByProvider.Count);
305292
}
306293
catch (Exception ex)
307294
{
@@ -368,6 +355,26 @@ public static void RecordCost(string provider, string model, string operationTyp
368355
CostPerRequest.WithLabels(model, provider).Observe(costDollars);
369356
}
370357

358+
public static void RecordTokens(string model, string provider, int promptTokens, int completionTokens)
359+
{
360+
if (promptTokens > 0)
361+
{
362+
ModelTokensProcessed.WithLabels(model, provider, "prompt").Inc(promptTokens);
363+
}
364+
if (completionTokens > 0)
365+
{
366+
ModelTokensProcessed.WithLabels(model, provider, "completion").Inc(completionTokens);
367+
}
368+
}
369+
370+
public static void RecordResponseTime(string model, string provider, double responseTimeSeconds)
371+
{
372+
if (responseTimeSeconds > 0)
373+
{
374+
ModelResponseTime.WithLabels(model, provider).Observe(responseTimeSeconds);
375+
}
376+
}
377+
371378
public static void RecordSLAViolation(string slaType, string model)
372379
{
373380
SLAViolations.WithLabels(slaType, model).Inc();

Shared/ConduitLLM.Configuration/DTOs/LogRequestDto.cs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,16 @@ public class LogRequestDto
2020
/// </summary>
2121
public string ModelName { get; set; } = string.Empty;
2222

23+
/// <summary>
24+
/// ID of the provider that processed the request.
25+
/// </summary>
26+
public int? ProviderId { get; set; }
27+
28+
/// <summary>
29+
/// Type of the provider that processed the request (e.g., "OpenAI", "Anthropic").
30+
/// </summary>
31+
public string? ProviderType { get; set; }
32+
2333
/// <summary>
2434
/// Type of the request (chat, completion, embedding, etc.)
2535
/// </summary>

Shared/ConduitLLM.Configuration/Entities/RequestLog.cs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,19 @@ public class RequestLog
3232
[MaxLength(100)]
3333
public string ModelName { get; set; } = string.Empty;
3434

35+
/// <summary>
36+
/// ID of the provider that processed the request.
37+
/// References the Provider entity for accurate provider tracking.
38+
/// </summary>
39+
public int? ProviderId { get; set; }
40+
41+
/// <summary>
42+
/// Type of the provider that processed the request.
43+
/// Stored as string for flexibility and query performance.
44+
/// </summary>
45+
[MaxLength(50)]
46+
public string? ProviderType { get; set; }
47+
3548
/// <summary>
3649
/// Type of the request (chat, completion, embedding, etc.)
3750
/// </summary>

0 commit comments

Comments
 (0)