UZMAN

Monitoring & Alerting

Production'da Hangfire'ı izlemek, sorunları erken tespit etmek için kritiktir.

Karar Rehberi

Durum	Öneri	Örnek veya gerekçe
Failed job oranı takibi	Uygun: Prometheus + Alert	>5% failure → Slack bildirimi
Queue derinliği izleme	Uygun: Gauge + threshold	10K+ birikme → worker ekle
Job süresi analizi	Uygun: Histogram + p95	Yavaşlayan job'ları tespit et
Tek seferlik debug	Uygun değil: Dashboard yeterli	Failed job stack trace bak
Development ortamı	Uygun değil: Overkill	InMemory + console log yeter

Health Check Integration

// NuGet: AspNetCore.HealthChecks.Hangfire
builder.Services.AddHealthChecks()
    .AddHangfire(options =>
    {
        options.MinimumAvailableServers = 1;
        options.MaximumJobsFailed = 50;
    }, name: "hangfire", tags: new[] { "ready" });

// Custom health check — queue depth
public class HangfireQueueHealthCheck : IHealthCheck
{
    public Task<HealthCheckResult> CheckHealthAsync(
        HealthCheckContext context, CancellationToken ct = default)
    {
        var monitor = JobStorage.Current.GetMonitoringApi();
        var queues = monitor.Queues();
        var totalEnqueued = queues.Sum(q => q.Length);

        if (totalEnqueued > 10000)
            return Task.FromResult(HealthCheckResult.Unhealthy(
                "Queue depth too high: " + totalEnqueued));

        if (totalEnqueued > 5000)
            return Task.FromResult(HealthCheckResult.Degraded(
                "Queue depth elevated: " + totalEnqueued));

        return Task.FromResult(HealthCheckResult.Healthy(
            "Queued: " + totalEnqueued));
    }
}

Prometheus Metrics via Filter

Hangfire native Prometheus endpoint sunmaz. Aşağıdaki PrometheusMetricsFilter custom bir çözümdür — prometheus-net NuGet paketi gerektirir. Grafana dashboard'undaki metrik adları (hangfire_jobs_processed_total vb.) bu filter'dan gelir, Hangfire'dan değil.

public class PrometheusMetricsFilter : JobFilterAttribute, IServerFilter, IElectStateFilter
{
    private static readonly Counter JobsProcessed = Metrics.CreateCounter(
        "hangfire_jobs_processed_total", "Total processed jobs",
        new CounterConfiguration { LabelNames = new[] { "job_type", "status" } });

    private static readonly Histogram JobDuration = Metrics.CreateHistogram(
        "hangfire_job_duration_seconds", "Job execution duration",
        new HistogramConfiguration
        {
            LabelNames = new[] { "job_type" },
            Buckets = new[] { 0.1, 0.5, 1, 5, 10, 30, 60, 120, 300 }
        });

    private static readonly Gauge QueueDepth = Metrics.CreateGauge(
        "hangfire_queue_depth", "Current queue depth",
        new GaugeConfiguration { LabelNames = new[] { "queue" } });

    public void OnPerforming(PerformingContext context)
    {
        context.Items["StartTime"] = Stopwatch.StartNew();
    }

    public void OnPerformed(PerformedContext context)
    {
        if (context.Items.TryGetValue("StartTime", out var obj) && obj is Stopwatch sw)
        {
            sw.Stop();
            var jobType = context.BackgroundJob.Job.Type.Name;
            JobDuration.WithLabels(jobType).Observe(sw.Elapsed.TotalSeconds);
        }
    }

    public void OnStateElection(ElectStateContext context)
    {
        var jobType = context.BackgroundJob.Job.Type.Name;
        if (context.CandidateState is SucceededState)
            JobsProcessed.WithLabels(jobType, "succeeded").Inc();
        else if (context.CandidateState is FailedState)
            JobsProcessed.WithLabels(jobType, "failed").Inc();
    }
}

// Global registration
GlobalJobFilters.Filters.Add(new PrometheusMetricsFilter());

Alerting Kuralları

Metrik	Warning	Critical	Aksiyon
Queue depth	> 5000	> 10000	Worker ekle veya sorunu bul
Failed job oranı	> 5%	> 15%	External dependency check
Job duration p95	> 30s	> 120s	Optimize veya timeout ayarla
Active servers	< 2	< 1	Server health kontrol
Scheduled backlog	> 1000	> 5000	Scheduler polling check

Grafana Dashboard JSON (4 Panel)

{
  "uid": "hangfire-overview-v1",
  "title": "Hangfire Monitoring",
  "version": 1,
  "timezone": "browser",
  "refresh": "30s",
  "time": {
    "from": "now-1h",
    "to": "now"
  },
  "panels": [
    {
      "id": 1,
      "title": "Jobs Processed (Rate)",
      "type": "timeseries",
      "gridPos": {
        "h": 8,
        "w": 12,
        "x": 0,
        "y": 0
      },
      "targets": [
        {
          "expr": "rate(hangfire_jobs_processed_total[5m])",
          "legendFormat": "{{job_type}} - {{status}}"
        }
      ],
      "description": "Job processing rate per second, grouped by type and status",
      "fieldConfig": {
        "defaults": {
          "unit": "ops"
        }
      }
    },
    {
      "id": 2,
      "title": "Job Duration (p95)",
      "type": "timeseries",
      "gridPos": {
        "h": 8,
        "w": 12,
        "x": 12,
        "y": 0
      },
      "targets": [
        {
          "expr": "histogram_quantile(0.95, rate(hangfire_job_duration_seconds_bucket[5m]))",
          "legendFormat": "{{job_type}}"
        }
      ],
      "description": "95th percentile job execution time — alert if >30s sustained",
      "fieldConfig": {
        "defaults": {
          "unit": "s",
          "thresholds": {
            "steps": [
              {
                "value": 0,
                "color": "green"
              },
              {
                "value": 30,
                "color": "orange"
              },
              {
                "value": 120,
                "color": "red"
              }
            ]
          }
        }
      }
    },
    {
      "id": 3,
      "title": "Queue Depth",
      "type": "gauge",
      "gridPos": {
        "h": 8,
        "w": 12,
        "x": 0,
        "y": 8
      },
      "targets": [
        {
          "expr": "hangfire_queue_depth",
          "legendFormat": "{{queue}}"
        }
      ],
      "description": "Current enqueued jobs per queue — sustained >5000 means backlog",
      "fieldConfig": {
        "defaults": {
          "thresholds": {
            "steps": [
              {
                "value": 0,
                "color": "green"
              },
              {
                "value": 5000,
                "color": "orange"
              },
              {
                "value": 10000,
                "color": "red"
              }
            ]
          }
        }
      }
    },
    {
      "id": 4,
      "title": "Failed Job Rate (%)",
      "type": "timeseries",
      "gridPos": {
        "h": 8,
        "w": 12,
        "x": 12,
        "y": 8
      },
      "targets": [
        {
          "expr": "rate(hangfire_jobs_processed_total{status=&#39;failed&#39;}[5m]) / rate(hangfire_jobs_processed_total[5m]) * 100",
          "legendFormat": "failure %"
        }
      ],
      "description": "Failure percentage over 5 minutes — alert if >5% sustained",
      "fieldConfig": {
        "defaults": {
          "unit": "percent",
          "thresholds": {
            "steps": [
              {
                "value": 0,
                "color": "green"
              },
              {
                "value": 5,
                "color": "orange"
              },
              {
                "value": 15,
                "color": "red"
              }
            ]
          }
        }
      }
    }
  ]
}

Örnek: Bir fintech'te gece 03:00'da recurring job başarısız oldu. Prometheus alert → PagerDuty → on-call engineer 5 dk'da haberi aldı. Sorun: 3rd party API maintenance window. Çözüm: job retry ile 30 dk sonra otomatik tamamlandı. Alert yoksa sabah keşfedilirdi.