149 lines
7.2 KiB
C#
149 lines
7.2 KiB
C#
using System;
|
|
using System.Linq;
|
|
using System.Net.Http;
|
|
using System.Threading.Tasks;
|
|
using Microsoft.EntityFrameworkCore;
|
|
using Microsoft.Extensions.Logging;
|
|
using Sockeye.Models;
|
|
using Sockeye.Util;
|
|
|
|
namespace Sockeye.Biz
|
|
{
|
|
|
|
|
|
/// <summary>
|
|
/// Check the health of subscribers servers using the /health endpoint
|
|
/// trigger notification if any fail the test 3 times (some slack for intermittent comm. issues)
|
|
/// </summary>
|
|
internal static class SockBotSubscriptionServerHealthChecks
|
|
{
|
|
private static ILogger log = Sockeye.Util.ApplicationLogging.CreateLogger("SockBotSubscriptionServerHealthChecks");
|
|
private static DateTime lastSweep = DateTime.MinValue;
|
|
private static TimeSpan HEALTHCHECK_EVERY_INTERVAL = new TimeSpan(0, 5, 10);//every 5 minutes roughly meaning 15 minutes down is highest fail state
|
|
////////////////////////////////////////////////////////////////////////////////////////////////
|
|
// DoSweep
|
|
//
|
|
public static async Task DoWorkAsync()
|
|
{
|
|
//This will get triggered roughly every minute, but we don't want to check that frequently
|
|
if (DateTime.UtcNow - lastSweep < HEALTHCHECK_EVERY_INTERVAL)
|
|
return;
|
|
|
|
log.LogDebug("Health check starting");
|
|
using (AyContext ct = Sockeye.Util.ServiceProviderProvider.DBContext)
|
|
{
|
|
|
|
//get a list of all active server ID's
|
|
var ActiveServerIdList = await ct.SubscriptionServer
|
|
.AsNoTracking()
|
|
.Where(z => z.ServerState != ServerState.DeActivated
|
|
&& z.ServerState != ServerState.Destroyed
|
|
&& z.ServerState != ServerState.Requested)
|
|
.OrderBy(z => z.Id)
|
|
.Select(z => z.Id)
|
|
.ToListAsync();
|
|
|
|
try
|
|
{
|
|
// var content = new StringContent(JsonConvert.SerializeObject(trialRequest), Encoding.UTF8, "application/json");
|
|
foreach (long serverId in ActiveServerIdList)
|
|
{
|
|
//get the health and triage accordingly
|
|
var client = ServiceProviderProvider.HttpClientFactory.CreateClient();
|
|
var biz = SubscriptionServerBiz.GetBiz(ct);
|
|
var srv = await biz.GetAsync(serverId, false);
|
|
if (srv == null)
|
|
{
|
|
//this is a serious issue log and server ops it
|
|
var err = $"SockBotSubscriptionServerHealthChecks error running job, subscription server record id {serverId} could not be fetched {biz.GetErrorsAsString}";
|
|
await NotifyEventHelper.AddOpsProblemEvent(err);
|
|
log.LogError(err);
|
|
}
|
|
else
|
|
{
|
|
//space things out a bit to not "Hammer" out calls
|
|
await Task.Delay(1000);//1 second delay between calls
|
|
//https://learn.microsoft.com/en-us/aspnet/core/host-and-deploy/health-checks?source=recommendations&view=aspnetcore-7.0
|
|
//https://learn.microsoft.com/en-us/aspnet/core/host-and-deploy/health-checks?source=recommendations&view=aspnetcore-7.0#customize-the-http-status-code
|
|
//Basically it returns only plain text
|
|
//http status code = 200 for both Healthy and Degraded
|
|
//http status code = 503 for UnHealthy
|
|
//for our purposes anything other than 200 and "Healthy" is a problem
|
|
|
|
HttpResponseMessage res = null;
|
|
string responseText = string.Empty;
|
|
bool Failed = false;
|
|
try
|
|
{
|
|
res = await client.GetAsync($"https://{srv.Name}/health");
|
|
responseText = await res.Content.ReadAsStringAsync();
|
|
if (res.IsSuccessStatusCode && responseText == "Healthy")
|
|
Failed = false;
|
|
}
|
|
catch (System.Net.Http.HttpRequestException hex)
|
|
{
|
|
Failed = true;
|
|
responseText = $"Failure {hex.Message}";
|
|
}
|
|
srv.LastHealthCheck = DateTime.UtcNow;
|
|
srv.LastHealthStatus = responseText;
|
|
log.LogDebug($"Health check server {srv.Name} response {responseText}");
|
|
if (!Failed)
|
|
{
|
|
//a-ok
|
|
srv.ServerState = ServerState.ActiveHealthy;
|
|
}
|
|
else
|
|
{
|
|
//PROBLEM!
|
|
//progress through the ladder of severity
|
|
//3rd fail means triggers notification
|
|
switch (srv.ServerState)
|
|
{
|
|
case ServerState.ActiveHealthy:
|
|
srv.ServerState = ServerState.FailFirstHealthCheck;
|
|
break;
|
|
case ServerState.FailFirstHealthCheck:
|
|
srv.ServerState = ServerState.FailSecondHealthCheck;
|
|
break;
|
|
case ServerState.FailSecondHealthCheck:
|
|
srv.ServerState = ServerState.FailedRequiresAttention;
|
|
break;
|
|
case ServerState.FailedRequiresAttention:
|
|
//no change, it still requires attention
|
|
break;
|
|
}
|
|
}
|
|
//save changes
|
|
await biz.PutAsync(srv);
|
|
}
|
|
}
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
var err = "SockBotSubscriptionServerHealthChecks error running job";
|
|
//serious issue requires immediate notification
|
|
await NotifyEventHelper.AddOpsProblemEvent(err, ex);
|
|
log.LogError(ex, err);
|
|
}
|
|
|
|
}
|
|
lastSweep = DateTime.UtcNow;
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/////////////////////////////////////////////////////////////////////
|
|
|
|
}//eoc
|
|
|
|
|
|
}//eons
|
|
|