Files
sockeye/server/generator/SockBotSubscriptionServerHealthChecks.cs
2023-02-24 23:50:49 +00:00

136 lines
6.5 KiB
C#

using System;
using System.Linq;
using System.Net.Http;
using System.Threading.Tasks;
using Microsoft.EntityFrameworkCore;
using Microsoft.Extensions.Logging;
using Sockeye.Models;
using Sockeye.Util;
namespace Sockeye.Biz
{
/// <summary>
/// Check the health of subscribers servers using the /health endpoint
/// trigger notification if any fail the test 3 times (some slack for intermittent comm. issues)
/// </summary>
internal static class SockBotSubscriptionServerHealthChecks
{
private static ILogger log = Sockeye.Util.ApplicationLogging.CreateLogger("SockBotSubscriptionServerHealthChecks");
private static DateTime lastSweep = DateTime.MinValue;
private static TimeSpan HEALTHCHECK_EVERY_INTERVAL = new TimeSpan(0, 5, 10);//every 5 minutes roughly meaning 15 minutes down is highest fail state
////////////////////////////////////////////////////////////////////////////////////////////////
// DoSweep
//
public static async Task DoWorkAsync()
{
//This will get triggered roughly every minute, but we don't want to check that frequently
if (DateTime.UtcNow - lastSweep < HEALTHCHECK_EVERY_INTERVAL)
return;
log.LogDebug("Health check starting");
using (AyContext ct = Sockeye.Util.ServiceProviderProvider.DBContext)
{
//get a list of all active server ID's
var ActiveServerIdList = await ct.SubscriptionServer
.AsNoTracking()
.Where(z => z.ServerState != ServerState.DeActivated
&& z.ServerState != ServerState.Destroyed
&& z.ServerState != ServerState.Requested)
.OrderBy(z => z.Id)
.Select(z => z.Id)
.ToListAsync();
try
{
// var content = new StringContent(JsonConvert.SerializeObject(trialRequest), Encoding.UTF8, "application/json");
foreach (long serverId in ActiveServerIdList)
{
//get the health and triage accordingly
var client = ServiceProviderProvider.HttpClientFactory.CreateClient();
var biz = SubscriptionServerBiz.GetBiz(ct);
var srv = await biz.GetAsync(serverId, false);
if (srv == null)
{
//this is a serious issue log and server ops it
var err = $"SockBotSubscriptionServerHealthChecks error running job, subscription server record id {serverId} could not be fetched {biz.GetErrorsAsString}";
await NotifyEventHelper.AddOpsProblemEvent(err);
log.LogError(err);
}
else
{
//space things out a bit to not "Hammer" out calls
await Task.Delay(1000);//1 second delay between calls
//https://learn.microsoft.com/en-us/aspnet/core/host-and-deploy/health-checks?source=recommendations&view=aspnetcore-7.0
//https://learn.microsoft.com/en-us/aspnet/core/host-and-deploy/health-checks?source=recommendations&view=aspnetcore-7.0#customize-the-http-status-code
//Basically it returns only plain text
//http status code = 200 for both Healthy and Degraded
//http status code = 503 for UnHealthy
//for our purposes anything other than 200 and "Healthy" is a problem
var res = await client.GetAsync($"https://{srv.Name}/health");
var responseText = await res.Content.ReadAsStringAsync();
srv.LastHealthCheck = DateTime.UtcNow;
srv.LastHealthStatus = responseText;
log.LogDebug($"Health check server {srv.Name} response {responseText}");
if (res.IsSuccessStatusCode && responseText == "Healthy")
{
//a-ok
srv.ServerState = ServerState.ActiveHealthy;
}
else
{
//PROBLEM!
//progress through the ladder of severity
//3rd fail means triggers notification
switch (srv.ServerState)
{
case ServerState.ActiveHealthy:
srv.ServerState = ServerState.FailFirstHealthCheck;
break;
case ServerState.FailFirstHealthCheck:
srv.ServerState = ServerState.FailSecondHealthCheck;
break;
case ServerState.FailSecondHealthCheck:
srv.ServerState = ServerState.FailedRequiresAttention;
break;
case ServerState.FailedRequiresAttention:
//no change, it still requires attention
break;
}
}
//save changes
await biz.PutAsync(srv);
}
}
}
catch (Exception ex)
{
var err = "SockBotSubscriptionServerHealthChecks error running job";
//serious issue requires immediate notification
await NotifyEventHelper.AddOpsProblemEvent(err, ex);
log.LogError(ex, err);
}
}
lastSweep = DateTime.UtcNow;
}
/////////////////////////////////////////////////////////////////////
}//eoc
}//eons