using System; using System.Linq; using System.Net.Http; using System.Threading.Tasks; using Microsoft.EntityFrameworkCore; using Microsoft.Extensions.Logging; using Sockeye.Models; using Sockeye.Util; namespace Sockeye.Biz { /// /// Check the health of subscribers servers using the /health endpoint /// trigger notification if any fail the test 3 times (some slack for intermittent comm. issues) /// internal static class SockBotSubscriptionServerHealthChecks { private static ILogger log = Sockeye.Util.ApplicationLogging.CreateLogger("SockBotSubscriptionServerHealthChecks"); private static DateTime lastSweep = DateTime.MinValue; private static TimeSpan HEALTHCHECK_EVERY_INTERVAL = new TimeSpan(0, 5, 10);//every 5 minutes roughly meaning 15 minutes down is highest fail state //////////////////////////////////////////////////////////////////////////////////////////////// // DoSweep // public static async Task DoWorkAsync() { //This will get triggered roughly every minute, but we don't want to check that frequently if (DateTime.UtcNow - lastSweep < HEALTHCHECK_EVERY_INTERVAL) return; log.LogDebug("Health check starting"); using (AyContext ct = Sockeye.Util.ServiceProviderProvider.DBContext) { //get a list of all active server ID's var ActiveServerIdList = await ct.SubscriptionServer .AsNoTracking() .Where(z => z.ServerState != ServerState.DeActivated && z.ServerState != ServerState.Destroyed && z.ServerState != ServerState.Requested) .OrderBy(z => z.Id) .Select(z => z.Id) .ToListAsync(); try { // var content = new StringContent(JsonConvert.SerializeObject(trialRequest), Encoding.UTF8, "application/json"); foreach (long serverId in ActiveServerIdList) { //get the health and triage accordingly var client = ServiceProviderProvider.HttpClientFactory.CreateClient(); var biz = SubscriptionServerBiz.GetBiz(ct); var srv = await biz.GetAsync(serverId, false); if (srv == null) { //this is a serious issue log and server ops it var err = $"SockBotSubscriptionServerHealthChecks error running job, subscription server record id {serverId} could not be fetched {biz.GetErrorsAsString}"; await NotifyEventHelper.AddOpsProblemEvent(err); log.LogError(err); } else { //space things out a bit to not "Hammer" out calls await Task.Delay(1000);//1 second delay between calls //https://learn.microsoft.com/en-us/aspnet/core/host-and-deploy/health-checks?source=recommendations&view=aspnetcore-7.0 //https://learn.microsoft.com/en-us/aspnet/core/host-and-deploy/health-checks?source=recommendations&view=aspnetcore-7.0#customize-the-http-status-code //Basically it returns only plain text //http status code = 200 for both Healthy and Degraded //http status code = 503 for UnHealthy //for our purposes anything other than 200 and "Healthy" is a problem HttpResponseMessage res = null; string responseText = string.Empty; bool Failed = false; try { res = await client.GetAsync($"https://{srv.Name}/health"); responseText = await res.Content.ReadAsStringAsync(); if (res.IsSuccessStatusCode && responseText == "Healthy") Failed = false; } catch (Exception hex)//generic exception as I'm seeing various different exceptions on client get between dev and production { Failed = true; responseText = $"Failure {hex.Message}"; } srv.LastHealthCheck = DateTime.UtcNow; srv.LastHealthStatus = responseText; log.LogDebug($"Health check server {srv.Name} response {responseText}"); if (!Failed) { //a-ok srv.ServerState = ServerState.ActiveHealthy; } else { //PROBLEM! //progress through the ladder of severity //3rd fail means triggers notification switch (srv.ServerState) { case ServerState.ActiveHealthy: srv.ServerState = ServerState.FailFirstHealthCheck; break; case ServerState.FailFirstHealthCheck: srv.ServerState = ServerState.FailSecondHealthCheck; break; case ServerState.FailSecondHealthCheck: srv.ServerState = ServerState.FailedRequiresAttention; break; case ServerState.FailedRequiresAttention: //no change, it still requires attention break; } } //save changes await biz.PutAsync(srv); } } } catch (Exception ex) { var err = "SockBotSubscriptionServerHealthChecks error running job"; //serious issue requires immediate notification await NotifyEventHelper.AddOpsProblemEvent(err, ex); log.LogError(ex, err); } } lastSweep = DateTime.UtcNow; } ///////////////////////////////////////////////////////////////////// }//eoc }//eons