using System; using System.Linq; using System.Threading.Tasks; using Microsoft.EntityFrameworkCore; using Microsoft.Extensions.Logging; using Sockeye.Models; namespace Sockeye.Biz { /// /// Check the health of subscribers servers, basically a ping check /// trigger notification if any fail the test excessively (some slack for intermittent comm. issues) /// internal static class SockBotSubscriptionServerHealthChecks { private static ILogger log = Sockeye.Util.ApplicationLogging.CreateLogger("SockBotSubscriptionServerHealthChecks"); private static DateTime lastSweep = DateTime.MinValue; private static TimeSpan HEALTHCHECK_EVERY_INTERVAL = new TimeSpan(0, 15, 10);//every 15 minutes roughly //////////////////////////////////////////////////////////////////////////////////////////////// // DoSweep // public static async Task DoWorkAsync() { //This will get triggered roughly every minute, but we don't want to check that frequently if (DateTime.UtcNow - lastSweep < HEALTHCHECK_EVERY_INTERVAL) return; log.LogDebug("Health check starting"); using (AyContext ct = Sockeye.Util.ServiceProviderProvider.DBContext) { var servers = await ct.SubscriptionServer .AsNoTracking() .Where(z => z.ServerState== < dtDeleteCutoff && z.JobStatus == jobStatus) .OrderBy(z => z.Created) .ToListAsync(); } lastSweep = DateTime.UtcNow; } private static async Task sweepAsync(AyContext ct, DateTime dtDeleteCutoff, JobStatus jobStatus) { //Get the deleteable succeeded jobs list log.LogDebug($"SweepAsync processing: cutoff={dtDeleteCutoff.ToString()}, for {jobs.Count.ToString()} jobs of status {jobStatus.ToString()}"); foreach (OpsJob j in jobs) { try { await JobsBiz.RemoveJobAndLogsAsync(j.GId); } catch (Exception ex) { log.LogError(ex, "sweepAsync exception calling JobsBiz.RemoveJobAndLogsAsync"); //for now just throw it but this needs to be removed when logging added and better handling throw; } } } /// /// Kill jobs that have been stuck in "running" state for too long /// private static async Task killStuckJobsAsync(AyContext ct, DateTime dtRunningDeadline) { //Get the deleteable succeeded jobs list var jobs = await ct.OpsJob .AsNoTracking() .Where(z => z.Created < dtRunningDeadline && z.JobStatus == JobStatus.Running) .OrderBy(z => z.Created) .ToListAsync(); log.LogDebug($"killStuckJobsAsync processing: cutoff={dtRunningDeadline.ToString()}, for {jobs.Count.ToString()} jobs of status {JobStatus.Running.ToString()}"); foreach (OpsJob j in jobs) { //OPSMETRIC await JobsBiz.LogJobAsync(j.GId, "LT:JobFailed LT:TimedOut"); log.LogError($"Job found job stuck in running status and set to failed: deadline={dtRunningDeadline.ToString()}, jobId={j.GId.ToString()}, jobname={j.Name}, jobtype={j.JobType.ToString()}, jobAType={j.SockType.ToString()}, jobObjectId={j.ObjectId.ToString()}"); await JobsBiz.UpdateJobStatusAsync(j.GId, JobStatus.Failed); } } private static async Task SweepInternalJobsLogsAsync(AyContext ct, DateTime dtDeleteCutoff) { //Get the deleteable list (this is for reporting, could easily just do it in one go) var logs = await ct.OpsJobLog .AsNoTracking() .Where(z => z.Created < dtDeleteCutoff) .OrderBy(z => z.Created) .ToListAsync(); log.LogDebug($"SweepInternalJobsLogsAsync processing: cutoff={dtDeleteCutoff.ToString()}, for {logs.Count.ToString()} log entries"); foreach (OpsJobLog l in logs) { try { await ct.Database.ExecuteSqlInterpolatedAsync($"delete from aopsjoblog where gid = {l.GId}"); } catch (Exception ex) { log.LogError(ex, "SweepInternalJobsLogsAsync exception removed old log entries"); throw; } } } ///////////////////////////////////////////////////////////////////// }//eoc }//eons