This commit is contained in:
2023-01-18 03:12:39 +00:00
parent 1ff42db380
commit 126ae54f66
4 changed files with 93 additions and 148 deletions

View File

@@ -8,7 +8,7 @@ using Sockeye.Models;
namespace Sockeye.Biz
{
/// <summary>
/// Check the health of subscribers servers, basically a ping check
/// trigger notification if any fail the test excessively (some slack for intermittent comm. issues)
@@ -30,90 +30,31 @@ namespace Sockeye.Biz
log.LogDebug("Health check starting");
using (AyContext ct = Sockeye.Util.ServiceProviderProvider.DBContext)
{
var servers = await ct.SubscriptionServer
.AsNoTracking()
.Where(z => z.ServerState== < dtDeleteCutoff && z.JobStatus == jobStatus)
.OrderBy(z => z.Created)
.ToListAsync();
var ActiveServerIdList = await ct.SubscriptionServer
.AsNoTracking()
.Where(z => z.ServerState != ServerState.DeActivated && z.ServerState != ServerState.DeActivated)
.OrderBy(z => z.Id)
.Select(z=>z.Id)
.ToListAsync();
foreach(long serverId in ActiveServerIdList){
//get the health and triage accordingly
}
}
lastSweep = DateTime.UtcNow;
}
private static async Task sweepAsync(AyContext ct, DateTime dtDeleteCutoff, JobStatus jobStatus)
{
//Get the deleteable succeeded jobs list
log.LogDebug($"SweepAsync processing: cutoff={dtDeleteCutoff.ToString()}, for {jobs.Count.ToString()} jobs of status {jobStatus.ToString()}");
foreach (OpsJob j in jobs)
{
try
{
await JobsBiz.RemoveJobAndLogsAsync(j.GId);
}
catch (Exception ex)
{
log.LogError(ex, "sweepAsync exception calling JobsBiz.RemoveJobAndLogsAsync");
//for now just throw it but this needs to be removed when logging added and better handling
throw;
}
}
}
/// <summary>
/// Kill jobs that have been stuck in "running" state for too long
/// </summary>
private static async Task killStuckJobsAsync(AyContext ct, DateTime dtRunningDeadline)
{
//Get the deleteable succeeded jobs list
var jobs = await ct.OpsJob
.AsNoTracking()
.Where(z => z.Created < dtRunningDeadline && z.JobStatus == JobStatus.Running)
.OrderBy(z => z.Created)
.ToListAsync();
log.LogDebug($"killStuckJobsAsync processing: cutoff={dtRunningDeadline.ToString()}, for {jobs.Count.ToString()} jobs of status {JobStatus.Running.ToString()}");
foreach (OpsJob j in jobs)
{
//OPSMETRIC
await JobsBiz.LogJobAsync(j.GId, "LT:JobFailed LT:TimedOut");
log.LogError($"Job found job stuck in running status and set to failed: deadline={dtRunningDeadline.ToString()}, jobId={j.GId.ToString()}, jobname={j.Name}, jobtype={j.JobType.ToString()}, jobAType={j.SockType.ToString()}, jobObjectId={j.ObjectId.ToString()}");
await JobsBiz.UpdateJobStatusAsync(j.GId, JobStatus.Failed);
}
}
private static async Task SweepInternalJobsLogsAsync(AyContext ct, DateTime dtDeleteCutoff)
{
//Get the deleteable list (this is for reporting, could easily just do it in one go)
var logs = await ct.OpsJobLog
.AsNoTracking()
.Where(z => z.Created < dtDeleteCutoff)
.OrderBy(z => z.Created)
.ToListAsync();
log.LogDebug($"SweepInternalJobsLogsAsync processing: cutoff={dtDeleteCutoff.ToString()}, for {logs.Count.ToString()} log entries");
foreach (OpsJobLog l in logs)
{
try
{
await ct.Database.ExecuteSqlInterpolatedAsync($"delete from aopsjoblog where gid = {l.GId}");
}
catch (Exception ex)
{
log.LogError(ex, "SweepInternalJobsLogsAsync exception removed old log entries");
throw;
}
}
}
/////////////////////////////////////////////////////////////////////