Files
raven/server/AyaNova/generator/CoreJobSweeper.cs

128 lines
5.1 KiB
C#

using System;
using System.Linq;
using System.Threading.Tasks;
using System.Collections.Generic;
using Microsoft.EntityFrameworkCore;
using Microsoft.AspNetCore.Mvc;
using Microsoft.Extensions.Logging;
using EnumsNET;
using AyaNova.Util;
using AyaNova.Api.ControllerHelpers;
using AyaNova.Biz;
using AyaNova.Models;
namespace AyaNova.Biz
{
/// <summary>
/// JobSweeper - called by Generator to clean out old jobs that are completed and their logs
///
/// </summary>
internal static class CoreJobSweeper
{
private static ILogger log = AyaNova.Util.ApplicationLogging.CreateLogger("CoreJobSweeper");
private static DateTime lastSweep = DateTime.MinValue;
private static TimeSpan SWEEP_EVERY_INTERVAL = new TimeSpan(0, 30, 0);
private static TimeSpan SUCCEEDED_JOBS_DELETE_AFTER_THIS_TIMESPAN = new TimeSpan(24, 0, 0);//24 hours
private static TimeSpan FAILED_JOBS_DELETE_AFTER_THIS_TIMESPAN = new TimeSpan(14, 0, 0, 0);//14 days (gives people time to notice and look into it)
private static TimeSpan RUNNING_JOBS_BECOME_FAILED_AFTER_THIS_TIMESPAN = new TimeSpan(24, 0, 0);//24 hours (time running jobs are allowed to sit in "running" state before considered failed)
////////////////////////////////////////////////////////////////////////////////////////////////
// DoSweep
//
public static async Task DoSweepAsync(AyContext ct)
{
//This will get triggered roughly every minute, but we don't want to sweep that frequently
if (DateTime.UtcNow - lastSweep < SWEEP_EVERY_INTERVAL)
return;
log.LogTrace("Sweep starting");
//SWEEP SUCCESSFUL JOBS
//calculate cutoff to delete
DateTime dtDeleteCutoff = DateTime.UtcNow - SUCCEEDED_JOBS_DELETE_AFTER_THIS_TIMESPAN;
await sweepAsync(ct, dtDeleteCutoff, JobStatus.Completed);
//SWEEP FAILED JOBS
//calculate cutoff to delete
dtDeleteCutoff = DateTime.UtcNow - FAILED_JOBS_DELETE_AFTER_THIS_TIMESPAN;
await sweepAsync(ct, dtDeleteCutoff, JobStatus.Failed);
//KILL STUCK JOBS
//calculate cutoff to delete
DateTime dtRunningDeadline = DateTime.UtcNow - RUNNING_JOBS_BECOME_FAILED_AFTER_THIS_TIMESPAN;
await killStuckJobsAsync(ct, dtRunningDeadline);
lastSweep = DateTime.UtcNow;
}
private static async Task sweepAsync(AyContext ct, DateTime dtDeleteCutoff, JobStatus jobStatus)
{
//Get the deleteable succeeded jobs list
var jobs = await ct.OpsJob
.AsNoTracking()
.Where(c => c.Created < dtDeleteCutoff && c.JobStatus == jobStatus)
.OrderBy(m => m.Created)
.ToListAsync();
log.LogTrace($"SweepAsync processing: cutoff={dtDeleteCutoff.ToString()}, for {jobs.Count.ToString()} jobs of status {jobStatus.ToString()}");
foreach (OpsJob j in jobs)
{
try
{
await JobsBiz.DeleteJobAndLogAsync(j.GId, ct);
}
catch (Exception ex)
{
log.LogError(ex, "sweepAsync exception calling JobsBiz.DeleteJobAndLogAsync");
//for now just throw it but this needs to be removed when logging added and better handling
throw (ex);
}
}
}
/// <summary>
/// Kill jobs that have been stuck in "running" state for too long
/// </summary>
/// <param name="ct"></param>
/// <param name="dtRunningDeadline"></param>
/// <returns></returns>
private static async Task killStuckJobsAsync(AyContext ct, DateTime dtRunningDeadline)
{
//Get the deleteable succeeded jobs list
var jobs = await ct.OpsJob
.AsNoTracking()
.Where(c => c.Created < dtRunningDeadline && c.JobStatus == JobStatus.Running)
.OrderBy(m => m.Created)
.ToListAsync();
log.LogTrace($"killStuckJobsAsync processing: cutoff={dtRunningDeadline.ToString()}, for {jobs.Count.ToString()} jobs of status {JobStatus.Running.ToString()}");
foreach (OpsJob j in jobs)
{
//OPSMETRIC
await JobsBiz.LogJobAsync(j.GId, "Job took too long to run - setting to failed", ct);
log.LogError($"Job found job stuck in running status and set to failed: deadline={dtRunningDeadline.ToString()}, jobId={j.GId.ToString()}, jobname={j.Name}, jobtype={j.JobType.ToString()}, jobObjectType={j.ObjectType.ToString()}, jobObjectId={j.ObjectId.ToString()}");
await JobsBiz.UpdateJobStatusAsync(j.GId, JobStatus.Failed, ct);
}
}
/////////////////////////////////////////////////////////////////////
}//eoc
}//eons