using System; using System.Linq; using System.Threading.Tasks; using System.Collections.Generic; using Microsoft.EntityFrameworkCore; using Microsoft.AspNetCore.Mvc; using Microsoft.AspNetCore.JsonPatch; using Microsoft.Extensions.Logging; using EnumsNET; using AyaNova.Util; using AyaNova.Api.ControllerHelpers; using AyaNova.Biz; using AyaNova.Models; namespace AyaNova.Biz { /// /// JobSweeper - called by Generator to clean out old jobs that are completed and their logs /// /// internal static class CoreJobSweeper { private static ILogger log = AyaNova.Util.ApplicationLogging.CreateLogger("CoreJobSweeper"); private static DateTime lastSweep = DateTime.MinValue; private static TimeSpan SWEEP_EVERY_INTERVAL = new TimeSpan(0, 30, 0); private static TimeSpan SUCCEEDED_JOBS_DELETE_AFTER_THIS_TIMESPAN = new TimeSpan(24, 0, 0);//24 hours private static TimeSpan FAILED_JOBS_DELETE_AFTER_THIS_TIMESPAN = new TimeSpan(14, 0, 0, 0);//14 days (gives people time to notice and look into it) private static TimeSpan RUNNING_JOBS_BECOME_FAILED_AFTER_THIS_TIMESPAN = new TimeSpan(24, 0, 0);//24 hours (time running jobs are allowed to sit in "running" state before considered failed) //////////////////////////////////////////////////////////////////////////////////////////////// // DoSweep // public static async Task DoSweepAsync(AyContext ct) { //This will get triggered roughly every minute, but we don't want to sweep that frequently if (DateTime.UtcNow - lastSweep < SWEEP_EVERY_INTERVAL) return; log.LogTrace("Sweep starting"); //SWEEP SUCCESSFUL JOBS //calculate cutoff to delete DateTime dtDeleteCutoff = DateTime.UtcNow - SUCCEEDED_JOBS_DELETE_AFTER_THIS_TIMESPAN; await sweepAsync(ct, dtDeleteCutoff, JobStatus.Completed); //SWEEP FAILED JOBS //calculate cutoff to delete dtDeleteCutoff = DateTime.UtcNow - FAILED_JOBS_DELETE_AFTER_THIS_TIMESPAN; await sweepAsync(ct, dtDeleteCutoff, JobStatus.Failed); //KILL STUCK JOBS //calculate cutoff to delete DateTime dtRunningDeadline = DateTime.UtcNow - RUNNING_JOBS_BECOME_FAILED_AFTER_THIS_TIMESPAN; await killStuckJobsAsync(ct, dtRunningDeadline); lastSweep = DateTime.UtcNow; } private static async Task sweepAsync(AyContext ct, DateTime dtDeleteCutoff, JobStatus jobStatus) { //Get the deleteable succeeded jobs list var jobs = await ct.OpsJob .AsNoTracking() .Where(c => c.Created < dtDeleteCutoff && c.JobStatus == jobStatus) .OrderBy(m => m.Created) .ToListAsync(); log.LogTrace($"SweepAsync processing: cutoff={dtDeleteCutoff.ToString()}, for {jobs.Count.ToString()} jobs of status {jobStatus.ToString()}"); foreach (OpsJob j in jobs) { try { await JobsBiz.DeleteJobAndLogAsync(j.GId, ct); } catch (Exception ex) { log.LogError(ex, "sweepAsync exception calling JobsBiz.DeleteJobAndLogAsync"); //for now just throw it but this needs to be removed when logging added and better handling throw (ex); } } } /// /// Kill jobs that have been stuck in "running" state for too long /// /// /// /// private static async Task killStuckJobsAsync(AyContext ct, DateTime dtRunningDeadline) { //Get the deleteable succeeded jobs list var jobs = await ct.OpsJob .AsNoTracking() .Where(c => c.Created < dtRunningDeadline && c.JobStatus == JobStatus.Running) .OrderBy(m => m.Created) .ToListAsync(); log.LogTrace($"killStuckJobsAsync processing: cutoff={dtRunningDeadline.ToString()}, for {jobs.Count.ToString()} jobs of status {JobStatus.Running.ToString()}"); foreach (OpsJob j in jobs) { //OPSMETRIC await JobsBiz.LogJobAsync(j.GId, "Job took too long to run - setting to failed", ct); log.LogError($"Job found job stuck in running status and set to failed: deadline={dtRunningDeadline.ToString()}, jobId={j.GId.ToString()}, jobname={j.Name}, jobtype={j.JobType.ToString()}, jobObjectType={j.ObjectType.ToString()}, jobObjectId={j.ObjectId.ToString()}"); await JobsBiz.UpdateJobStatusAsync(j.GId, JobStatus.Failed, ct); } } ///////////////////////////////////////////////////////////////////// }//eoc }//eons