From ae4f09cc611e9b6bdfca579fd1d253eca2ad2447 Mon Sep 17 00:00:00 2001 From: John Cardinal Date: Thu, 28 Oct 2021 17:35:58 +0000 Subject: [PATCH] Solved zombie process and resource consumption overrun issue when rendering reports --- ...s-config-report-rendering-max-instances.md | 4 +- .../AyaNova/Controllers/ReportController.cs | 2 +- server/AyaNova/biz/ReportBiz.cs | 3 +- server/AyaNova/util/ReportProcessManager.cs | 122 ++---------------- 4 files changed, 17 insertions(+), 114 deletions(-) diff --git a/docs/8.0/ayanova/docs/ops-config-report-rendering-max-instances.md b/docs/8.0/ayanova/docs/ops-config-report-rendering-max-instances.md index b9ed7ed5..ec690369 100644 --- a/docs/8.0/ayanova/docs/ops-config-report-rendering-max-instances.md +++ b/docs/8.0/ayanova/docs/ops-config-report-rendering-max-instances.md @@ -2,10 +2,12 @@ The report rendering maximum instances value controls how many report rendering processors are allowed to be running when a report is submitted before AyaNova will start forcibly shutting down excess instances. -Report rendering is a very "expensive" process in terms of memory and cpu usage on the server and in some rare cases the report rendering process can get stuck in a loop or frozen due to a bug in a report template script or simply selecting too many records to report in a practical amount of time. If too much memory or CPU cycles are tied up by these "zombie" report rendering processes it can cause the AyaNova server to stop responding completely. +Report rendering is a relatively "expensive" process in terms of memory and cpu usage on the server taking far more resources than non reporting AyaNova operations and in some rare cases the report rendering process can get stuck in a loop or frozen due to a bug in a report template script or simply selecting too many records to report in a practical amount of time. If too much memory or CPU cycles are tied up by these "zombie" report rendering processes it can cause the AyaNova server to stop responding completely. When AyaNova receives a report request it checks first to ensure there are not more report rendering processes active than are specified in this setting. If there are too many already running it will return a busy code to the end user to try again later and will attempt to forcibly shut down any of the processes that should have expired. Expired here means processes that have been running longer than [AYANOVA_REPORT_RENDERING_TIMEOUT](ops-config-report-rendering-timeout.md) number of milliseconds. +It's important to note that this setting does not prevent AyaNova from processing a large time consuming report for any length of time necessary to complete it; this setting only takes effect when another report is requested to be rendered and there are no slots free for report rendering. For this reason, if a User has to render an unusually large report that is known to take longer than the timeout period they would be advised to render that report when no one else is requesting a report (i.e. after hours). + ## Default If no override is specified AyaNova will use the following default value: diff --git a/server/AyaNova/Controllers/ReportController.cs b/server/AyaNova/Controllers/ReportController.cs index b3ca0469..c3f43021 100644 --- a/server/AyaNova/Controllers/ReportController.cs +++ b/server/AyaNova/Controllers/ReportController.cs @@ -201,7 +201,7 @@ namespace AyaNova.Api.Controllers //handles it at a lower level //returning an OK method here allows the client to handle it at the level of the report dialog rather than the api handler which will short circuit if it was a 503 if (!Util.ReportRenderManager.RenderSlotAvailable(log)) - return Ok(ApiOkResponse.Response(new { busy = true, retryafter = DateTime.UtcNow.AddMilliseconds(ServerBootConfig.AYANOVA_REPORT_RENDERING_TIMEOUT) })); + return Ok(ApiOkResponse.Response(new { busy = true, retryms = ServerBootConfig.AYANOVA_REPORT_RENDERING_TIMEOUT })); ReportBiz biz = ReportBiz.GetBiz(ct, HttpContext); if (!Authorized.HasReadFullRole(HttpContext.Items, biz.BizType)) diff --git a/server/AyaNova/biz/ReportBiz.cs b/server/AyaNova/biz/ReportBiz.cs index 9149655f..70819f73 100644 --- a/server/AyaNova/biz/ReportBiz.cs +++ b/server/AyaNova/biz/ReportBiz.cs @@ -467,7 +467,8 @@ namespace AyaNova.Biz //however chrome may be problematic on linux, I'm seeing issues under load testing where it freezes out the droplet at 100% cpu and ram and it's //chrome processes piling up all churning away so experimenting with resolutions - //this guy says single process should NOT be used + //this guy says single process should NOT be used as recommended by Google + //and so far in testing I see no reason to doubt him so removed it //https://github.com/puppeteer/puppeteer/issues/1825#issuecomment-792817748 //testing lo.Args = new string[] { "--disable-dev-shm-usage --no-sandbox --disable-gpu --no-zygote " }; diff --git a/server/AyaNova/util/ReportProcessManager.cs b/server/AyaNova/util/ReportProcessManager.cs index e32d364c..9b65aff6 100644 --- a/server/AyaNova/util/ReportProcessManager.cs +++ b/server/AyaNova/util/ReportProcessManager.cs @@ -1,41 +1,24 @@ using System; using System.Collections.Concurrent; -using System.Collections.Generic; using System.Diagnostics; -using System.Threading; -using System.Threading.Tasks; using Microsoft.Extensions.Logging; namespace AyaNova.Util { /// - /// Used by reporting system to ensure headless browsers don't hang around in an untimely manner + /// Used by reporting system to ensure headless browsers don't hang around in an untimely manner chewing up resources /// needed due to bugs in puppeteersharp where it won't close the browser on timeout properly /// also zombie process issues in linux etc, this just ensures it's safe - /// This is triggered when a report is rendered + /// This is triggered when a report is rendered on demand + /// in other words demand drives whether it kills long running renders or not + /// this is by design to allow a scenario where a super long running report can still be run off hours (for example) /// internal static class ReportRenderManager { - - /* - Use thread safe concurrent dictionary collection to manage up to AYANOVA_REPORT_RENDERING_MAX_INSTANCES - (it's allowed to go slightly over, this is not ever going to be exactly right under heavy load, but it should always kill the old processes no matter what) - - Render route controller checks for an available slot with the reportprocessormanager first "ProcessSlotAvailable" - if there is no free slot it immediately callse Cleanup which looks for the oldest slot that is over the limit and attempts to shut it down, once shut down it returns true - if there is no free slot and none are over the limit it returns false signifying try again - - - If there *is* a free slot then it passes off to reportbiz as usual - Report biz reserves a slot when launches the browser process with here by adding it to the dictionary "AddProcess(processid)" - When the report is generated it will remove from the slot by calling into here "RemoveProcess(processId)" - Remove from slot here will confirm the process is no longer running and if it is kill it or if it's not remove it from the collection - + /* expired processes are removed by the act of tryign to get a new slot so in this way it still supports running super long reports overnight for example as long as there is no contention The other way was by a job that looks for expired processes but that would mean all old jobs would expire all the time so there would be an issue with huge reports never working - - */ //thread safe collection for unordered items, optimized for single thread produce/consume (which is the norm here) but supports multithread produce / consume (which is needed for separate cleanup job) @@ -58,9 +41,9 @@ namespace AyaNova.Util { log.LogTrace("RenderSlotAvailable check"); var count = _baginstances.Count; -#if (DEBUG) - log.LogInformation($"DBG: RenderSlotAvailable check, there are currently {count} instances in the bag"); -#endif +// #if (DEBUG) +// log.LogInformation($"DBG: RenderSlotAvailable check, there are currently {count} instances in the bag"); +// #endif if (count >= ServerBootConfig.AYANOVA_REPORT_RENDERING_MAX_INSTANCES) { log.LogTrace($"RenderSlotAvailable there are no free report rendering slots available, current count is {count}, checking for expired slots to force closed"); @@ -71,9 +54,9 @@ namespace AyaNova.Util { if (i.Expires < dtNow) { -#if (DEBUG) - log.LogInformation($"DBG: RenderSlotAvailable attempting kill of expired process {i.ReporterProcessId}"); -#endif +// #if (DEBUG) +// log.LogInformation($"DBG: RenderSlotAvailable attempting kill of expired process {i.ReporterProcessId}"); +// #endif ForceCloseProcess(i, log); } } @@ -117,7 +100,6 @@ namespace AyaNova.Util } } - internal static void AddProcess(int processId) { _baginstances.Add(new ReportRenderInstanceInfo(processId)); @@ -135,87 +117,5 @@ namespace AyaNova.Util } } - - - // internal async static Task EnsureReporterAvailableAsync(ILogger log) - // { - // Process reportProcess = ReporterProcess(); - // if (reportProcess == null) - // { - // return; - // } - // //await it's completion in the specified timeout - // int HardTimeout = ServerBootConfig.AYANOVA_REPORT_RENDERING_TIMEOUT; - // //don't wait forever, hard cap of 3 minutes regardless of setting - // if (HardTimeout > 180000) HardTimeout = 180000; - // bool keepOnWaiting = true; - // while (keepOnWaiting) - // { - // //don't check continually - // await Task.Delay(500); - // //check process is still running - // if (reportProcess?.HasExited == false) - // { - // //time to kill it? - // if ((DateTime.UtcNow - Started).TotalMilliseconds > HardTimeout) - // { - // log.LogInformation($"Report processor did not complete in {HardTimeout}ms and will be force stopped"); - // reportProcess.Kill(); - // keepOnWaiting = false; - // } - // } - // else - // { - // log.LogDebug($"EnsureReporterAvailableAsync Reporter processor completed normally"); - // keepOnWaiting = false; - // } - // }; - // ReporterProcessId = -1; - // Started = DateTime.MinValue; - // return; - // } - - // internal static void RecordNewReportGeneratorProcess(int processId) - // { - // ReporterProcessId = processId; - // Started = DateTime.UtcNow; - // } - - // private static Process ReporterProcess() - // { - // if (ReporterProcessId == -1) return null; - // try - // { - // return Process.GetProcessById(ReporterProcessId); - // } - // catch (ArgumentException) - // { - // return null;//no process available / not running - // } - // } - - - /* - //Is the report generator (browser) already running? - if(ReportingProcessCache.ReporterProcess()!=null){ - //there is an existing process in action, let's wait for timeout seconds and kill it if it's still running before proceeding - //first check to see if it's still actually running or not: - var process = System.Diagnostics.Process.GetProcessById(ReportingProcessCache.ReporterProcessId); - // if (ChromiumProcessID > 0 && process?.HasExited == false) - // { - // log.LogError($"Error during render, Chromium process (pid {ChromiumProcessID}) still active, forcing it to stop now"); - // process.Kill(); - // } - - bool keepOnWaiting=true; - while(keepOnWaiting){ - var v= DateTime.UtcNow-ReportingProcessCache.Started; - if(v.TotalSeconds> ServerBootConfig.REPORT_RENDERING_OPERATION_TIMEOUT){ - - } - } - - } - */ }//eoc }//eons \ No newline at end of file