From fc95aedaac2d3d4eb05bca27bd07818438efc472 Mon Sep 17 00:00:00 2001 From: John Cardinal Date: Mon, 18 May 2020 22:41:39 +0000 Subject: [PATCH] Search indexing improvements --- .vscode/launch.json | 2 +- server/AyaNova/biz/Search.cs | 97 ++++++++----------- .../SearchTranslationWordBreakDataCache.cs | 74 ++++++++++++++ 3 files changed, 118 insertions(+), 55 deletions(-) create mode 100644 server/AyaNova/biz/SearchTranslationWordBreakDataCache.cs diff --git a/.vscode/launch.json b/.vscode/launch.json index eeacb75a..1d722811 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -50,7 +50,7 @@ "AYANOVA_FOLDER_USER_FILES": "c:\\temp\\RavenTestData\\userfiles", "AYANOVA_FOLDER_BACKUP_FILES": "c:\\temp\\RavenTestData\\backupfiles", "AYANOVA_METRICS_USE_INFLUXDB": "false", - "AYANOVA_SERVER_TEST_MODE":"true", + "AYANOVA_SERVER_TEST_MODE":"false", "AYANOVA_SERVER_TEST_MODE_SEEDLEVEL":"small", "AYANOVA_SERVER_TEST_MODE_TZ_OFFSET":"-7" diff --git a/server/AyaNova/biz/Search.cs b/server/AyaNova/biz/Search.cs index 69132a5c..391a9a5e 100644 --- a/server/AyaNova/biz/Search.cs +++ b/server/AyaNova/biz/Search.cs @@ -910,19 +910,19 @@ cache or provide directly the translation to save time repeatedly fetching it wh #region Breaker - //Class to hold relevant translation data for breaking text - public class TranslationWordBreakingData - { - public bool CJKIndex { get; set; } - public List StopWords { get; set; } - public TranslationWordBreakingData() - { - CJKIndex = false; - StopWords = new List(); - } - } + // //Class to hold relevant translation data for breaking text + // public class TranslationWordBreakingData + // { + // public bool CJKIndex { get; set; } + // public List StopWords { get; set; } + // public TranslationWordBreakingData() + // { + // CJKIndex = false; + // StopWords = new List(); + // } + // } - private static Dictionary translationWordBreakingDataCache = new Dictionary(); + // private static Dictionary translationWordBreakingDataCache = new Dictionary(); // //called at startup to populate cache //WAS GOING TO ADD THIS IN RESPONSE TO AN ISSUE WITH EXCEPTION ATTEMPTING TO ADD ALREADY EXISTING DICTIONARY ID 1, BUT IT NEVER HAPPENED AGAIN, SO :SHRUGEMOJI: @@ -940,36 +940,36 @@ cache or provide directly the translation to save time repeatedly fetching it wh // }).ToListAsync(); // TranslationWordBreakingDataCache.Add(TranslationId, await GetTranslationSearchDataAsync(TranslationId)); // } - internal static async Task GetTranslationSearchDataAsync(long translationId, AyContext ct = null) - { - TranslationWordBreakingData LSD = new TranslationWordBreakingData(); - if (ct == null) - ct = ServiceProviderProvider.DBContext; - //Get stopwords - //Validate translation id, if not right then use default instead - var Param = new List(); - translationId = await TranslationBiz.ReturnSpecifiedTranslationIdIfExistsOrDefaultTranslationId(translationId, ct); - Param.Add("StopWords1"); - Param.Add("StopWords2"); - Param.Add("StopWords3"); - Param.Add("StopWords4"); - Param.Add("StopWords5"); - Param.Add("StopWords6"); - Param.Add("StopWords7"); - var Stops = await TranslationBiz.GetSubsetStaticAsync(Param, translationId); + // internal static async Task GetTranslationSearchDataAsync(long translationId, AyContext ct = null) + // { + // TranslationWordBreakingData LSD = new TranslationWordBreakingData(); + // if (ct == null) + // ct = ServiceProviderProvider.DBContext; + // //Get stopwords + // //Validate translation id, if not right then use default instead + // var Param = new List(); + // translationId = await TranslationBiz.ReturnSpecifiedTranslationIdIfExistsOrDefaultTranslationId(translationId, ct); + // Param.Add("StopWords1"); + // Param.Add("StopWords2"); + // Param.Add("StopWords3"); + // Param.Add("StopWords4"); + // Param.Add("StopWords5"); + // Param.Add("StopWords6"); + // Param.Add("StopWords7"); + // var Stops = await TranslationBiz.GetSubsetStaticAsync(Param, translationId); - foreach (KeyValuePair kvp in Stops) - { - //Each stopwords translation key is a space delimited list of words and in the case of an empty local string (i.e. StopWords7) it's value is a single question mark - if (kvp.Value != "?") - { - LSD.StopWords.AddRange(kvp.Value.Split(" ")); - } - } + // foreach (KeyValuePair kvp in Stops) + // { + // //Each stopwords translation key is a space delimited list of words and in the case of an empty local string (i.e. StopWords7) it's value is a single question mark + // if (kvp.Value != "?") + // { + // LSD.StopWords.AddRange(kvp.Value.Split(" ")); + // } + // } - LSD.CJKIndex = await TranslationBiz.GetCJKIndexAsync(translationId, ct); - return LSD; - } + // LSD.CJKIndex = await TranslationBiz.GetCJKIndexAsync(translationId, ct); + // return LSD; + // } public enum TokenTypes { Nothing, Separator, CJK, Latin }; @@ -1012,23 +1012,12 @@ cache or provide directly the translation to save time repeatedly fetching it wh return await BreakCoreAsync(translationId, true, textStrings, true); } - /// - /// Stop words list reset upon login or editing of Translation text - /// used for eliminating noise words from search dictionary - /// - + internal static async Task> BreakCoreAsync(long translationId, bool KeepWildCards, List textStrings, bool ignoreStopWords = false) { - //For stopwords and CJKIndex flag value - //if not provided (will be provided by seeder for performance but normally never) then fetch - - if (!translationWordBreakingDataCache.ContainsKey(translationId)) - { - translationWordBreakingDataCache.Add(translationId, await GetTranslationSearchDataAsync(translationId)); - } - var translationWordBreakData = translationWordBreakingDataCache[translationId]; - + //For stopwords and CJKIndex flag value + var translationWordBreakData = await SearchTranslationWordBreakDataCache.GetWordBreakData(translationId); int MAXWORDLENGTH = 255; int MINWORDLENGTH = 2;//A word isn't a word unless it's got at least two characters in it diff --git a/server/AyaNova/biz/SearchTranslationWordBreakDataCache.cs b/server/AyaNova/biz/SearchTranslationWordBreakDataCache.cs new file mode 100644 index 00000000..718a55da --- /dev/null +++ b/server/AyaNova/biz/SearchTranslationWordBreakDataCache.cs @@ -0,0 +1,74 @@ +using System.Threading.Tasks; +using System.Threading; +using System.Collections.Generic; +using AyaNova.Util; +using AyaNova.Models; +namespace AyaNova.Biz +{ + + public class SearchTranslationWordBreakDataCache + { + static SemaphoreSlim semaphoreSlim = new SemaphoreSlim(1, 1); + private static Dictionary theCache = new Dictionary(); + public SearchTranslationWordBreakDataCache() { } + public static async Task GetWordBreakData(long id) + { + await semaphoreSlim.WaitAsync(); + try + { + if (!theCache.ContainsKey(1)) + theCache[id] = await GetTranslationSearchDataAsync(id); + return theCache[id]; + } + finally + { + semaphoreSlim.Release(); + } + } + + + + internal static async Task GetTranslationSearchDataAsync(long translationId) + { + TranslationWordBreakingData LSD = new TranslationWordBreakingData(); + AyContext ct = ServiceProviderProvider.DBContext; + //Get stopwords + //Validate translation id, if not right then use default instead + var Param = new List(); + translationId = await TranslationBiz.ReturnSpecifiedTranslationIdIfExistsOrDefaultTranslationId(translationId, ct); + Param.Add("StopWords1"); + Param.Add("StopWords2"); + Param.Add("StopWords3"); + Param.Add("StopWords4"); + Param.Add("StopWords5"); + Param.Add("StopWords6"); + Param.Add("StopWords7"); + var Stops = await TranslationBiz.GetSubsetStaticAsync(Param, translationId); + + foreach (KeyValuePair kvp in Stops) + { + //Each stopwords translation key is a space delimited list of words and in the case of an empty local string (i.e. StopWords7) it's value is a single question mark + if (kvp.Value != "?") + { + LSD.StopWords.AddRange(kvp.Value.Split(" ")); + } + } + + LSD.CJKIndex = await TranslationBiz.GetCJKIndexAsync(translationId, ct); + return LSD; + } + + //Class to hold relevant translation data for breaking text + public class TranslationWordBreakingData + { + public bool CJKIndex { get; set; } + public List StopWords { get; set; } + public TranslationWordBreakingData() + { + CJKIndex = false; + StopWords = new List(); + } + } + + }//eoc +}//eons \ No newline at end of file