From 02d4084826ed577c578300396ef5a7b474fa75f1 Mon Sep 17 00:00:00 2001 From: John Cardinal Date: Tue, 18 Sep 2018 23:15:02 +0000 Subject: [PATCH] --- server/AyaNova/biz/LocaleBiz.cs | 10 + server/AyaNova/biz/Search.cs | 417 +++++++++++++++++++++++++++++++- 2 files changed, 422 insertions(+), 5 deletions(-) diff --git a/server/AyaNova/biz/LocaleBiz.cs b/server/AyaNova/biz/LocaleBiz.cs index 9a385791..e8b5fcaf 100644 --- a/server/AyaNova/biz/LocaleBiz.cs +++ b/server/AyaNova/biz/LocaleBiz.cs @@ -164,6 +164,16 @@ namespace AyaNova.Biz } + //Get the CJKIndex value for the locale specified + internal static async Task GetCJKIndex(long localeId, AyContext ct=null) + { + if(ct==null) + ct = ServiceProviderProvider.DBContext; + var ret = await ct.Locale.Where(x => x.Id == localeId).Select(m=>m.CjkIndex).SingleOrDefaultAsync(); + return ret; + } + + /// /// Get the value of the key provided in the default locale chosen /// diff --git a/server/AyaNova/biz/Search.cs b/server/AyaNova/biz/Search.cs index 39284ffa..34c601e1 100644 --- a/server/AyaNova/biz/Search.cs +++ b/server/AyaNova/biz/Search.cs @@ -13,12 +13,18 @@ namespace AyaNova.Biz //This class handles word breaking, processing keywords and searching for results public static class Search { + + //Initial keyword indexing consists of + //WordBreaker - break down into words + //ProcessKeywords into database + + /// /// Process the keywords into the dictionary /// public static void ProcessKeywords(AyContext ct, long localeId, long objectID, AyaType objectType, bool newRecord, string keyWords, string name) { - var StopWords = GetLocaleSearchData(ct, localeId); + var LocaleSearchData = GetLocaleSearchData(ct, localeId); //Get CJK index bool flag. //TODO: should this be a property of the locale or a global setting as before?? @@ -53,8 +59,9 @@ namespace AyaNova.Biz //Get the current stopwords for the user's locale - private static List GetLocaleSearchData(AyContext ct, long localeId) + private static LocaleSearchData GetLocaleSearchData(AyContext ct, long localeId) { + LocaleSearchData LSD=new LocaleSearchData(); //Get stopwords //Validate locale id, if not right then use default instead var Param = new Api.Controllers.LocaleController.LocaleSubsetParam(); @@ -67,21 +74,421 @@ namespace AyaNova.Biz Param.Keys.Add("StopWords6"); Param.Keys.Add("StopWords7"); var Stops = LocaleBiz.GetSubsetStatic(Param).Result; - List StopWords = new List(); + foreach (KeyValuePair kvp in Stops) { //Each stopwords locale key is a space delimited list of words and in the case of an empty local string (i.e. StopWords7) it's value is a single question mark if (kvp.Value != "?") { - StopWords.AddRange(kvp.Value.Split(" ")); + LSD.StopWords.AddRange(kvp.Value.Split(" ")); } } - return StopWords; + + LSD.CJKIndex = LocaleBiz.GetCJKIndex(localeId,ct).Result; + return LSD; } + #region Breaker + + public enum TokenTypes + { Nothing, Separator, CJK, Latin }; + + /// + /// Take an array of strings and + /// return a single string + /// containing unique only, lowercase comma delimited + /// keywords suitable for passing to a + /// stored procedure or other function + /// + /// Use Locale setting CJKIndex=true to handle Chinese, Japanese, Korean etc + /// (languages with no easily identifiable word boundaries as in english) + /// + /// + + /// An array of 0 to * strings of text + /// + internal static string Break(params string[] text) + { + return BreakCore(false, text); + } + + /// + /// Used to Process users search phrase and preserve wild + /// cards entered + /// + /// + /// + internal static string BreakSearchPhrase(params string[] text) + { + return BreakCore(true, text); + } + + /// + /// Stop words list reset upon login or editing of localized text + /// used for eliminating noise words from search dictionary + /// + public static System.Collections.Generic.List StopList = null; + + internal static string BreakCore(bool KeepWildCards, params string[] text) + { + + ////case 1039 //log.Debug("Break"); + + #region stopwords + if (StopList == null) + { + StopList = new List(); + for (int stopkeys = 1; stopkeys < 8; stopkeys++) + { + MatchCollection mc = rxAllWords.Matches(LocalizedTextTable.GetLocalizedTextDirect("StopWords" + stopkeys.ToString())); + foreach (Match m in mc) + { + if (!string.IsNullOrEmpty(m.Value) && m.Value != "?" && !StopList.Contains(m.Value)) + StopList.Add(m.Value); + + } + } + } + #endregion + + bool CJK = GlobalSettings.CJKIndex; + int MAXWORDLENGTH = 255; + + StringBuilder sbResults = new StringBuilder(); + + //Hashtable to temporarily hold parsed words + //used to easily ensure unique words only + Hashtable ht = new Hashtable(); + + //Stuff required for creating xml fragment on the fly in memory (string) + StringBuilder sb = new StringBuilder(); + StringBuilder sbWord = new StringBuilder(); + System.IO.StringWriter sr = new System.IO.StringWriter(sb); + System.Xml.XmlTextWriter w = new System.Xml.XmlTextWriter(sr); + + w.Formatting = System.Xml.Formatting.Indented; + w.WriteStartElement("Items"); + + + //Loop through each of the passed in strings + foreach (string s in text) + { + if (s == null || s == "") continue; + //get all the characters in a unicode compliant manner... + TextElementEnumerator t = StringInfo.GetTextElementEnumerator(s); + //start at the top + t.Reset(); + + TokenTypes LastToken = TokenTypes.Nothing; + + //Used by CJK + bool BasicLatinBlock = true; + + //Process each "character" (text element,glyph whatever) in the + //current string + while (t.MoveNext()) + { + //get it as a character + char c = t.GetTextElement()[0]; + + if (!CJK) + { + #region regular tokenizer + + //Is it a token we want to include? + //Or a wildcard character + if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%')) + { + #region Include token + //All latin text is converted to lower case + c = char.ToLower(c); + + //Do we already have a word? + if (sbWord.Length > 0) + { + //Maybe we need to flush this word into the word list + //if we're over the word length limit + if (sbWord.Length >= MAXWORDLENGTH) + { + //flush away... + if (!ht.ContainsKey(sbWord.ToString())) + { + ht[sbWord.ToString()] = 1; + //sbTest.Append(sbWord.ToString()+"\r\n"); + } + sbWord.Length = 0; + sbWord.Append(c); + LastToken = TokenTypes.Latin; + continue; + + } + } + + //append character and go on to next one + sbWord.Append(c); + LastToken = TokenTypes.Latin; + continue; + #endregion + } + else + { + #region Word Boundary token + LastToken = TokenTypes.Separator; + if (sbWord.Length > 0) + { + //flush away... + if (!ht.ContainsKey(sbWord.ToString())) + { + ht[sbWord.ToString()] = 1; + //sbTest.Append(sbWord.ToString()+"\r\n"); + } + sbWord.Length = 0; + + continue; + + } + + #endregion + } + #endregion + } + else + { + #region CJK Tokenizer + + //Is it a basic latin charater? (ascii basically) + //see: http://www.unicode.org/charts/index.html + //and here for a funky online viewer: + //http://www.fileformat.info/info/unicode/block/index.htm + //we need to know this so that regular english text + //within cjk text gets properly indexed as whole words + BasicLatinBlock = false; + if ((int)c < 256) BasicLatinBlock = true; + + if (BasicLatinBlock) + { + //Is it a token we want to include? + if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%')) + { + #region Latin Include token + //All latin text is converted to lower case + c = char.ToLower(c); + + //Do we already have a word? + if (sbWord.Length > 0) + { + //Maybe we need to flush this word into the word list + //if we're over the word length limit or we are going from + //CJK to latin + if (LastToken == TokenTypes.CJK || sbWord.Length >= MAXWORDLENGTH) + { + //flush away... + if (!ht.ContainsKey(sbWord.ToString())) + { + ht[sbWord.ToString()] = 1; + //sbTest.Append(sbWord.ToString()+"\r\n"); + } + sbWord.Length = 0; + sbWord.Append(c); + LastToken = TokenTypes.Latin; + continue; + + } + } + + //append character and go on to next one + sbWord.Append(c); + LastToken = TokenTypes.Latin; + continue; + #endregion + } + else + { + #region Latin Word Boundary token + LastToken = TokenTypes.Separator; + if (sbWord.Length > 0) + { + //flush away... + if (!ht.ContainsKey(sbWord.ToString())) + { + ht[sbWord.ToString()] = 1; + //sbTest.Append(sbWord.ToString()+"\r\n"); + } + sbWord.Length = 0; + + continue; + + } + + #endregion + } + + } + else//CJK character + { + if (char.IsLetter(c) || (KeepWildCards && c == '%')) + { + #region CJK Include token + //Do we already have a word? + if (sbWord.Length > 0) + { + //Maybe we need to flush this word into the word list + //if we're over the word length limit or we are going from + //latin TO CJK + if (LastToken == TokenTypes.Latin || sbWord.Length >= MAXWORDLENGTH) + { + //flush away... + if (!ht.ContainsKey(sbWord.ToString())) + { + ht[sbWord.ToString()] = 1; + //sbTest.Append(sbWord.ToString()+"\r\n"); + } + sbWord.Length = 0; + sbWord.Append(c); + LastToken = TokenTypes.CJK; + continue; + + } + + if (LastToken == TokenTypes.CJK) + { + //we're here because there is more than zero characters already stored + //and the last was CJK so we need append current character + //and flush the resultant 2 character n-gram + sbWord.Append(c); + System.Diagnostics.Debug.Assert(sbWord.Length == 2); + if (!ht.ContainsKey(sbWord.ToString())) + { + ht[sbWord.ToString()] = 1; + //sbTest.Append(sbWord.ToString()+"\r\n"); + } + sbWord.Length = 0; + sbWord.Append(c); + LastToken = TokenTypes.CJK; + continue; + + } + } + + //append character and go on to next one + sbWord.Append(c); + LastToken = TokenTypes.CJK; + continue; + #endregion + + + } + else + { + #region CJK Word Boundary token + LastToken = TokenTypes.Separator; + if (sbWord.Length > 0) + { + //flush away... + if (!ht.ContainsKey(sbWord.ToString())) + { + ht[sbWord.ToString()] = 1; + //sbTest.Append(sbWord.ToString()+"\r\n"); + } + sbWord.Length = 0; + + continue; + + } + + #endregion + } + + + } + + + + + + + #endregion + } + } + + //Flush out the last word + if (sbWord.Length > 0) + { + //flush away... + if (!ht.ContainsKey(sbWord.ToString())) + { + ht[sbWord.ToString()] = 1; + ////sbTest.Append(sbWord.ToString()+"\r\n"); + } + sbWord.Length = 0; + } + } + + + //bail early if there is nothing indexed + if (ht.Count == 0) return ""; + + if (AsXML) + { + //Make a return xml fragment + //from the word list + foreach (DictionaryEntry d in ht) + { + //Add only non stopwords + if (!StopList.Contains(d.Key.ToString())) + { + w.WriteStartElement("i"); + w.WriteAttributeString("w", d.Key.ToString()); + w.WriteEndElement(); + } + } + + w.WriteEndElement(); + sr.Close(); + return sr.ToString(); + } + else + { + //Make a return string array + //from the word list + foreach (DictionaryEntry d in ht) + { + //Add only non stopwords + if (!StopList.Contains(d.Key.ToString())) + { + sbResults.Append(d.Key.ToString()); + sbResults.Append(","); + + } + } + + //sometimes all the results are stop words so you end up + //here with nothing in sbResults. Removed some code that was + //causing a crash here + return sbResults.ToString().TrimEnd(','); + + } + //return sbTest.ToString(); + + } + + #endregion + + + public class LocaleSearchData + { + public bool CJKIndex { get; set; } + public List StopWords { get; set; } + public LocaleSearchData(){ + CJKIndex=false; + StopWords=new List(); + } + } + + }//eoc }//eons \ No newline at end of file