using System; using System.Globalization; using System.Text; using System.Collections.Generic; using System.IO; using Newtonsoft.Json.Linq; using Microsoft.Extensions.Logging; using Microsoft.EntityFrameworkCore; using AyaNova.Util; using AyaNova.Models; namespace AyaNova.Biz { //This class handles word breaking, processing keywords and searching for results public static class Search { //Initial keyword indexing consists of //WordBreaker - break down into words //ProcessKeywords into database #region ProcessKeywords into Database public static void ProcessNewObjectKeywords(AyContext ct, long localeId, long objectID, AyaType objectType, string name, params string[] text) { ProcessKeywords(ct, localeId, objectID, objectType, true, name, text); } public static void ProcessUpdatedObjectKeywords(AyContext ct, long localeId, long objectID, AyaType objectType, string name, params string[] text) { ProcessKeywords(ct, localeId, objectID, objectType, false, name, text); } public static void ProcessDeletedObjectKeywords(AyContext ct, long objectID, AyaType objectType) { //Be careful in future, if you put ToString at the end of each object in the string interpolation //npgsql driver will assume it's a string and put quotes around it triggering an error that a string can't be compared to an int ct.Database.ExecuteSqlCommand($"delete from asearchkey where objectid={objectID} and objecttype={(int)objectType}"); } /// /// Process the keywords into the dictionary /// private static void ProcessKeywords(AyContext ct, long localeId, long objectID, AyaType objectType, bool newRecord, string name, params string[] text) { //TODO: Code this, using method idea from v7 code and adding the handling of the new InName flag and Name separately. //Note that as initially coded in widget test class the context will be saved by the controller as it is also done with the event log //So theoretically I don't save here, but it may turn out that the code requires a save so in that case need to re-do the WidgetController calls to here to //account for the save //IF NOT NEW, DELETE ALL EXISTING ENTRIES FOR OBJECT TYPE AND ID if (!newRecord) { ProcessDeletedObjectKeywords(ct, objectID, objectType); } //BREAK STRING ARRAY INTO KEYWORD LIST List KeyWordList = Break(localeId, text); //EARLY EXIT IF NO KEYWORDS OR NAME RECORD TO PROCESS if (KeyWordList.Count == 0 && string.IsNullOrWhiteSpace(name)) { return; } //ITERATE ALL THE KEYWORDS, SEARCH IN THE SEARCHDICTIONARY TABLE AND COLLECT ID'S OF ANY PRE-EXISTING IN DB KEYWORDS //ITERATE THROUGH THE KEYWORDS THAT DO *NOT* HAVE MATCHES IN THE SEARCHDICTIONARY AND ADD THEM TO THE SEARCH DICTIONARY, COLLECTING THEIR ID'S //CREATE THE SEARCHKEY RECORDS FOR ALL THE KEYWORDS } #endregion #region Breaker //Class to hold relevant locale data for breaking text public class LocaleWordBreakingData { public bool CJKIndex { get; set; } public List StopWords { get; set; } public LocaleWordBreakingData() { CJKIndex = false; StopWords = new List(); } } //Get the current stopwords for the user's locale private static LocaleWordBreakingData GetLocaleSearchData(long localeId, AyContext ct = null) { LocaleWordBreakingData LSD = new LocaleWordBreakingData(); if (ct == null) ct = ServiceProviderProvider.DBContext; //Get stopwords //Validate locale id, if not right then use default instead var Param = new Api.Controllers.LocaleController.LocaleSubsetParam(); Param.LocaleId = LocaleBiz.EnsuredLocaleIdStatic(localeId, ct); Param.Keys.Add("StopWords1"); Param.Keys.Add("StopWords2"); Param.Keys.Add("StopWords3"); Param.Keys.Add("StopWords4"); Param.Keys.Add("StopWords5"); Param.Keys.Add("StopWords6"); Param.Keys.Add("StopWords7"); var Stops = LocaleBiz.GetSubsetStatic(Param).Result; foreach (KeyValuePair kvp in Stops) { //Each stopwords locale key is a space delimited list of words and in the case of an empty local string (i.e. StopWords7) it's value is a single question mark if (kvp.Value != "?") { LSD.StopWords.AddRange(kvp.Value.Split(" ")); } } LSD.CJKIndex = LocaleBiz.GetCJKIndex(localeId, ct).Result; return LSD; } public enum TokenTypes { Nothing, Separator, CJK, Latin }; /// /// Take an array of strings and /// return a single string /// containing unique only, lowercase comma delimited /// keywords suitable for passing to a /// stored procedure or other function /// /// Use Locale setting CJKIndex=true to handle Chinese, Japanese, Korean etc /// (languages with no easily identifiable word boundaries as in english) /// /// /// /// An array of 0 to * strings of text /// List of strings internal static List Break(long localeId, params string[] text) { List KeyWordList = new List(BreakCore(localeId, false, text).Split(',')); return KeyWordList; } /// /// Used to Process users search phrase and preserve wild /// cards entered /// /// /// /// internal static string BreakSearchPhrase(long localeId, params string[] text) { return BreakCore(localeId, true, text); } /// /// Stop words list reset upon login or editing of localized text /// used for eliminating noise words from search dictionary /// public static System.Collections.Generic.List StopList = null; internal static string BreakCore(long localeId, bool KeepWildCards, params string[] text) { //Get stopwords and CJKIndex flag value LocaleWordBreakingData LSD = GetLocaleSearchData(localeId); int MAXWORDLENGTH = 255; StringBuilder sbResults = new StringBuilder(); //List to temporarily hold parsed words //used to easily ensure unique words only List tempParsedWords = new List(); StringBuilder sb = new StringBuilder(); StringBuilder sbWord = new StringBuilder(); // System.IO.StringWriter sr = new System.IO.StringWriter(sb); // System.Xml.XmlTextWriter w = new System.Xml.XmlTextWriter(sr); // w.Formatting = System.Xml.Formatting.Indented; // w.WriteStartElement("Items"); //Loop through each of the passed in strings foreach (string s in text) { if (s == null || s == "") continue; //get all the characters in a unicode compliant manner... TextElementEnumerator t = StringInfo.GetTextElementEnumerator(s); //start at the top t.Reset(); TokenTypes LastToken = TokenTypes.Nothing; //Used by CJK bool BasicLatinBlock = true; //Process each "character" (text element,glyph whatever) in the //current string while (t.MoveNext()) { //get it as a character char c = t.GetTextElement()[0]; if (!LSD.CJKIndex) { #region regular tokenizer //Is it a token we want to include? //Or a wildcard character if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%')) { #region Include token //All latin text is converted to lower case c = char.ToLower(c); //Do we already have a word? if (sbWord.Length > 0) { //Maybe we need to flush this word into the word list //if we're over the word length limit if (sbWord.Length >= MAXWORDLENGTH) { //flush away... if (!tempParsedWords.Contains(sbWord.ToString())) { tempParsedWords.Add(sbWord.ToString()); } sbWord.Length = 0; sbWord.Append(c); LastToken = TokenTypes.Latin; continue; } } //append character and go on to next one sbWord.Append(c); LastToken = TokenTypes.Latin; continue; #endregion } else { #region Word Boundary token LastToken = TokenTypes.Separator; if (sbWord.Length > 0) { //flush away... if (!tempParsedWords.Contains(sbWord.ToString())) { tempParsedWords.Add(sbWord.ToString()); } sbWord.Length = 0; continue; } #endregion } #endregion } else { #region CJK Tokenizer //Is it a basic latin charater? (ascii basically) //see: http://www.unicode.org/charts/index.html //and here for a funky online viewer: //http://www.fileformat.info/info/unicode/block/index.htm //we need to know this so that regular english text //within cjk text gets properly indexed as whole words BasicLatinBlock = false; if ((int)c < 256) BasicLatinBlock = true; if (BasicLatinBlock) { //Is it a token we want to include? if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%')) { #region Latin Include token //All latin text is converted to lower case c = char.ToLower(c); //Do we already have a word? if (sbWord.Length > 0) { //Maybe we need to flush this word into the word list //if we're over the word length limit or we are going from //CJK to latin if (LastToken == TokenTypes.CJK || sbWord.Length >= MAXWORDLENGTH) { //flush away... if (!tempParsedWords.Contains(sbWord.ToString())) { tempParsedWords.Add(sbWord.ToString()); } sbWord.Length = 0; sbWord.Append(c); LastToken = TokenTypes.Latin; continue; } } //append character and go on to next one sbWord.Append(c); LastToken = TokenTypes.Latin; continue; #endregion } else { #region Latin Word Boundary token LastToken = TokenTypes.Separator; if (sbWord.Length > 0) { //flush away... if (!tempParsedWords.Contains(sbWord.ToString())) { tempParsedWords.Add(sbWord.ToString()); } sbWord.Length = 0; continue; } #endregion } } else//CJK character { if (char.IsLetter(c) || (KeepWildCards && c == '%')) { #region CJK Include token //Do we already have a word? if (sbWord.Length > 0) { //Maybe we need to flush this word into the word list //if we're over the word length limit or we are going from //latin TO CJK if (LastToken == TokenTypes.Latin || sbWord.Length >= MAXWORDLENGTH) { //flush away... if (!tempParsedWords.Contains(sbWord.ToString())) { tempParsedWords.Add(sbWord.ToString()); } sbWord.Length = 0; sbWord.Append(c); LastToken = TokenTypes.CJK; continue; } if (LastToken == TokenTypes.CJK) { //we're here because there is more than zero characters already stored //and the last was CJK so we need append current character //and flush the resultant 2 character n-gram sbWord.Append(c); System.Diagnostics.Debug.Assert(sbWord.Length == 2); //flush away... if (!tempParsedWords.Contains(sbWord.ToString())) { tempParsedWords.Add(sbWord.ToString()); } sbWord.Length = 0; sbWord.Append(c); LastToken = TokenTypes.CJK; continue; } } //append character and go on to next one sbWord.Append(c); LastToken = TokenTypes.CJK; continue; #endregion } else { #region CJK Word Boundary token LastToken = TokenTypes.Separator; if (sbWord.Length > 0) { //flush away... if (!tempParsedWords.Contains(sbWord.ToString())) { tempParsedWords.Add(sbWord.ToString()); } sbWord.Length = 0; continue; } #endregion } } #endregion } } //Flush out the last word if (sbWord.Length > 0) { //flush away... if (!tempParsedWords.Contains(sbWord.ToString())) { tempParsedWords.Add(sbWord.ToString()); } sbWord.Length = 0; } } //bail early if there is nothing indexed if (tempParsedWords.Count == 0) return ""; //Make a return string array //from the word list foreach (string s in tempParsedWords) { //Add only non stopwords if (!StopList.Contains(s)) { sbResults.Append(s); sbResults.Append(","); } } //sometimes all the results are stop words so you end up //here with nothing in sbResults. return sbResults.ToString().TrimEnd(','); } #endregion }//eoc }//eons