using System; using System.Linq; using System.Globalization; using System.Text; using System.Collections.Generic; using System.IO; using System.Threading.Tasks; using Newtonsoft.Json.Linq; using Microsoft.Extensions.Logging; using Microsoft.EntityFrameworkCore; using AyaNova.Util; using AyaNova.Models; namespace AyaNova.Biz { //This class handles word breaking, processing keywords and searching for results public static class Search { /* ISSUES: none at the moment */ #region Search and return results //Class to hold search result public class SearchResult { public bool CJKIndex { get; set; } public List StopWords { get; set; } public SearchResult() { CJKIndex = false; StopWords = new List(); } } public static async Task> DoSearch(AyContext ct, long localeId, long objectID, AyaType objectType, string name, params string[] text) { List ResultList=new List(); //fake await to clear error await ct.SaveChangesAsync(); return ResultList; } #endregion #region ProcessKeywords into Database public static void ProcessNewObjectKeywords(AyContext ct, long localeId, long objectID, AyaType objectType, string name, params string[] text) { ProcessKeywords(ct, localeId, objectID, objectType, true, name, text); } public static void ProcessUpdatedObjectKeywords(AyContext ct, long localeId, long objectID, AyaType objectType, string name, params string[] text) { ProcessKeywords(ct, localeId, objectID, objectType, false, name, text); } public static void ProcessDeletedObjectKeywords(AyContext ct, long objectID, AyaType objectType) { //Be careful in future, if you put ToString at the end of each object in the string interpolation //npgsql driver will assume it's a string and put quotes around it triggering an error that a string can't be compared to an int ct.Database.ExecuteSqlCommand($"delete from asearchkey where objectid={objectID} and objecttype={(int)objectType}"); } /// /// Process the keywords into the dictionary /// NOTE: NAME parameter is in ADDITION to the NAME also being on of the strings passed in text parameter /// private static void ProcessKeywords(AyContext ct, long localeId, long objectID, AyaType objectType, bool newRecord, string name, params string[] text) { //IF NOT NEW, DELETE ALL EXISTING ENTRIES FOR OBJECT TYPE AND ID if (!newRecord) { ProcessDeletedObjectKeywords(ct, objectID, objectType); } //BREAK STRING ARRAY INTO KEYWORD LIST List KeyWordList = Break(localeId, text); //BREAK NAME STRING List NameKeyWordList = Break(localeId, name); //EARLY EXIT IF NO KEYWORDS OR NAME RECORD TO PROCESS if (KeyWordList.Count == 0 && string.IsNullOrWhiteSpace(name)) { return; } //BUILD A LIST OF MatchingDictionaryEntry items FOR THE MATCHING WORDS List MatchingKeywordIdList = new List(); //ITERATE ALL THE KEYWORDS, SEARCH IN THE SEARCHDICTIONARY TABLE AND COLLECT ID'S OF ANY PRE-EXISTING IN DB KEYWORDS var ExistingKeywordMatches = ct.SearchDictionary.Where(m => KeyWordList.Contains(m.Word)).ToDictionary(m => m.Id, m => m.Word); //Put the matching keyword ID's into the list foreach (KeyValuePair K in ExistingKeywordMatches) { bool IsName = false; if (NameKeyWordList.Contains(K.Value)) IsName = true; MatchingKeywordIdList.Add(new MatchingDictionaryEntry() { DictionaryId = K.Key, InName = IsName }); } //ITERATE THROUGH THE KEYWORDS THAT DO *NOT* HAVE MATCHES IN THE SEARCHDICTIONARY AND ADD THEM TO THE SEARCH DICTIONARY, COLLECTING THEIR ID'S foreach (string KeyWord in KeyWordList) { if (!ExistingKeywordMatches.ContainsValue(KeyWord)) { ct.SearchDictionary.Add(new SearchDictionary() { Word = KeyWord }); } } //Save the context in order to get the id's of the new words added ct.SaveChanges(); //Now add the id's of the newly created words to the matching keyword id list for this object foreach (SearchDictionary SD in ct.SearchDictionary.Local) { bool IsName = false; if (NameKeyWordList.Contains(SD.Word)) IsName = true; MatchingKeywordIdList.Add(new MatchingDictionaryEntry() { DictionaryId = SD.Id, InName = IsName }); } //CREATE THE SEARCHKEY RECORDS FOR ALL THE KEYWORDS foreach (MatchingDictionaryEntry E in MatchingKeywordIdList) { ct.SearchKey.Add(new SearchKey() { WordId = E.DictionaryId, InName = E.InName, ObjectId = objectID, ObjectType = objectType }); } ct.SaveChanges(); }//eoc //Class to hold temporary list of matching id public class MatchingDictionaryEntry { public bool InName { get; set; } public long DictionaryId { get; set; } public MatchingDictionaryEntry() { InName = false; DictionaryId = -1; } } #endregion #region Breaker //Class to hold relevant locale data for breaking text public class LocaleWordBreakingData { public bool CJKIndex { get; set; } public List StopWords { get; set; } public LocaleWordBreakingData() { CJKIndex = false; StopWords = new List(); } } //Get the current stopwords for the user's locale private static LocaleWordBreakingData GetLocaleSearchData(long localeId, AyContext ct = null) { LocaleWordBreakingData LSD = new LocaleWordBreakingData(); if (ct == null) ct = ServiceProviderProvider.DBContext; //Get stopwords //Validate locale id, if not right then use default instead var Param = new Api.Controllers.LocaleController.LocaleSubsetParam(); Param.LocaleId = LocaleBiz.EnsuredLocaleIdStatic(localeId, ct); Param.Keys.Add("StopWords1"); Param.Keys.Add("StopWords2"); Param.Keys.Add("StopWords3"); Param.Keys.Add("StopWords4"); Param.Keys.Add("StopWords5"); Param.Keys.Add("StopWords6"); Param.Keys.Add("StopWords7"); var Stops = LocaleBiz.GetSubsetStatic(Param).Result; foreach (KeyValuePair kvp in Stops) { //Each stopwords locale key is a space delimited list of words and in the case of an empty local string (i.e. StopWords7) it's value is a single question mark if (kvp.Value != "?") { LSD.StopWords.AddRange(kvp.Value.Split(" ")); } } LSD.CJKIndex = LocaleBiz.GetCJKIndex(localeId, ct).Result; return LSD; } public enum TokenTypes { Nothing, Separator, CJK, Latin }; /// /// Take an array of strings and /// return a single string /// containing unique only, lowercase comma delimited /// keywords suitable for passing to a /// stored procedure or other function /// /// Use Locale setting CJKIndex=true to handle Chinese, Japanese, Korean etc /// (languages with no easily identifiable word boundaries as in english) /// /// /// /// An array of 0 to * strings of text /// List of strings internal static List Break(long localeId, params string[] text) { List KeyWordList = new List(BreakCore(localeId, false, text).Split(',')); return KeyWordList; } /// /// Used to Process users search phrase and preserve wild /// cards entered /// /// /// /// internal static string BreakSearchPhrase(long localeId, params string[] text) { return BreakCore(localeId, true, text); } /// /// Stop words list reset upon login or editing of localized text /// used for eliminating noise words from search dictionary /// // public static System.Collections.Generic.List StopList = null; internal static string BreakCore(long localeId, bool KeepWildCards, params string[] text) { //Get stopwords and CJKIndex flag value LocaleWordBreakingData LocaleSearchData = GetLocaleSearchData(localeId); int MAXWORDLENGTH = 255; StringBuilder sbResults = new StringBuilder(); //List to temporarily hold parsed words //used to easily ensure unique words only List tempParsedWords = new List(); StringBuilder sb = new StringBuilder(); StringBuilder sbWord = new StringBuilder(); // System.IO.StringWriter sr = new System.IO.StringWriter(sb); // System.Xml.XmlTextWriter w = new System.Xml.XmlTextWriter(sr); // w.Formatting = System.Xml.Formatting.Indented; // w.WriteStartElement("Items"); //Loop through each of the passed in strings foreach (string s in text) { if (s == null || s == "") continue; //get all the characters in a unicode compliant manner... TextElementEnumerator t = StringInfo.GetTextElementEnumerator(s); //start at the top t.Reset(); TokenTypes LastToken = TokenTypes.Nothing; //Used by CJK bool BasicLatinBlock = true; //Process each "character" (text element,glyph whatever) in the //current string while (t.MoveNext()) { //get it as a character char c = t.GetTextElement()[0]; if (!LocaleSearchData.CJKIndex) { #region regular tokenizer //Is it a token we want to include? //Or a wildcard character if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%')) { #region Include token //All latin text is converted to lower case c = char.ToLower(c); //Do we already have a word? if (sbWord.Length > 0) { //Maybe we need to flush this word into the word list //if we're over the word length limit if (sbWord.Length >= MAXWORDLENGTH) { //flush away... if (!tempParsedWords.Contains(sbWord.ToString())) { tempParsedWords.Add(sbWord.ToString()); } sbWord.Length = 0; sbWord.Append(c); LastToken = TokenTypes.Latin; continue; } } //append character and go on to next one sbWord.Append(c); LastToken = TokenTypes.Latin; continue; #endregion } else { #region Word Boundary token LastToken = TokenTypes.Separator; if (sbWord.Length > 0) { //flush away... if (!tempParsedWords.Contains(sbWord.ToString())) { tempParsedWords.Add(sbWord.ToString()); } sbWord.Length = 0; continue; } #endregion } #endregion } else { #region CJK Tokenizer //Is it a basic latin charater? (ascii basically) //see: http://www.unicode.org/charts/index.html //and here for a funky online viewer: //http://www.fileformat.info/info/unicode/block/index.htm //we need to know this so that regular english text //within cjk text gets properly indexed as whole words BasicLatinBlock = false; if ((int)c < 256) BasicLatinBlock = true; if (BasicLatinBlock) { //Is it a token we want to include? if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%')) { #region Latin Include token //All latin text is converted to lower case c = char.ToLower(c); //Do we already have a word? if (sbWord.Length > 0) { //Maybe we need to flush this word into the word list //if we're over the word length limit or we are going from //CJK to latin if (LastToken == TokenTypes.CJK || sbWord.Length >= MAXWORDLENGTH) { //flush away... if (!tempParsedWords.Contains(sbWord.ToString())) { tempParsedWords.Add(sbWord.ToString()); } sbWord.Length = 0; sbWord.Append(c); LastToken = TokenTypes.Latin; continue; } } //append character and go on to next one sbWord.Append(c); LastToken = TokenTypes.Latin; continue; #endregion } else { #region Latin Word Boundary token LastToken = TokenTypes.Separator; if (sbWord.Length > 0) { //flush away... if (!tempParsedWords.Contains(sbWord.ToString())) { tempParsedWords.Add(sbWord.ToString()); } sbWord.Length = 0; continue; } #endregion } } else//CJK character { if (char.IsLetter(c) || (KeepWildCards && c == '%')) { #region CJK Include token //Do we already have a word? if (sbWord.Length > 0) { //Maybe we need to flush this word into the word list //if we're over the word length limit or we are going from //latin TO CJK if (LastToken == TokenTypes.Latin || sbWord.Length >= MAXWORDLENGTH) { //flush away... if (!tempParsedWords.Contains(sbWord.ToString())) { tempParsedWords.Add(sbWord.ToString()); } sbWord.Length = 0; sbWord.Append(c); LastToken = TokenTypes.CJK; continue; } if (LastToken == TokenTypes.CJK) { //we're here because there is more than zero characters already stored //and the last was CJK so we need append current character //and flush the resultant 2 character n-gram sbWord.Append(c); System.Diagnostics.Debug.Assert(sbWord.Length == 2); //flush away... if (!tempParsedWords.Contains(sbWord.ToString())) { tempParsedWords.Add(sbWord.ToString()); } sbWord.Length = 0; sbWord.Append(c); LastToken = TokenTypes.CJK; continue; } } //append character and go on to next one sbWord.Append(c); LastToken = TokenTypes.CJK; continue; #endregion } else { #region CJK Word Boundary token LastToken = TokenTypes.Separator; if (sbWord.Length > 0) { //flush away... if (!tempParsedWords.Contains(sbWord.ToString())) { tempParsedWords.Add(sbWord.ToString()); } sbWord.Length = 0; continue; } #endregion } } #endregion } } //Flush out the last word if (sbWord.Length > 0) { //flush away... if (!tempParsedWords.Contains(sbWord.ToString())) { tempParsedWords.Add(sbWord.ToString()); } sbWord.Length = 0; } } //bail early if there is nothing indexed if (tempParsedWords.Count == 0) return ""; //Make a return string array //from the word list foreach (string s in tempParsedWords) { //Add only non stopwords if (!LocaleSearchData.StopWords.Contains(s)) { sbResults.Append(s); sbResults.Append(","); } } //sometimes all the results are stop words so you end up //here with nothing in sbResults. return sbResults.ToString().TrimEnd(','); } #endregion }//eoc }//eons