using System; using System.Linq; using System.Globalization; using System.Text; using System.Collections.Generic; using System.IO; using System.Threading.Tasks; using Newtonsoft.Json.Linq; using Microsoft.Extensions.Logging; using Microsoft.EntityFrameworkCore; using AyaNova.Util; using AyaNova.Models; namespace AyaNova.Biz { //This class handles word breaking, processing keywords and searching for results public static class Search { /* ISSUES: none at the moment */ #region Search and return results /* Requirements: INPUT PARAMETERS - Search phrase (with wildcard support) - Can be empty if tags are specified, no tags and no phrase is an error condition - ObjectType: only return results for objects of this type - InName: flag that indicates only search in names - Tag ids that are also on result objects - Can be empty if a phrase is specified ACTION Find search matches, then find tag matches then intersect, then sort and return Filter OUT results that user is not permitted to read //TODO: proper testing of searching - SAMPLE DATA: Need a huge amount of sample data indexed to load test it - INDEXES: play with it and see what works best OUTPUT FORMAT - No localized text, up to client - Name of object in return result - Object Type and ID in return result - Group results by object type, then by object ID descending which will result in natural most recently created order result:[ { name:"blah", type:2, id:210 }, ] */ //Class to hold search request parameters public class SearchRequestParameters { public string Phrase { get; set; } public bool NameOnly { get; set; } public AyaType TypeOnly { get; set; } public List Tags { get; set; } public SearchRequestParameters() { NameOnly = false; TypeOnly = AyaType.NoType; Tags = new List(); } public bool IsValid { get { //has a phrase? if (!string.IsNullOrWhiteSpace(this.Phrase)) return true; //has tags? if (this.Tags.Count > 0) return true; return false; } } } //Class to hold search result returned to client public class SearchResult { public string Name { get; set; } public AyaType Type { get; set; } public long Id { get; set; } } public static async Task> DoSearch(AyContext ct, long localeId, AuthorizationRoles currentUserRoles, SearchRequestParameters searchParameters) { List ResultList = new List(); //list to hold temporary search/tag hits List MatchingObjects = new List(); if (!searchParameters.IsValid) { throw new System.ArgumentException("Search::DoSearch - Search request parameters must contain a phrase or tags"); } //IF PHRASE SPECIFIED //Modify Phrase to replace wildcard * with % as breakcore expects sql style wildcards searchParameters.Phrase = searchParameters.Phrase.Replace("*", "%"); //BREAK SEARCH PHRASE INTO SEPARATE TERMS var PhraseItems = BreakSearchPhrase(localeId, searchParameters.Phrase); //SPLIT OUT WILDCARDS FROM NON WILDCARDS List WildCardSearchTerms = new List(); List RegularSearchTerms = new List(); foreach (string PhraseItem in PhraseItems) { if (PhraseItem.Contains("%")) WildCardSearchTerms.Add(PhraseItem); else RegularSearchTerms.Add(PhraseItem); } //List holder for matching dictionary ID's List DictionaryMatches = new List(); //GET LIST OF DICTIONARY ID'S THAT MATCH REGULAR SEARCH TERMS if (RegularSearchTerms.Count > 0) DictionaryMatches = await ct.SearchDictionary.Where(m => RegularSearchTerms.Contains(m.Word)).Select(m => m.Id).ToListAsync(); //GET LIST OF DICTIONARY ID'S THAT MATCH WILDCARD SEARCH TERMS if (WildCardSearchTerms.Count > 0) { foreach (string WildCardSearchTerm in WildCardSearchTerms) { //Contains? if (WildCardSearchTerm.StartsWith("%") && WildCardSearchTerm.EndsWith("%")) { DictionaryMatches.AddRange(await ct.SearchDictionary.Where(m => m.Word.EndsWith(WildCardSearchTerm.Replace("%", ""))).Select(m => m.Id).ToListAsync()); } else if (WildCardSearchTerm.EndsWith("%")) //STARTS WITH? { DictionaryMatches.AddRange(await ct.SearchDictionary.Where(m => m.Word.EndsWith(WildCardSearchTerm.Replace("%", ""))).Select(m => m.Id).ToListAsync()); } else if (WildCardSearchTerm.StartsWith("%"))//ENDS WITH? { DictionaryMatches.AddRange(await ct.SearchDictionary.Where(m => m.Word.EndsWith(WildCardSearchTerm.Replace("%", ""))).Select(m => m.Id).ToListAsync()); } } } //SEARCH SEARCHKEY FOR MATCHING WORDS AND OPTIONALLY TYPE AND INNAME var TestRawMatches = await ct.SearchKey.Where(x => DictionaryMatches.Contains(x.WordId)).ToListAsync(); //Build search query based on searchParameters var q = ct.SearchKey.Distinct().Where(x => DictionaryMatches.Contains(x.WordId)); //In name? if (searchParameters.NameOnly) q.Where(m => m.InName == true); //Of type? if (searchParameters.TypeOnly != AyaType.NoType) q.Where(m => m.ObjectType == searchParameters.TypeOnly); //Find the records that have all the words var SearchMatches = q.GroupBy(x => new { x.ObjectType, x.ObjectId }).Select(x => new { ObjectId = x.Key.ObjectId, ObjectType = x.Key.ObjectType, ObjectCount = x.LongCount() }); //PUT THE RESULTS INTO MATCHING OBJECTS LIST foreach (var SearchMatch in SearchMatches) { //Is this going to require checking the count?? MatchingObjects.Add(new AyaTypeId(SearchMatch.ObjectType, SearchMatch.ObjectId)); } //IF TAGS SPECIFIED if (searchParameters.Tags.Count > 0) { //get a count of the search tags (used by both paths below) var SearchTagCount = searchParameters.Tags.Count; if (string.IsNullOrWhiteSpace(searchParameters.Phrase)) { #region TAGS ONLY SEARCH (NO PHRASE) ALL FULL MATCHES ARE INCLUSIVE Dictionary TagCounts = new Dictionary(); //QUERY FOR ALL TAGMAPS THAT MATCH OBJECT TYPE AND ID FOR EVERY TAG SPECIFIED (UNION) //var tagmatches= await ct.TagMap.Where(m => ).Select(m => m.Id).ToListAsync(); //ct.TagMap.Where(n => n.Tags.Count(t => tags.Contains(t.DisplayName)) == tags.Count) //algorithm: //1) get counts for each tag specified from tagmap, if any are zero then none match and can bail early foreach (long SearchTagId in searchParameters.Tags) { var MatchTagCount = await ct.TagMap.Where(m => m.TagId == SearchTagId).LongCountAsync(); //zero tags matching here at any point means no results for the entire search and we can bail if (MatchTagCount == 0) { //return empty resultlist return ResultList; } //Save the matching count TagCounts.Add(SearchTagId, MatchTagCount); } //2) find smallest count match so we are working with the shortest list first var ShortestMatchingTag = TagCounts.OrderBy(x => x.Value).First().Key; //3) Generate the shortlist of items that match the shortest tag list var ShortList = await ct.TagMap.Where(x => x.TagId == ShortestMatchingTag).ToListAsync(); //4) Iterate the shortlist and see if each item matches all other tags specified if it does then put it into the matching objects list for return //Iterate shortlist foreach (TagMap t in ShortList) { var matchCount = 1; //Iterate requested tags foreach (long TagId in searchParameters.Tags) { //skipping already matched shortest tag if (TagId != ShortestMatchingTag) { //Ok, does this object have this tag? bool HasTag = await ct.TagMap.Where(x => x.TagToObjectId == t.TagToObjectId && x.TagToObjectType == t.TagToObjectType && x.TagId == TagId).AnyAsync(); if (HasTag) matchCount++; } } //does it match all tags? if (matchCount == SearchTagCount) { //yes, add it to the results MatchingObjects.Add(new AyaTypeId(t.TagToObjectType, t.TagToObjectId)); } } #endregion } else { #region TAGS PLUS PHRASE SEARCH WITH NON MATCHING TAGS EXCLUSIVE //list to hold temporary matches List TagMatchingObjects = new List(); //LOOP THROUGH MATCHING OBJECTS LIST foreach (AyaTypeId i in MatchingObjects) { var matchCount = await ct.TagMap.Where(x => x.TagToObjectId == i.ObjectId && x.TagToObjectType == i.ObjectType && searchParameters.Tags.Contains(x.TagId)).LongCountAsync(); if (matchCount == SearchTagCount) { TagMatchingObjects.Add(i); } } //Ok here we have all the MatchingObjects that had all the tags in the TagMatchingObjects list so that's actually now our defacto return list MatchingObjects = TagMatchingObjects; #endregion } } //REMOVE ANY ITEMS THAT USER IS NOT PERMITTED TO READ //If it's a name only search then all is allowed //If it's not a name only search then rights need to be checked for full read because even if it's just a tags search that's part of the full record of the object if (!searchParameters.NameOnly) { //list to hold temporary matches List CanReadMatchingObjects = new List(); foreach (AyaTypeId t in MatchingObjects) { if (AyaNova.Api.ControllerHelpers.Authorized.IsAuthorizedToReadFullRecord(currentUserRoles, t.ObjectType)) { CanReadMatchingObjects.Add(t); } } //Ok, we're here with the list of allowable objects which is now the master matching objects list so... MatchingObjects = CanReadMatchingObjects; } //Build the return list from the remaining matching objects list foreach (AyaTypeId i in MatchingObjects) { SearchResult SR = new SearchResult(); SR.Name = BizObjectNameFetcher.Name(i, ct); SR.Id = i.ObjectId; SR.Type = i.ObjectType; ResultList.Add(SR); } return ResultList; } #endregion dosearch #region ProcessKeywords into Database public static void ProcessNewObjectKeywords(AyContext ct, long localeId, long objectID, AyaType objectType, string name, params string[] text) { ProcessKeywords(ct, localeId, objectID, objectType, true, name, text); } public static void ProcessUpdatedObjectKeywords(AyContext ct, long localeId, long objectID, AyaType objectType, string name, params string[] text) { ProcessKeywords(ct, localeId, objectID, objectType, false, name, text); } public static void ProcessDeletedObjectKeywords(AyContext ct, long objectID, AyaType objectType) { //Be careful in future, if you put ToString at the end of each object in the string interpolation //npgsql driver will assume it's a string and put quotes around it triggering an error that a string can't be compared to an int ct.Database.ExecuteSqlCommand($"delete from asearchkey where objectid={objectID} and objecttype={(int)objectType}"); } /// /// Process the keywords into the dictionary /// NOTE: NAME parameter is in ADDITION to the NAME also being on of the strings passed in text parameter /// private static void ProcessKeywords(AyContext ct, long localeId, long objectID, AyaType objectType, bool newRecord, string name, params string[] text) { //IF NOT NEW, DELETE ALL EXISTING ENTRIES FOR OBJECT TYPE AND ID if (!newRecord) { ProcessDeletedObjectKeywords(ct, objectID, objectType); } //BREAK STRING ARRAY INTO KEYWORD LIST List KeyWordList = Break(localeId, text); //BREAK NAME STRING List NameKeyWordList = Break(localeId, name); //EARLY EXIT IF NO KEYWORDS OR NAME RECORD TO PROCESS if (KeyWordList.Count == 0 && string.IsNullOrWhiteSpace(name)) { return; } //BUILD A LIST OF MatchingDictionaryEntry items FOR THE MATCHING WORDS List MatchingKeywordIdList = new List(); //ITERATE ALL THE KEYWORDS, SEARCH IN THE SEARCHDICTIONARY TABLE AND COLLECT ID'S OF ANY PRE-EXISTING IN DB KEYWORDS var ExistingKeywordMatches = ct.SearchDictionary.Where(m => KeyWordList.Contains(m.Word)).ToDictionary(m => m.Id, m => m.Word); //Put the matching keyword ID's into the list foreach (KeyValuePair K in ExistingKeywordMatches) { bool IsName = false; if (NameKeyWordList.Contains(K.Value)) IsName = true; MatchingKeywordIdList.Add(new MatchingDictionaryEntry() { DictionaryId = K.Key, InName = IsName }); } //ITERATE THROUGH THE KEYWORDS THAT DO *NOT* HAVE MATCHES IN THE SEARCHDICTIONARY AND ADD THEM TO THE SEARCH DICTIONARY, COLLECTING THEIR ID'S bool NewWordsAdded = false; foreach (string KeyWord in KeyWordList) { if (!ExistingKeywordMatches.ContainsValue(KeyWord)) { ct.SearchDictionary.Add(new SearchDictionary() { Word = KeyWord }); NewWordsAdded = true; } } //Save the context in order to get the id's of the new words added if (NewWordsAdded) ct.SaveChanges(); //----- //Now add the id's of the newly created words to the matching keyword id list for this object foreach (SearchDictionary SD in ct.SearchDictionary.Local) { bool IsName = false; if (NameKeyWordList.Contains(SD.Word)) IsName = true; //See if it's already in the matching keywordlist or needs to be added var ExistingMatch = MatchingKeywordIdList.Where(x => x.DictionaryId == SD.Id).FirstOrDefault(); if (ExistingMatch == null)//If null then needs to be added MatchingKeywordIdList.Add(new MatchingDictionaryEntry() { DictionaryId = SD.Id, InName = IsName }); else { //Not null, but may need to be updated to reflect that it's in the name if (!ExistingMatch.InName && IsName) { ExistingMatch.InName = true; } } } //CREATE THE SEARCHKEY RECORDS FOR ALL THE KEYWORDS foreach (MatchingDictionaryEntry E in MatchingKeywordIdList) { ct.SearchKey.Add(new SearchKey() { WordId = E.DictionaryId, InName = E.InName, ObjectId = objectID, ObjectType = objectType }); } //--------------------------------- ct.SaveChanges(); }//eoc //Class to hold temporary list of matching id public class MatchingDictionaryEntry { public bool InName { get; set; } public long DictionaryId { get; set; } public MatchingDictionaryEntry() { InName = false; DictionaryId = -1; } } #endregion #region Breaker //Class to hold relevant locale data for breaking text public class LocaleWordBreakingData { public bool CJKIndex { get; set; } public List StopWords { get; set; } public LocaleWordBreakingData() { CJKIndex = false; StopWords = new List(); } } //Get the current stopwords for the user's locale private static LocaleWordBreakingData GetLocaleSearchData(long localeId, AyContext ct = null) { LocaleWordBreakingData LSD = new LocaleWordBreakingData(); if (ct == null) ct = ServiceProviderProvider.DBContext; //Get stopwords //Validate locale id, if not right then use default instead var Param = new Api.Controllers.LocaleController.LocaleSubsetParam(); Param.LocaleId = LocaleBiz.EnsuredLocaleIdStatic(localeId, ct); Param.Keys.Add("StopWords1"); Param.Keys.Add("StopWords2"); Param.Keys.Add("StopWords3"); Param.Keys.Add("StopWords4"); Param.Keys.Add("StopWords5"); Param.Keys.Add("StopWords6"); Param.Keys.Add("StopWords7"); var Stops = LocaleBiz.GetSubsetStatic(Param).Result; foreach (KeyValuePair kvp in Stops) { //Each stopwords locale key is a space delimited list of words and in the case of an empty local string (i.e. StopWords7) it's value is a single question mark if (kvp.Value != "?") { LSD.StopWords.AddRange(kvp.Value.Split(" ")); } } LSD.CJKIndex = LocaleBiz.GetCJKIndex(localeId, ct).Result; return LSD; } public enum TokenTypes { Nothing, Separator, CJK, Latin }; /// /// Take an array of strings and /// return a single string /// containing unique only, lowercase comma delimited /// keywords suitable for passing to a /// stored procedure or other function /// /// Use Locale setting CJKIndex=true to handle Chinese, Japanese, Korean etc /// (languages with no easily identifiable word boundaries as in english) /// /// /// /// An array of 0 to * strings of text /// List of strings internal static List Break(long localeId, params string[] text) { return BreakCore(localeId, false, text); } /// /// Used to Process users search phrase and preserve wild /// cards entered /// /// /// /// internal static List BreakSearchPhrase(long localeId, params string[] text) { return BreakCore(localeId, true, text); } /// /// Stop words list reset upon login or editing of localized text /// used for eliminating noise words from search dictionary /// // public static System.Collections.Generic.List StopList = null; internal static List BreakCore(long localeId, bool KeepWildCards, params string[] text) { //Get stopwords and CJKIndex flag value LocaleWordBreakingData LocaleSearchData = GetLocaleSearchData(localeId); int MAXWORDLENGTH = 255; StringBuilder sbResults = new StringBuilder(); //List to temporarily hold parsed words //used to easily ensure unique words only List tempParsedWords = new List(); StringBuilder sb = new StringBuilder(); StringBuilder sbWord = new StringBuilder(); List ReturnList = new List(); //Loop through each of the passed in strings foreach (string s in text) { if (s == null || s == "") continue; //get all the characters in a unicode compliant manner... TextElementEnumerator t = StringInfo.GetTextElementEnumerator(s); //start at the top t.Reset(); TokenTypes LastToken = TokenTypes.Nothing; //Used by CJK bool BasicLatinBlock = true; //Process each "character" (text element,glyph whatever) in the //current string while (t.MoveNext()) { //get it as a character char c = t.GetTextElement()[0]; if (!LocaleSearchData.CJKIndex) { #region regular tokenizer //Is it a token we want to include? //Or a wildcard character if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%')) { #region Include token //All latin text is converted to lower case c = char.ToLower(c); //Do we already have a word? if (sbWord.Length > 0) { //Maybe we need to flush this word into the word list //if we're over the word length limit if (sbWord.Length >= MAXWORDLENGTH) { //flush away... if (!tempParsedWords.Contains(sbWord.ToString())) { tempParsedWords.Add(sbWord.ToString()); } sbWord.Length = 0; sbWord.Append(c); LastToken = TokenTypes.Latin; continue; } } //append character and go on to next one sbWord.Append(c); LastToken = TokenTypes.Latin; continue; #endregion } else { #region Word Boundary token LastToken = TokenTypes.Separator; if (sbWord.Length > 0) { //flush away... if (!tempParsedWords.Contains(sbWord.ToString())) { tempParsedWords.Add(sbWord.ToString()); } sbWord.Length = 0; continue; } #endregion } #endregion } else { #region CJK Tokenizer //Is it a basic latin charater? (ascii basically) //see: http://www.unicode.org/charts/index.html //and here for a funky online viewer: //http://www.fileformat.info/info/unicode/block/index.htm //we need to know this so that regular english text //within cjk text gets properly indexed as whole words BasicLatinBlock = false; if ((int)c < 256) BasicLatinBlock = true; if (BasicLatinBlock) { //Is it a token we want to include? if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%')) { #region Latin Include token //All latin text is converted to lower case c = char.ToLower(c); //Do we already have a word? if (sbWord.Length > 0) { //Maybe we need to flush this word into the word list //if we're over the word length limit or we are going from //CJK to latin if (LastToken == TokenTypes.CJK || sbWord.Length >= MAXWORDLENGTH) { //flush away... if (!tempParsedWords.Contains(sbWord.ToString())) { tempParsedWords.Add(sbWord.ToString()); } sbWord.Length = 0; sbWord.Append(c); LastToken = TokenTypes.Latin; continue; } } //append character and go on to next one sbWord.Append(c); LastToken = TokenTypes.Latin; continue; #endregion } else { #region Latin Word Boundary token LastToken = TokenTypes.Separator; if (sbWord.Length > 0) { //flush away... if (!tempParsedWords.Contains(sbWord.ToString())) { tempParsedWords.Add(sbWord.ToString()); } sbWord.Length = 0; continue; } #endregion } } else//CJK character { if (char.IsLetter(c) || (KeepWildCards && c == '%')) { #region CJK Include token //Do we already have a word? if (sbWord.Length > 0) { //Maybe we need to flush this word into the word list //if we're over the word length limit or we are going from //latin TO CJK if (LastToken == TokenTypes.Latin || sbWord.Length >= MAXWORDLENGTH) { //flush away... if (!tempParsedWords.Contains(sbWord.ToString())) { tempParsedWords.Add(sbWord.ToString()); } sbWord.Length = 0; sbWord.Append(c); LastToken = TokenTypes.CJK; continue; } if (LastToken == TokenTypes.CJK) { //we're here because there is more than zero characters already stored //and the last was CJK so we need append current character //and flush the resultant 2 character n-gram sbWord.Append(c); System.Diagnostics.Debug.Assert(sbWord.Length == 2); //flush away... if (!tempParsedWords.Contains(sbWord.ToString())) { tempParsedWords.Add(sbWord.ToString()); } sbWord.Length = 0; sbWord.Append(c); LastToken = TokenTypes.CJK; continue; } } //append character and go on to next one sbWord.Append(c); LastToken = TokenTypes.CJK; continue; #endregion } else { #region CJK Word Boundary token LastToken = TokenTypes.Separator; if (sbWord.Length > 0) { //flush away... if (!tempParsedWords.Contains(sbWord.ToString())) { tempParsedWords.Add(sbWord.ToString()); } sbWord.Length = 0; continue; } #endregion } } #endregion } } //Flush out the last word if (sbWord.Length > 0) { //flush away... if (!tempParsedWords.Contains(sbWord.ToString())) { tempParsedWords.Add(sbWord.ToString()); } sbWord.Length = 0; } } //bail early if there is nothing indexed if (tempParsedWords.Count == 0) return ReturnList; //Make a return string array //from the word list foreach (string s in tempParsedWords) { //Add only non stopwords if (!LocaleSearchData.StopWords.Contains(s)) { ReturnList.Add(s); } } //sometimes all the results are stop words so you end up here with nothing return ReturnList; } #endregion }//eoc }//eons