using System.Linq; using System.Globalization; using System.Text; using System.Collections.Generic; using System.Threading.Tasks; using Microsoft.EntityFrameworkCore; using Sockeye.Util; using Sockeye.Models; namespace Sockeye.Biz { //This class handles word breaking, processing keywords and searching for results public static class Search { #region Search and return results public class SearchRequestParameters { public string Phrase { get; set; } public SockType TypeOnly { get; set; } //Note: maxresults of 0 will get all results public int MaxResults { get; set; } public SearchRequestParameters() { TypeOnly = SockType.NoType; MaxResults = 500; } public bool IsValid { get { //has a phrase? if (!string.IsNullOrWhiteSpace(this.Phrase)) return true; return false; } } } //Classes to hold search results returned to client public class SearchResult { public string Name { get; set; } public SockType Type { get; set; } public long Id { get; set; } } public class SearchReturnObject { public long TotalResultsFound { get; set; } public List SearchResults { get; set; } public SearchReturnObject() { TotalResultsFound = 0; SearchResults = new List(); } } public static async Task DoSearchAsync(AyContext ct, long translationId, AuthorizationRoles currentUserRoles, long currentUserId, SearchRequestParameters searchParameters) { var ReturnObject = new SearchReturnObject(); //list to hold temporary search/tag hits List MatchingObjects = new List(); if (!searchParameters.IsValid) { //this is expected, don't throw, just return nothing //throw new System.ArgumentException("Search::DoSearch - Search request parameters must contain a phrase or tags"); return ReturnObject; } //escape literal percentage signs first just in case they are searching for 50% off or something //https://www.postgresql.org/docs/current/functions-matching.html#FUNCTIONS-LIKE //need to get around breaking possibly losing the symbol so make it text searchParameters.Phrase = searchParameters.Phrase.Replace("%", "pctsym"); //Modify Phrase to replace wildcard * with % as breakcore expects sql style wildcards searchParameters.Phrase = searchParameters.Phrase.Replace("*", "%"); //BREAK SEARCH PHRASE INTO SEPARATE TERMS var PhraseItems = await BreakSearchPhraseAsync(translationId, searchParameters.Phrase); //SPLIT OUT WILDCARDS FROM NON WILDCARDS List PreWildCardedSearchTerms = new List(); List SearchTerms = new List(); foreach (string PhraseItem in PhraseItems) { if (PhraseItem.Contains("%")) PreWildCardedSearchTerms.Add(PhraseItem.Replace("pctsym", @"\%"));//put back literal percentage symbol if necessary else SearchTerms.Add(PhraseItem.Replace("pctsym", @"\%"));//put back literal percentage symbol if necessary } StringBuilder q = new StringBuilder(); int termCount = 0; q.Append("WITH qr AS (SELECT asearchkey.sockType, asearchkey.objectid, "); //EXACT MATCH SEARCH TERMS foreach (string Term in SearchTerms) q.Append($"COUNT(*) FILTER (WHERE asearchdictionary.word = '{Term}') AS st{++termCount}, "); //WILDCARD SEARCH TERMS foreach (string WildCardSearchTerm in PreWildCardedSearchTerms) q.Append($"COUNT(*) FILTER (WHERE asearchdictionary.word LIKE '{WildCardSearchTerm}') AS st{++termCount}, "); q.Length=q.Length-2;//trim the final comma and space var qTypeOnly=string.Empty; if(searchParameters.TypeOnly!=SockType.NoType){ //INNER JOIN ASEARCHKEY ON ASEARCHDICTIONARY.ID = ASEARCHKEY.WORDID and asearchkey.sockType=20 qTypeOnly=$"AND ASEARCHKEY.SOCKTYPE={(int)searchParameters.TypeOnly}"; } q.Append($" FROM asearchdictionary INNER JOIN asearchkey ON asearchdictionary.id = asearchkey.wordid {qTypeOnly} GROUP BY asearchkey.objectid, asearchkey.sockType) SELECT sockType, objectid FROM qr WHERE "); for (; termCount > 0; termCount--) q.Append($"st{termCount} > 0 {(termCount > 1 ? "AND " : "")}"); //execute the query and iterate the results using (var command = ct.Database.GetDbConnection().CreateCommand()) { await ct.Database.OpenConnectionAsync(); command.CommandText = q.ToString(); using (var dr = await command.ExecuteReaderAsync()) { while (dr.Read()) { MatchingObjects.Add(new SockTypeId((SockType)dr.GetInt32(0), dr.GetInt64(1))); } } } //REMOVE ANY ITEMS THAT USER IS NOT PERMITTED TO READ //list to hold temporary matches List CanReadMatchingObjects = new List(); foreach (SockTypeId t in MatchingObjects) { if (t.SockType == SockType.FileAttachment) { //have to look up the actual underlying object type and id here //check if it's readable for user //then add the PARENT object type and id to the CanREadMatchingObjects list //this means user will not see it return as an attachment, just as the object FileAttachment f = await ct.FileAttachment.AsNoTracking().FirstOrDefaultAsync(z => z.Id == t.ObjectId); if (Sockeye.Api.ControllerHelpers.Authorized.HasReadFullRole(currentUserRoles, f.AttachToAType)) { CanReadMatchingObjects.Add(new SockTypeId(f.AttachToAType, f.AttachToObjectId)); } } else if (t.SockType == SockType.Memo) { //Users are only permitted to search their own memo's if (await ct.Memo.AsNoTracking().AnyAsync(z => z.Id == t.ObjectId && z.ToId == currentUserId)) CanReadMatchingObjects.Add(t); } else if (t.SockType == SockType.Reminder) { //Users are only permitted to search their own reminder's if (await ct.Reminder.AsNoTracking().AnyAsync(z => z.Id == t.ObjectId && z.UserId == currentUserId)) CanReadMatchingObjects.Add(t); } else { if (Sockeye.Api.ControllerHelpers.Authorized.HasReadFullRole(currentUserRoles, t.SockType)) { CanReadMatchingObjects.Add(t); } } } //Ok, we're here with the list of allowable objects which is now the master matching objects list so... MatchingObjects = CanReadMatchingObjects; //TOTAL RESULTS //we have the total results here so set accordingly ReturnObject.TotalResultsFound = MatchingObjects.Count; //MAXIMUM RESULTS FILTER //The theory is that it should be filtered BEFORE sorting so that you get the most random collection of results //As the results are not ranked so... if (searchParameters.MaxResults > 0)//0 = all results MatchingObjects = MatchingObjects.Take(searchParameters.MaxResults).ToList(); //Sort and group the matching objects list in return order var OrderedMatchingObjects = MatchingObjects.OrderBy(z => z.SockType).ThenByDescending(z => z.ObjectId); //Get names using best performing technique using (var command = ct.Database.GetDbConnection().CreateCommand()) { ct.Database.OpenConnection(); //Build the return list from the remaining matching objects list foreach (SockTypeId i in OrderedMatchingObjects) { SearchResult SR = new SearchResult(); SR.Name = BizObjectNameFetcherDirect.Name(i.SockType, i.ObjectId,translationId, command); SR.Id = i.ObjectId; SR.Type = i.SockType; ReturnObject.SearchResults.Add(SR); } } return ReturnObject; } #endregion dosearch #region Get info (excerpt) public static async Task GetInfoAsync(long translationId, AuthorizationRoles currentUserRoles, long userId, string phrase, int max, SockType sockType, long id, AyContext ct) { //escape literal percentage signs first just in case they are searching for 50% off or something //https://www.postgresql.org/docs/current/functions-matching.html#FUNCTIONS-LIKE //need to get around breaking possibly losing the symbol so make it text phrase = phrase.Replace("%", "pctsym"); //Modify Phrase to replace wildcard * with % as breakcore expects sql style wildcards phrase = phrase.Replace("*", "%"); //BREAK SEARCH PHRASE INTO SEPARATE TERMS var PhraseItems = await BreakSearchPhraseAsync(translationId, phrase); PhraseItems.ToArray(); //get text ISearchAbleObject o = (ISearchAbleObject)BizObjectFactory.GetBizObject(sockType, ct, userId, currentUserRoles, translationId); //get extract var searchParams = await o.GetSearchResultSummary(id, sockType); //extract and rank here ExtractAndRank er = new ExtractAndRank(); er.Process(searchParams, PhraseItems.ToArray(), max); // sr.Extract = er.Extract; // sr.Rank = er.Ranking; return er.Extract; } #region Search rank and extract /// /// Rank and extract best excerpt of specified text and search terms /// public sealed class ExtractAndRank { #region Fields private string[] searchTerms; private string rawtext; private string extract = ""; private bool flattenExtract = true; private float ranking; private int extractionThresholdRank = 10; private int maximumCharactersToExtract = 40; #endregion #region Properties /// /// This is the ranking of the source text as it pertains to the /// search terms /// /// A rank of zero means either there was no match or the rank that was calculated /// was lower than the threshold ranking, either way, no excerpt extraction is done. /// /// It is a percentage value on a scale of 0 to 100 /// and is weighted: /// /// 75% of the score is the percentage of all search terms found in the text /// 25% of the score is the percentage of all characters in the text that are search term characters /// /// /// public float Ranking { get { return ranking; } } /// /// Maximum characters to appear in an extraction /// default is 80 /// Minimum is 10 /// public int MaximumCharactersToExtract { get { return maximumCharactersToExtract; } set { if (value > 10) maximumCharactersToExtract = value; else maximumCharactersToExtract = 10; } } /// /// ExtractionThresholdRank /// Extraction will only take place if the rank is /// this value or higher /// /// default is 10, maximum is 100 minimum is 0 /// public int ExtractionThresholdRank { get { return extractionThresholdRank; } set { if (value > 100) extractionThresholdRank = 100; else if (value < 0) extractionThresholdRank = 0; else extractionThresholdRank = value; } } /// /// If true, carriage returns and line feeds will be removed from extract /// public bool FlattenExtract { get { return this.flattenExtract; } set { this.flattenExtract = value; } } /// /// Extracted text excerpt that best reflects search terms /// public string Extract { get { return extract; } } #endregion #region public methods /// /// Do the extraction and ranking /// public void Process(SearchIndexProcessObjectParameters searchObjectParams, string[] searchTerms, int max) { this.maximumCharactersToExtract = max; ranking = 0; extract = ""; string rawText = string.Join(" ", searchObjectParams.Words); //System.Diagnostics.Debug.Assert(rawText!=null && rawText!="","EXTRACT AND RANK","EMPTY RAWTEXT, CHECK OBJECTS GetSearchResult() CODE TO ENSURE IT'S GOT THE correct SP (CHECK THE SP IF NOT)"); if (rawText == null || rawText == "") return; this.rawtext = rawText; if (searchTerms == null || searchTerms.Length == 0) return; this.searchTerms = searchTerms; ranking = score(0, this.rawtext.Length); if (ranking > extractionThresholdRank) DoExtract(); } #endregion #region Calculate score /// /// Give a percentage score for a given window of /// text in the raw text string /// 75% of the score is the percentage of all search terms found in the window /// 25% of the score is the percentage of all characters in the search window that are search term characters /// /// /// /// /// /// /// Float value of zero to one hundred private float score(int nStartPos, int nEndPos) { //rewrite this as an integer based calculation System.Diagnostics.Debug.Assert(nStartPos < nEndPos); if (nStartPos < 0) nStartPos = 0; if (nEndPos > this.rawtext.Length) nEndPos = this.rawtext.Length; int nTermCharsInWindow = 0;//how many of the characters in the window are matching term characters string SearchString = this.rawtext.Substring(nStartPos, nEndPos - nStartPos).ToLower(System.Globalization.CultureInfo.CurrentCulture); int nMatches = 0; foreach (string term in searchTerms) { //remove the wild card character if present and set to lower case string lTerm = term.ToLower(System.Globalization.CultureInfo.CurrentCulture).Replace("%", ""); int nLocation = SearchString.IndexOf(lTerm); if (nLocation != -1) { nMatches++; while (nLocation != -1) { nTermCharsInWindow += lTerm.Length; ; nLocation = SearchString.IndexOf(lTerm, nLocation + 1); } } } //If no matches then rank is automatically zero if (nMatches == 0) { return 0; } //Rank is calculated on a weighted scale //75% for matching all search terms //25% for the quantity of search terms versus other text found float fTermsFoundPct = 75 * ((float)nMatches / (float)searchTerms.GetLength(0)); float fTermsVsTextPct = 0; if (nTermCharsInWindow > 0) fTermsVsTextPct = 25 * ((float)nTermCharsInWindow / (float)SearchString.Length); return fTermsFoundPct + fTermsVsTextPct; } #endregion #region Extract best excerpt /// /// Extract the best scoring excerpt fragments of /// raw text /// private void DoExtract() { //If the whole thing is less than the max to extract //just save time and return the whole thing if (this.rawtext.Length < this.maximumCharactersToExtract) { this.extract = this.rawtext; return; } string BestWindow = ""; float BestScore = 0; float thisscore = 0; int BestWindowStartPos = 0; //Get the shortest search term length so //we can save time iterating over the window in the extract //function below int shortestSearchTermLength = int.MaxValue; foreach (string s in this.searchTerms) { if (s.Length < shortestSearchTermLength) shortestSearchTermLength = s.Length; } //slide a window over the text and check it's score, the highest scoring window wins //move the length of the shortest search term so as to ensure we won't //miss it, but faster than moving one character at a time for (int z = 0; z < this.rawtext.Length - maximumCharactersToExtract; z += shortestSearchTermLength) { thisscore = score(z, z + (maximumCharactersToExtract)); if (thisscore == 0) continue; if (thisscore > BestScore) { BestScore = thisscore; BestWindow = this.rawtext.Substring(z, maximumCharactersToExtract); //Best window to get if the future score is equal //I.E. put the terms in the center of the window if //the score is equal BestWindowStartPos = z + (maximumCharactersToExtract / 2); } //If it's equal to the last and we're positioned over //the best spot (terms in center) then capture that if (thisscore == BestScore && z == BestWindowStartPos) { BestWindow = this.rawtext.Substring(z, maximumCharactersToExtract); } } if (this.flattenExtract) this.extract = "..." + BestWindow.Trim().Replace("\r", "").Replace("\n", "").Replace("\t", "") + "...";//case 1593 added tab character removal else this.extract = "..." + BestWindow.Trim() + "..."; } //======================================================================== #endregion } #endregion Xtract #endregion #region ProcessKeywords into Database //Class to hold process input parameters //also used for getting summary search results public class SearchIndexProcessObjectParameters { public long TranslationId { get; set; } public long ObjectId { get; set; } public SockType SockType { get; set; } public List Words { get; set; } public SearchIndexProcessObjectParameters(long translationId, long objectID, SockType aType) { Words = new List(); TranslationId = translationId; ObjectId = objectID; SockType = aType; } //format used for getsummmary by biz objects public SearchIndexProcessObjectParameters() { Words = new List(); TranslationId = 0; ObjectId = 0; SockType = 0; } public SearchIndexProcessObjectParameters AddText(string s) { if (!string.IsNullOrWhiteSpace(s)) { Words.Add(s); } return this; } public SearchIndexProcessObjectParameters AddText(long l) { Words.Add(l.ToString()); return this; } // public SearchIndexProcessObjectParameters AddText(decimal? d) // { // if (d != null) // Words.Add(d.ToString()); // return this; // } public SearchIndexProcessObjectParameters AddText(List lWords) { if (lWords != null) { foreach (string s in lWords) { if (!string.IsNullOrWhiteSpace(s)) { Words.Add(s); } } } return this; } public SearchIndexProcessObjectParameters AddCustomFields(string jsonString) { //Extract the text from custom fields json fragment as an array of strings and add it here AddText(JsonUtil.GetCustomFieldsAsStringArrayForSearchIndexing(jsonString)); return this; } } public static async Task ProcessNewObjectKeywordsAsync(SearchIndexProcessObjectParameters searchIndexObjectParameters) { await ProcessKeywordsAsync(searchIndexObjectParameters, true); } public static async Task ProcessUpdatedObjectKeywordsAsync(SearchIndexProcessObjectParameters searchIndexObjectParameters) { await ProcessKeywordsAsync(searchIndexObjectParameters, false); } public static async Task ProcessDeletedObjectKeywordsAsync(long objectID, SockType aType, AyContext ct) { //Be careful in future, if you put ToString at the end of each object in the string interpolation //npgsql driver will assume it's a string and put quotes around it triggering an error that a string can't be compared to an int await ct.Database.ExecuteSqlInterpolatedAsync($"delete from asearchkey where objectid={objectID} and socktype={(int)aType}"); //nothing to save here, it's a direct command already executed } /// /// Process the keywords into the dictionary /// private static async Task ProcessKeywordsAsync(SearchIndexProcessObjectParameters p, bool newRecord) { // #if (DEBUG) // if (!p.SockType.HasAttribute(typeof(CoreBizObjectAttribute))) // throw new System.NotSupportedException($"Search::ProcessKeywords - Invalid type presented {p.SockType}"); // #endif List KeyWordList = await BreakAsync(p.TranslationId, p.Words); if (KeyWordList.Count == 0) return; //call stored procedure to do the work right at the server (fastest method by far) using (AyContext ct = ServiceProviderProvider.DBContext) await ct.Database.ExecuteSqlInterpolatedAsync($"call aydosearchindex({KeyWordList},{p.ObjectId},{p.SockType},{!newRecord})"); return; }//eoc #endregion #region Breaker public enum TokenTypes { Nothing, Separator, CJK, Latin }; /// /// Take an array of strings and /// return a single string /// containing unique only, lowercase comma delimited /// keywords suitable for passing to a /// stored procedure or other function /// /// Use Translation setting CJKIndex=true to handle Chinese, Japanese, Korean etc /// (languages with no easily identifiable word boundaries as in english) /// /// List of strings internal static async Task> BreakAsync(long translationId, List textStrings) { return await BreakCoreAsync(translationId, false, textStrings); } /// /// /// internal static async Task> BreakAsync(long translationId, string textString) { List textStrings = new List(1); textStrings.Add(textString); return await BreakCoreAsync(translationId, false, textStrings); } /// /// Used to Process users search phrase and preserve wild /// cards entered /// internal static async Task> BreakSearchPhraseAsync(long translationId, string searchPhrase) { List textStrings = new List(); textStrings.Add(searchPhrase); //note: we want stopwords if this is a search phrase break because they might type "some" wanting awesome but some is a stopword so.. return await BreakCoreAsync(translationId, true, textStrings, true); } internal static async Task> BreakCoreAsync(long translationId, bool KeepWildCards, List textStrings, bool ignoreStopWords = false) { //For stopwords and CJKIndex flag value var translationWordBreakData = await SearchTranslationWordBreakDataCache.GetWordBreakData(translationId); int MAXWORDLENGTH = 255; int MINWORDLENGTH = 2;//A word isn't a word unless it's got at least two characters in it StringBuilder sbResults = new StringBuilder(); //List to temporarily hold parsed words //used to easily ensure unique words only List tempParsedWords = new List(); StringBuilder sb = new StringBuilder(); StringBuilder sbWord = new StringBuilder(); List ReturnList = new List(); //Loop through each of the passed in strings foreach (string s in textStrings) { if (s == null || s == "") continue; //get all the characters in a unicode compliant manner... TextElementEnumerator t = StringInfo.GetTextElementEnumerator(s); //start at the top t.Reset(); TokenTypes LastToken = TokenTypes.Nothing; //Used by CJK bool BasicLatinBlock = true; //Process each "character" (text element,glyph whatever) in the //current string while (t.MoveNext()) { //get it as a character char c = t.GetTextElement()[0]; if (!translationWordBreakData.CJKIndex) { #region regular tokenizer //Is it a token we want to include? //Or a wildcard character if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%')) { #region Include token //All latin text is converted to lower case c = char.ToLower(c, System.Globalization.CultureInfo.CurrentCulture); //Do we already have a word? if (sbWord.Length > 0) { //Maybe we need to flush this word into the word list //if we're over the word length limit if (sbWord.Length >= MAXWORDLENGTH) { //flush away... if (!tempParsedWords.Contains(sbWord.ToString())) { tempParsedWords.Add(sbWord.ToString()); } sbWord.Length = 0; sbWord.Append(c); LastToken = TokenTypes.Latin; continue; } } //append character and go on to next one sbWord.Append(c); LastToken = TokenTypes.Latin; continue; #endregion } else { #region Word Boundary token LastToken = TokenTypes.Separator; if (sbWord.Length > 0) { //flush away... if (!tempParsedWords.Contains(sbWord.ToString())) { tempParsedWords.Add(sbWord.ToString()); } sbWord.Length = 0; continue; } #endregion } #endregion } else { #region CJK Tokenizer //Is it a basic latin charater? (ascii basically) //see: http://www.unicode.org/charts/index.html //and here for a funky online viewer: //http://www.fileformat.info/info/unicode/block/index.htm //we need to know this so that regular english text //within cjk text gets properly indexed as whole words BasicLatinBlock = false; if ((int)c < 256) BasicLatinBlock = true; if (BasicLatinBlock) { //Is it a token we want to include? if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%')) { #region Latin Include token //All latin text is converted to lower case c = char.ToLower(c, System.Globalization.CultureInfo.CurrentCulture); //Do we already have a word? if (sbWord.Length > 0) { //Maybe we need to flush this word into the word list //if we're over the word length limit or we are going from //CJK to latin if (LastToken == TokenTypes.CJK || sbWord.Length >= MAXWORDLENGTH) { //flush away... if (!tempParsedWords.Contains(sbWord.ToString())) { tempParsedWords.Add(sbWord.ToString()); } sbWord.Length = 0; sbWord.Append(c); LastToken = TokenTypes.Latin; continue; } } //append character and go on to next one sbWord.Append(c); LastToken = TokenTypes.Latin; continue; #endregion } else { #region Latin Word Boundary token LastToken = TokenTypes.Separator; if (sbWord.Length > 0) { //flush away... if (!tempParsedWords.Contains(sbWord.ToString())) { tempParsedWords.Add(sbWord.ToString()); } sbWord.Length = 0; continue; } #endregion } } else//CJK character { if (char.IsLetter(c) || (KeepWildCards && c == '%')) { #region CJK Include token //Do we already have a word? if (sbWord.Length > 0) { //Maybe we need to flush this word into the word list //if we're over the word length limit or we are going from //latin TO CJK if (LastToken == TokenTypes.Latin || sbWord.Length >= MAXWORDLENGTH) { //flush away... if (!tempParsedWords.Contains(sbWord.ToString())) { tempParsedWords.Add(sbWord.ToString()); } sbWord.Length = 0; sbWord.Append(c); LastToken = TokenTypes.CJK; continue; } if (LastToken == TokenTypes.CJK) { //we're here because there is more than zero characters already stored //and the last was CJK so we need append current character //and flush the resultant 2 character n-gram sbWord.Append(c); System.Diagnostics.Debug.Assert(sbWord.Length == 2); //flush away... if (!tempParsedWords.Contains(sbWord.ToString())) { tempParsedWords.Add(sbWord.ToString()); } sbWord.Length = 0; sbWord.Append(c); LastToken = TokenTypes.CJK; continue; } } //append character and go on to next one sbWord.Append(c); LastToken = TokenTypes.CJK; continue; #endregion } else { #region CJK Word Boundary token LastToken = TokenTypes.Separator; if (sbWord.Length > 0) { //flush away... if (!tempParsedWords.Contains(sbWord.ToString())) { tempParsedWords.Add(sbWord.ToString()); } sbWord.Length = 0; continue; } #endregion } } #endregion } } //Flush out the last word if (sbWord.Length > 0) { //flush away... if (!tempParsedWords.Contains(sbWord.ToString())) { tempParsedWords.Add(sbWord.ToString()); } sbWord.Length = 0; } } //bail early if there is nothing indexed if (tempParsedWords.Count == 0) return ReturnList; //Make a return string array //from the word list foreach (string s in tempParsedWords) { //Filter out short words if we are breaking for indexing //but keep them if they are part of a wildcard search phrase if (s.Length >= MINWORDLENGTH || (KeepWildCards && s.Contains('%'))) { if (ignoreStopWords) { //breaking of search phrase ReturnList.Add(s); } else { //Add only non stopwords - regular breaking of object for dictionary entry if (!translationWordBreakData.StopWords.Contains(s)) { ReturnList.Add(s); } } } } //sometimes all the results are stop words so you end up here with nothing return ReturnList; } #endregion }//eoc }//eons