raven/server/AyaNova/biz/Search.cs

using System;
using System.Linq;
using System.Globalization;
using System.Text;
using System.Collections.Generic;
using System.IO;
using System.Threading.Tasks;
using Newtonsoft.Json.Linq;
using Microsoft.Extensions.Logging;
using Microsoft.EntityFrameworkCore;
using AyaNova.Util;
using AyaNova.Models;


namespace AyaNova.Biz
{

    //This class handles word breaking, processing keywords and searching for results
    public static class Search
    {

        /*
        ISSUES:
           none at the moment


         */


        #region Search and return results

        /*
        Requirements:

        INPUT PARAMETERS
        - Search phrase (with wildcard support)
            - Can be empty if tags are specified, no tags and no phrase is an error condition
        - ObjectType: only return results for objects of this type
        - InName: flag that indicates only search in names
        - Tag ids that are also on result objects
            - Can be empty if a phrase is specified


        ACTION
        Find search matches, then find tag matches then intersect, then sort and return
        Filter OUT results that user is not permitted to read
        //TODO: proper testing of searching
        - SAMPLE DATA: Need a huge amount of sample data indexed to load test it
        - INDEXES: play with it and see what works best

        OUTPUT FORMAT
        - No localized text, up to client
        - Name of object in return result
        - Object Type and ID in return result
        - Group results by object type, then by object ID descending which will result in natural most recently created order

        result:[
        {
            name:"blah",
            type:2,
            id:210
        },
        ]


         */

        //Class to hold search request parameters
        public class SearchRequestParameters
        {
            public string Phrase { get; set; }
            public bool NameOnly { get; set; }
            public AyaType TypeOnly { get; set; }
            public List<long> Tags { get; set; }

            public SearchRequestParameters()
            {
                NameOnly = false;
                TypeOnly = AyaType.NoType;
                Tags = new List<long>();
            }

            public bool IsValid
            {
                get
                {
                    //has a phrase?
                    if (!string.IsNullOrWhiteSpace(this.Phrase))
                        return true;

                    //has tags?
                    if (this.Tags.Count > 0)
                        return true;

                    return false;
                }
            }
        }


        //Class to hold search result
        public class SearchResult
        {
            public string Name { get; set; }
            public AyaType Type { get; set; }
            public long Id { get; set; }
        }


        public static async Task<List<SearchResult>> DoSearch(AyContext ct, long localeId, SearchRequestParameters searchParameters)
        {
            List<SearchResult> ResultList = new List<SearchResult>();

            if (!searchParameters.IsValid)
            {
                throw new System.ArgumentException("Search::DoSearch - Search request parameters must contain a phrase or tags");
            }

            //IF PHRASE SPECIFIED

            //Modify Phrase to replace wildcard * with % as breakcore expects sql style wildcards
            searchParameters.Phrase = searchParameters.Phrase.Replace("*", "%");

            //BREAK SEARCH PHRASE INTO SEPARATE TERMS
            var PhraseItems = BreakSearchPhrase(localeId, searchParameters.Phrase);

            //SPLIT OUT WILDCARDS FROM NON WILDCARDS
            List<string> WildCardSearchTerms = new List<string>();
            List<string> RegularSearchTerms = new List<string>();

            foreach (string PhraseItem in PhraseItems)
            {
                if (PhraseItem.Contains("%"))
                    WildCardSearchTerms.Add(PhraseItem);
                else
                    RegularSearchTerms.Add(PhraseItem);
            }


            //GET LIST OF DICTIONARY ID'S THAT MATCH REGULAR SEARCH TERMS
            List<SearchDictionary> RegularMatches = new List<SearchDictionary>();
            if (RegularSearchTerms.Count > 0)
                RegularMatches = await ct.SearchDictionary.Where(m => RegularSearchTerms.Contains(m.Word)).ToListAsync();


            //GET LIST OF DICTIONARY ID'S THAT MATCH WILDCARD SEARCH TERMS
            List<SearchDictionary> WildCardMatches = new List<SearchDictionary>();
            if (WildCardSearchTerms.Count > 0)
            {
                //Ok some fuckery required to implement this the EF CORE way
                /*
                .Where(entity => entity.Name.Contains("xyz"))
                .Where(entity => entity.Name.EndsWith("xyz"))
                .Where(entity => entity.Name.StartsWith("xyz"))
                 */
                foreach (string WildCardSearchTerm in WildCardSearchTerms)
                {
                    //Contains?
                    if (WildCardSearchTerm.StartsWith("%") && WildCardSearchTerm.EndsWith("%"))
                    {
                        WildCardMatches.AddRange(await ct.SearchDictionary.Where(m => m.Word.EndsWith(WildCardSearchTerm.Replace("%", ""))).ToListAsync());
                    }
                    else if (WildCardSearchTerm.EndsWith("%")) //STARTS WITH?
                    {
                        WildCardMatches.AddRange(await ct.SearchDictionary.Where(m => m.Word.EndsWith(WildCardSearchTerm.Replace("%", ""))).ToListAsync());
                    }
                    else if (WildCardSearchTerm.StartsWith("%"))//ENDS WITH?
                    {
                        WildCardMatches.AddRange(await ct.SearchDictionary.Where(m => m.Word.EndsWith(WildCardSearchTerm.Replace("%", ""))).ToListAsync());
                    }
                }


                WildCardMatches = await ct.SearchDictionary.Where(m => WildCardMatches.Contains(m.Word)).ToListAsync();
            }

            //SEARCH SEARCHKEY FOR MATCHING WORDS AND OPTIONALLY TYPE AND INNAME

            //IF TAGS SPECIFIED
            //LOOP THROUGH SEARCHKEY MATCHES
            //FOREACH OBJECT SEARCH TAGMAP FOR MATCHING OBJECTTYPE AND ID
            //REMOVE RESULTS FROM SEARCH PHRASE PHASE THAT ARE NOT MATCHING


            //fake await to clear error
            //await ct.SaveChangesAsync();

            return ResultList;
        }


        #endregion dosearch

        #region ProcessKeywords into Database

        public static void ProcessNewObjectKeywords(AyContext ct, long localeId, long objectID, AyaType objectType, string name, params string[] text)
        {
            ProcessKeywords(ct, localeId, objectID, objectType, true, name, text);
        }

        public static void ProcessUpdatedObjectKeywords(AyContext ct, long localeId, long objectID, AyaType objectType, string name, params string[] text)
        {
            ProcessKeywords(ct, localeId, objectID, objectType, false, name, text);
        }

        public static void ProcessDeletedObjectKeywords(AyContext ct, long objectID, AyaType objectType)
        {
            //Be careful in future, if you put ToString at the end of each object in the string interpolation
            //npgsql driver will assume it's a string and put quotes around it triggering an error that a string can't be compared to an int
            ct.Database.ExecuteSqlCommand($"delete from asearchkey where objectid={objectID} and objecttype={(int)objectType}");
        }


        /// <summary>
        /// Process the keywords into the dictionary
        /// NOTE: NAME parameter is in ADDITION to the NAME also being on of the strings passed in text parameter
        /// </summary>
        private static void ProcessKeywords(AyContext ct, long localeId, long objectID, AyaType objectType, bool newRecord, string name, params string[] text)
        {

            //IF NOT NEW, DELETE ALL EXISTING ENTRIES FOR OBJECT TYPE AND ID
            if (!newRecord)
            {
                ProcessDeletedObjectKeywords(ct, objectID, objectType);
            }

            //BREAK STRING ARRAY INTO KEYWORD LIST
            List<string> KeyWordList = Break(localeId, text);

            //BREAK NAME STRING
            List<string> NameKeyWordList = Break(localeId, name);

            //EARLY EXIT IF NO KEYWORDS OR NAME RECORD TO PROCESS
            if (KeyWordList.Count == 0 && string.IsNullOrWhiteSpace(name))
            {
                return;
            }

            //BUILD A LIST OF MatchingDictionaryEntry items FOR THE MATCHING WORDS
            List<MatchingDictionaryEntry> MatchingKeywordIdList = new List<MatchingDictionaryEntry>();

            //ITERATE ALL THE KEYWORDS, SEARCH IN THE SEARCHDICTIONARY TABLE AND COLLECT ID'S OF ANY PRE-EXISTING IN DB KEYWORDS
            var ExistingKeywordMatches = ct.SearchDictionary.Where(m => KeyWordList.Contains(m.Word)).ToDictionary(m => m.Id, m => m.Word);
            //Put the matching keyword ID's into the list
            foreach (KeyValuePair<long, string> K in ExistingKeywordMatches)
            {
                bool IsName = false;
                if (NameKeyWordList.Contains(K.Value))
                    IsName = true;
                MatchingKeywordIdList.Add(new MatchingDictionaryEntry() { DictionaryId = K.Key, InName = IsName });
            }

            //ITERATE THROUGH THE KEYWORDS THAT DO *NOT* HAVE MATCHES IN THE SEARCHDICTIONARY AND ADD THEM TO THE SEARCH DICTIONARY, COLLECTING THEIR ID'S
            foreach (string KeyWord in KeyWordList)
            {
                if (!ExistingKeywordMatches.ContainsValue(KeyWord))
                {
                    ct.SearchDictionary.Add(new SearchDictionary() { Word = KeyWord });
                }
            }

            //Save the context in order to get the id's of the new words added
            ct.SaveChanges();

            //Now add the id's of the newly created words to the matching keyword id list for this object
            foreach (SearchDictionary SD in ct.SearchDictionary.Local)
            {
                bool IsName = false;
                if (NameKeyWordList.Contains(SD.Word))
                    IsName = true;
                MatchingKeywordIdList.Add(new MatchingDictionaryEntry() { DictionaryId = SD.Id, InName = IsName });
            }


            //CREATE THE SEARCHKEY RECORDS FOR ALL THE KEYWORDS
            foreach (MatchingDictionaryEntry E in MatchingKeywordIdList)
            {
                ct.SearchKey.Add(new SearchKey() { WordId = E.DictionaryId, InName = E.InName, ObjectId = objectID, ObjectType = objectType });
            }

            ct.SaveChanges();

        }//eoc

        //Class to hold temporary list of matching id
        public class MatchingDictionaryEntry
        {
            public bool InName { get; set; }
            public long DictionaryId { get; set; }
            public MatchingDictionaryEntry()
            {
                InName = false;
                DictionaryId = -1;
            }
        }


        #endregion

        #region Breaker

        //Class to hold relevant locale data for breaking text
        public class LocaleWordBreakingData
        {
            public bool CJKIndex { get; set; }
            public List<string> StopWords { get; set; }
            public LocaleWordBreakingData()
            {
                CJKIndex = false;
                StopWords = new List<string>();
            }
        }

        //Get the current stopwords for the user's locale
        private static LocaleWordBreakingData GetLocaleSearchData(long localeId, AyContext ct = null)
        {
            LocaleWordBreakingData LSD = new LocaleWordBreakingData();
            if (ct == null)
                ct = ServiceProviderProvider.DBContext;
            //Get stopwords
            //Validate locale id, if not right then use default instead
            var Param = new Api.Controllers.LocaleController.LocaleSubsetParam();
            Param.LocaleId = LocaleBiz.EnsuredLocaleIdStatic(localeId, ct);
            Param.Keys.Add("StopWords1");
            Param.Keys.Add("StopWords2");
            Param.Keys.Add("StopWords3");
            Param.Keys.Add("StopWords4");
            Param.Keys.Add("StopWords5");
            Param.Keys.Add("StopWords6");
            Param.Keys.Add("StopWords7");
            var Stops = LocaleBiz.GetSubsetStatic(Param).Result;

            foreach (KeyValuePair<string, string> kvp in Stops)
            {
                //Each stopwords locale key is a space delimited list of words and in the case of an empty local string (i.e. StopWords7) it's value is a single question mark
                if (kvp.Value != "?")
                {
                    LSD.StopWords.AddRange(kvp.Value.Split(" "));
                }
            }

            LSD.CJKIndex = LocaleBiz.GetCJKIndex(localeId, ct).Result;
            return LSD;
        }

        public enum TokenTypes
        { Nothing, Separator, CJK, Latin };

        /// <summary>
        /// Take an array of strings and
        /// return a single string
        /// containing unique only, lowercase comma delimited
        /// keywords suitable for passing to a
        /// stored procedure or other function
        ///
        /// Use Locale setting CJKIndex=true to handle Chinese, Japanese, Korean etc
        /// (languages with no easily identifiable word boundaries as in english)
        /// </summary>
        ///
        /// <param name="localeId"></param>
        /// <param name="text">An array of 0 to * strings of text</param>
        /// <returns>List of strings</returns>
        internal static List<string> Break(long localeId, params string[] text)
        {
            return BreakCore(localeId, false, text);
        }

        /// <summary>
        /// Used to Process users search phrase and preserve wild
        /// cards entered
        /// </summary>
        /// <param name="localeId"></param>
        /// <param name="text"></param>
        /// <returns></returns>
        internal static List<string> BreakSearchPhrase(long localeId, params string[] text)
        {
            return BreakCore(localeId, true, text);
        }

        /// <summary>
        /// Stop words list reset upon login or editing of localized text
        /// used for eliminating noise words from search dictionary
        /// </summary>
       // public static System.Collections.Generic.List<string> StopList = null;

        internal static List<string> BreakCore(long localeId, bool KeepWildCards, params string[] text)
        {
            //Get stopwords and CJKIndex flag value
            LocaleWordBreakingData LocaleSearchData = GetLocaleSearchData(localeId);
            int MAXWORDLENGTH = 255;
            StringBuilder sbResults = new StringBuilder();
            //List to temporarily hold parsed words
            //used to easily ensure unique words only
            List<string> tempParsedWords = new List<string>();

            StringBuilder sb = new StringBuilder();
            StringBuilder sbWord = new StringBuilder();
            List<string> ReturnList = new List<string>();


            //Loop through each of the passed in strings
            foreach (string s in text)
            {
                if (s == null || s == "") continue;
                //get all the characters in a unicode compliant manner...
                TextElementEnumerator t = StringInfo.GetTextElementEnumerator(s);
                //start at the top
                t.Reset();

                TokenTypes LastToken = TokenTypes.Nothing;

                //Used by CJK
                bool BasicLatinBlock = true;

                //Process each "character" (text element,glyph whatever) in the
                //current string
                while (t.MoveNext())
                {
                    //get it as a character
                    char c = t.GetTextElement()[0];

                    if (!LocaleSearchData.CJKIndex)
                    {
                        #region regular tokenizer

                        //Is it a token we want to include?
                        //Or a wildcard character
                        if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%'))
                        {
                            #region Include token
                            //All latin text is converted to lower case
                            c = char.ToLower(c);

                            //Do we already have a word?
                            if (sbWord.Length > 0)
                            {
                                //Maybe we need to flush this word into the word list
                                //if we're over the word length limit
                                if (sbWord.Length >= MAXWORDLENGTH)
                                {
                                    //flush away...
                                    if (!tempParsedWords.Contains(sbWord.ToString()))
                                    {
                                        tempParsedWords.Add(sbWord.ToString());
                                    }
                                    sbWord.Length = 0;
                                    sbWord.Append(c);
                                    LastToken = TokenTypes.Latin;
                                    continue;

                                }
                            }

                            //append character and go on to next one
                            sbWord.Append(c);
                            LastToken = TokenTypes.Latin;
                            continue;
                            #endregion
                        }
                        else
                        {
                            #region Word Boundary token
                            LastToken = TokenTypes.Separator;
                            if (sbWord.Length > 0)
                            {
                                //flush away...
                                if (!tempParsedWords.Contains(sbWord.ToString()))
                                {
                                    tempParsedWords.Add(sbWord.ToString());
                                }
                                sbWord.Length = 0;
                                continue;
                            }

                            #endregion
                        }
                        #endregion
                    }
                    else
                    {
                        #region CJK Tokenizer

                        //Is it a basic latin charater? (ascii basically)
                        //see: http://www.unicode.org/charts/index.html
                        //and here for a funky online viewer:
                        //http://www.fileformat.info/info/unicode/block/index.htm
                        //we need to know this so that regular english text
                        //within cjk text gets properly indexed as whole words
                        BasicLatinBlock = false;
                        if ((int)c < 256) BasicLatinBlock = true;

                        if (BasicLatinBlock)
                        {
                            //Is it a token we want to include?
                            if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%'))
                            {
                                #region Latin Include token
                                //All latin text is converted to lower case
                                c = char.ToLower(c);

                                //Do we already have a word?
                                if (sbWord.Length > 0)
                                {
                                    //Maybe we need to flush this word into the word list
                                    //if we're over the word length limit or we are going from
                                    //CJK to latin
                                    if (LastToken == TokenTypes.CJK || sbWord.Length >= MAXWORDLENGTH)
                                    {
                                        //flush away...
                                        if (!tempParsedWords.Contains(sbWord.ToString()))
                                        {
                                            tempParsedWords.Add(sbWord.ToString());
                                        }
                                        sbWord.Length = 0;
                                        sbWord.Append(c);
                                        LastToken = TokenTypes.Latin;
                                        continue;

                                    }
                                }

                                //append character and go on to next one
                                sbWord.Append(c);
                                LastToken = TokenTypes.Latin;
                                continue;
                                #endregion
                            }
                            else
                            {
                                #region Latin Word Boundary token
                                LastToken = TokenTypes.Separator;
                                if (sbWord.Length > 0)
                                {
                                    //flush away...
                                    if (!tempParsedWords.Contains(sbWord.ToString()))
                                    {
                                        tempParsedWords.Add(sbWord.ToString());
                                    }
                                    sbWord.Length = 0;

                                    continue;

                                }

                                #endregion
                            }

                        }
                        else//CJK character
                        {
                            if (char.IsLetter(c) || (KeepWildCards && c == '%'))
                            {
                                #region CJK Include token
                                //Do we already have a word?
                                if (sbWord.Length > 0)
                                {
                                    //Maybe we need to flush this word into the word list
                                    //if we're over the word length limit or we are going from
                                    //latin TO CJK
                                    if (LastToken == TokenTypes.Latin || sbWord.Length >= MAXWORDLENGTH)
                                    {
                                        //flush away...
                                        if (!tempParsedWords.Contains(sbWord.ToString()))
                                        {
                                            tempParsedWords.Add(sbWord.ToString());
                                        }
                                        sbWord.Length = 0;
                                        sbWord.Append(c);
                                        LastToken = TokenTypes.CJK;
                                        continue;

                                    }

                                    if (LastToken == TokenTypes.CJK)
                                    {
                                        //we're here because there is more than zero characters already stored
                                        //and the last was CJK so we need append current character
                                        //and flush the resultant 2 character n-gram
                                        sbWord.Append(c);
                                        System.Diagnostics.Debug.Assert(sbWord.Length == 2);
                                        //flush away...
                                        if (!tempParsedWords.Contains(sbWord.ToString()))
                                        {
                                            tempParsedWords.Add(sbWord.ToString());
                                        }
                                        sbWord.Length = 0;
                                        sbWord.Append(c);
                                        LastToken = TokenTypes.CJK;
                                        continue;

                                    }
                                }

                                //append character and go on to next one
                                sbWord.Append(c);
                                LastToken = TokenTypes.CJK;
                                continue;
                                #endregion


                            }
                            else
                            {
                                #region CJK Word Boundary token
                                LastToken = TokenTypes.Separator;
                                if (sbWord.Length > 0)
                                {
                                    //flush away...
                                    if (!tempParsedWords.Contains(sbWord.ToString()))
                                    {
                                        tempParsedWords.Add(sbWord.ToString());
                                    }
                                    sbWord.Length = 0;
                                    continue;
                                }

                                #endregion
                            }

                        }

                        #endregion
                    }
                }

                //Flush out the last word
                if (sbWord.Length > 0)
                {
                    //flush away...
                    if (!tempParsedWords.Contains(sbWord.ToString()))
                    {
                        tempParsedWords.Add(sbWord.ToString());
                    }
                    sbWord.Length = 0;
                }
            }


            //bail early if there is nothing indexed
            if (tempParsedWords.Count == 0) return ReturnList;


            //Make a return string array
            //from the word list
            foreach (string s in tempParsedWords)
            {
                //Add only non stopwords
                if (!LocaleSearchData.StopWords.Contains(s))
                {
                    ReturnList.Add(s);
                }
            }

            //sometimes all the results are stop words so you end up here with nothing
            return ReturnList;

        }

        #endregion


    }//eoc

}//eons