raven/server/AyaNova/biz/Search.cs

using System;
using System.Linq;
using System.Globalization;
using System.Text;
using System.Collections.Generic;
using System.Threading.Tasks;
using Microsoft.Extensions.Logging;
using Microsoft.EntityFrameworkCore;
using AyaNova.Util;
using AyaNova.Models;
//using System.Diagnostics;


namespace AyaNova.Biz
{

    //This class handles word breaking, processing keywords and searching for results
    public static class Search
    {

        #region Search and return results

        /*
        Requirements:

        INPUT PARAMETERS
        - Search phrase (with wildcard support)
            - Can be empty if tags are specified, no tags and no phrase is an error condition
        - ObjectType: only return results for objects of this type


        ACTION
        Find search matches, then find tag matches then intersect, then sort and return
        Filter OUT results that user is not permitted to read
        //TODO: proper testing of searching
        - SAMPLE DATA: Need a huge amount of sample data indexed to load test it
        - INDEXES: play with it and see what works best

        OUTPUT FORMAT
        - No translated text, up to client
        - Name of object in return result
        - Object Type and ID in return result
        - Group results by object type, then by object ID descending which will result in natural most recently created order

        result:[
        {
            name:"blah",
            type:2,
            id:210
        },
        ]


         */

        //Class to hold search request parameters
        public class SearchRequestParameters
        {
            public string Phrase { get; set; }

            public AyaType TypeOnly { get; set; }

            //Note: maxresults of 0 will get all results
            public int MaxResults { get; set; }

            public SearchRequestParameters()
            {

                TypeOnly = AyaType.NoType;
                MaxResults = 500;
            }

            public bool IsValid
            {
                get
                {
                    //has a phrase?
                    if (!string.IsNullOrWhiteSpace(this.Phrase))
                        return true;
                    return false;
                }
            }
        }


        //Classes to hold search results returned to client
        public class SearchResult
        {
            public string Name { get; set; }
            public AyaType Type { get; set; }
            public long Id { get; set; }
        }

        public class SearchReturnObject
        {
            public long TotalResultsFound { get; set; }
            public List<SearchResult> SearchResults { get; set; }
            public SearchReturnObject()
            {
                TotalResultsFound = 0;
                SearchResults = new List<SearchResult>();
            }
        }


        public static async Task<SearchReturnObject> DoSearchAsync(AyContext ct, long translationId, AuthorizationRoles currentUserRoles, SearchRequestParameters searchParameters)
        {
            var ReturnObject = new SearchReturnObject();

            //list to hold temporary search/tag hits
            List<AyaTypeId> MatchingObjects = new List<AyaTypeId>();

            if (!searchParameters.IsValid)
            {
                //this is expected, don't throw, just return nothing
                //throw new System.ArgumentException("Search::DoSearch - Search request parameters must contain a phrase or tags");
                return ReturnObject;
            }


            //escape literal percentage signs first just in case they are searching for 50% off or something
            //https://www.postgresql.org/docs/current/functions-matching.html#FUNCTIONS-LIKE
            //need to get around breaking possibly losing the symbol so make it text
            searchParameters.Phrase = searchParameters.Phrase.Replace("%", "pctsym");

            //Modify Phrase to replace wildcard * with % as breakcore expects sql style wildcards
            searchParameters.Phrase = searchParameters.Phrase.Replace("*", "%");

            //BREAK SEARCH PHRASE INTO SEPARATE TERMS
            var PhraseItems = await BreakSearchPhraseAsync(translationId, searchParameters.Phrase);

            //SPLIT OUT WILDCARDS FROM NON WILDCARDS
            List<string> PreWildCardedSearchTerms = new List<string>();
            List<string> SearchTerms = new List<string>();

            foreach (string PhraseItem in PhraseItems)
            {
                if (PhraseItem.Contains("%"))
                    PreWildCardedSearchTerms.Add(PhraseItem.Replace("pctsym", @"\%"));//put back literal percentage symbol if necessary
                else
                    SearchTerms.Add(PhraseItem.Replace("pctsym", @"\%"));//put back literal percentage symbol if necessary
            }


            //List holder for matching dictionary ID's
            List<long> DictionaryMatches = new List<long>();


            //GET LIST OF DICTIONARY ID'S THAT MATCH REGULAR SEARCH TERMS
            if (SearchTerms.Count > 0)
                foreach (string Term in SearchTerms)
                {
                    DictionaryMatches.AddRange(await ct.SearchDictionary.Where(z => z.Word.Contains(Term)).Select(z => z.Id).ToListAsync());
                }


            //GET LIST OF DICTIONARY ID'S THAT MATCH WILDCARD SEARCH TERMS
            if (PreWildCardedSearchTerms.Count > 0)
            {
                foreach (string WildCardSearchTerm in PreWildCardedSearchTerms)
                {
                    //Contains?
                    if (WildCardSearchTerm.StartsWith("%") && WildCardSearchTerm.EndsWith("%"))
                    {
                        DictionaryMatches.AddRange(await ct.SearchDictionary.Where(z => z.Word.Contains(WildCardSearchTerm.Replace("%", ""))).Select(z => z.Id).ToListAsync());
                    }
                    else if (WildCardSearchTerm.EndsWith("%")) //STARTS WITH?
                    {
                        DictionaryMatches.AddRange(await ct.SearchDictionary.Where(z => z.Word.StartsWith(WildCardSearchTerm.Replace("%", ""))).Select(z => z.Id).ToListAsync());
                    }
                    else if (WildCardSearchTerm.StartsWith("%"))//ENDS WITH?
                    {
                        DictionaryMatches.AddRange(await ct.SearchDictionary.Where(z => z.Word.EndsWith(WildCardSearchTerm.Replace("%", ""))).Select(z => z.Id).ToListAsync());
                    }
                }
            }

            //SEARCH SEARCHKEY FOR MATCHING WORDS AND OPTIONALLY TYPE
            var TotalSearchTermsToMatch = PreWildCardedSearchTerms.Count + SearchTerms.Count;

            //Build search query based on searchParameters
            var q = ct.SearchKey.Distinct().Where(z => DictionaryMatches.Contains(z.WordId));


            //Of type?
            if (searchParameters.TypeOnly != AyaType.NoType)
                q = q.Where(z => z.ObjectType == searchParameters.TypeOnly);


            //Find the records that have the search terms in searchkey
            var SearchMatches = q.GroupBy(z => new { z.ObjectType, z.ObjectId }).Select(z => new { ObjectId = z.Key.ObjectId, ObjectType = z.Key.ObjectType, ObjectCount = z.LongCount() });


            //PUT THE RESULTS INTO MATCHING OBJECTS LIST
            foreach (var SearchMatch in SearchMatches)
            {
                //keep any object that matches *all* the search terms
                if (SearchMatch.ObjectCount == TotalSearchTermsToMatch)
                    MatchingObjects.Add(new AyaTypeId(SearchMatch.ObjectType, SearchMatch.ObjectId));
            }


            //REMOVE ANY ITEMS THAT USER IS NOT PERMITTED TO READ
            //list to hold temporary matches
            List<AyaTypeId> CanReadMatchingObjects = new List<AyaTypeId>();
            foreach (AyaTypeId t in MatchingObjects)
            {
                if (AyaNova.Api.ControllerHelpers.Authorized.HasReadFullRole(currentUserRoles, t.ObjectType))
                {
                    CanReadMatchingObjects.Add(t);
                }
            }

            //Ok, we're here with the list of allowable objects which is now the master matching objects list so...
            MatchingObjects = CanReadMatchingObjects;


            //TOTAL RESULTS
            //we have the total results here so set accordingly
            ReturnObject.TotalResultsFound = MatchingObjects.Count;

            //MAXIMUM RESULTS FILTER
            //The theory is that it should be filtered BEFORE sorting so that you get the most random collection of results
            //As the results are not ranked so...
            if (searchParameters.MaxResults > 0)//0 = all results
                MatchingObjects = MatchingObjects.Take(searchParameters.MaxResults).ToList();

            //Sort and group the matching objects list in return order
            //zCzustomer.OrderBy(z => z.LastName).ThenBy(z => z.FirstName)
            var OrderedMatchingObjects = MatchingObjects.OrderBy(z => z.ObjectType).ThenByDescending(z => z.ObjectId);


            // var watch = new System.Diagnostics.Stopwatch();//###################### PROFILING
            // watch.Start();//###################### PROFILING

            //Get names using best performing technique
            using (var command = ct.Database.GetDbConnection().CreateCommand())
            {

                ct.Database.OpenConnection();
                //Build the return list from the remaining matching objects list
                foreach (AyaTypeId i in OrderedMatchingObjects)
                {
                    SearchResult SR = new SearchResult();

                    SR.Name = BizObjectNameFetcherDirect.Name(i, command);//THIS IS CAUSING ALL THE SLOWNESS IN RETURNING SEARCH RESULTS (...much later ??? WTF??)

                    SR.Id = i.ObjectId;
                    SR.Type = i.ObjectType;
                    ReturnObject.SearchResults.Add(SR);
                }
            }

            // watch.Stop();//###################### PROFILING
            // var TimeToBuildSearchResultReturnList = watch.ElapsedMilliseconds;//###################### PROFILING

            return ReturnObject;
        }


        #endregion dosearch

        #region Get info (excerpt)
        public static async Task<string> GetInfoAsync(AyContext ct, long translationId,
        AuthorizationRoles currentUserRoles, long userId, string phrase, int max, AyaType ayaType, long id)
        {
            //escape literal percentage signs first just in case they are searching for 50% off or something
            //https://www.postgresql.org/docs/current/functions-matching.html#FUNCTIONS-LIKE
            //need to get around breaking possibly losing the symbol so make it text
            phrase = phrase.Replace("%", "pctsym");

            //Modify Phrase to replace wildcard * with % as breakcore expects sql style wildcards
            phrase = phrase.Replace("*", "%");

            //BREAK SEARCH PHRASE INTO SEPARATE TERMS
            var PhraseItems = await BreakSearchPhraseAsync(translationId, phrase);
            PhraseItems.ToArray();

            //get text
            ISearchAbleObject o = (ISearchAbleObject)BizObjectFactory.GetBizObject(ayaType, ct, userId, currentUserRoles);

            //get extract
            var searchParams = await o.GetSearchResultSummary(id);

            //extract and rank here
            ExtractAndRank er = new ExtractAndRank();
            er.Process(searchParams, PhraseItems.ToArray(), max);
            // sr.Extract = er.Extract;
            // sr.Rank = er.Ranking;

            return er.Extract;

        }


        #region Search rank and extract
        /// <summary>
        /// Rank and extract best excerpt of specified text and search terms
        /// </summary>
        public sealed class ExtractAndRank
        {

            #region Fields
            private string[] searchTerms;
            private string rawtext;
            private string extract = "";
            private bool flattenExtract = true;
            private float ranking;
            private int extractionThresholdRank = 10;
            private int maximumCharactersToExtract = 40;
            #endregion

            #region Properties

            /// <summary>
            /// This is the ranking of the source text as it pertains to the
            /// search terms
            ///
            /// A rank of zero means either there was no match or the rank that was calculated
            /// was lower than the threshold ranking, either way, no excerpt extraction is done.
            ///
            /// It is a percentage value on a scale of 0 to 100
            ///	and is weighted:
            ///
            /// 75% of the score is the percentage of all search terms found in the text
            /// 25% of the score is the percentage of all characters in the text that are search term characters
            ///
            ///
            /// </summary>
            public float Ranking
            {
                get
                {
                    return ranking;
                }
            }

            /// <summary>
            /// Maximum characters to appear in an extraction
            /// default is 80
            /// Minimum is 10
            /// </summary>
            public int MaximumCharactersToExtract
            {
                get
                {
                    return maximumCharactersToExtract;
                }
                set
                {

                    if (value > 10)
                        maximumCharactersToExtract = value;
                    else
                        maximumCharactersToExtract = 10;

                }
            }

            /// <summary>
            /// ExtractionThresholdRank
            /// Extraction will only take place if the rank is
            /// this value or higher
            ///
            /// default is 10, maximum is 100 minimum is 0
            /// </summary>
            public int ExtractionThresholdRank
            {
                get
                {
                    return extractionThresholdRank;
                }
                set
                {
                    if (value > 100)
                        extractionThresholdRank = 100;
                    else if (value < 0)
                        extractionThresholdRank = 0;
                    else
                        extractionThresholdRank = value;
                }
            }


            /// <summary>
            /// If true, carriage returns and line feeds will be removed from extract
            /// </summary>
            public bool FlattenExtract
            {
                get
                {
                    return this.flattenExtract;
                }
                set
                {
                    this.flattenExtract = value;
                }
            }

            /// <summary>
            /// Extracted text excerpt that best reflects search terms
            /// </summary>
            public string Extract
            {
                get
                {
                    return extract;
                }
            }

            #endregion

            #region public methods
            /// <summary>
            /// Do the extraction and ranking
            /// </summary>
            public void Process(SearchIndexProcessObjectParameters searchObjectParams, string[] searchTerms, int max)
            {
                this.maximumCharactersToExtract = max;

                ranking = 0;
                extract = "";

                string rawText = string.Join(" ", searchObjectParams.Words);

                //System.Diagnostics.Debug.Assert(rawText!=null && rawText!="","EXTRACT AND RANK","EMPTY RAWTEXT, CHECK OBJECTS GetSearchResult() CODE TO ENSURE IT'S GOT THE correct SP (CHECK THE SP IF NOT)");
                if (rawText == null || rawText == "") return;
                this.rawtext = rawText;

                if (searchTerms == null || searchTerms.Length == 0) return;
                this.searchTerms = searchTerms;


                ranking = score(0, this.rawtext.Length);
                if (ranking > extractionThresholdRank)
                    DoExtract();
            }
            #endregion

            #region Calculate score
            /// <summary>
            /// Give a percentage score for a given window of
            /// text in the raw text string
            /// 75% of the score is the percentage of all search terms found in the window
            /// 25% of the score is the percentage of all characters in the search window that are search term characters
            ///
            ///
            ///
            /// </summary>
            /// <param name="nStartPos"></param>
            /// <param name="nEndPos"></param>
            /// <returns>Float value of zero to one hundred</returns>
            private float score(int nStartPos, int nEndPos)
            {
                //rewrite this as an integer based calculation

                System.Diagnostics.Debug.Assert(nStartPos < nEndPos);
                if (nStartPos < 0) nStartPos = 0;
                if (nEndPos > this.rawtext.Length) nEndPos = this.rawtext.Length;

                int nTermCharsInWindow = 0;//how many of the characters in the window are matching term characters
                string SearchString = this.rawtext.Substring(nStartPos, nEndPos - nStartPos).ToLower(System.Globalization.CultureInfo.CurrentCulture);

                int nMatches = 0;

                foreach (string term in searchTerms)
                {
                    //remove the wild card character if present and set to lower case
                    string lTerm = term.ToLower(System.Globalization.CultureInfo.CurrentCulture).Replace("%", "");
                    int nLocation = SearchString.IndexOf(lTerm);
                    if (nLocation != -1)
                    {
                        nMatches++;
                        while (nLocation != -1)
                        {
                            nTermCharsInWindow += lTerm.Length; ;
                            nLocation = SearchString.IndexOf(lTerm, nLocation + 1);

                        }

                    }
                }

                //If no matches then rank is automatically zero
                if (nMatches == 0)
                {
                    return 0;
                }


                //Rank is calculated on a weighted scale
                //75% for matching all search terms
                //25% for the quantity of search terms versus other text found
                float fTermsFoundPct = 75 * ((float)nMatches / (float)searchTerms.GetLength(0));
                float fTermsVsTextPct = 0;
                if (nTermCharsInWindow > 0)
                    fTermsVsTextPct = 25 * ((float)nTermCharsInWindow / (float)SearchString.Length);

                return fTermsFoundPct + fTermsVsTextPct;

            }
            #endregion

            #region Extract best excerpt
            /// <summary>
            /// Extract the best scoring excerpt fragments of
            /// raw text
            /// </summary>
            private void DoExtract()
            {
                //If the whole thing is less than the max to extract
                //just save time and return the whole thing
                if (this.rawtext.Length < this.maximumCharactersToExtract)
                {
                    this.extract = this.rawtext;
                    return;
                }

                string BestWindow = "";
                float BestScore = 0;
                float thisscore = 0;
                int BestWindowStartPos = 0;

                //Get the shortest search term length so
                //we can save time iterating over the window in the extract
                //function below
                int shortestSearchTermLength = int.MaxValue;
                foreach (string s in this.searchTerms)
                {
                    if (s.Length < shortestSearchTermLength)
                        shortestSearchTermLength = s.Length;

                }


                //slide a window over the text and check it's score, the highest scoring window wins
                //move the length of the shortest search term so as to ensure we won't
                //miss it, but faster than moving one character at a time
                for (int z = 0; z < this.rawtext.Length - maximumCharactersToExtract; z += shortestSearchTermLength)
                {
                    thisscore = score(z, z + (maximumCharactersToExtract));

                    if (thisscore == 0) continue;

                    if (thisscore > BestScore)
                    {
                        BestScore = thisscore;
                        BestWindow = this.rawtext.Substring(z, maximumCharactersToExtract);
                        //Best window to get if the future score is equal
                        //I.E. put the terms in the center of the window if
                        //the score is equal
                        BestWindowStartPos = z + (maximumCharactersToExtract / 2);
                    }

                    //If it's equal to the last and we're positioned over
                    //the best spot (terms in center) then capture that
                    if (thisscore == BestScore && z == BestWindowStartPos)
                    {
                        BestWindow = this.rawtext.Substring(z, maximumCharactersToExtract);

                    }
                }

                if (this.flattenExtract)
                    this.extract = "..." + BestWindow.Trim().Replace("\r", "").Replace("\n", "").Replace("\t", "") + "...";//case 1593 added tab character removal
                else
                    this.extract = "..." + BestWindow.Trim() + "...";


            }


            //========================================================================

            #endregion

        }
        #endregion Xtract


        #endregion
        #region ProcessKeywords into Database

        //Class to hold process input parameters
        //also used for getting summary search results
        public class SearchIndexProcessObjectParameters
        {
            public long TranslationId { get; set; }
            public long ObjectId { get; set; }
            public AyaType ObjectType { get; set; }
            // public string Name { get; set; }
            public List<string> Words { get; set; }


            public SearchIndexProcessObjectParameters(long translationId, long objectID, AyaType objectType)
            {
                Words = new List<string>();

                TranslationId = translationId;
                ObjectId = objectID;
                ObjectType = objectType;


            }

            //format used for getsummmary by biz objects
            public SearchIndexProcessObjectParameters()
            {
                Words = new List<string>();
                TranslationId = 0;
                ObjectId = 0;
                ObjectType = 0;
            }

            public SearchIndexProcessObjectParameters AddText(string s)
            {
                if (!string.IsNullOrWhiteSpace(s))
                {
                    Words.Add(s);
                }
                return this;
            }

            public SearchIndexProcessObjectParameters AddText(uint u)
            {
                Words.Add(u.ToString());
                return this;
            }

            public SearchIndexProcessObjectParameters AddText(long l)
            {
                Words.Add(l.ToString());
                return this;
            }

            public SearchIndexProcessObjectParameters AddText(List<string> lWords)
            {
                if (lWords != null)
                {
                    foreach (string s in lWords)
                    {
                        if (!string.IsNullOrWhiteSpace(s))
                        {
                            Words.Add(s);
                        }
                    }
                }

                return this;
            }


            public SearchIndexProcessObjectParameters AddCustomFields(string jsonString)
            {
                //Extract the text from custom fields json fragment as an array of strings and add it here
                AddText(JsonUtil.GetCustomFieldsAsStringArrayForSearchIndexing(jsonString));
                return this;
            }


        }


        public static async Task ProcessNewObjectKeywordsAsync(SearchIndexProcessObjectParameters searchIndexObjectParameters)
        {
            await ProcessKeywordsAsync(searchIndexObjectParameters, true);
        }

        public static async Task ProcessUpdatedObjectKeywordsAsync(SearchIndexProcessObjectParameters searchIndexObjectParameters)
        {
            await ProcessKeywordsAsync(searchIndexObjectParameters, false);
        }

        public static async Task ProcessDeletedObjectKeywordsAsync(long objectID, AyaType objectType)
        {
            //Be careful in future, if you put ToString at the end of each object in the string interpolation
            //npgsql driver will assume it's a string and put quotes around it triggering an error that a string can't be compared to an int
            AyContext ct = ServiceProviderProvider.DBContext;
            await ct.Database.ExecuteSqlInterpolatedAsync($"delete from asearchkey where objectid={objectID} and objecttype={(int)objectType}");
            //nothing to save here, it's a direct command already executed
        }


        /// <summary>
        /// Process the keywords into the dictionary
        /// NOTE: NAME parameter is in ADDITION to the NAME also being one of the strings passed in text parameter
        /// </summary>
        private static async Task ProcessKeywordsAsync(SearchIndexProcessObjectParameters p, bool newRecord)
        {

#if (DEBUG)
            if (p.ObjectType == AyaType.ServerJob || p.ObjectType == AyaType.Translation)
            {
                throw new System.NotSupportedException($"Search::ProcessKeywords - Invalid type presented {p.ObjectType}");
            }
#endif


            //IF NOT NEW, DELETE ALL EXISTING ENTRIES FOR OBJECT TYPE AND ID
            if (!newRecord)
            {
                await ProcessDeletedObjectKeywordsAsync(p.ObjectId, p.ObjectType);
            }

            //BREAK OBJECT TEXT STRINGS INTO KEYWORD LIST
            List<string> KeyWordList = await BreakAsync(p.TranslationId, p.Words);


            //EARLY EXIT IF NO KEYWORDS TO PROCESS
            if (KeyWordList.Count == 0)
            {
                return;
            }


            //BUILD A LIST OF MatchingDictionaryEntry items FOR THE MATCHING WORDS
            List<long> MatchingKeywordIdList = new List<long>();

            //ITERATE ALL THE KEYWORDS, SEARCH IN THE SEARCHDICTIONARY TABLE AND COLLECT ID'S OF ANY PRE-EXISTING IN DB KEYWORDS
            var ExistingKeywordMatches = await ServiceProviderProvider.DBContext.SearchDictionary.AsNoTracking().Where(z => KeyWordList.Contains(z.Word)).ToDictionaryAsync(z => z.Id, z => z.Word);
            /*example of above query, returns a list of words and ids
            SELECT a.id, a.xmin, a.word
FROM asearchdictionary AS a
WHERE a.word IN ('eos', 'quia', 'voluptate', 'delectus', 'sapiente', 'omnis', 'suscipit', 'rerum', 'unbranded', 'soft', 'towels', '25', 'green', 'zone', 'red', 'sequi', 'aspernatur', 'animi', '85586490', '70907391547648')


            */

            //Put the matching keyword ID's into the list
            foreach (KeyValuePair<long, string> K in ExistingKeywordMatches)
            {
                MatchingKeywordIdList.Add(K.Key);
            }

            //-------- START CRITICAL SECTION -----------
            //-------------------------------------------
            #region NEW WORD ADDITION second attempt, do it word by word and accept clashes and handle them
#if (DEBUG)
            var log = AyaNova.Util.ApplicationLogging.CreateLogger("### Search::ProcessKeywords ###");
#endif

            #region PERFORMANCE NOTES / EXPERIMENTS
            /*
            This next block is where all the slowness exists.
            I've played with it and brought it down to half the original time it took, but could likely find more savings,
             however not a good use of time right now and really only affects bulk ops which is seeding right now,
             so keeping my notes here just in case I take another whack at it
             ///////////////////////////////////////////////////////////////////////////////
TODO: Search indexing is painfully slow, it accounts for 16 of 22 seconds when creating 500 widgets with full paragraphs of text
	- Try to see if it's just one part of the operation by timing it
	- Re-code it not using EF but directly interacting with the DB
	- Maybe it's a case for stored procedures or something?

SEARCH INDEXING PERFORMANCE WORK
Baseline from before doing anything seeding a medium level with full text
2020-01-21 16:49:17.4662|INFO|Seeder|75 Users seeded in 2279 ms
2020-01-21 16:49:39.4481|INFO|Seeder|500 Widgets seeded in 21968 ms

After round one of improvements (less text in seed data notes, not calling savechanges or add async)
//about 2 seconds came from the async db stuff and the rest was from using less text so less indexing which isn't really a permanent solution just a workaround
2020-01-23 16:57:57.0422|INFO|Seeder|75 Users seeded in 2398 ms
2020-01-23 16:58:11.9983|INFO|Seeder|500 Widgets seeded in 14958 ms

TODO: Find out if this is linear time for more widgets or exponential to see if it exposes part of the issue
X widgets, ms per widget:
100=32
500=29 (27 in non debug mode)
5000=29


Stripped out all text to index except single letter a in notes and c2
500=20

Now going to try the opposite, a *lot* of text 10 paragraphs in both c2 and notes
500=59ms

So the quantity of text directly affects the performance, so it's not just some overhead from the query being run, it's the amount of work it needs to do in the queries

THINGS TO TRY:
Completely alternate methods:
	- https://stackoverflow.com/a/15089664/8939  Store a Digest of each record with that record then can just search the digests (would mean a search has to traverse all records of every table possibly)

DB INDEX TUNING?
	- Play with the indexes and see if there is a slowup with an unnecessary index maybe affecting things

Async the keyword processing
	- Fire off the indexing and return immediately so there would be a bit of time to come into compliance maybe more clashes?

Removing use of EF entirely in search indexing processing in favor of direct sql queries

cache or provide directly the translation to save time repeatedly fetching it when doing bulk ops!!!
	-After doing this 500=21 That's as fast as when I stripped out all the text, what a huge overhead saving right there!:
	2020-01-24 12:00:41.2547|INFO|Seeder|Seeding 500 Widgets....
	2020-01-24 12:00:51.9138|INFO|Seeder|500 Widgets seeded in 10649 ms

///////////////////////////////////////////////////////////////////////////////

            */
            #endregion performance notes experiments

            foreach (string KeyWord in KeyWordList)
            {
                if (!ExistingKeywordMatches.ContainsValue(KeyWord))
                {
                    //algorithm: Attempt to add it to the db and get the id, if it fails with the expected exception for a duplicate word insertion attempt, then immediately read back that word and handle it

                    //ATTEMPT TO ADD THE WORD TO THE SEARCHDICTIONARY
                    SearchDictionary NewWord = new SearchDictionary();
                    NewWord.Word = KeyWord;

                    try
                    {


                        //ADD WORD TO DICTIONARY, SAVE THE ID INTO THE MATCHINGKEYWORDIDLIST
                        var CtAdd = ServiceProviderProvider.DBContext;
                        await CtAdd.SearchDictionary.AddAsync(NewWord);
                        await CtAdd.SaveChangesAsync();


                        //-------
                        //Add to matching keywords
                        MatchingKeywordIdList.Add(NewWord.Id);
                        //-------

                        //It exists now
                        ExistingKeywordMatches.Add(NewWord.Id, NewWord.Word);
                    }
                    catch (Microsoft.EntityFrameworkCore.DbUpdateException ex)
                    {
                        #region Exceptions from word already existing (added maybe in another thread)
#if (DEBUG)
                        log.LogInformation($"###################### Exception caught attempting to add word: '{KeyWord}' fetching instead...");
#endif
                        //FAIL DUE TO OTHER CAUSE THAN WORD ALREADY ADDED?
                        if (ex.InnerException == null || !ex.InnerException.Message.Contains("asearchdictionary_word_idx"))
                        {
#if (DEBUG)
                            log.LogInformation($"###################### Unexpected inner exception on add word: '{KeyWord}'!?");
#endif
                            throw ex;
                        }

                        //FETCH THE WORD ID, PLACE IN MATCHINGKEYWORDLIST AND MOVE ON TO THE NEXT WORD
                        var SearchDictionaryMatchFoundInDB = await ServiceProviderProvider.DBContext.SearchDictionary.AsNoTracking().Where(z => z.Word == KeyWord).FirstOrDefaultAsync();
                        if (SearchDictionaryMatchFoundInDB != null)
                        {
                            MatchingKeywordIdList.Add(SearchDictionaryMatchFoundInDB.Id);
                            //It exists now
                            ExistingKeywordMatches.Add(SearchDictionaryMatchFoundInDB.Id, SearchDictionaryMatchFoundInDB.Word);
                        }
                        else
                        {
#if (DEBUG)
                            log.LogInformation($"###################### NULL when expected to find word: '{KeyWord}'!?");
#endif
                        }
                        #endregion
                    }
                    catch (Exception ex)
                    {
#if (DEBUG)
                        log.LogInformation(ex, $"###################### Unexpected exception adding word: '{KeyWord}'!?");
#endif
                        throw ex;
                    }
                }
            }

            #endregion second attempt

            //-------- END CRITICAL SECTION -------------
            //-------------------------------------------


            //CREATE THE SEARCHKEY RECORDS FOR ALL THE KEYWORDS
            var NewSearchKeyList = new List<SearchKey>();
            foreach (long E in MatchingKeywordIdList)
            {
                NewSearchKeyList.Add(new SearchKey() { WordId = E, ObjectId = p.ObjectId, ObjectType = p.ObjectType });
            }
            var CtSearchKeyAdd = ServiceProviderProvider.DBContext;
            await CtSearchKeyAdd.SearchKey.AddRangeAsync(NewSearchKeyList);
            await CtSearchKeyAdd.SaveChangesAsync();

            //---------------------------------

        }//eoc


        #endregion

        #region Breaker

        // //Class to hold relevant translation data for breaking text
        // public class TranslationWordBreakingData
        // {
        //     public bool CJKIndex { get; set; }
        //     public List<string> StopWords { get; set; }
        //     public TranslationWordBreakingData()
        //     {
        //         CJKIndex = false;
        //         StopWords = new List<string>();
        //     }
        // }

       // private static Dictionary<long, TranslationWordBreakingData> translationWordBreakingDataCache = new Dictionary<long, TranslationWordBreakingData>();

        // //called at startup to populate cache
        //WAS GOING TO ADD THIS IN RESPONSE TO AN ISSUE WITH EXCEPTION ATTEMPTING TO ADD ALREADY EXISTING DICTIONARY ID 1, BUT IT NEVER HAPPENED AGAIN, SO :SHRUGEMOJI:
        //IF IT DOES, MAKE THIS CODE AND POPULATE IT AT SERVER BOOT AND SHOULD BE ADEQUATE
        //OR GO NUTS WITH A FULL MEMORY CACHE: https://docs.microsoft.com/en-us/aspnet/core/performance/caching/memory?view=aspnetcore-3.1
        //         internal static async Task CacheAllTranslationWordBreakingData(){
        //             //iterate all Translations, cache the word break data
        //            l = await ct.Translation
        //                    .AsNoTracking()
        //                    .OrderBy(z => z.Name)
        //                    .Select(z => new NameIdItem()
        //                    {
        //                        Id = z.Id,
        //                        Name = z.Name
        //                    }).ToListAsync();
        //  TranslationWordBreakingDataCache.Add(TranslationId, await GetTranslationSearchDataAsync(TranslationId));
        //         }
        // internal static async Task<TranslationWordBreakingData> GetTranslationSearchDataAsync(long translationId, AyContext ct = null)
        // {
        //     TranslationWordBreakingData LSD = new TranslationWordBreakingData();
        //     if (ct == null)
        //         ct = ServiceProviderProvider.DBContext;
        //     //Get stopwords
        //     //Validate translation id, if not right then use default instead
        //     var Param = new List<string>();
        //     translationId = await TranslationBiz.ReturnSpecifiedTranslationIdIfExistsOrDefaultTranslationId(translationId, ct);
        //     Param.Add("StopWords1");
        //     Param.Add("StopWords2");
        //     Param.Add("StopWords3");
        //     Param.Add("StopWords4");
        //     Param.Add("StopWords5");
        //     Param.Add("StopWords6");
        //     Param.Add("StopWords7");
        //     var Stops = await TranslationBiz.GetSubsetStaticAsync(Param, translationId);

        //     foreach (KeyValuePair<string, string> kvp in Stops)
        //     {
        //         //Each stopwords translation key is a space delimited list of words and in the case of an empty local string (i.e. StopWords7) it's value is a single question mark
        //         if (kvp.Value != "?")
        //         {
        //             LSD.StopWords.AddRange(kvp.Value.Split(" "));
        //         }
        //     }

        //     LSD.CJKIndex = await TranslationBiz.GetCJKIndexAsync(translationId, ct);
        //     return LSD;
        // }

        public enum TokenTypes
        { Nothing, Separator, CJK, Latin };

        /// <summary>
        /// Take an array of strings and
        /// return a single string
        /// containing unique only, lowercase comma delimited
        /// keywords suitable for passing to a
        /// stored procedure or other function
        ///
        /// Use Translation setting CJKIndex=true to handle Chinese, Japanese, Korean etc
        /// (languages with no easily identifiable word boundaries as in english)
        /// </summary>
        /// <returns>List of strings</returns>
        internal static async Task<List<string>> BreakAsync(long translationId, List<string> textStrings)
        {
            return await BreakCoreAsync(translationId, false, textStrings);
        }

        /// <summary>
        ///
        /// </summary>
        internal static async Task<List<string>> BreakAsync(long translationId, string textString)
        {
            List<string> textStrings = new List<string>(1);
            textStrings.Add(textString);
            return await BreakCoreAsync(translationId, false, textStrings);
        }

        /// <summary>
        /// Used to Process users search phrase and preserve wild
        /// cards entered
        /// </summary>
        internal static async Task<List<string>> BreakSearchPhraseAsync(long translationId, string searchPhrase)
        {
            List<string> textStrings = new List<string>();
            textStrings.Add(searchPhrase);
            //note: we want stopwords if this is a search phrase break because they might type "some" wanting awesome but some is a stopword so..
            return await BreakCoreAsync(translationId, true, textStrings, true);
        }


        internal static async Task<List<string>> BreakCoreAsync(long translationId, bool KeepWildCards, List<string> textStrings, bool ignoreStopWords = false)
        {
            //For stopwords and CJKIndex flag value
            var translationWordBreakData = await SearchTranslationWordBreakDataCache.GetWordBreakData(translationId);

            int MAXWORDLENGTH = 255;
            int MINWORDLENGTH = 2;//A word isn't a word unless it's got at least two characters in it
            StringBuilder sbResults = new StringBuilder();
            //List to temporarily hold parsed words
            //used to easily ensure unique words only
            List<string> tempParsedWords = new List<string>();

            StringBuilder sb = new StringBuilder();
            StringBuilder sbWord = new StringBuilder();
            List<string> ReturnList = new List<string>();


            //Loop through each of the passed in strings
            foreach (string s in textStrings)
            {
                if (s == null || s == "") continue;
                //get all the characters in a unicode compliant manner...
                TextElementEnumerator t = StringInfo.GetTextElementEnumerator(s);
                //start at the top
                t.Reset();

                TokenTypes LastToken = TokenTypes.Nothing;

                //Used by CJK
                bool BasicLatinBlock = true;

                //Process each "character" (text element,glyph whatever) in the
                //current string
                while (t.MoveNext())
                {
                    //get it as a character
                    char c = t.GetTextElement()[0];

                    if (!translationWordBreakData.CJKIndex)
                    {
                        #region regular tokenizer

                        //Is it a token we want to include?
                        //Or a wildcard character
                        if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%'))
                        {
                            #region Include token
                            //All latin text is converted to lower case
                            c = char.ToLower(c);

                            //Do we already have a word?
                            if (sbWord.Length > 0)
                            {
                                //Maybe we need to flush this word into the word list
                                //if we're over the word length limit
                                if (sbWord.Length >= MAXWORDLENGTH)
                                {
                                    //flush away...
                                    if (!tempParsedWords.Contains(sbWord.ToString()))
                                    {
                                        tempParsedWords.Add(sbWord.ToString());
                                    }
                                    sbWord.Length = 0;
                                    sbWord.Append(c);
                                    LastToken = TokenTypes.Latin;
                                    continue;

                                }
                            }

                            //append character and go on to next one
                            sbWord.Append(c);
                            LastToken = TokenTypes.Latin;
                            continue;
                            #endregion
                        }
                        else
                        {
                            #region Word Boundary token
                            LastToken = TokenTypes.Separator;
                            if (sbWord.Length > 0)
                            {
                                //flush away...
                                if (!tempParsedWords.Contains(sbWord.ToString()))
                                {
                                    tempParsedWords.Add(sbWord.ToString());
                                }
                                sbWord.Length = 0;
                                continue;
                            }

                            #endregion
                        }
                        #endregion
                    }
                    else
                    {
                        #region CJK Tokenizer

                        //Is it a basic latin charater? (ascii basically)
                        //see: http://www.unicode.org/charts/index.html
                        //and here for a funky online viewer:
                        //http://www.fileformat.info/info/unicode/block/index.htm
                        //we need to know this so that regular english text
                        //within cjk text gets properly indexed as whole words
                        BasicLatinBlock = false;
                        if ((int)c < 256) BasicLatinBlock = true;

                        if (BasicLatinBlock)
                        {
                            //Is it a token we want to include?
                            if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%'))
                            {
                                #region Latin Include token
                                //All latin text is converted to lower case
                                c = char.ToLower(c);

                                //Do we already have a word?
                                if (sbWord.Length > 0)
                                {
                                    //Maybe we need to flush this word into the word list
                                    //if we're over the word length limit or we are going from
                                    //CJK to latin
                                    if (LastToken == TokenTypes.CJK || sbWord.Length >= MAXWORDLENGTH)
                                    {
                                        //flush away...
                                        if (!tempParsedWords.Contains(sbWord.ToString()))
                                        {
                                            tempParsedWords.Add(sbWord.ToString());
                                        }
                                        sbWord.Length = 0;
                                        sbWord.Append(c);
                                        LastToken = TokenTypes.Latin;
                                        continue;

                                    }
                                }

                                //append character and go on to next one
                                sbWord.Append(c);
                                LastToken = TokenTypes.Latin;
                                continue;
                                #endregion
                            }
                            else
                            {
                                #region Latin Word Boundary token
                                LastToken = TokenTypes.Separator;
                                if (sbWord.Length > 0)
                                {
                                    //flush away...
                                    if (!tempParsedWords.Contains(sbWord.ToString()))
                                    {
                                        tempParsedWords.Add(sbWord.ToString());
                                    }
                                    sbWord.Length = 0;

                                    continue;

                                }

                                #endregion
                            }

                        }
                        else//CJK character
                        {
                            if (char.IsLetter(c) || (KeepWildCards && c == '%'))
                            {
                                #region CJK Include token
                                //Do we already have a word?
                                if (sbWord.Length > 0)
                                {
                                    //Maybe we need to flush this word into the word list
                                    //if we're over the word length limit or we are going from
                                    //latin TO CJK
                                    if (LastToken == TokenTypes.Latin || sbWord.Length >= MAXWORDLENGTH)
                                    {
                                        //flush away...
                                        if (!tempParsedWords.Contains(sbWord.ToString()))
                                        {
                                            tempParsedWords.Add(sbWord.ToString());
                                        }
                                        sbWord.Length = 0;
                                        sbWord.Append(c);
                                        LastToken = TokenTypes.CJK;
                                        continue;

                                    }

                                    if (LastToken == TokenTypes.CJK)
                                    {
                                        //we're here because there is more than zero characters already stored
                                        //and the last was CJK so we need append current character
                                        //and flush the resultant 2 character n-gram
                                        sbWord.Append(c);
                                        System.Diagnostics.Debug.Assert(sbWord.Length == 2);
                                        //flush away...
                                        if (!tempParsedWords.Contains(sbWord.ToString()))
                                        {
                                            tempParsedWords.Add(sbWord.ToString());
                                        }
                                        sbWord.Length = 0;
                                        sbWord.Append(c);
                                        LastToken = TokenTypes.CJK;
                                        continue;

                                    }
                                }

                                //append character and go on to next one
                                sbWord.Append(c);
                                LastToken = TokenTypes.CJK;
                                continue;
                                #endregion


                            }
                            else
                            {
                                #region CJK Word Boundary token
                                LastToken = TokenTypes.Separator;
                                if (sbWord.Length > 0)
                                {
                                    //flush away...
                                    if (!tempParsedWords.Contains(sbWord.ToString()))
                                    {
                                        tempParsedWords.Add(sbWord.ToString());
                                    }
                                    sbWord.Length = 0;
                                    continue;
                                }

                                #endregion
                            }

                        }

                        #endregion
                    }
                }

                //Flush out the last word
                if (sbWord.Length > 0)
                {
                    //flush away...
                    if (!tempParsedWords.Contains(sbWord.ToString()))
                    {
                        tempParsedWords.Add(sbWord.ToString());
                    }
                    sbWord.Length = 0;
                }
            }


            //bail early if there is nothing indexed
            if (tempParsedWords.Count == 0) return ReturnList;


            //Make a return string array
            //from the word list
            foreach (string s in tempParsedWords)
            {
                //Filter out short words if we are breaking for indexing
                //but keep them if they are part of a wildcard search phrase
                if (s.Length >= MINWORDLENGTH || (KeepWildCards && s.Contains('%')))
                {
                    if (ignoreStopWords)
                    {
                        //breaking of search phrase
                        ReturnList.Add(s);
                    }
                    else
                    {
                        //Add only non stopwords - regular breaking of object for dictionary entry
                        if (!translationWordBreakData.StopWords.Contains(s))
                        {
                            ReturnList.Add(s);
                        }
                    }
                }
            }

            //sometimes all the results are stop words so you end up here with nothing
            return ReturnList;

        }

        #endregion

        #region Utility

        #endregion utility

    }//eoc

}//eons