raven/server/AyaNova/biz/Search.cs

using System;
using System.Linq;
using System.Globalization;
using System.Text;
using System.Collections.Generic;
using System.IO;
using System.Threading.Tasks;
using Newtonsoft.Json.Linq;
using Microsoft.Extensions.Logging;
using Microsoft.EntityFrameworkCore;
using AyaNova.Util;
using AyaNova.Models;
using System.Diagnostics;


namespace AyaNova.Biz
{

    //This class handles word breaking, processing keywords and searching for results
    public static class Search
    {

        /*
        ISSUES:
           Search of big data a little slow, attempt to tweak indices


         */


        #region Search and return results

        /*
        Requirements:

        INPUT PARAMETERS
        - Search phrase (with wildcard support)
            - Can be empty if tags are specified, no tags and no phrase is an error condition
        - ObjectType: only return results for objects of this type
        - InName: flag that indicates only search in names
        - Tag ids that are also on result objects
            - Can be empty if a phrase is specified


        ACTION
        Find search matches, then find tag matches then intersect, then sort and return
        Filter OUT results that user is not permitted to read
        //TODO: proper testing of searching
        - SAMPLE DATA: Need a huge amount of sample data indexed to load test it
        - INDEXES: play with it and see what works best

        OUTPUT FORMAT
        - No localized text, up to client
        - Name of object in return result
        - Object Type and ID in return result
        - Group results by object type, then by object ID descending which will result in natural most recently created order

        result:[
        {
            name:"blah",
            type:2,
            id:210
        },
        ]


         */

        //Class to hold search request parameters
        public class SearchRequestParameters
        {
            public string Phrase { get; set; }
            public bool NameOnly { get; set; }
            public AyaType TypeOnly { get; set; }
            public List<long> Tags { get; set; }
            //Note: maxresults of 0 will get all results
            public int MaxResults { get; set; }

            public SearchRequestParameters()
            {
                NameOnly = false;
                TypeOnly = AyaType.NoType;
                Tags = new List<long>();
                MaxResults = 500;
            }

            public bool IsValid
            {
                get
                {
                    //has a phrase?
                    if (!string.IsNullOrWhiteSpace(this.Phrase))
                        return true;

                    //has tags?
                    if (this.Tags.Count > 0)
                        return true;

                    return false;
                }
            }
        }


        //Class to hold search result returned to client
        public class SearchResult
        {
            public string Name { get; set; }
            public AyaType Type { get; set; }
            public long Id { get; set; }
        }


        public static async Task<List<SearchResult>> DoSearch(AyContext ct, long localeId, AuthorizationRoles currentUserRoles, SearchRequestParameters searchParameters)
        {
            List<SearchResult> ResultList = new List<SearchResult>();

            //list to hold temporary search/tag hits
            List<AyaTypeId> MatchingObjects = new List<AyaTypeId>();

            if (!searchParameters.IsValid)
            {
                throw new System.ArgumentException("Search::DoSearch - Search request parameters must contain a phrase or tags");
            }

            //IF PHRASE SPECIFIED

            //Modify Phrase to replace wildcard * with % as breakcore expects sql style wildcards
            searchParameters.Phrase = searchParameters.Phrase.Replace("*", "%");

            //BREAK SEARCH PHRASE INTO SEPARATE TERMS
            var PhraseItems = BreakSearchPhrase(localeId, searchParameters.Phrase);

            //SPLIT OUT WILDCARDS FROM NON WILDCARDS
            List<string> WildCardSearchTerms = new List<string>();
            List<string> RegularSearchTerms = new List<string>();

            foreach (string PhraseItem in PhraseItems)
            {
                if (PhraseItem.Contains("%"))
                    WildCardSearchTerms.Add(PhraseItem);
                else
                    RegularSearchTerms.Add(PhraseItem);
            }


            //List holder for matching dictionary ID's
            List<long> DictionaryMatches = new List<long>();


            //GET LIST OF DICTIONARY ID'S THAT MATCH REGULAR SEARCH TERMS
            if (RegularSearchTerms.Count > 0)
                DictionaryMatches = await ct.SearchDictionary.Where(m => RegularSearchTerms.Contains(m.Word)).Select(m => m.Id).ToListAsync();


            //GET LIST OF DICTIONARY ID'S THAT MATCH WILDCARD SEARCH TERMS
            if (WildCardSearchTerms.Count > 0)
            {
                foreach (string WildCardSearchTerm in WildCardSearchTerms)
                {
                    //Contains?
                    if (WildCardSearchTerm.StartsWith("%") && WildCardSearchTerm.EndsWith("%"))
                    {
                        DictionaryMatches.AddRange(await ct.SearchDictionary.Where(m => m.Word.Contains(WildCardSearchTerm.Replace("%", ""))).Select(m => m.Id).ToListAsync());
                    }
                    else if (WildCardSearchTerm.EndsWith("%")) //STARTS WITH?
                    {
                        DictionaryMatches.AddRange(await ct.SearchDictionary.Where(m => m.Word.StartsWith(WildCardSearchTerm.Replace("%", ""))).Select(m => m.Id).ToListAsync());
                    }
                    else if (WildCardSearchTerm.StartsWith("%"))//ENDS WITH?
                    {
                        DictionaryMatches.AddRange(await ct.SearchDictionary.Where(m => m.Word.EndsWith(WildCardSearchTerm.Replace("%", ""))).Select(m => m.Id).ToListAsync());
                    }
                }
            }

            //SEARCH SEARCHKEY FOR MATCHING WORDS AND OPTIONALLY TYPE AND INNAME
            var TotalSearchTermsToMatch = WildCardSearchTerms.Count + RegularSearchTerms.Count;

            // var TestRawMatches = await ct.SearchKey.Where(x => DictionaryMatches.Contains(x.WordId)).ToListAsync();

            //Build search query based on searchParameters
            var q = ct.SearchKey.Distinct().Where(x => DictionaryMatches.Contains(x.WordId));

            //In name?
            if (searchParameters.NameOnly)
                q = q.Where(m => m.InName == true);

            //Of type?
            if (searchParameters.TypeOnly != AyaType.NoType)
                q = q.Where(m => m.ObjectType == searchParameters.TypeOnly);


            //Find the records that have the search terms in searchkey
            var SearchMatches = q.GroupBy(x => new { x.ObjectType, x.ObjectId }).Select(x => new { ObjectId = x.Key.ObjectId, ObjectType = x.Key.ObjectType, ObjectCount = x.LongCount() });


            //PUT THE RESULTS INTO MATCHING OBJECTS LIST
            foreach (var SearchMatch in SearchMatches)
            {
                //keep any object that matches *all* the search terms
                if (SearchMatch.ObjectCount == TotalSearchTermsToMatch)
                    MatchingObjects.Add(new AyaTypeId(SearchMatch.ObjectType, SearchMatch.ObjectId));
            }


            //IF TAGS SPECIFIED
            //BUGBUG: If no valid tags provided, i.e. a single tag of type or id 0 then can skip
            if (searchParameters.Tags.Count > 0)
            {
                //get a count of the search tags (used by both paths below)
                var SearchTagCount = searchParameters.Tags.Count;

                if (string.IsNullOrWhiteSpace(searchParameters.Phrase))
                {

                    #region TAGS ONLY SEARCH (NO PHRASE) ALL FULL MATCHES ARE INCLUSIVE
                    Dictionary<long, long> TagCounts = new Dictionary<long, long>();

                    //QUERY FOR ALL TAGMAPS THAT MATCH OBJECT TYPE AND ID FOR EVERY TAG SPECIFIED (UNION)
                    //var tagmatches= await ct.TagMap.Where(m => ).Select(m => m.Id).ToListAsync();
                    //ct.TagMap.Where(n => n.Tags.Count(t => tags.Contains(t.DisplayName)) == tags.Count)

                    //algorithm:
                    //1) get counts for each tag specified from tagmap, if any are zero then none match and can bail early
                    foreach (long SearchTagId in searchParameters.Tags)
                    {
                        var MatchTagCount = await ct.TagMap.Where(m => m.TagId == SearchTagId).LongCountAsync();
                        //zero tags matching here at any point means no results for the entire search and we can bail
                        if (MatchTagCount == 0)
                        {
                            //return empty resultlist
                            return ResultList;
                        }

                        //Save the matching count
                        TagCounts.Add(SearchTagId, MatchTagCount);
                    }

                    //2) find smallest count match so we are working with the shortest list first
                    var ShortestMatchingTag = TagCounts.OrderBy(x => x.Value).First().Key;

                    //3) Generate the shortlist of items that match the shortest tag list
                    var ShortList = await ct.TagMap.Where(x => x.TagId == ShortestMatchingTag).ToListAsync();

                    //4) Iterate the shortlist and see if each item matches all other tags specified if it does then put it into the matching objects list for return

                    //Iterate shortlist
                    foreach (TagMap t in ShortList)
                    {
                        var matchCount = 1;
                        //Iterate requested tags
                        foreach (long TagId in searchParameters.Tags)
                        {
                            //skipping already matched shortest tag
                            if (TagId != ShortestMatchingTag)
                            {
                                //Ok, does this object have this tag?
                                bool HasTag = await ct.TagMap.Where(x => x.TagToObjectId == t.TagToObjectId && x.TagToObjectType == t.TagToObjectType && x.TagId == TagId).AnyAsync();
                                if (HasTag)
                                    matchCount++;
                            }
                        }
                        //does it match all tags?
                        if (matchCount == SearchTagCount)
                        {
                            //yes, add it to the results
                            MatchingObjects.Add(new AyaTypeId(t.TagToObjectType, t.TagToObjectId));
                        }
                    }
                    #endregion


                }
                else
                {
                    #region TAGS PLUS PHRASE SEARCH WITH NON MATCHING TAGS EXCLUSIVE
                    //list to hold temporary matches
                    List<AyaTypeId> TagMatchingObjects = new List<AyaTypeId>();

                    //LOOP THROUGH MATCHING OBJECTS LIST
                    foreach (AyaTypeId i in MatchingObjects)
                    {
                        var matchCount = await ct.TagMap.Where(x => x.TagToObjectId == i.ObjectId && x.TagToObjectType == i.ObjectType && searchParameters.Tags.Contains(x.TagId)).LongCountAsync();
                        if (matchCount == SearchTagCount)
                        {
                            TagMatchingObjects.Add(i);
                        }

                    }

                    //Ok here we have all the MatchingObjects that had all the tags in the TagMatchingObjects list so that's actually now our defacto return list
                    MatchingObjects = TagMatchingObjects;


                    #endregion

                }
            }

            //REMOVE ANY ITEMS THAT USER IS NOT PERMITTED TO READ
            //If it's a name only search then all is allowed
            //If it's not a name only search then rights need to be checked for full read because even if it's just a tags search that's part of the full record of the object
            //Note: I have decided in the interests of simplicity that even if the result was only found in the name, the user still needs full rights to read the object if the type of search
            //was not InNameOnly type.  This greatly simplifies processing.
            if (!searchParameters.NameOnly)
            {
                //list to hold temporary matches
                List<AyaTypeId> CanReadMatchingObjects = new List<AyaTypeId>();
                foreach (AyaTypeId t in MatchingObjects)
                {
                    if (AyaNova.Api.ControllerHelpers.Authorized.IsAuthorizedToReadFullRecord(currentUserRoles, t.ObjectType))
                    {
                        CanReadMatchingObjects.Add(t);
                    }
                }

                //Ok, we're here with the list of allowable objects which is now the master matching objects list so...
                MatchingObjects = CanReadMatchingObjects;
            }

            //MAXIMUM RESULTS FILTER
            //The theory is that it should be filtered BEFORE sorting so that you get the most random collection of results
            //As the results are not ranked so...
            var watch = new System.Diagnostics.Stopwatch();//###################### PROFILING
            watch.Start();//###################### PROFILING
            //BUGBUG: THIS is what is taking all the time in the queries FFS
            if (searchParameters.MaxResults > 0)//0 = all results
                MatchingObjects = MatchingObjects.Take(searchParameters.MaxResults).ToList();
            watch.Stop();//###################### PROFILING
            var TimeToMaximumResultsFilter = watch.ElapsedMilliseconds;//###################### PROFILING
            watch.Reset();

            watch.Start();//###################### PROFILING
            //Sort and group the matching objects list in return order
            //Customer.OrderBy(c => c.LastName).ThenBy(c => c.FirstName)
            var OrderedMatchingObjects = MatchingObjects.OrderBy(x => x.ObjectType).ThenByDescending(x => x.ObjectId);

            watch.Stop();//###################### PROFILING
            var TimeToOrderMatchingObjects = watch.ElapsedMilliseconds;//###################### PROFILING
            watch.Reset();


            watch.Start();//###################### PROFILING
            //Build the return list from the remaining matching objects list
            foreach (AyaTypeId i in OrderedMatchingObjects)
            {
                SearchResult SR = new SearchResult();
                SR.Name = BizObjectNameFetcher.Name(i, ct);
                SR.Id = i.ObjectId;
                SR.Type = i.ObjectType;
                ResultList.Add(SR);
            }

            watch.Stop();//###################### PROFILING
            var TimeToBuildSearchResultReturnList = watch.ElapsedMilliseconds;//###################### PROFILING


            return ResultList;
        }


        #endregion dosearch

        #region ProcessKeywords into Database

        public static void ProcessNewObjectKeywords(AyContext ct, long localeId, long objectID, AyaType objectType, string name, params string[] text)
        {
            ProcessKeywords(ct, localeId, objectID, objectType, true, name, text);
        }

        public static void ProcessUpdatedObjectKeywords(AyContext ct, long localeId, long objectID, AyaType objectType, string name, params string[] text)
        {
            ProcessKeywords(ct, localeId, objectID, objectType, false, name, text);
        }

        public static void ProcessDeletedObjectKeywords(AyContext ct, long objectID, AyaType objectType)
        {
            //Be careful in future, if you put ToString at the end of each object in the string interpolation
            //npgsql driver will assume it's a string and put quotes around it triggering an error that a string can't be compared to an int
            ct.Database.ExecuteSqlCommand($"delete from asearchkey where objectid={objectID} and objecttype={(int)objectType}");
        }


        /// <summary>
        /// Process the keywords into the dictionary
        /// NOTE: NAME parameter is in ADDITION to the NAME also being on of the strings passed in text parameter
        /// </summary>
        private static void ProcessKeywords(AyContext ct, long localeId, long objectID, AyaType objectType, bool newRecord, string name, params string[] text)
        {

            //IF NOT NEW, DELETE ALL EXISTING ENTRIES FOR OBJECT TYPE AND ID
            if (!newRecord)
            {
                ProcessDeletedObjectKeywords(ct, objectID, objectType);
            }

            //BREAK STRING ARRAY INTO KEYWORD LIST
            List<string> KeyWordList = Break(localeId, text);

            //BREAK NAME STRING
            List<string> NameKeyWordList = Break(localeId, name);

            //EARLY EXIT IF NO KEYWORDS OR NAME RECORD TO PROCESS
            if (KeyWordList.Count == 0 && string.IsNullOrWhiteSpace(name))
            {
                return;
            }

            //BUILD A LIST OF MatchingDictionaryEntry items FOR THE MATCHING WORDS
            List<MatchingDictionaryEntry> MatchingKeywordIdList = new List<MatchingDictionaryEntry>();

            //ITERATE ALL THE KEYWORDS, SEARCH IN THE SEARCHDICTIONARY TABLE AND COLLECT ID'S OF ANY PRE-EXISTING IN DB KEYWORDS
            var ExistingKeywordMatches = ct.SearchDictionary.Where(m => KeyWordList.Contains(m.Word)).ToDictionary(m => m.Id, m => m.Word);
            //Put the matching keyword ID's into the list
            foreach (KeyValuePair<long, string> K in ExistingKeywordMatches)
            {
                bool IsName = false;
                if (NameKeyWordList.Contains(K.Value))
                    IsName = true;
                MatchingKeywordIdList.Add(new MatchingDictionaryEntry() { DictionaryId = K.Key, InName = IsName });
            }

            //ITERATE THROUGH THE KEYWORDS THAT DO *NOT* HAVE MATCHES IN THE SEARCHDICTIONARY AND ADD THEM TO THE SEARCH DICTIONARY, COLLECTING THEIR ID'S
            bool NewWordsAdded = false;
            var NewSearchDictionaryWordsList = new List<SearchDictionary>();
            foreach (string KeyWord in KeyWordList)
            {
                if (!ExistingKeywordMatches.ContainsValue(KeyWord))
                {
                    NewSearchDictionaryWordsList.Add(new SearchDictionary() { Word = KeyWord });
                    NewWordsAdded = true;
                }
            }

            //Save the context in order to get the id's of the new words added
            if (NewWordsAdded)
            {
                //adding in a range sped this up noticeably
                ct.SearchDictionary.AddRange(NewSearchDictionaryWordsList);
                ct.SaveChanges();
            }


            //-----
            //Now add the id's of the newly created words to the matching keyword id list for this object

            foreach (SearchDictionary SD in ct.SearchDictionary.Local)
            {
                bool IsName = false;
                if (NameKeyWordList.Contains(SD.Word))
                    IsName = true;
                //See if it's already in the matching keywordlist or needs to be added
                var ExistingMatch = MatchingKeywordIdList.Where(x => x.DictionaryId == SD.Id).FirstOrDefault();

                if (ExistingMatch == null)//If null then needs to be added
                    MatchingKeywordIdList.Add(new MatchingDictionaryEntry() { DictionaryId = SD.Id, InName = IsName });
                else
                {
                    //Not null, but may need to be updated to reflect that it's in the name
                    if (!ExistingMatch.InName && IsName)
                    {
                        ExistingMatch.InName = true;
                    }
                }
            }

            //CREATE THE SEARCHKEY RECORDS FOR ALL THE KEYWORDS
            var NewSearchKeyList = new List<SearchKey>();
            foreach (MatchingDictionaryEntry E in MatchingKeywordIdList)
            {
                NewSearchKeyList.Add(new SearchKey() { WordId = E.DictionaryId, InName = E.InName, ObjectId = objectID, ObjectType = objectType });
                //ct.SearchKey.Add(new SearchKey() { WordId = E.DictionaryId, InName = E.InName, ObjectId = objectID, ObjectType = objectType });
            }
            ct.SearchKey.AddRange(NewSearchKeyList);
            ct.SaveChanges();

            //---------------------------------


        }//eoc

        //Class to hold temporary list of matching id
        public class MatchingDictionaryEntry
        {
            public bool InName { get; set; }
            public long DictionaryId { get; set; }
            public MatchingDictionaryEntry()
            {
                InName = false;
                DictionaryId = -1;
            }
        }


        #endregion

        #region Breaker

        //Class to hold relevant locale data for breaking text
        public class LocaleWordBreakingData
        {
            public bool CJKIndex { get; set; }
            public List<string> StopWords { get; set; }
            public LocaleWordBreakingData()
            {
                CJKIndex = false;
                StopWords = new List<string>();
            }
        }

        //Get the current stopwords for the user's locale
        private static LocaleWordBreakingData GetLocaleSearchData(long localeId, AyContext ct = null)
        {
            LocaleWordBreakingData LSD = new LocaleWordBreakingData();
            if (ct == null)
                ct = ServiceProviderProvider.DBContext;
            //Get stopwords
            //Validate locale id, if not right then use default instead
            var Param = new Api.Controllers.LocaleController.LocaleSubsetParam();
            Param.LocaleId = LocaleBiz.EnsuredLocaleIdStatic(localeId, ct);
            Param.Keys.Add("StopWords1");
            Param.Keys.Add("StopWords2");
            Param.Keys.Add("StopWords3");
            Param.Keys.Add("StopWords4");
            Param.Keys.Add("StopWords5");
            Param.Keys.Add("StopWords6");
            Param.Keys.Add("StopWords7");
            var Stops = LocaleBiz.GetSubsetStatic(Param).Result;

            foreach (KeyValuePair<string, string> kvp in Stops)
            {
                //Each stopwords locale key is a space delimited list of words and in the case of an empty local string (i.e. StopWords7) it's value is a single question mark
                if (kvp.Value != "?")
                {
                    LSD.StopWords.AddRange(kvp.Value.Split(" "));
                }
            }

            LSD.CJKIndex = LocaleBiz.GetCJKIndex(localeId, ct).Result;
            return LSD;
        }

        public enum TokenTypes
        { Nothing, Separator, CJK, Latin };

        /// <summary>
        /// Take an array of strings and
        /// return a single string
        /// containing unique only, lowercase comma delimited
        /// keywords suitable for passing to a
        /// stored procedure or other function
        ///
        /// Use Locale setting CJKIndex=true to handle Chinese, Japanese, Korean etc
        /// (languages with no easily identifiable word boundaries as in english)
        /// </summary>
        ///
        /// <param name="localeId"></param>
        /// <param name="text">An array of 0 to * strings of text</param>
        /// <returns>List of strings</returns>
        internal static List<string> Break(long localeId, params string[] text)
        {
            return BreakCore(localeId, false, text);
        }

        /// <summary>
        /// Used to Process users search phrase and preserve wild
        /// cards entered
        /// </summary>
        /// <param name="localeId"></param>
        /// <param name="text"></param>
        /// <returns></returns>
        internal static List<string> BreakSearchPhrase(long localeId, params string[] text)
        {
            return BreakCore(localeId, true, text);
        }

        /// <summary>
        /// Stop words list reset upon login or editing of localized text
        /// used for eliminating noise words from search dictionary
        /// </summary>
       // public static System.Collections.Generic.List<string> StopList = null;

        internal static List<string> BreakCore(long localeId, bool KeepWildCards, params string[] text)
        {
            //Get stopwords and CJKIndex flag value
            LocaleWordBreakingData LocaleSearchData = GetLocaleSearchData(localeId);
            int MAXWORDLENGTH = 255;
            int MINWORDLENGTH = 2;//A word isn't a word unless it's got at least two characters in it
            StringBuilder sbResults = new StringBuilder();
            //List to temporarily hold parsed words
            //used to easily ensure unique words only
            List<string> tempParsedWords = new List<string>();

            StringBuilder sb = new StringBuilder();
            StringBuilder sbWord = new StringBuilder();
            List<string> ReturnList = new List<string>();


            //Loop through each of the passed in strings
            foreach (string s in text)
            {
                if (s == null || s == "") continue;
                //get all the characters in a unicode compliant manner...
                TextElementEnumerator t = StringInfo.GetTextElementEnumerator(s);
                //start at the top
                t.Reset();

                TokenTypes LastToken = TokenTypes.Nothing;

                //Used by CJK
                bool BasicLatinBlock = true;

                //Process each "character" (text element,glyph whatever) in the
                //current string
                while (t.MoveNext())
                {
                    //get it as a character
                    char c = t.GetTextElement()[0];

                    if (!LocaleSearchData.CJKIndex)
                    {
                        #region regular tokenizer

                        //Is it a token we want to include?
                        //Or a wildcard character
                        if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%'))
                        {
                            #region Include token
                            //All latin text is converted to lower case
                            c = char.ToLower(c);

                            //Do we already have a word?
                            if (sbWord.Length > 0)
                            {
                                //Maybe we need to flush this word into the word list
                                //if we're over the word length limit
                                if (sbWord.Length >= MAXWORDLENGTH)
                                {
                                    //flush away...
                                    if (!tempParsedWords.Contains(sbWord.ToString()))
                                    {
                                        tempParsedWords.Add(sbWord.ToString());
                                    }
                                    sbWord.Length = 0;
                                    sbWord.Append(c);
                                    LastToken = TokenTypes.Latin;
                                    continue;

                                }
                            }

                            //append character and go on to next one
                            sbWord.Append(c);
                            LastToken = TokenTypes.Latin;
                            continue;
                            #endregion
                        }
                        else
                        {
                            #region Word Boundary token
                            LastToken = TokenTypes.Separator;
                            if (sbWord.Length > 0)
                            {
                                //flush away...
                                if (!tempParsedWords.Contains(sbWord.ToString()))
                                {
                                    tempParsedWords.Add(sbWord.ToString());
                                }
                                sbWord.Length = 0;
                                continue;
                            }

                            #endregion
                        }
                        #endregion
                    }
                    else
                    {
                        #region CJK Tokenizer

                        //Is it a basic latin charater? (ascii basically)
                        //see: http://www.unicode.org/charts/index.html
                        //and here for a funky online viewer:
                        //http://www.fileformat.info/info/unicode/block/index.htm
                        //we need to know this so that regular english text
                        //within cjk text gets properly indexed as whole words
                        BasicLatinBlock = false;
                        if ((int)c < 256) BasicLatinBlock = true;

                        if (BasicLatinBlock)
                        {
                            //Is it a token we want to include?
                            if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%'))
                            {
                                #region Latin Include token
                                //All latin text is converted to lower case
                                c = char.ToLower(c);

                                //Do we already have a word?
                                if (sbWord.Length > 0)
                                {
                                    //Maybe we need to flush this word into the word list
                                    //if we're over the word length limit or we are going from
                                    //CJK to latin
                                    if (LastToken == TokenTypes.CJK || sbWord.Length >= MAXWORDLENGTH)
                                    {
                                        //flush away...
                                        if (!tempParsedWords.Contains(sbWord.ToString()))
                                        {
                                            tempParsedWords.Add(sbWord.ToString());
                                        }
                                        sbWord.Length = 0;
                                        sbWord.Append(c);
                                        LastToken = TokenTypes.Latin;
                                        continue;

                                    }
                                }

                                //append character and go on to next one
                                sbWord.Append(c);
                                LastToken = TokenTypes.Latin;
                                continue;
                                #endregion
                            }
                            else
                            {
                                #region Latin Word Boundary token
                                LastToken = TokenTypes.Separator;
                                if (sbWord.Length > 0)
                                {
                                    //flush away...
                                    if (!tempParsedWords.Contains(sbWord.ToString()))
                                    {
                                        tempParsedWords.Add(sbWord.ToString());
                                    }
                                    sbWord.Length = 0;

                                    continue;

                                }

                                #endregion
                            }

                        }
                        else//CJK character
                        {
                            if (char.IsLetter(c) || (KeepWildCards && c == '%'))
                            {
                                #region CJK Include token
                                //Do we already have a word?
                                if (sbWord.Length > 0)
                                {
                                    //Maybe we need to flush this word into the word list
                                    //if we're over the word length limit or we are going from
                                    //latin TO CJK
                                    if (LastToken == TokenTypes.Latin || sbWord.Length >= MAXWORDLENGTH)
                                    {
                                        //flush away...
                                        if (!tempParsedWords.Contains(sbWord.ToString()))
                                        {
                                            tempParsedWords.Add(sbWord.ToString());
                                        }
                                        sbWord.Length = 0;
                                        sbWord.Append(c);
                                        LastToken = TokenTypes.CJK;
                                        continue;

                                    }

                                    if (LastToken == TokenTypes.CJK)
                                    {
                                        //we're here because there is more than zero characters already stored
                                        //and the last was CJK so we need append current character
                                        //and flush the resultant 2 character n-gram
                                        sbWord.Append(c);
                                        System.Diagnostics.Debug.Assert(sbWord.Length == 2);
                                        //flush away...
                                        if (!tempParsedWords.Contains(sbWord.ToString()))
                                        {
                                            tempParsedWords.Add(sbWord.ToString());
                                        }
                                        sbWord.Length = 0;
                                        sbWord.Append(c);
                                        LastToken = TokenTypes.CJK;
                                        continue;

                                    }
                                }

                                //append character and go on to next one
                                sbWord.Append(c);
                                LastToken = TokenTypes.CJK;
                                continue;
                                #endregion


                            }
                            else
                            {
                                #region CJK Word Boundary token
                                LastToken = TokenTypes.Separator;
                                if (sbWord.Length > 0)
                                {
                                    //flush away...
                                    if (!tempParsedWords.Contains(sbWord.ToString()))
                                    {
                                        tempParsedWords.Add(sbWord.ToString());
                                    }
                                    sbWord.Length = 0;
                                    continue;
                                }

                                #endregion
                            }

                        }

                        #endregion
                    }
                }

                //Flush out the last word
                if (sbWord.Length > 0)
                {
                    //flush away...
                    if (!tempParsedWords.Contains(sbWord.ToString()))
                    {
                        tempParsedWords.Add(sbWord.ToString());
                    }
                    sbWord.Length = 0;
                }
            }


            //bail early if there is nothing indexed
            if (tempParsedWords.Count == 0) return ReturnList;


            //Make a return string array
            //from the word list
            foreach (string s in tempParsedWords)
            {
                //Filter out short words if we are breaking for indexing
                //but keep them if they are part of a wildcard search phrase
                if (s.Length > MINWORDLENGTH || (KeepWildCards && s.Contains('%')))
                {
                    //Add only non stopwords
                    if (!LocaleSearchData.StopWords.Contains(s))
                    {
                        ReturnList.Add(s);
                    }
                }
            }

            //sometimes all the results are stop words so you end up here with nothing
            return ReturnList;

        }

        #endregion


    }//eoc

}//eons