sockeye/server/biz/Search.cs

using System.Linq;
using System.Globalization;
using System.Text;
using System.Collections.Generic;
using System.Threading.Tasks;
using Microsoft.EntityFrameworkCore;
using Sockeye.Util;
using Sockeye.Models;

namespace Sockeye.Biz
{

    //This class handles word breaking, processing keywords and searching for results
    public static class Search
    {

        #region Search and return results

        public class SearchRequestParameters
        {
            public string Phrase { get; set; }

            public SockType TypeOnly { get; set; }

            //Note: maxresults of 0 will get all results
            public int MaxResults { get; set; }

            public SearchRequestParameters()
            {

                TypeOnly = SockType.NoType;
                MaxResults = 500;
            }

            public bool IsValid
            {
                get
                {
                    //has a phrase?
                    if (!string.IsNullOrWhiteSpace(this.Phrase))
                        return true;
                    return false;
                }
            }
        }


        //Classes to hold search results returned to client
        public class SearchResult
        {
            public string Name { get; set; }
            public SockType Type { get; set; }
            public long Id { get; set; }
        }

        public class SearchReturnObject
        {
            public long TotalResultsFound { get; set; }
            public List<SearchResult> SearchResults { get; set; }
            public SearchReturnObject()
            {
                TotalResultsFound = 0;
                SearchResults = new List<SearchResult>();
            }
        }


        public static async Task<SearchReturnObject> DoSearchAsync(AyContext ct, long translationId, AuthorizationRoles currentUserRoles, long currentUserId, SearchRequestParameters searchParameters)
        {
            var ReturnObject = new SearchReturnObject();

            //list to hold temporary search/tag hits
            List<SockTypeId> MatchingObjects = new List<SockTypeId>();

            if (!searchParameters.IsValid)
            {
                //this is expected, don't throw, just return nothing
                //throw new System.ArgumentException("Search::DoSearch - Search request parameters must contain a phrase or tags");
                return ReturnObject;
            }

            //escape literal percentage signs first just in case they are searching for 50% off or something
            //https://www.postgresql.org/docs/current/functions-matching.html#FUNCTIONS-LIKE
            //need to get around breaking possibly losing the symbol so make it text
            searchParameters.Phrase = searchParameters.Phrase.Replace("%", "pctsym");

            //Modify Phrase to replace wildcard * with % as breakcore expects sql style wildcards
            searchParameters.Phrase = searchParameters.Phrase.Replace("*", "%");

            //BREAK SEARCH PHRASE INTO SEPARATE TERMS
            var PhraseItems = await BreakSearchPhraseAsync(translationId, searchParameters.Phrase);

            //SPLIT OUT WILDCARDS FROM NON WILDCARDS
            List<string> PreWildCardedSearchTerms = new List<string>();
            List<string> SearchTerms = new List<string>();

            foreach (string PhraseItem in PhraseItems)
            {
                if (PhraseItem.Contains("%"))
                    PreWildCardedSearchTerms.Add(PhraseItem.Replace("pctsym", @"\%"));//put back literal percentage symbol if necessary
                else
                    SearchTerms.Add(PhraseItem.Replace("pctsym", @"\%"));//put back literal percentage symbol if necessary
            }

            StringBuilder q = new StringBuilder();
            int termCount = 0;

            q.Append("WITH qr AS (SELECT asearchkey.sockType, asearchkey.objectid, ");

            //EXACT MATCH SEARCH TERMS
            foreach (string Term in SearchTerms)
                q.Append($"COUNT(*) FILTER (WHERE asearchdictionary.word = '{Term}') AS st{++termCount}, ");

            //WILDCARD SEARCH TERMS
            foreach (string WildCardSearchTerm in PreWildCardedSearchTerms)
                q.Append($"COUNT(*) FILTER (WHERE asearchdictionary.word LIKE '{WildCardSearchTerm}') AS st{++termCount}, ");

            q.Length=q.Length-2;//trim the final comma and space

            var qTypeOnly=string.Empty;
            if(searchParameters.TypeOnly!=SockType.NoType){
                //INNER JOIN ASEARCHKEY ON ASEARCHDICTIONARY.ID = ASEARCHKEY.WORDID and asearchkey.sockType=20
                qTypeOnly=$"AND ASEARCHKEY.SOCKTYPE={(int)searchParameters.TypeOnly}";
            }

            q.Append($" FROM asearchdictionary INNER JOIN asearchkey ON asearchdictionary.id = asearchkey.wordid {qTypeOnly} GROUP BY asearchkey.objectid, asearchkey.sockType) SELECT sockType, objectid FROM qr WHERE ");

            for (; termCount > 0; termCount--)
                q.Append($"st{termCount} > 0 {(termCount > 1 ? "AND " : "")}");


            //execute the query and iterate the results
            using (var command = ct.Database.GetDbConnection().CreateCommand())
            {
                await ct.Database.OpenConnectionAsync();
                command.CommandText = q.ToString();
                using (var dr = await command.ExecuteReaderAsync())
                {
                    while (dr.Read())
                    {
                        MatchingObjects.Add(new SockTypeId((SockType)dr.GetInt32(0), dr.GetInt64(1)));
                    }
                }

            }


            //REMOVE ANY ITEMS THAT USER IS NOT PERMITTED TO READ
            //list to hold temporary matches
            List<SockTypeId> CanReadMatchingObjects = new List<SockTypeId>();
            foreach (SockTypeId t in MatchingObjects)
            {
                if (t.SockType == SockType.FileAttachment)
                {
                    //have to look up the actual underlying object type and id here
                    //check if it's readable for user
                    //then add the PARENT object type and id to the CanREadMatchingObjects list
                    //this means user will not see it return as an attachment, just as the object
                    FileAttachment f = await ct.FileAttachment.AsNoTracking().FirstOrDefaultAsync(z => z.Id == t.ObjectId);
                    if (Sockeye.Api.ControllerHelpers.Authorized.HasReadFullRole(currentUserRoles, f.AttachToAType))
                    {
                        CanReadMatchingObjects.Add(new SockTypeId(f.AttachToAType, f.AttachToObjectId));
                    }
                }
                else if (t.SockType == SockType.Memo)
                {
                    //Users are only permitted to search their own memo's
                    if (await ct.Memo.AsNoTracking().AnyAsync(z => z.Id == t.ObjectId && z.ToId == currentUserId))
                        CanReadMatchingObjects.Add(t);
                }
                 else if (t.SockType == SockType.Reminder)
                {
                    //Users are only permitted to search their own reminder's
                    if (await ct.Reminder.AsNoTracking().AnyAsync(z => z.Id == t.ObjectId && z.UserId == currentUserId))
                        CanReadMatchingObjects.Add(t);
                }
                else
                {
                    if (Sockeye.Api.ControllerHelpers.Authorized.HasReadFullRole(currentUserRoles, t.SockType))
                    {
                        CanReadMatchingObjects.Add(t);
                    }
                }
            }

            //Ok, we're here with the list of allowable objects which is now the master matching objects list so...
            MatchingObjects = CanReadMatchingObjects;

            //TOTAL RESULTS
            //we have the total results here so set accordingly
            ReturnObject.TotalResultsFound = MatchingObjects.Count;

            //MAXIMUM RESULTS FILTER
            //The theory is that it should be filtered BEFORE sorting so that you get the most random collection of results
            //As the results are not ranked so...
            if (searchParameters.MaxResults > 0)//0 = all results
                MatchingObjects = MatchingObjects.Take(searchParameters.MaxResults).ToList();

            //Sort and group the matching objects list in return order
            var OrderedMatchingObjects = MatchingObjects.OrderBy(z => z.SockType).ThenByDescending(z => z.ObjectId);

            //Get names using best performing technique
            using (var command = ct.Database.GetDbConnection().CreateCommand())
            {

                ct.Database.OpenConnection();
                //Build the return list from the remaining matching objects list
                foreach (SockTypeId i in OrderedMatchingObjects)
                {
                    SearchResult SR = new SearchResult();
                    SR.Name = BizObjectNameFetcherDirect.Name(i.SockType,
                                                              i.ObjectId,translationId,
                                                              command);
                    SR.Id = i.ObjectId;
                    SR.Type = i.SockType;
                    ReturnObject.SearchResults.Add(SR);
                }
            }

            return ReturnObject;
        }


        #endregion dosearch

        #region Get info (excerpt)
        public static async Task<string> GetInfoAsync(long translationId, AuthorizationRoles currentUserRoles, long userId, string phrase, int max, SockType sockType, long id, AyContext ct)
        {
            //escape literal percentage signs first just in case they are searching for 50% off or something
            //https://www.postgresql.org/docs/current/functions-matching.html#FUNCTIONS-LIKE
            //need to get around breaking possibly losing the symbol so make it text
            phrase = phrase.Replace("%", "pctsym");

            //Modify Phrase to replace wildcard * with % as breakcore expects sql style wildcards
            phrase = phrase.Replace("*", "%");

            //BREAK SEARCH PHRASE INTO SEPARATE TERMS
            var PhraseItems = await BreakSearchPhraseAsync(translationId, phrase);
            PhraseItems.ToArray();

            //get text
            ISearchAbleObject o = (ISearchAbleObject)BizObjectFactory.GetBizObject(sockType, ct, userId, currentUserRoles, translationId);

            //get extract
            var searchParams = await o.GetSearchResultSummary(id, sockType);

            //extract and rank here
            ExtractAndRank er = new ExtractAndRank();
            er.Process(searchParams, PhraseItems.ToArray(), max);
            // sr.Extract = er.Extract;
            // sr.Rank = er.Ranking;

            return er.Extract;

        }


        #region Search rank and extract
        /// <summary>
        /// Rank and extract best excerpt of specified text and search terms
        /// </summary>
        public sealed class ExtractAndRank
        {

            #region Fields
            private string[] searchTerms;
            private string rawtext;
            private string extract = "";
            private bool flattenExtract = true;
            private float ranking;
            private int extractionThresholdRank = 10;
            private int maximumCharactersToExtract = 40;
            #endregion

            #region Properties

            /// <summary>
            /// This is the ranking of the source text as it pertains to the
            /// search terms
            ///
            /// A rank of zero means either there was no match or the rank that was calculated
            /// was lower than the threshold ranking, either way, no excerpt extraction is done.
            ///
            /// It is a percentage value on a scale of 0 to 100
            ///	and is weighted:
            ///
            /// 75% of the score is the percentage of all search terms found in the text
            /// 25% of the score is the percentage of all characters in the text that are search term characters
            ///
            ///
            /// </summary>
            public float Ranking
            {
                get
                {
                    return ranking;
                }
            }

            /// <summary>
            /// Maximum characters to appear in an extraction
            /// default is 80
            /// Minimum is 10
            /// </summary>
            public int MaximumCharactersToExtract
            {
                get
                {
                    return maximumCharactersToExtract;
                }
                set
                {

                    if (value > 10)
                        maximumCharactersToExtract = value;
                    else
                        maximumCharactersToExtract = 10;

                }
            }

            /// <summary>
            /// ExtractionThresholdRank
            /// Extraction will only take place if the rank is
            /// this value or higher
            ///
            /// default is 10, maximum is 100 minimum is 0
            /// </summary>
            public int ExtractionThresholdRank
            {
                get
                {
                    return extractionThresholdRank;
                }
                set
                {
                    if (value > 100)
                        extractionThresholdRank = 100;
                    else if (value < 0)
                        extractionThresholdRank = 0;
                    else
                        extractionThresholdRank = value;
                }
            }


            /// <summary>
            /// If true, carriage returns and line feeds will be removed from extract
            /// </summary>
            public bool FlattenExtract
            {
                get
                {
                    return this.flattenExtract;
                }
                set
                {
                    this.flattenExtract = value;
                }
            }

            /// <summary>
            /// Extracted text excerpt that best reflects search terms
            /// </summary>
            public string Extract
            {
                get
                {
                    return extract;
                }
            }

            #endregion

            #region public methods
            /// <summary>
            /// Do the extraction and ranking
            /// </summary>
            public void Process(SearchIndexProcessObjectParameters searchObjectParams, string[] searchTerms, int max)
            {
                this.maximumCharactersToExtract = max;

                ranking = 0;
                extract = "";

                string rawText = string.Join(" ", searchObjectParams.Words);

                //System.Diagnostics.Debug.Assert(rawText!=null && rawText!="","EXTRACT AND RANK","EMPTY RAWTEXT, CHECK OBJECTS GetSearchResult() CODE TO ENSURE IT'S GOT THE correct SP (CHECK THE SP IF NOT)");
                if (rawText == null || rawText == "") return;
                this.rawtext = rawText;

                if (searchTerms == null || searchTerms.Length == 0) return;
                this.searchTerms = searchTerms;


                ranking = score(0, this.rawtext.Length);
                if (ranking > extractionThresholdRank)
                    DoExtract();
            }
            #endregion

            #region Calculate score
            /// <summary>
            /// Give a percentage score for a given window of
            /// text in the raw text string
            /// 75% of the score is the percentage of all search terms found in the window
            /// 25% of the score is the percentage of all characters in the search window that are search term characters
            ///
            ///
            ///
            /// </summary>
            /// <param name="nStartPos"></param>
            /// <param name="nEndPos"></param>
            /// <returns>Float value of zero to one hundred</returns>
            private float score(int nStartPos, int nEndPos)
            {
                //rewrite this as an integer based calculation

                System.Diagnostics.Debug.Assert(nStartPos < nEndPos);
                if (nStartPos < 0) nStartPos = 0;
                if (nEndPos > this.rawtext.Length) nEndPos = this.rawtext.Length;

                int nTermCharsInWindow = 0;//how many of the characters in the window are matching term characters
                string SearchString = this.rawtext.Substring(nStartPos, nEndPos - nStartPos).ToLower(System.Globalization.CultureInfo.CurrentCulture);

                int nMatches = 0;

                foreach (string term in searchTerms)
                {
                    //remove the wild card character if present and set to lower case
                    string lTerm = term.ToLower(System.Globalization.CultureInfo.CurrentCulture).Replace("%", "");
                    int nLocation = SearchString.IndexOf(lTerm);
                    if (nLocation != -1)
                    {
                        nMatches++;
                        while (nLocation != -1)
                        {
                            nTermCharsInWindow += lTerm.Length; ;
                            nLocation = SearchString.IndexOf(lTerm, nLocation + 1);

                        }

                    }
                }

                //If no matches then rank is automatically zero
                if (nMatches == 0)
                {
                    return 0;
                }


                //Rank is calculated on a weighted scale
                //75% for matching all search terms
                //25% for the quantity of search terms versus other text found
                float fTermsFoundPct = 75 * ((float)nMatches / (float)searchTerms.GetLength(0));
                float fTermsVsTextPct = 0;
                if (nTermCharsInWindow > 0)
                    fTermsVsTextPct = 25 * ((float)nTermCharsInWindow / (float)SearchString.Length);

                return fTermsFoundPct + fTermsVsTextPct;

            }
            #endregion

            #region Extract best excerpt
            /// <summary>
            /// Extract the best scoring excerpt fragments of
            /// raw text
            /// </summary>
            private void DoExtract()
            {
                //If the whole thing is less than the max to extract
                //just save time and return the whole thing
                if (this.rawtext.Length < this.maximumCharactersToExtract)
                {
                    this.extract = this.rawtext;
                    return;
                }

                string BestWindow = "";
                float BestScore = 0;
                float thisscore = 0;
                int BestWindowStartPos = 0;

                //Get the shortest search term length so
                //we can save time iterating over the window in the extract
                //function below
                int shortestSearchTermLength = int.MaxValue;
                foreach (string s in this.searchTerms)
                {
                    if (s.Length < shortestSearchTermLength)
                        shortestSearchTermLength = s.Length;

                }


                //slide a window over the text and check it's score, the highest scoring window wins
                //move the length of the shortest search term so as to ensure we won't
                //miss it, but faster than moving one character at a time
                for (int z = 0; z < this.rawtext.Length - maximumCharactersToExtract; z += shortestSearchTermLength)
                {
                    thisscore = score(z, z + (maximumCharactersToExtract));

                    if (thisscore == 0) continue;

                    if (thisscore > BestScore)
                    {
                        BestScore = thisscore;
                        BestWindow = this.rawtext.Substring(z, maximumCharactersToExtract);
                        //Best window to get if the future score is equal
                        //I.E. put the terms in the center of the window if
                        //the score is equal
                        BestWindowStartPos = z + (maximumCharactersToExtract / 2);
                    }

                    //If it's equal to the last and we're positioned over
                    //the best spot (terms in center) then capture that
                    if (thisscore == BestScore && z == BestWindowStartPos)
                    {
                        BestWindow = this.rawtext.Substring(z, maximumCharactersToExtract);

                    }
                }

                if (this.flattenExtract)
                    this.extract = "..." + BestWindow.Trim().Replace("\r", "").Replace("\n", "").Replace("\t", "") + "...";//case 1593 added tab character removal
                else
                    this.extract = "..." + BestWindow.Trim() + "...";


            }


            //========================================================================

            #endregion

        }
        #endregion Xtract


        #endregion

        #region ProcessKeywords into Database

        //Class to hold process input parameters
        //also used for getting summary search results
        public class SearchIndexProcessObjectParameters
        {
            public long TranslationId { get; set; }
            public long ObjectId { get; set; }
            public SockType SockType { get; set; }
            public List<string> Words { get; set; }


            public SearchIndexProcessObjectParameters(long translationId, long objectID, SockType aType)
            {
                Words = new List<string>();
                TranslationId = translationId;
                ObjectId = objectID;
                SockType = aType;
            }

            //format used for getsummmary by biz objects
            public SearchIndexProcessObjectParameters()
            {
                Words = new List<string>();
                TranslationId = 0;
                ObjectId = 0;
                SockType = 0;
            }

            public SearchIndexProcessObjectParameters AddText(string s)
            {
                if (!string.IsNullOrWhiteSpace(s))
                {
                    Words.Add(s);
                }
                return this;
            }


            public SearchIndexProcessObjectParameters AddText(long l)
            {
                Words.Add(l.ToString());
                return this;
            }

            // public SearchIndexProcessObjectParameters AddText(decimal? d)
            // {
            //     if (d != null)
            //         Words.Add(d.ToString());
            //     return this;
            // }

            public SearchIndexProcessObjectParameters AddText(List<string> lWords)
            {
                if (lWords != null)
                {
                    foreach (string s in lWords)
                    {
                        if (!string.IsNullOrWhiteSpace(s))
                        {
                            Words.Add(s);
                        }
                    }
                }

                return this;
            }
            public SearchIndexProcessObjectParameters AddCustomFields(string jsonString)
            {
                //Extract the text from custom fields json fragment as an array of strings and add it here
                AddText(JsonUtil.GetCustomFieldsAsStringArrayForSearchIndexing(jsonString));
                return this;
            }
        }

        public static async Task ProcessNewObjectKeywordsAsync(SearchIndexProcessObjectParameters searchIndexObjectParameters)
        {
            await ProcessKeywordsAsync(searchIndexObjectParameters, true);
        }

        public static async Task ProcessUpdatedObjectKeywordsAsync(SearchIndexProcessObjectParameters searchIndexObjectParameters)
        {
            await ProcessKeywordsAsync(searchIndexObjectParameters, false);
        }

        public static async Task ProcessDeletedObjectKeywordsAsync(long objectID, SockType aType, AyContext ct)
        {
            //Be careful in future, if you put ToString at the end of each object in the string interpolation
            //npgsql driver will assume it's a string and put quotes around it triggering an error that a string can't be compared to an int
            await ct.Database.ExecuteSqlInterpolatedAsync($"delete from asearchkey where objectid={objectID} and socktype={(int)aType}");
            //nothing to save here, it's a direct command already executed
        }


        /// <summary>
        /// Process the keywords into the dictionary
        /// </summary>
        private static async Task ProcessKeywordsAsync(SearchIndexProcessObjectParameters p, bool newRecord)
        {
            // #if (DEBUG)
            //             if (!p.SockType.HasAttribute(typeof(CoreBizObjectAttribute)))
            //                 throw new System.NotSupportedException($"Search::ProcessKeywords - Invalid type presented {p.SockType}");
            // #endif
            List<string> KeyWordList = await BreakAsync(p.TranslationId, p.Words);

            if (KeyWordList.Count == 0) return;
            //call stored procedure to do the work right at the server (fastest method by far)
            using (AyContext ct = ServiceProviderProvider.DBContext)
                await ct.Database.ExecuteSqlInterpolatedAsync($"call aydosearchindex({KeyWordList},{p.ObjectId},{p.SockType},{!newRecord})");
            return;
        }//eoc
        #endregion

        #region Breaker

        public enum TokenTypes
        { Nothing, Separator, CJK, Latin };

        /// <summary>
        /// Take an array of strings and
        /// return a single string
        /// containing unique only, lowercase comma delimited
        /// keywords suitable for passing to a
        /// stored procedure or other function
        ///
        /// Use Translation setting CJKIndex=true to handle Chinese, Japanese, Korean etc
        /// (languages with no easily identifiable word boundaries as in english)
        /// </summary>
        /// <returns>List of strings</returns>
        internal static async Task<List<string>> BreakAsync(long translationId, List<string> textStrings)
        {
            return await BreakCoreAsync(translationId, false, textStrings);
        }

        /// <summary>
        ///
        /// </summary>
        internal static async Task<List<string>> BreakAsync(long translationId, string textString)
        {
            List<string> textStrings = new List<string>(1);
            textStrings.Add(textString);
            return await BreakCoreAsync(translationId, false, textStrings);
        }

        /// <summary>
        /// Used to Process users search phrase and preserve wild
        /// cards entered
        /// </summary>
        internal static async Task<List<string>> BreakSearchPhraseAsync(long translationId, string searchPhrase)
        {
            List<string> textStrings = new List<string>();
            textStrings.Add(searchPhrase);
            //note: we want stopwords if this is a search phrase break because they might type "some" wanting awesome but some is a stopword so..
            return await BreakCoreAsync(translationId, true, textStrings, true);
        }


        internal static async Task<List<string>> BreakCoreAsync(long translationId, bool KeepWildCards, List<string> textStrings, bool ignoreStopWords = false)
        {
            //For stopwords and CJKIndex flag value
            var translationWordBreakData = await SearchTranslationWordBreakDataCache.GetWordBreakData(translationId);

            int MAXWORDLENGTH = 255;
            int MINWORDLENGTH = 2;//A word isn't a word unless it's got at least two characters in it
            StringBuilder sbResults = new StringBuilder();
            //List to temporarily hold parsed words
            //used to easily ensure unique words only
            List<string> tempParsedWords = new List<string>();

            StringBuilder sb = new StringBuilder();
            StringBuilder sbWord = new StringBuilder();
            List<string> ReturnList = new List<string>();


            //Loop through each of the passed in strings
            foreach (string s in textStrings)
            {
                if (s == null || s == "") continue;
                //get all the characters in a unicode compliant manner...
                TextElementEnumerator t = StringInfo.GetTextElementEnumerator(s);
                //start at the top
                t.Reset();

                TokenTypes LastToken = TokenTypes.Nothing;

                //Used by CJK
                bool BasicLatinBlock = true;

                //Process each "character" (text element,glyph whatever) in the
                //current string
                while (t.MoveNext())
                {
                    //get it as a character
                    char c = t.GetTextElement()[0];

                    if (!translationWordBreakData.CJKIndex)
                    {
                        #region regular tokenizer

                        //Is it a token we want to include?
                        //Or a wildcard character
                        if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%'))
                        {
                            #region Include token
                            //All latin text is converted to lower case
                            c = char.ToLower(c, System.Globalization.CultureInfo.CurrentCulture);

                            //Do we already have a word?
                            if (sbWord.Length > 0)
                            {
                                //Maybe we need to flush this word into the word list
                                //if we're over the word length limit
                                if (sbWord.Length >= MAXWORDLENGTH)
                                {
                                    //flush away...
                                    if (!tempParsedWords.Contains(sbWord.ToString()))
                                    {
                                        tempParsedWords.Add(sbWord.ToString());
                                    }
                                    sbWord.Length = 0;
                                    sbWord.Append(c);
                                    LastToken = TokenTypes.Latin;
                                    continue;

                                }
                            }

                            //append character and go on to next one
                            sbWord.Append(c);
                            LastToken = TokenTypes.Latin;
                            continue;
                            #endregion
                        }
                        else
                        {
                            #region Word Boundary token
                            LastToken = TokenTypes.Separator;
                            if (sbWord.Length > 0)
                            {
                                //flush away...
                                if (!tempParsedWords.Contains(sbWord.ToString()))
                                {
                                    tempParsedWords.Add(sbWord.ToString());
                                }
                                sbWord.Length = 0;
                                continue;
                            }

                            #endregion
                        }
                        #endregion
                    }
                    else
                    {
                        #region CJK Tokenizer

                        //Is it a basic latin charater? (ascii basically)
                        //see: http://www.unicode.org/charts/index.html
                        //and here for a funky online viewer:
                        //http://www.fileformat.info/info/unicode/block/index.htm
                        //we need to know this so that regular english text
                        //within cjk text gets properly indexed as whole words
                        BasicLatinBlock = false;
                        if ((int)c < 256) BasicLatinBlock = true;

                        if (BasicLatinBlock)
                        {
                            //Is it a token we want to include?
                            if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%'))
                            {
                                #region Latin Include token
                                //All latin text is converted to lower case
                                c = char.ToLower(c, System.Globalization.CultureInfo.CurrentCulture);

                                //Do we already have a word?
                                if (sbWord.Length > 0)
                                {
                                    //Maybe we need to flush this word into the word list
                                    //if we're over the word length limit or we are going from
                                    //CJK to latin
                                    if (LastToken == TokenTypes.CJK || sbWord.Length >= MAXWORDLENGTH)
                                    {
                                        //flush away...
                                        if (!tempParsedWords.Contains(sbWord.ToString()))
                                        {
                                            tempParsedWords.Add(sbWord.ToString());
                                        }
                                        sbWord.Length = 0;
                                        sbWord.Append(c);
                                        LastToken = TokenTypes.Latin;
                                        continue;

                                    }
                                }

                                //append character and go on to next one
                                sbWord.Append(c);
                                LastToken = TokenTypes.Latin;
                                continue;
                                #endregion
                            }
                            else
                            {
                                #region Latin Word Boundary token
                                LastToken = TokenTypes.Separator;
                                if (sbWord.Length > 0)
                                {
                                    //flush away...
                                    if (!tempParsedWords.Contains(sbWord.ToString()))
                                    {
                                        tempParsedWords.Add(sbWord.ToString());
                                    }
                                    sbWord.Length = 0;

                                    continue;

                                }

                                #endregion
                            }

                        }
                        else//CJK character
                        {
                            if (char.IsLetter(c) || (KeepWildCards && c == '%'))
                            {
                                #region CJK Include token
                                //Do we already have a word?
                                if (sbWord.Length > 0)
                                {
                                    //Maybe we need to flush this word into the word list
                                    //if we're over the word length limit or we are going from
                                    //latin TO CJK
                                    if (LastToken == TokenTypes.Latin || sbWord.Length >= MAXWORDLENGTH)
                                    {
                                        //flush away...
                                        if (!tempParsedWords.Contains(sbWord.ToString()))
                                        {
                                            tempParsedWords.Add(sbWord.ToString());
                                        }
                                        sbWord.Length = 0;
                                        sbWord.Append(c);
                                        LastToken = TokenTypes.CJK;
                                        continue;

                                    }

                                    if (LastToken == TokenTypes.CJK)
                                    {
                                        //we're here because there is more than zero characters already stored
                                        //and the last was CJK so we need append current character
                                        //and flush the resultant 2 character n-gram
                                        sbWord.Append(c);
                                        System.Diagnostics.Debug.Assert(sbWord.Length == 2);
                                        //flush away...
                                        if (!tempParsedWords.Contains(sbWord.ToString()))
                                        {
                                            tempParsedWords.Add(sbWord.ToString());
                                        }
                                        sbWord.Length = 0;
                                        sbWord.Append(c);
                                        LastToken = TokenTypes.CJK;
                                        continue;

                                    }
                                }

                                //append character and go on to next one
                                sbWord.Append(c);
                                LastToken = TokenTypes.CJK;
                                continue;
                                #endregion


                            }
                            else
                            {
                                #region CJK Word Boundary token
                                LastToken = TokenTypes.Separator;
                                if (sbWord.Length > 0)
                                {
                                    //flush away...
                                    if (!tempParsedWords.Contains(sbWord.ToString()))
                                    {
                                        tempParsedWords.Add(sbWord.ToString());
                                    }
                                    sbWord.Length = 0;
                                    continue;
                                }

                                #endregion
                            }

                        }

                        #endregion
                    }
                }

                //Flush out the last word
                if (sbWord.Length > 0)
                {
                    //flush away...
                    if (!tempParsedWords.Contains(sbWord.ToString()))
                    {
                        tempParsedWords.Add(sbWord.ToString());
                    }
                    sbWord.Length = 0;
                }
            }


            //bail early if there is nothing indexed
            if (tempParsedWords.Count == 0) return ReturnList;


            //Make a return string array
            //from the word list
            foreach (string s in tempParsedWords)
            {
                //Filter out short words if we are breaking for indexing
                //but keep them if they are part of a wildcard search phrase
                if (s.Length >= MINWORDLENGTH || (KeepWildCards && s.Contains('%')))
                {
                    if (ignoreStopWords)
                    {
                        //breaking of search phrase
                        ReturnList.Add(s);
                    }
                    else
                    {
                        //Add only non stopwords - regular breaking of object for dictionary entry
                        if (!translationWordBreakData.StopWords.Contains(s))
                        {
                            ReturnList.Add(s);
                        }
                    }
                }
            }

            //sometimes all the results are stop words so you end up here with nothing
            return ReturnList;

        }

        #endregion

    }//eoc

}//eons