2022-12-16 06:01:23 +00:00
parent 26c2ae5cc9
commit effd96143f
310 changed files with 48715 additions and 0 deletions
--- a/server/biz/Search.cs
+++ b/server/biz/Search.cs
@@ -0,0 +1,998 @@
+using System.Linq;
+using System.Globalization;
+using System.Text;
+using System.Collections.Generic;
+using System.Threading.Tasks;
+using Microsoft.EntityFrameworkCore;
+using Sockeye.Util;
+using Sockeye.Models;
+
+namespace Sockeye.Biz
+{
+
+    //This class handles word breaking, processing keywords and searching for results
+    public static class Search
+    {
+
+        #region Search and return results
+
+        public class SearchRequestParameters
+        {
+            public string Phrase { get; set; }
+
+            public SockType TypeOnly { get; set; }
+
+            //Note: maxresults of 0 will get all results
+            public int MaxResults { get; set; }
+
+            public SearchRequestParameters()
+            {
+
+                TypeOnly = SockType.NoType;
+                MaxResults = 500;
+            }
+
+            public bool IsValid
+            {
+                get
+                {
+                    //has a phrase?
+                    if (!string.IsNullOrWhiteSpace(this.Phrase))
+                        return true;
+                    return false;
+                }
+            }
+        }
+
+
+        //Classes to hold search results returned to client
+        public class SearchResult
+        {
+            public string Name { get; set; }
+            public SockType Type { get; set; }
+            public long Id { get; set; }
+        }
+
+        public class SearchReturnObject
+        {
+            public long TotalResultsFound { get; set; }
+            public List<SearchResult> SearchResults { get; set; }
+            public SearchReturnObject()
+            {
+                TotalResultsFound = 0;
+                SearchResults = new List<SearchResult>();
+            }
+        }
+
+
+        public static async Task<SearchReturnObject> DoSearchAsync(AyContext ct, long translationId, AuthorizationRoles currentUserRoles, long currentUserId, SearchRequestParameters searchParameters)
+        {
+            var ReturnObject = new SearchReturnObject();
+
+            //list to hold temporary search/tag hits
+            List<SockTypeId> MatchingObjects = new List<SockTypeId>();
+
+            if (!searchParameters.IsValid)
+            {
+                //this is expected, don't throw, just return nothing                
+                //throw new System.ArgumentException("Search::DoSearch - Search request parameters must contain a phrase or tags");
+                return ReturnObject;
+            }
+
+            //escape literal percentage signs first just in case they are searching for 50% off or something
+            //https://www.postgresql.org/docs/current/functions-matching.html#FUNCTIONS-LIKE
+            //need to get around breaking possibly losing the symbol so make it text
+            searchParameters.Phrase = searchParameters.Phrase.Replace("%", "pctsym");
+
+            //Modify Phrase to replace wildcard * with % as breakcore expects sql style wildcards
+            searchParameters.Phrase = searchParameters.Phrase.Replace("*", "%");
+
+            //BREAK SEARCH PHRASE INTO SEPARATE TERMS
+            var PhraseItems = await BreakSearchPhraseAsync(translationId, searchParameters.Phrase);
+
+            //SPLIT OUT WILDCARDS FROM NON WILDCARDS
+            List<string> PreWildCardedSearchTerms = new List<string>();
+            List<string> SearchTerms = new List<string>();
+
+            foreach (string PhraseItem in PhraseItems)
+            {
+                if (PhraseItem.Contains("%"))
+                    PreWildCardedSearchTerms.Add(PhraseItem.Replace("pctsym", @"\%"));//put back literal percentage symbol if necessary
+                else
+                    SearchTerms.Add(PhraseItem.Replace("pctsym", @"\%"));//put back literal percentage symbol if necessary
+            }
+           
+            StringBuilder q = new StringBuilder();
+            int termCount = 0;
+
+            q.Append("WITH qr AS (SELECT asearchkey.sockType, asearchkey.objectid, ");
+
+            //EXACT MATCH SEARCH TERMS
+            foreach (string Term in SearchTerms)
+                q.Append($"COUNT(*) FILTER (WHERE asearchdictionary.word = '{Term}') AS st{++termCount}, ");
+
+            //WILDCARD SEARCH TERMS
+            foreach (string WildCardSearchTerm in PreWildCardedSearchTerms)
+                q.Append($"COUNT(*) FILTER (WHERE asearchdictionary.word LIKE '{WildCardSearchTerm}') AS st{++termCount}, ");
+            
+            q.Length=q.Length-2;//trim the final comma and space
+
+            var qTypeOnly=string.Empty;
+            if(searchParameters.TypeOnly!=SockType.NoType){
+                //INNER JOIN ASEARCHKEY ON ASEARCHDICTIONARY.ID = ASEARCHKEY.WORDID and asearchkey.sockType=20
+                qTypeOnly=$"AND ASEARCHKEY.ATYPE={(int)searchParameters.TypeOnly}";
+            }
+
+            q.Append($" FROM asearchdictionary INNER JOIN asearchkey ON asearchdictionary.id = asearchkey.wordid {qTypeOnly} GROUP BY asearchkey.objectid, asearchkey.sockType) SELECT sockType, objectid FROM qr WHERE ");
+
+            for (; termCount > 0; termCount--)
+                q.Append($"st{termCount} > 0 {(termCount > 1 ? "AND " : "")}");
+
+
+            //execute the query and iterate the results
+            using (var command = ct.Database.GetDbConnection().CreateCommand())
+            {
+                await ct.Database.OpenConnectionAsync();
+                command.CommandText = q.ToString();              
+                using (var dr = await command.ExecuteReaderAsync())
+                {
+                    while (dr.Read())
+                    {
+                        MatchingObjects.Add(new SockTypeId((SockType)dr.GetInt32(0), dr.GetInt64(1)));
+                    }
+                }
+               
+            }
+         
+
+            //REMOVE ANY ITEMS THAT USER IS NOT PERMITTED TO READ
+            //list to hold temporary matches
+            List<SockTypeId> CanReadMatchingObjects = new List<SockTypeId>();
+            foreach (SockTypeId t in MatchingObjects)
+            {
+                if (t.SockType == SockType.FileAttachment)
+                {
+                    //have to look up the actual underlying object type and id here
+                    //check if it's readable for user
+                    //then add the PARENT object type and id to the CanREadMatchingObjects list
+                    //this means user will not see it return as an attachment, just as the object 
+                    FileAttachment f = await ct.FileAttachment.AsNoTracking().FirstOrDefaultAsync(z => z.Id == t.ObjectId);
+                    if (Sockeye.Api.ControllerHelpers.Authorized.HasReadFullRole(currentUserRoles, f.AttachToAType))
+                    {
+                        CanReadMatchingObjects.Add(new SockTypeId(f.AttachToAType, f.AttachToObjectId));
+                    }
+                }
+                else if (t.SockType == SockType.Memo)
+                {
+                    //Users are only permitted to search their own memo's
+                    if (await ct.Memo.AsNoTracking().AnyAsync(z => z.Id == t.ObjectId && z.ToId == currentUserId))
+                        CanReadMatchingObjects.Add(t);
+                }
+                 else if (t.SockType == SockType.Reminder)
+                {
+                    //Users are only permitted to search their own reminder's
+                    if (await ct.Reminder.AsNoTracking().AnyAsync(z => z.Id == t.ObjectId && z.UserId == currentUserId))
+                        CanReadMatchingObjects.Add(t);
+                }
+                else
+                {
+                    if (Sockeye.Api.ControllerHelpers.Authorized.HasReadFullRole(currentUserRoles, t.SockType))
+                    {
+                        CanReadMatchingObjects.Add(t);
+                    }
+                }
+            }
+
+            //Ok, we're here with the list of allowable objects which is now the master matching objects list so...
+            MatchingObjects = CanReadMatchingObjects;
+
+            //TOTAL RESULTS
+            //we have the total results here so set accordingly
+            ReturnObject.TotalResultsFound = MatchingObjects.Count;
+
+            //MAXIMUM RESULTS FILTER
+            //The theory is that it should be filtered BEFORE sorting so that you get the most random collection of results
+            //As the results are not ranked so...
+            if (searchParameters.MaxResults > 0)//0 = all results
+                MatchingObjects = MatchingObjects.Take(searchParameters.MaxResults).ToList();
+
+            //Sort and group the matching objects list in return order        
+            var OrderedMatchingObjects = MatchingObjects.OrderBy(z => z.SockType).ThenByDescending(z => z.ObjectId);
+
+            //Get names using best performing technique         
+            using (var command = ct.Database.GetDbConnection().CreateCommand())
+            {
+
+                ct.Database.OpenConnection();
+                //Build the return list from the remaining matching objects list           
+                foreach (SockTypeId i in OrderedMatchingObjects)
+                {
+                    SearchResult SR = new SearchResult();
+                    SR.Name = BizObjectNameFetcherDirect.Name(i.SockType,
+                                                              i.ObjectId,translationId,
+                                                              command);
+                    SR.Id = i.ObjectId;
+                    SR.Type = i.SockType;
+                    ReturnObject.SearchResults.Add(SR);
+                }
+            }
+
+            return ReturnObject;
+        }
+
+
+        #endregion dosearch
+
+        #region Get info (excerpt)
+        public static async Task<string> GetInfoAsync(long translationId, AuthorizationRoles currentUserRoles, long userId, string phrase, int max, SockType sockType, long id, AyContext ct)
+        {
+            //escape literal percentage signs first just in case they are searching for 50% off or something
+            //https://www.postgresql.org/docs/current/functions-matching.html#FUNCTIONS-LIKE
+            //need to get around breaking possibly losing the symbol so make it text
+            phrase = phrase.Replace("%", "pctsym");
+
+            //Modify Phrase to replace wildcard * with % as breakcore expects sql style wildcards
+            phrase = phrase.Replace("*", "%");
+
+            //BREAK SEARCH PHRASE INTO SEPARATE TERMS
+            var PhraseItems = await BreakSearchPhraseAsync(translationId, phrase);
+            PhraseItems.ToArray();
+
+            //get text           
+            ISearchAbleObject o = (ISearchAbleObject)BizObjectFactory.GetBizObject(sockType, ct, userId, currentUserRoles, translationId);
+
+            //get extract
+            var searchParams = await o.GetSearchResultSummary(id, sockType);
+
+            //extract and rank here
+            ExtractAndRank er = new ExtractAndRank();
+            er.Process(searchParams, PhraseItems.ToArray(), max);
+            // sr.Extract = er.Extract;
+            // sr.Rank = er.Ranking;
+
+            return er.Extract;
+
+        }
+
+
+
+        #region Search rank and extract
+        /// <summary>
+        /// Rank and extract best excerpt of specified text and search terms
+        /// </summary>
+        public sealed class ExtractAndRank
+        {
+
+            #region Fields
+            private string[] searchTerms;
+            private string rawtext;
+            private string extract = "";
+            private bool flattenExtract = true;
+            private float ranking;
+            private int extractionThresholdRank = 10;
+            private int maximumCharactersToExtract = 40;
+            #endregion
+
+            #region Properties
+
+            /// <summary>
+            /// This is the ranking of the source text as it pertains to the 
+            /// search terms
+            /// 
+            /// A rank of zero means either there was no match or the rank that was calculated
+            /// was lower than the threshold ranking, either way, no excerpt extraction is done.
+            /// 
+            /// It is a percentage value on a scale of 0 to 100
+            ///	and is weighted:
+            ///	
+            /// 75% of the score is the percentage of all search terms found in the text
+            /// 25% of the score is the percentage of all characters in the text that are search term characters
+            /// 
+            ///  
+            /// </summary>
+            public float Ranking
+            {
+                get
+                {
+                    return ranking;
+                }
+            }
+
+            /// <summary>
+            /// Maximum characters to appear in an extraction 
+            /// default is 80
+            /// Minimum is 10
+            /// </summary>
+            public int MaximumCharactersToExtract
+            {
+                get
+                {
+                    return maximumCharactersToExtract;
+                }
+                set
+                {
+
+                    if (value > 10)
+                        maximumCharactersToExtract = value;
+                    else
+                        maximumCharactersToExtract = 10;
+
+                }
+            }
+
+            /// <summary>
+            /// ExtractionThresholdRank
+            /// Extraction will only take place if the rank is
+            /// this value or higher
+            /// 
+            /// default is 10, maximum is 100 minimum is 0
+            /// </summary>
+            public int ExtractionThresholdRank
+            {
+                get
+                {
+                    return extractionThresholdRank;
+                }
+                set
+                {
+                    if (value > 100)
+                        extractionThresholdRank = 100;
+                    else if (value < 0)
+                        extractionThresholdRank = 0;
+                    else
+                        extractionThresholdRank = value;
+                }
+            }
+
+
+
+            /// <summary>
+            /// If true, carriage returns and line feeds will be removed from extract
+            /// </summary>
+            public bool FlattenExtract
+            {
+                get
+                {
+                    return this.flattenExtract;
+                }
+                set
+                {
+                    this.flattenExtract = value;
+                }
+            }
+
+            /// <summary>
+            /// Extracted text excerpt that best reflects search terms
+            /// </summary>
+            public string Extract
+            {
+                get
+                {
+                    return extract;
+                }
+            }
+
+            #endregion
+
+            #region public methods
+            /// <summary>
+            /// Do the extraction and ranking
+            /// </summary>          
+            public void Process(SearchIndexProcessObjectParameters searchObjectParams, string[] searchTerms, int max)
+            {
+                this.maximumCharactersToExtract = max;
+
+                ranking = 0;
+                extract = "";
+
+                string rawText = string.Join(" ", searchObjectParams.Words);
+
+                //System.Diagnostics.Debug.Assert(rawText!=null && rawText!="","EXTRACT AND RANK","EMPTY RAWTEXT, CHECK OBJECTS GetSearchResult() CODE TO ENSURE IT'S GOT THE correct SP (CHECK THE SP IF NOT)");
+                if (rawText == null || rawText == "") return;
+                this.rawtext = rawText;
+
+                if (searchTerms == null || searchTerms.Length == 0) return;
+                this.searchTerms = searchTerms;
+
+
+                ranking = score(0, this.rawtext.Length);
+                if (ranking > extractionThresholdRank)
+                    DoExtract();
+            }
+            #endregion
+
+            #region Calculate score
+            /// <summary>
+            /// Give a percentage score for a given window of
+            /// text in the raw text string
+            /// 75% of the score is the percentage of all search terms found in the window
+            /// 25% of the score is the percentage of all characters in the search window that are search term characters
+            /// 
+            /// 
+            /// 
+            /// </summary>
+            /// <param name="nStartPos"></param>
+            /// <param name="nEndPos"></param>
+            /// <returns>Float value of zero to one hundred</returns>
+            private float score(int nStartPos, int nEndPos)
+            {
+                //rewrite this as an integer based calculation
+
+                System.Diagnostics.Debug.Assert(nStartPos < nEndPos);
+                if (nStartPos < 0) nStartPos = 0;
+                if (nEndPos > this.rawtext.Length) nEndPos = this.rawtext.Length;
+
+                int nTermCharsInWindow = 0;//how many of the characters in the window are matching term characters
+                string SearchString = this.rawtext.Substring(nStartPos, nEndPos - nStartPos).ToLower(System.Globalization.CultureInfo.CurrentCulture);
+
+                int nMatches = 0;
+
+                foreach (string term in searchTerms)
+                {
+                    //remove the wild card character if present and set to lower case
+                    string lTerm = term.ToLower(System.Globalization.CultureInfo.CurrentCulture).Replace("%", "");
+                    int nLocation = SearchString.IndexOf(lTerm);
+                    if (nLocation != -1)
+                    {
+                        nMatches++;
+                        while (nLocation != -1)
+                        {
+                            nTermCharsInWindow += lTerm.Length; ;
+                            nLocation = SearchString.IndexOf(lTerm, nLocation + 1);
+
+                        }
+
+                    }
+                }
+
+                //If no matches then rank is automatically zero
+                if (nMatches == 0)
+                {
+                    return 0;
+                }
+
+
+
+                //Rank is calculated on a weighted scale
+                //75% for matching all search terms
+                //25% for the quantity of search terms versus other text found
+                float fTermsFoundPct = 75 * ((float)nMatches / (float)searchTerms.GetLength(0));
+                float fTermsVsTextPct = 0;
+                if (nTermCharsInWindow > 0)
+                    fTermsVsTextPct = 25 * ((float)nTermCharsInWindow / (float)SearchString.Length);
+
+                return fTermsFoundPct + fTermsVsTextPct;
+
+            }
+            #endregion
+
+            #region Extract best excerpt
+            /// <summary>
+            /// Extract the best scoring excerpt fragments of 
+            /// raw text
+            /// </summary>
+            private void DoExtract()
+            {
+                //If the whole thing is less than the max to extract
+                //just save time and return the whole thing
+                if (this.rawtext.Length < this.maximumCharactersToExtract)
+                {
+                    this.extract = this.rawtext;
+                    return;
+                }
+
+                string BestWindow = "";
+                float BestScore = 0;
+                float thisscore = 0;
+                int BestWindowStartPos = 0;
+
+                //Get the shortest search term length so 
+                //we can save time iterating over the window in the extract
+                //function below
+                int shortestSearchTermLength = int.MaxValue;
+                foreach (string s in this.searchTerms)
+                {
+                    if (s.Length < shortestSearchTermLength)
+                        shortestSearchTermLength = s.Length;
+
+                }
+
+
+                //slide a window over the text and check it's score, the highest scoring window wins
+                //move the length of the shortest search term so as to ensure we won't
+                //miss it, but faster than moving one character at a time
+                for (int z = 0; z < this.rawtext.Length - maximumCharactersToExtract; z += shortestSearchTermLength)
+                {
+                    thisscore = score(z, z + (maximumCharactersToExtract));
+
+                    if (thisscore == 0) continue;
+
+                    if (thisscore > BestScore)
+                    {
+                        BestScore = thisscore;
+                        BestWindow = this.rawtext.Substring(z, maximumCharactersToExtract);
+                        //Best window to get if the future score is equal
+                        //I.E. put the terms in the center of the window if 
+                        //the score is equal
+                        BestWindowStartPos = z + (maximumCharactersToExtract / 2);
+                    }
+
+                    //If it's equal to the last and we're positioned over
+                    //the best spot (terms in center) then capture that
+                    if (thisscore == BestScore && z == BestWindowStartPos)
+                    {
+                        BestWindow = this.rawtext.Substring(z, maximumCharactersToExtract);
+
+                    }
+                }
+
+                if (this.flattenExtract)
+                    this.extract = "..." + BestWindow.Trim().Replace("\r", "").Replace("\n", "").Replace("\t", "") + "...";//case 1593 added tab character removal
+                else
+                    this.extract = "..." + BestWindow.Trim() + "...";
+
+
+            }
+
+
+            //========================================================================
+
+            #endregion
+
+        }
+        #endregion Xtract
+
+
+        #endregion
+
+        #region ProcessKeywords into Database
+
+        //Class to hold process input parameters
+        //also used for getting summary search results
+        public class SearchIndexProcessObjectParameters
+        {
+            public long TranslationId { get; set; }
+            public long ObjectId { get; set; }
+            public SockType SockType { get; set; }
+            public List<string> Words { get; set; }
+
+
+            public SearchIndexProcessObjectParameters(long translationId, long objectID, SockType aType)
+            {
+                Words = new List<string>();
+                TranslationId = translationId;
+                ObjectId = objectID;
+                SockType = aType;
+            }
+
+            //format used for getsummmary by biz objects
+            public SearchIndexProcessObjectParameters()
+            {
+                Words = new List<string>();
+                TranslationId = 0;
+                ObjectId = 0;
+                SockType = 0;
+            }
+
+            public SearchIndexProcessObjectParameters AddText(string s)
+            {
+                if (!string.IsNullOrWhiteSpace(s))
+                {
+                    Words.Add(s);
+                }
+                return this;
+            }
+
+
+            public SearchIndexProcessObjectParameters AddText(long l)
+            {
+                Words.Add(l.ToString());
+                return this;
+            }
+
+            // public SearchIndexProcessObjectParameters AddText(decimal? d)
+            // {
+            //     if (d != null)
+            //         Words.Add(d.ToString());
+            //     return this;
+            // }
+
+            public SearchIndexProcessObjectParameters AddText(List<string> lWords)
+            {
+                if (lWords != null)
+                {
+                    foreach (string s in lWords)
+                    {
+                        if (!string.IsNullOrWhiteSpace(s))
+                        {
+                            Words.Add(s);
+                        }
+                    }
+                }
+
+                return this;
+            }
+            public SearchIndexProcessObjectParameters AddCustomFields(string jsonString)
+            {
+                //Extract the text from custom fields json fragment as an array of strings and add it here
+                AddText(JsonUtil.GetCustomFieldsAsStringArrayForSearchIndexing(jsonString));
+                return this;
+            }
+        }
+
+        public static async Task ProcessNewObjectKeywordsAsync(SearchIndexProcessObjectParameters searchIndexObjectParameters)
+        {
+            await ProcessKeywordsAsync(searchIndexObjectParameters, true);
+        }
+
+        public static async Task ProcessUpdatedObjectKeywordsAsync(SearchIndexProcessObjectParameters searchIndexObjectParameters)
+        {
+            await ProcessKeywordsAsync(searchIndexObjectParameters, false);
+        }
+
+        public static async Task ProcessDeletedObjectKeywordsAsync(long objectID, SockType aType, AyContext ct)
+        {
+            //Be careful in future, if you put ToString at the end of each object in the string interpolation
+            //npgsql driver will assume it's a string and put quotes around it triggering an error that a string can't be compared to an int
+            await ct.Database.ExecuteSqlInterpolatedAsync($"delete from asearchkey where objectid={objectID} and aType={(int)aType}");
+            //nothing to save here, it's a direct command already executed
+        }
+
+
+        /// <summary>
+        /// Process the keywords into the dictionary        
+        /// </summary>    
+        private static async Task ProcessKeywordsAsync(SearchIndexProcessObjectParameters p, bool newRecord)
+        {
+            // #if (DEBUG)
+            //             if (!p.SockType.HasAttribute(typeof(CoreBizObjectAttribute)))
+            //                 throw new System.NotSupportedException($"Search::ProcessKeywords - Invalid type presented {p.SockType}");
+            // #endif             
+            List<string> KeyWordList = await BreakAsync(p.TranslationId, p.Words);
+
+            if (KeyWordList.Count == 0) return;
+            //call stored procedure to do the work right at the server (fastest method by far)
+            using (AyContext ct = ServiceProviderProvider.DBContext)
+                await ct.Database.ExecuteSqlInterpolatedAsync($"call aydosearchindex({KeyWordList},{p.ObjectId},{p.SockType},{!newRecord})");
+            return;
+        }//eoc
+        #endregion
+
+        #region Breaker
+
+        public enum TokenTypes
+        { Nothing, Separator, CJK, Latin };
+
+        /// <summary>
+        /// Take an array of strings and 
+        /// return a single string
+        /// containing unique only, lowercase comma delimited 
+        /// keywords suitable for passing to a 
+        /// stored procedure or other function
+        /// 
+        /// Use Translation setting CJKIndex=true to handle Chinese, Japanese, Korean etc
+        /// (languages with no easily identifiable word boundaries as in english)
+        /// </summary>       
+        /// <returns>List of strings</returns>
+        internal static async Task<List<string>> BreakAsync(long translationId, List<string> textStrings)
+        {
+            return await BreakCoreAsync(translationId, false, textStrings);
+        }
+
+        /// <summary>
+        /// 
+        /// </summary>        
+        internal static async Task<List<string>> BreakAsync(long translationId, string textString)
+        {
+            List<string> textStrings = new List<string>(1);
+            textStrings.Add(textString);
+            return await BreakCoreAsync(translationId, false, textStrings);
+        }
+
+        /// <summary>
+        /// Used to Process users search phrase and preserve wild
+        /// cards entered
+        /// </summary>        
+        internal static async Task<List<string>> BreakSearchPhraseAsync(long translationId, string searchPhrase)
+        {
+            List<string> textStrings = new List<string>();
+            textStrings.Add(searchPhrase);
+            //note: we want stopwords if this is a search phrase break because they might type "some" wanting awesome but some is a stopword so..
+            return await BreakCoreAsync(translationId, true, textStrings, true);
+        }
+
+
+
+        internal static async Task<List<string>> BreakCoreAsync(long translationId, bool KeepWildCards, List<string> textStrings, bool ignoreStopWords = false)
+        {
+            //For stopwords and CJKIndex flag value      
+            var translationWordBreakData = await SearchTranslationWordBreakDataCache.GetWordBreakData(translationId);
+
+            int MAXWORDLENGTH = 255;
+            int MINWORDLENGTH = 2;//A word isn't a word unless it's got at least two characters in it
+            StringBuilder sbResults = new StringBuilder();
+            //List to temporarily hold parsed words
+            //used to easily ensure unique words only
+            List<string> tempParsedWords = new List<string>();
+
+            StringBuilder sb = new StringBuilder();
+            StringBuilder sbWord = new StringBuilder();
+            List<string> ReturnList = new List<string>();
+
+
+            //Loop through each of the passed in strings
+            foreach (string s in textStrings)
+            {
+                if (s == null || s == "") continue;
+                //get all the characters in a unicode compliant manner...
+                TextElementEnumerator t = StringInfo.GetTextElementEnumerator(s);
+                //start at the top
+                t.Reset();
+
+                TokenTypes LastToken = TokenTypes.Nothing;
+
+                //Used by CJK
+                bool BasicLatinBlock = true;
+
+                //Process each "character" (text element,glyph whatever) in the 
+                //current string
+                while (t.MoveNext())
+                {
+                    //get it as a character
+                    char c = t.GetTextElement()[0];
+
+                    if (!translationWordBreakData.CJKIndex)
+                    {
+                        #region regular tokenizer
+
+                        //Is it a token we want to include?
+                        //Or a wildcard character
+                        if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%'))
+                        {
+                            #region Include token
+                            //All latin text is converted to lower case
+                            c = char.ToLower(c, System.Globalization.CultureInfo.CurrentCulture);
+
+                            //Do we already have a word?
+                            if (sbWord.Length > 0)
+                            {
+                                //Maybe we need to flush this word into the word list
+                                //if we're over the word length limit 
+                                if (sbWord.Length >= MAXWORDLENGTH)
+                                {
+                                    //flush away...
+                                    if (!tempParsedWords.Contains(sbWord.ToString()))
+                                    {
+                                        tempParsedWords.Add(sbWord.ToString());
+                                    }
+                                    sbWord.Length = 0;
+                                    sbWord.Append(c);
+                                    LastToken = TokenTypes.Latin;
+                                    continue;
+
+                                }
+                            }
+
+                            //append character and go on to next one
+                            sbWord.Append(c);
+                            LastToken = TokenTypes.Latin;
+                            continue;
+                            #endregion
+                        }
+                        else
+                        {
+                            #region Word Boundary token
+                            LastToken = TokenTypes.Separator;
+                            if (sbWord.Length > 0)
+                            {
+                                //flush away...
+                                if (!tempParsedWords.Contains(sbWord.ToString()))
+                                {
+                                    tempParsedWords.Add(sbWord.ToString());
+                                }
+                                sbWord.Length = 0;
+                                continue;
+                            }
+
+                            #endregion
+                        }
+                        #endregion
+                    }
+                    else
+                    {
+                        #region CJK Tokenizer
+
+                        //Is it a basic latin charater? (ascii basically)
+                        //see: http://www.unicode.org/charts/index.html
+                        //and here for a funky online viewer:
+                        //http://www.fileformat.info/info/unicode/block/index.htm
+                        //we need to know this so that regular english text
+                        //within cjk text gets properly indexed as whole words
+                        BasicLatinBlock = false;
+                        if ((int)c < 256) BasicLatinBlock = true;
+
+                        if (BasicLatinBlock)
+                        {
+                            //Is it a token we want to include?
+                            if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%'))
+                            {
+                                #region Latin Include token
+                                //All latin text is converted to lower case
+                                c = char.ToLower(c, System.Globalization.CultureInfo.CurrentCulture);
+
+                                //Do we already have a word?
+                                if (sbWord.Length > 0)
+                                {
+                                    //Maybe we need to flush this word into the word list
+                                    //if we're over the word length limit or we are going from 
+                                    //CJK to latin
+                                    if (LastToken == TokenTypes.CJK || sbWord.Length >= MAXWORDLENGTH)
+                                    {
+                                        //flush away...
+                                        if (!tempParsedWords.Contains(sbWord.ToString()))
+                                        {
+                                            tempParsedWords.Add(sbWord.ToString());
+                                        }
+                                        sbWord.Length = 0;
+                                        sbWord.Append(c);
+                                        LastToken = TokenTypes.Latin;
+                                        continue;
+
+                                    }
+                                }
+
+                                //append character and go on to next one
+                                sbWord.Append(c);
+                                LastToken = TokenTypes.Latin;
+                                continue;
+                                #endregion
+                            }
+                            else
+                            {
+                                #region Latin Word Boundary token
+                                LastToken = TokenTypes.Separator;
+                                if (sbWord.Length > 0)
+                                {
+                                    //flush away...
+                                    if (!tempParsedWords.Contains(sbWord.ToString()))
+                                    {
+                                        tempParsedWords.Add(sbWord.ToString());
+                                    }
+                                    sbWord.Length = 0;
+
+                                    continue;
+
+                                }
+
+                                #endregion
+                            }
+
+                        }
+                        else//CJK character
+                        {
+                            if (char.IsLetter(c) || (KeepWildCards && c == '%'))
+                            {
+                                #region CJK Include token
+                                //Do we already have a word?
+                                if (sbWord.Length > 0)
+                                {
+                                    //Maybe we need to flush this word into the word list
+                                    //if we're over the word length limit or we are going from 
+                                    //latin TO CJK 
+                                    if (LastToken == TokenTypes.Latin || sbWord.Length >= MAXWORDLENGTH)
+                                    {
+                                        //flush away...
+                                        if (!tempParsedWords.Contains(sbWord.ToString()))
+                                        {
+                                            tempParsedWords.Add(sbWord.ToString());
+                                        }
+                                        sbWord.Length = 0;
+                                        sbWord.Append(c);
+                                        LastToken = TokenTypes.CJK;
+                                        continue;
+
+                                    }
+
+                                    if (LastToken == TokenTypes.CJK)
+                                    {
+                                        //we're here because there is more than zero characters already stored
+                                        //and the last was CJK so we need append current character
+                                        //and flush the resultant 2 character n-gram 
+                                        sbWord.Append(c);
+                                        System.Diagnostics.Debug.Assert(sbWord.Length == 2);
+                                        //flush away...
+                                        if (!tempParsedWords.Contains(sbWord.ToString()))
+                                        {
+                                            tempParsedWords.Add(sbWord.ToString());
+                                        }
+                                        sbWord.Length = 0;
+                                        sbWord.Append(c);
+                                        LastToken = TokenTypes.CJK;
+                                        continue;
+
+                                    }
+                                }
+
+                                //append character and go on to next one
+                                sbWord.Append(c);
+                                LastToken = TokenTypes.CJK;
+                                continue;
+                                #endregion
+
+
+                            }
+                            else
+                            {
+                                #region CJK Word Boundary token
+                                LastToken = TokenTypes.Separator;
+                                if (sbWord.Length > 0)
+                                {
+                                    //flush away...
+                                    if (!tempParsedWords.Contains(sbWord.ToString()))
+                                    {
+                                        tempParsedWords.Add(sbWord.ToString());
+                                    }
+                                    sbWord.Length = 0;
+                                    continue;
+                                }
+
+                                #endregion
+                            }
+
+                        }
+
+                        #endregion
+                    }
+                }
+
+                //Flush out the last word
+                if (sbWord.Length > 0)
+                {
+                    //flush away...
+                    if (!tempParsedWords.Contains(sbWord.ToString()))
+                    {
+                        tempParsedWords.Add(sbWord.ToString());
+                    }
+                    sbWord.Length = 0;
+                }
+            }
+
+
+            //bail early if there is nothing indexed
+            if (tempParsedWords.Count == 0) return ReturnList;
+
+
+            //Make a return string array
+            //from the word list
+            foreach (string s in tempParsedWords)
+            {
+                //Filter out short words if we are breaking for indexing
+                //but keep them if they are part of a wildcard search phrase
+                if (s.Length >= MINWORDLENGTH || (KeepWildCards && s.Contains('%')))
+                {
+                    if (ignoreStopWords)
+                    {
+                        //breaking of search phrase
+                        ReturnList.Add(s);
+                    }
+                    else
+                    {
+                        //Add only non stopwords - regular breaking of object for dictionary entry
+                        if (!translationWordBreakData.StopWords.Contains(s))
+                        {
+                            ReturnList.Add(s);
+                        }
+                    }
+                }
+            }
+
+            //sometimes all the results are stop words so you end up here with nothing
+            return ReturnList;
+
+        }
+
+        #endregion
+
+    }//eoc
+
+}//eons