From 02d4084826ed577c578300396ef5a7b474fa75f1 Mon Sep 17 00:00:00 2001
From: John Cardinal <support@ayanova.com>
Date: Tue, 18 Sep 2018 23:15:02 +0000
Subject: [PATCH]

---
 server/AyaNova/biz/LocaleBiz.cs |  10 +
 server/AyaNova/biz/Search.cs    | 417 +++++++++++++++++++++++++++++++-
 2 files changed, 422 insertions(+), 5 deletions(-)
diff --git a/server/AyaNova/biz/LocaleBiz.cs b/server/AyaNova/biz/LocaleBiz.cs
index 9a385791..e8b5fcaf 100644
--- a/server/AyaNova/biz/LocaleBiz.cs
+++ b/server/AyaNova/biz/LocaleBiz.cs
@@ -164,6 +164,16 @@ namespace AyaNova.Biz
         }
 
 
+         //Get the CJKIndex value for the locale specified
+        internal static async Task<bool> GetCJKIndex(long localeId, AyContext ct=null)
+        {
+            if(ct==null)
+                ct = ServiceProviderProvider.DBContext;          
+            var ret = await ct.Locale.Where(x => x.Id == localeId).Select(m=>m.CjkIndex).SingleOrDefaultAsync();
+            return ret;
+        }
+
+
         /// <summary>
         /// Get the value of the key provided in the default locale chosen
         /// </summary>
diff --git a/server/AyaNova/biz/Search.cs b/server/AyaNova/biz/Search.cs
index 39284ffa..34c601e1 100644
--- a/server/AyaNova/biz/Search.cs
+++ b/server/AyaNova/biz/Search.cs
@@ -13,12 +13,18 @@ namespace AyaNova.Biz
     //This class handles word breaking, processing keywords and searching for results
     public static class Search
     {
+
+        //Initial keyword indexing consists of 
+        //WordBreaker - break down into words
+        //ProcessKeywords into database
+
+
         /// <summary>
         /// Process the keywords into the dictionary
         /// </summary>    
         public static void ProcessKeywords(AyContext ct, long localeId, long objectID, AyaType objectType, bool newRecord, string keyWords, string name)
         {
-            var StopWords = GetLocaleSearchData(ct, localeId);
+            var LocaleSearchData = GetLocaleSearchData(ct, localeId);
 
             //Get CJK index bool flag.
             //TODO: should this be a property of the locale or a global setting as before??
@@ -53,8 +59,9 @@ namespace AyaNova.Biz
 
 
         //Get the current stopwords for the user's locale
-        private static List<string> GetLocaleSearchData(AyContext ct, long localeId)
+        private static LocaleSearchData GetLocaleSearchData(AyContext ct, long localeId)
         {
+            LocaleSearchData LSD=new LocaleSearchData();
             //Get stopwords
             //Validate locale id, if not right then use default instead 
             var Param = new Api.Controllers.LocaleController.LocaleSubsetParam();
@@ -67,21 +74,421 @@ namespace AyaNova.Biz
             Param.Keys.Add("StopWords6");
             Param.Keys.Add("StopWords7");
             var Stops = LocaleBiz.GetSubsetStatic(Param).Result;
-            List<string> StopWords = new List<string>();
+            
             foreach (KeyValuePair<string, string> kvp in Stops)
             {
                 //Each stopwords locale key is a space delimited list of words and in the case of an empty local string (i.e. StopWords7) it's value is a single question mark
                 if (kvp.Value != "?")
                 {
-                    StopWords.AddRange(kvp.Value.Split(" "));
+                   LSD.StopWords.AddRange(kvp.Value.Split(" "));
                 }
             }
-            return StopWords;
+
+            LSD.CJKIndex = LocaleBiz.GetCJKIndex(localeId,ct).Result;
+            return LSD;
         }
 
 
 
 
+        #region Breaker
+
+        public enum TokenTypes
+        { Nothing, Separator, CJK, Latin };
+
+        /// <summary>
+        /// Take an array of strings and 
+        /// return a single string
+        /// containing unique only, lowercase comma delimited 
+        /// keywords suitable for passing to a 
+        /// stored procedure or other function
+        /// 
+        /// Use Locale setting CJKIndex=true to handle Chinese, Japanese, Korean etc
+        /// (languages with no easily identifiable word boundaries as in english)
+        /// </summary>
+        /// 
+
+        /// <param name="text">An array of 0 to * strings of text</param>
+        /// <returns></returns>
+        internal static string Break(params string[] text)
+        {
+            return BreakCore(false, text);
+        }
+
+        /// <summary>
+        /// Used to Process users search phrase and preserve wild
+        /// cards entered
+        /// </summary>
+        /// <param name="text"></param>
+        /// <returns></returns>
+        internal static string BreakSearchPhrase(params string[] text)
+        {
+            return BreakCore(true, text);
+        }
+
+        /// <summary>
+        /// Stop words list reset upon login or editing of localized text
+        /// used for eliminating noise words from search dictionary
+        /// </summary>
+        public static System.Collections.Generic.List<string> StopList = null;
+
+        internal static string BreakCore(bool KeepWildCards, params string[] text)
+        {
+
+            ////case 1039 //log.Debug("Break");
+
+            #region stopwords
+            if (StopList == null)
+            {
+                StopList = new List<string>();
+                for (int stopkeys = 1; stopkeys < 8; stopkeys++)
+                {
+                    MatchCollection mc = rxAllWords.Matches(LocalizedTextTable.GetLocalizedTextDirect("StopWords" + stopkeys.ToString()));
+                    foreach (Match m in mc)
+                    {
+                        if (!string.IsNullOrEmpty(m.Value) && m.Value != "?" && !StopList.Contains(m.Value))
+                            StopList.Add(m.Value);
+
+                    }
+                }
+            }
+            #endregion
+
+            bool CJK = GlobalSettings.CJKIndex;
+            int MAXWORDLENGTH = 255;
+
+            StringBuilder sbResults = new StringBuilder();
+
+            //Hashtable to temporarily hold parsed words
+            //used to easily ensure unique words only
+            Hashtable ht = new Hashtable();
+
+            //Stuff required for creating xml fragment on the fly in memory (string)
+            StringBuilder sb = new StringBuilder();
+            StringBuilder sbWord = new StringBuilder();
+            System.IO.StringWriter sr = new System.IO.StringWriter(sb);
+            System.Xml.XmlTextWriter w = new System.Xml.XmlTextWriter(sr);
+
+            w.Formatting = System.Xml.Formatting.Indented;
+            w.WriteStartElement("Items");
+
+
+            //Loop through each of the passed in strings
+            foreach (string s in text)
+            {
+                if (s == null || s == "") continue;
+                //get all the characters in a unicode compliant manner...
+                TextElementEnumerator t = StringInfo.GetTextElementEnumerator(s);
+                //start at the top
+                t.Reset();
+
+                TokenTypes LastToken = TokenTypes.Nothing;
+
+                //Used by CJK
+                bool BasicLatinBlock = true;
+
+                //Process each "character" (text element,glyph whatever) in the 
+                //current string
+                while (t.MoveNext())
+                {
+                    //get it as a character
+                    char c = t.GetTextElement()[0];
+
+                    if (!CJK)
+                    {
+                        #region regular tokenizer
+
+                        //Is it a token we want to include?
+                        //Or a wildcard character
+                        if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%'))
+                        {
+                            #region Include token
+                            //All latin text is converted to lower case
+                            c = char.ToLower(c);
+
+                            //Do we already have a word?
+                            if (sbWord.Length > 0)
+                            {
+                                //Maybe we need to flush this word into the word list
+                                //if we're over the word length limit 
+                                if (sbWord.Length >= MAXWORDLENGTH)
+                                {
+                                    //flush away...
+                                    if (!ht.ContainsKey(sbWord.ToString()))
+                                    {
+                                        ht[sbWord.ToString()] = 1;
+                                        //sbTest.Append(sbWord.ToString()+"\r\n");
+                                    }
+                                    sbWord.Length = 0;
+                                    sbWord.Append(c);
+                                    LastToken = TokenTypes.Latin;
+                                    continue;
+
+                                }
+                            }
+
+                            //append character and go on to next one
+                            sbWord.Append(c);
+                            LastToken = TokenTypes.Latin;
+                            continue;
+                            #endregion
+                        }
+                        else
+                        {
+                            #region Word Boundary token
+                            LastToken = TokenTypes.Separator;
+                            if (sbWord.Length > 0)
+                            {
+                                //flush away...
+                                if (!ht.ContainsKey(sbWord.ToString()))
+                                {
+                                    ht[sbWord.ToString()] = 1;
+                                    //sbTest.Append(sbWord.ToString()+"\r\n");
+                                }
+                                sbWord.Length = 0;
+
+                                continue;
+
+                            }
+
+                            #endregion
+                        }
+                        #endregion
+                    }
+                    else
+                    {
+                        #region CJK Tokenizer
+
+                        //Is it a basic latin charater? (ascii basically)
+                        //see: http://www.unicode.org/charts/index.html
+                        //and here for a funky online viewer:
+                        //http://www.fileformat.info/info/unicode/block/index.htm
+                        //we need to know this so that regular english text
+                        //within cjk text gets properly indexed as whole words
+                        BasicLatinBlock = false;
+                        if ((int)c < 256) BasicLatinBlock = true;
+
+                        if (BasicLatinBlock)
+                        {
+                            //Is it a token we want to include?
+                            if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%'))
+                            {
+                                #region Latin Include token
+                                //All latin text is converted to lower case
+                                c = char.ToLower(c);
+
+                                //Do we already have a word?
+                                if (sbWord.Length > 0)
+                                {
+                                    //Maybe we need to flush this word into the word list
+                                    //if we're over the word length limit or we are going from 
+                                    //CJK to latin
+                                    if (LastToken == TokenTypes.CJK || sbWord.Length >= MAXWORDLENGTH)
+                                    {
+                                        //flush away...
+                                        if (!ht.ContainsKey(sbWord.ToString()))
+                                        {
+                                            ht[sbWord.ToString()] = 1;
+                                            //sbTest.Append(sbWord.ToString()+"\r\n");
+                                        }
+                                        sbWord.Length = 0;
+                                        sbWord.Append(c);
+                                        LastToken = TokenTypes.Latin;
+                                        continue;
+
+                                    }
+                                }
+
+                                //append character and go on to next one
+                                sbWord.Append(c);
+                                LastToken = TokenTypes.Latin;
+                                continue;
+                                #endregion
+                            }
+                            else
+                            {
+                                #region Latin Word Boundary token
+                                LastToken = TokenTypes.Separator;
+                                if (sbWord.Length > 0)
+                                {
+                                    //flush away...
+                                    if (!ht.ContainsKey(sbWord.ToString()))
+                                    {
+                                        ht[sbWord.ToString()] = 1;
+                                        //sbTest.Append(sbWord.ToString()+"\r\n");
+                                    }
+                                    sbWord.Length = 0;
+
+                                    continue;
+
+                                }
+
+                                #endregion
+                            }
+
+                        }
+                        else//CJK character
+                        {
+                            if (char.IsLetter(c) || (KeepWildCards && c == '%'))
+                            {
+                                #region CJK Include token
+                                //Do we already have a word?
+                                if (sbWord.Length > 0)
+                                {
+                                    //Maybe we need to flush this word into the word list
+                                    //if we're over the word length limit or we are going from 
+                                    //latin TO CJK 
+                                    if (LastToken == TokenTypes.Latin || sbWord.Length >= MAXWORDLENGTH)
+                                    {
+                                        //flush away...
+                                        if (!ht.ContainsKey(sbWord.ToString()))
+                                        {
+                                            ht[sbWord.ToString()] = 1;
+                                            //sbTest.Append(sbWord.ToString()+"\r\n");
+                                        }
+                                        sbWord.Length = 0;
+                                        sbWord.Append(c);
+                                        LastToken = TokenTypes.CJK;
+                                        continue;
+
+                                    }
+
+                                    if (LastToken == TokenTypes.CJK)
+                                    {
+                                        //we're here because there is more than zero characters already stored
+                                        //and the last was CJK so we need append current character
+                                        //and flush the resultant 2 character n-gram 
+                                        sbWord.Append(c);
+                                        System.Diagnostics.Debug.Assert(sbWord.Length == 2);
+                                        if (!ht.ContainsKey(sbWord.ToString()))
+                                        {
+                                            ht[sbWord.ToString()] = 1;
+                                            //sbTest.Append(sbWord.ToString()+"\r\n");
+                                        }
+                                        sbWord.Length = 0;
+                                        sbWord.Append(c);
+                                        LastToken = TokenTypes.CJK;
+                                        continue;
+
+                                    }
+                                }
+
+                                //append character and go on to next one
+                                sbWord.Append(c);
+                                LastToken = TokenTypes.CJK;
+                                continue;
+                                #endregion
+
+
+                            }
+                            else
+                            {
+                                #region CJK Word Boundary token
+                                LastToken = TokenTypes.Separator;
+                                if (sbWord.Length > 0)
+                                {
+                                    //flush away...
+                                    if (!ht.ContainsKey(sbWord.ToString()))
+                                    {
+                                        ht[sbWord.ToString()] = 1;
+                                        //sbTest.Append(sbWord.ToString()+"\r\n");
+                                    }
+                                    sbWord.Length = 0;
+
+                                    continue;
+
+                                }
+
+                                #endregion
+                            }
+
+
+                        }
+
+
+
+
+
+
+                        #endregion
+                    }
+                }
+
+                //Flush out the last word
+                if (sbWord.Length > 0)
+                {
+                    //flush away...
+                    if (!ht.ContainsKey(sbWord.ToString()))
+                    {
+                        ht[sbWord.ToString()] = 1;
+                        ////sbTest.Append(sbWord.ToString()+"\r\n");
+                    }
+                    sbWord.Length = 0;
+                }
+            }
+
+
+            //bail early if there is nothing indexed
+            if (ht.Count == 0) return "";
+
+            if (AsXML)
+            {
+                //Make a return xml fragment
+                //from the word list
+                foreach (DictionaryEntry d in ht)
+                {
+                    //Add only non stopwords
+                    if (!StopList.Contains(d.Key.ToString()))
+                    {
+                        w.WriteStartElement("i");
+                        w.WriteAttributeString("w", d.Key.ToString());
+                        w.WriteEndElement();
+                    }
+                }
+
+                w.WriteEndElement();
+                sr.Close();
+                return sr.ToString();
+            }
+            else
+            {
+                //Make a return string array
+                //from the word list
+                foreach (DictionaryEntry d in ht)
+                {
+                    //Add only non stopwords
+                    if (!StopList.Contains(d.Key.ToString()))
+                    {
+                        sbResults.Append(d.Key.ToString());
+                        sbResults.Append(",");
+
+                    }
+                }
+
+                //sometimes all the results are stop words so you end up 
+                //here with nothing in sbResults.  Removed some code that was
+                //causing a crash here
+                return sbResults.ToString().TrimEnd(',');
+
+            }
+            //return sbTest.ToString();
+
+        }
+
+        #endregion
+
+
+        public class LocaleSearchData
+        {
+            public bool CJKIndex { get; set; }
+            public List<string> StopWords { get; set; }
+            public LocaleSearchData(){
+                CJKIndex=false;
+                StopWords=new List<string>();
+            }
+        }
+
+
     }//eoc
 
 }//eons
\ No newline at end of file