2018-09-18 23:15:02 +00:00
parent 6771c9f2a1
commit 02d4084826
2 changed files with 422 additions and 5 deletions
--- a/server/AyaNova/biz/LocaleBiz.cs
+++ b/server/AyaNova/biz/LocaleBiz.cs
@@ -164,6 +164,16 @@ namespace AyaNova.Biz
        }
         //Get the CJKIndex value for the locale specified
        internal static async Task<bool> GetCJKIndex(long localeId, AyContext ct=null)
        {
            if(ct==null)
                ct = ServiceProviderProvider.DBContext;          
            var ret = await ct.Locale.Where(x => x.Id == localeId).Select(m=>m.CjkIndex).SingleOrDefaultAsync();
            return ret;
        }
        /// <summary>
        /// Get the value of the key provided in the default locale chosen
        /// </summary>
--- a/server/AyaNova/biz/Search.cs
+++ b/server/AyaNova/biz/Search.cs
@@ -13,12 +13,18 @@ namespace AyaNova.Biz
    //This class handles word breaking, processing keywords and searching for results
    public static class Search
    {
        //Initial keyword indexing consists of 
        //WordBreaker - break down into words
        //ProcessKeywords into database
        /// <summary>
        /// Process the keywords into the dictionary
        /// </summary>    
        public static void ProcessKeywords(AyContext ct, long localeId, long objectID, AyaType objectType, bool newRecord, string keyWords, string name)
        {
-            var StopWords = GetLocaleSearchData(ct, localeId);
+            var LocaleSearchData = GetLocaleSearchData(ct, localeId);
            //Get CJK index bool flag.
            //TODO: should this be a property of the locale or a global setting as before??
@@ -53,8 +59,9 @@ namespace AyaNova.Biz
        //Get the current stopwords for the user's locale
-        private static List<string> GetLocaleSearchData(AyContext ct, long localeId)
+        private static LocaleSearchData GetLocaleSearchData(AyContext ct, long localeId)
        {
            LocaleSearchData LSD=new LocaleSearchData();
            //Get stopwords
            //Validate locale id, if not right then use default instead 
            var Param = new Api.Controllers.LocaleController.LocaleSubsetParam();
@@ -67,21 +74,421 @@ namespace AyaNova.Biz
            Param.Keys.Add("StopWords6");
            Param.Keys.Add("StopWords7");
            var Stops = LocaleBiz.GetSubsetStatic(Param).Result;
-            List<string> StopWords = new List<string>();
+            
            foreach (KeyValuePair<string, string> kvp in Stops)
            {
                //Each stopwords locale key is a space delimited list of words and in the case of an empty local string (i.e. StopWords7) it's value is a single question mark
                if (kvp.Value != "?")
                {
-                    StopWords.AddRange(kvp.Value.Split(" "));
+                   LSD.StopWords.AddRange(kvp.Value.Split(" "));
                }
            }
-            return StopWords;
+
            LSD.CJKIndex = LocaleBiz.GetCJKIndex(localeId,ct).Result;
            return LSD;
        }
        #region Breaker
        public enum TokenTypes
        { Nothing, Separator, CJK, Latin };
        /// <summary>
        /// Take an array of strings and 
        /// return a single string
        /// containing unique only, lowercase comma delimited 
        /// keywords suitable for passing to a 
        /// stored procedure or other function
        /// 
        /// Use Locale setting CJKIndex=true to handle Chinese, Japanese, Korean etc
        /// (languages with no easily identifiable word boundaries as in english)
        /// </summary>
        /// 
        /// <param name="text">An array of 0 to * strings of text</param>
        /// <returns></returns>
        internal static string Break(params string[] text)
        {
            return BreakCore(false, text);
        }
        /// <summary>
        /// Used to Process users search phrase and preserve wild
        /// cards entered
        /// </summary>
        /// <param name="text"></param>
        /// <returns></returns>
        internal static string BreakSearchPhrase(params string[] text)
        {
            return BreakCore(true, text);
        }
        /// <summary>
        /// Stop words list reset upon login or editing of localized text
        /// used for eliminating noise words from search dictionary
        /// </summary>
        public static System.Collections.Generic.List<string> StopList = null;
        internal static string BreakCore(bool KeepWildCards, params string[] text)
        {
            ////case 1039 //log.Debug("Break");
            #region stopwords
            if (StopList == null)
            {
                StopList = new List<string>();
                for (int stopkeys = 1; stopkeys < 8; stopkeys++)
                {
                    MatchCollection mc = rxAllWords.Matches(LocalizedTextTable.GetLocalizedTextDirect("StopWords" + stopkeys.ToString()));
                    foreach (Match m in mc)
                    {
                        if (!string.IsNullOrEmpty(m.Value) && m.Value != "?" && !StopList.Contains(m.Value))
                            StopList.Add(m.Value);
                    }
                }
            }
            #endregion
            bool CJK = GlobalSettings.CJKIndex;
            int MAXWORDLENGTH = 255;
            StringBuilder sbResults = new StringBuilder();
            //Hashtable to temporarily hold parsed words
            //used to easily ensure unique words only
            Hashtable ht = new Hashtable();
            //Stuff required for creating xml fragment on the fly in memory (string)
            StringBuilder sb = new StringBuilder();
            StringBuilder sbWord = new StringBuilder();
            System.IO.StringWriter sr = new System.IO.StringWriter(sb);
            System.Xml.XmlTextWriter w = new System.Xml.XmlTextWriter(sr);
            w.Formatting = System.Xml.Formatting.Indented;
            w.WriteStartElement("Items");
            //Loop through each of the passed in strings
            foreach (string s in text)
            {
                if (s == null || s == "") continue;
                //get all the characters in a unicode compliant manner...
                TextElementEnumerator t = StringInfo.GetTextElementEnumerator(s);
                //start at the top
                t.Reset();
                TokenTypes LastToken = TokenTypes.Nothing;
                //Used by CJK
                bool BasicLatinBlock = true;
                //Process each "character" (text element,glyph whatever) in the 
                //current string
                while (t.MoveNext())
                {
                    //get it as a character
                    char c = t.GetTextElement()[0];
                    if (!CJK)
                    {
                        #region regular tokenizer
                        //Is it a token we want to include?
                        //Or a wildcard character
                        if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%'))
                        {
                            #region Include token
                            //All latin text is converted to lower case
                            c = char.ToLower(c);
                            //Do we already have a word?
                            if (sbWord.Length > 0)
                            {
                                //Maybe we need to flush this word into the word list
                                //if we're over the word length limit 
                                if (sbWord.Length >= MAXWORDLENGTH)
                                {
                                    //flush away...
                                    if (!ht.ContainsKey(sbWord.ToString()))
                                    {
                                        ht[sbWord.ToString()] = 1;
                                        //sbTest.Append(sbWord.ToString()+"\r\n");
                                    }
                                    sbWord.Length = 0;
                                    sbWord.Append(c);
                                    LastToken = TokenTypes.Latin;
                                    continue;
                                }
                            }
                            //append character and go on to next one
                            sbWord.Append(c);
                            LastToken = TokenTypes.Latin;
                            continue;
                            #endregion
                        }
                        else
                        {
                            #region Word Boundary token
                            LastToken = TokenTypes.Separator;
                            if (sbWord.Length > 0)
                            {
                                //flush away...
                                if (!ht.ContainsKey(sbWord.ToString()))
                                {
                                    ht[sbWord.ToString()] = 1;
                                    //sbTest.Append(sbWord.ToString()+"\r\n");
                                }
                                sbWord.Length = 0;
                                continue;
                            }
                            #endregion
                        }
                        #endregion
                    }
                    else
                    {
                        #region CJK Tokenizer
                        //Is it a basic latin charater? (ascii basically)
                        //see: http://www.unicode.org/charts/index.html
                        //and here for a funky online viewer:
                        //http://www.fileformat.info/info/unicode/block/index.htm
                        //we need to know this so that regular english text
                        //within cjk text gets properly indexed as whole words
                        BasicLatinBlock = false;
                        if ((int)c < 256) BasicLatinBlock = true;
                        if (BasicLatinBlock)
                        {
                            //Is it a token we want to include?
                            if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%'))
                            {
                                #region Latin Include token
                                //All latin text is converted to lower case
                                c = char.ToLower(c);
                                //Do we already have a word?
                                if (sbWord.Length > 0)
                                {
                                    //Maybe we need to flush this word into the word list
                                    //if we're over the word length limit or we are going from 
                                    //CJK to latin
                                    if (LastToken == TokenTypes.CJK || sbWord.Length >= MAXWORDLENGTH)
                                    {
                                        //flush away...
                                        if (!ht.ContainsKey(sbWord.ToString()))
                                        {
                                            ht[sbWord.ToString()] = 1;
                                            //sbTest.Append(sbWord.ToString()+"\r\n");
                                        }
                                        sbWord.Length = 0;
                                        sbWord.Append(c);
                                        LastToken = TokenTypes.Latin;
                                        continue;
                                    }
                                }
                                //append character and go on to next one
                                sbWord.Append(c);
                                LastToken = TokenTypes.Latin;
                                continue;
                                #endregion
                            }
                            else
                            {
                                #region Latin Word Boundary token
                                LastToken = TokenTypes.Separator;
                                if (sbWord.Length > 0)
                                {
                                    //flush away...
                                    if (!ht.ContainsKey(sbWord.ToString()))
                                    {
                                        ht[sbWord.ToString()] = 1;
                                        //sbTest.Append(sbWord.ToString()+"\r\n");
                                    }
                                    sbWord.Length = 0;
                                    continue;
                                }
                                #endregion
                            }
                        }
                        else//CJK character
                        {
                            if (char.IsLetter(c) || (KeepWildCards && c == '%'))
                            {
                                #region CJK Include token
                                //Do we already have a word?
                                if (sbWord.Length > 0)
                                {
                                    //Maybe we need to flush this word into the word list
                                    //if we're over the word length limit or we are going from 
                                    //latin TO CJK 
                                    if (LastToken == TokenTypes.Latin || sbWord.Length >= MAXWORDLENGTH)
                                    {
                                        //flush away...
                                        if (!ht.ContainsKey(sbWord.ToString()))
                                        {
                                            ht[sbWord.ToString()] = 1;
                                            //sbTest.Append(sbWord.ToString()+"\r\n");
                                        }
                                        sbWord.Length = 0;
                                        sbWord.Append(c);
                                        LastToken = TokenTypes.CJK;
                                        continue;
                                    }
                                    if (LastToken == TokenTypes.CJK)
                                    {
                                        //we're here because there is more than zero characters already stored
                                        //and the last was CJK so we need append current character
                                        //and flush the resultant 2 character n-gram 
                                        sbWord.Append(c);
                                        System.Diagnostics.Debug.Assert(sbWord.Length == 2);
                                        if (!ht.ContainsKey(sbWord.ToString()))
                                        {
                                            ht[sbWord.ToString()] = 1;
                                            //sbTest.Append(sbWord.ToString()+"\r\n");
                                        }
                                        sbWord.Length = 0;
                                        sbWord.Append(c);
                                        LastToken = TokenTypes.CJK;
                                        continue;
                                    }
                                }
                                //append character and go on to next one
                                sbWord.Append(c);
                                LastToken = TokenTypes.CJK;
                                continue;
                                #endregion
                            }
                            else
                            {
                                #region CJK Word Boundary token
                                LastToken = TokenTypes.Separator;
                                if (sbWord.Length > 0)
                                {
                                    //flush away...
                                    if (!ht.ContainsKey(sbWord.ToString()))
                                    {
                                        ht[sbWord.ToString()] = 1;
                                        //sbTest.Append(sbWord.ToString()+"\r\n");
                                    }
                                    sbWord.Length = 0;
                                    continue;
                                }
                                #endregion
                            }
                        }
                        #endregion
                    }
                }
                //Flush out the last word
                if (sbWord.Length > 0)
                {
                    //flush away...
                    if (!ht.ContainsKey(sbWord.ToString()))
                    {
                        ht[sbWord.ToString()] = 1;
                        ////sbTest.Append(sbWord.ToString()+"\r\n");
                    }
                    sbWord.Length = 0;
                }
            }
            //bail early if there is nothing indexed
            if (ht.Count == 0) return "";
            if (AsXML)
            {
                //Make a return xml fragment
                //from the word list
                foreach (DictionaryEntry d in ht)
                {
                    //Add only non stopwords
                    if (!StopList.Contains(d.Key.ToString()))
                    {
                        w.WriteStartElement("i");
                        w.WriteAttributeString("w", d.Key.ToString());
                        w.WriteEndElement();
                    }
                }
                w.WriteEndElement();
                sr.Close();
                return sr.ToString();
            }
            else
            {
                //Make a return string array
                //from the word list
                foreach (DictionaryEntry d in ht)
                {
                    //Add only non stopwords
                    if (!StopList.Contains(d.Key.ToString()))
                    {
                        sbResults.Append(d.Key.ToString());
                        sbResults.Append(",");
                    }
                }
                //sometimes all the results are stop words so you end up 
                //here with nothing in sbResults.  Removed some code that was
                //causing a crash here
                return sbResults.ToString().TrimEnd(',');
            }
            //return sbTest.ToString();
        }
        #endregion
        public class LocaleSearchData
        {
            public bool CJKIndex { get; set; }
            public List<string> StopWords { get; set; }
            public LocaleSearchData(){
                CJKIndex=false;
                StopWords=new List<string>();
            }
        }
    }//eoc
 }//eons