using System; using System.Globalization; using System.Text; using System.Collections.Generic; using System.IO; using Newtonsoft.Json.Linq; using Microsoft.Extensions.Logging; using AyaNova.Util; using AyaNova.Models; namespace AyaNova.Biz { //This class handles word breaking, processing keywords and searching for results public static class Search { //Initial keyword indexing consists of //WordBreaker - break down into words //ProcessKeywords into database /// /// Process the keywords into the dictionary /// public static void ProcessKeywords(AyContext ct, long localeId, long objectID, AyaType objectType, bool newRecord, string keyWords, string name) { //Get CJK index bool flag. //TODO: should this be a property of the locale or a global setting as before?? //if it's a locale property, it could be stored as just another word in the locale dictionary rather than getting into other aspects or maybe it belongs as a bool value on the //locale record itself? // //get a db and logger // ILogger log = AyaNova.Util.ApplicationLogging.CreateLogger("PrimeData"); // User u = new User(); // u.Active=true; // u.Name = "AyaNova Administrator"; // u.Salt = Hasher.GenerateSalt(); // u.Login = "manager"; // u.Password = Hasher.hash(u.Salt, "l3tm3in"); // u.Roles = AuthorizationRoles.BizAdminFull | AuthorizationRoles.OpsAdminFull | AuthorizationRoles.DispatchFull | AuthorizationRoles.InventoryFull; // u.OwnerId = 1; // u.LocaleId=ServerBootConfig.AYANOVA_DEFAULT_LANGUAGE_ID;//Ensure primeLocales is called first // u.UserType=UserType.Administrator; // u.UserOptions=new UserOptions(1); // ct.User.Add(u); // ct.SaveChanges(); } //Get the current stopwords for the user's locale private static LocaleSearchData GetLocaleSearchData(long localeId, AyContext ct = null) { LocaleSearchData LSD = new LocaleSearchData(); if (ct == null) ct = ServiceProviderProvider.DBContext; //Get stopwords //Validate locale id, if not right then use default instead var Param = new Api.Controllers.LocaleController.LocaleSubsetParam(); Param.LocaleId = LocaleBiz.EnsuredLocaleIdStatic(localeId, ct); Param.Keys.Add("StopWords1"); Param.Keys.Add("StopWords2"); Param.Keys.Add("StopWords3"); Param.Keys.Add("StopWords4"); Param.Keys.Add("StopWords5"); Param.Keys.Add("StopWords6"); Param.Keys.Add("StopWords7"); var Stops = LocaleBiz.GetSubsetStatic(Param).Result; foreach (KeyValuePair kvp in Stops) { //Each stopwords locale key is a space delimited list of words and in the case of an empty local string (i.e. StopWords7) it's value is a single question mark if (kvp.Value != "?") { LSD.StopWords.AddRange(kvp.Value.Split(" ")); } } LSD.CJKIndex = LocaleBiz.GetCJKIndex(localeId, ct).Result; return LSD; } #region Breaker public enum TokenTypes { Nothing, Separator, CJK, Latin }; /// /// Take an array of strings and /// return a single string /// containing unique only, lowercase comma delimited /// keywords suitable for passing to a /// stored procedure or other function /// /// Use Locale setting CJKIndex=true to handle Chinese, Japanese, Korean etc /// (languages with no easily identifiable word boundaries as in english) /// /// /// /// An array of 0 to * strings of text /// internal static string Break(long localeId, params string[] text) { return BreakCore(localeId, false, text); } /// /// Used to Process users search phrase and preserve wild /// cards entered /// /// /// /// internal static string BreakSearchPhrase(long localeId, params string[] text) { return BreakCore(localeId, true, text); } /// /// Stop words list reset upon login or editing of localized text /// used for eliminating noise words from search dictionary /// public static System.Collections.Generic.List StopList = null; internal static string BreakCore(long localeId, bool KeepWildCards, params string[] text) { //Get stopwords and CJKIndex flag value LocaleSearchData LSD = GetLocaleSearchData(localeId); //bool CJK = GlobalSettings.CJKIndex; int MAXWORDLENGTH = 255; StringBuilder sbResults = new StringBuilder(); //List to temporarily hold parsed words //used to easily ensure unique words only List tempParsedWords = new List(); //Stuff required for creating xml fragment on the fly in memory (string) StringBuilder sb = new StringBuilder(); StringBuilder sbWord = new StringBuilder(); System.IO.StringWriter sr = new System.IO.StringWriter(sb); System.Xml.XmlTextWriter w = new System.Xml.XmlTextWriter(sr); w.Formatting = System.Xml.Formatting.Indented; w.WriteStartElement("Items"); //Loop through each of the passed in strings foreach (string s in text) { if (s == null || s == "") continue; //get all the characters in a unicode compliant manner... TextElementEnumerator t = StringInfo.GetTextElementEnumerator(s); //start at the top t.Reset(); TokenTypes LastToken = TokenTypes.Nothing; //Used by CJK bool BasicLatinBlock = true; //Process each "character" (text element,glyph whatever) in the //current string while (t.MoveNext()) { //get it as a character char c = t.GetTextElement()[0]; if (!LSD.CJKIndex) { #region regular tokenizer //Is it a token we want to include? //Or a wildcard character if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%')) { #region Include token //All latin text is converted to lower case c = char.ToLower(c); //Do we already have a word? if (sbWord.Length > 0) { //Maybe we need to flush this word into the word list //if we're over the word length limit if (sbWord.Length >= MAXWORDLENGTH) { //flush away... if (!tempParsedWords.Contains(sbWord.ToString())) { tempParsedWords.Add(sbWord.ToString()); } sbWord.Length = 0; sbWord.Append(c); LastToken = TokenTypes.Latin; continue; } } //append character and go on to next one sbWord.Append(c); LastToken = TokenTypes.Latin; continue; #endregion } else { #region Word Boundary token LastToken = TokenTypes.Separator; if (sbWord.Length > 0) { //flush away... if (!tempParsedWords.Contains(sbWord.ToString())) { tempParsedWords.Add(sbWord.ToString()); } sbWord.Length = 0; continue; } #endregion } #endregion } else { #region CJK Tokenizer //Is it a basic latin charater? (ascii basically) //see: http://www.unicode.org/charts/index.html //and here for a funky online viewer: //http://www.fileformat.info/info/unicode/block/index.htm //we need to know this so that regular english text //within cjk text gets properly indexed as whole words BasicLatinBlock = false; if ((int)c < 256) BasicLatinBlock = true; if (BasicLatinBlock) { //Is it a token we want to include? if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%')) { #region Latin Include token //All latin text is converted to lower case c = char.ToLower(c); //Do we already have a word? if (sbWord.Length > 0) { //Maybe we need to flush this word into the word list //if we're over the word length limit or we are going from //CJK to latin if (LastToken == TokenTypes.CJK || sbWord.Length >= MAXWORDLENGTH) { //flush away... if (!tempParsedWords.Contains(sbWord.ToString())) { tempParsedWords.Add(sbWord.ToString()); } sbWord.Length = 0; sbWord.Append(c); LastToken = TokenTypes.Latin; continue; } } //append character and go on to next one sbWord.Append(c); LastToken = TokenTypes.Latin; continue; #endregion } else { #region Latin Word Boundary token LastToken = TokenTypes.Separator; if (sbWord.Length > 0) { //flush away... if (!tempParsedWords.Contains(sbWord.ToString())) { tempParsedWords.Add(sbWord.ToString()); } sbWord.Length = 0; continue; } #endregion } } else//CJK character { if (char.IsLetter(c) || (KeepWildCards && c == '%')) { #region CJK Include token //Do we already have a word? if (sbWord.Length > 0) { //Maybe we need to flush this word into the word list //if we're over the word length limit or we are going from //latin TO CJK if (LastToken == TokenTypes.Latin || sbWord.Length >= MAXWORDLENGTH) { //flush away... if (!tempParsedWords.Contains(sbWord.ToString())) { tempParsedWords.Add(sbWord.ToString()); } sbWord.Length = 0; sbWord.Append(c); LastToken = TokenTypes.CJK; continue; } if (LastToken == TokenTypes.CJK) { //we're here because there is more than zero characters already stored //and the last was CJK so we need append current character //and flush the resultant 2 character n-gram sbWord.Append(c); System.Diagnostics.Debug.Assert(sbWord.Length == 2); //flush away... if (!tempParsedWords.Contains(sbWord.ToString())) { tempParsedWords.Add(sbWord.ToString()); } sbWord.Length = 0; sbWord.Append(c); LastToken = TokenTypes.CJK; continue; } } //append character and go on to next one sbWord.Append(c); LastToken = TokenTypes.CJK; continue; #endregion } else { #region CJK Word Boundary token LastToken = TokenTypes.Separator; if (sbWord.Length > 0) { //flush away... if (!tempParsedWords.Contains(sbWord.ToString())) { tempParsedWords.Add(sbWord.ToString()); } sbWord.Length = 0; continue; } #endregion } } #endregion } } //Flush out the last word if (sbWord.Length > 0) { //flush away... if (!tempParsedWords.Contains(sbWord.ToString())) { tempParsedWords.Add(sbWord.ToString()); } sbWord.Length = 0; } } //bail early if there is nothing indexed if (tempParsedWords.Count == 0) return ""; //Make a return string array //from the word list foreach (string s in tempParsedWords) { //Add only non stopwords if (!StopList.Contains(s)) { sbResults.Append(s); sbResults.Append(","); } } //sometimes all the results are stop words so you end up //here with nothing in sbResults. return sbResults.ToString().TrimEnd(','); } #endregion public class LocaleSearchData { public bool CJKIndex { get; set; } public List StopWords { get; set; } public LocaleSearchData() { CJKIndex = false; StopWords = new List(); } } }//eoc }//eons