This commit is contained in:
2018-09-18 23:15:02 +00:00
parent 6771c9f2a1
commit 02d4084826
2 changed files with 422 additions and 5 deletions

View File

@@ -164,6 +164,16 @@ namespace AyaNova.Biz
}
//Get the CJKIndex value for the locale specified
internal static async Task<bool> GetCJKIndex(long localeId, AyContext ct=null)
{
if(ct==null)
ct = ServiceProviderProvider.DBContext;
var ret = await ct.Locale.Where(x => x.Id == localeId).Select(m=>m.CjkIndex).SingleOrDefaultAsync();
return ret;
}
/// <summary>
/// Get the value of the key provided in the default locale chosen
/// </summary>

View File

@@ -13,12 +13,18 @@ namespace AyaNova.Biz
//This class handles word breaking, processing keywords and searching for results
public static class Search
{
//Initial keyword indexing consists of
//WordBreaker - break down into words
//ProcessKeywords into database
/// <summary>
/// Process the keywords into the dictionary
/// </summary>
public static void ProcessKeywords(AyContext ct, long localeId, long objectID, AyaType objectType, bool newRecord, string keyWords, string name)
{
var StopWords = GetLocaleSearchData(ct, localeId);
var LocaleSearchData = GetLocaleSearchData(ct, localeId);
//Get CJK index bool flag.
//TODO: should this be a property of the locale or a global setting as before??
@@ -53,8 +59,9 @@ namespace AyaNova.Biz
//Get the current stopwords for the user's locale
private static List<string> GetLocaleSearchData(AyContext ct, long localeId)
private static LocaleSearchData GetLocaleSearchData(AyContext ct, long localeId)
{
LocaleSearchData LSD=new LocaleSearchData();
//Get stopwords
//Validate locale id, if not right then use default instead
var Param = new Api.Controllers.LocaleController.LocaleSubsetParam();
@@ -67,21 +74,421 @@ namespace AyaNova.Biz
Param.Keys.Add("StopWords6");
Param.Keys.Add("StopWords7");
var Stops = LocaleBiz.GetSubsetStatic(Param).Result;
List<string> StopWords = new List<string>();
foreach (KeyValuePair<string, string> kvp in Stops)
{
//Each stopwords locale key is a space delimited list of words and in the case of an empty local string (i.e. StopWords7) it's value is a single question mark
if (kvp.Value != "?")
{
StopWords.AddRange(kvp.Value.Split(" "));
LSD.StopWords.AddRange(kvp.Value.Split(" "));
}
}
return StopWords;
LSD.CJKIndex = LocaleBiz.GetCJKIndex(localeId,ct).Result;
return LSD;
}
#region Breaker
public enum TokenTypes
{ Nothing, Separator, CJK, Latin };
/// <summary>
/// Take an array of strings and
/// return a single string
/// containing unique only, lowercase comma delimited
/// keywords suitable for passing to a
/// stored procedure or other function
///
/// Use Locale setting CJKIndex=true to handle Chinese, Japanese, Korean etc
/// (languages with no easily identifiable word boundaries as in english)
/// </summary>
///
/// <param name="text">An array of 0 to * strings of text</param>
/// <returns></returns>
internal static string Break(params string[] text)
{
return BreakCore(false, text);
}
/// <summary>
/// Used to Process users search phrase and preserve wild
/// cards entered
/// </summary>
/// <param name="text"></param>
/// <returns></returns>
internal static string BreakSearchPhrase(params string[] text)
{
return BreakCore(true, text);
}
/// <summary>
/// Stop words list reset upon login or editing of localized text
/// used for eliminating noise words from search dictionary
/// </summary>
public static System.Collections.Generic.List<string> StopList = null;
internal static string BreakCore(bool KeepWildCards, params string[] text)
{
////case 1039 //log.Debug("Break");
#region stopwords
if (StopList == null)
{
StopList = new List<string>();
for (int stopkeys = 1; stopkeys < 8; stopkeys++)
{
MatchCollection mc = rxAllWords.Matches(LocalizedTextTable.GetLocalizedTextDirect("StopWords" + stopkeys.ToString()));
foreach (Match m in mc)
{
if (!string.IsNullOrEmpty(m.Value) && m.Value != "?" && !StopList.Contains(m.Value))
StopList.Add(m.Value);
}
}
}
#endregion
bool CJK = GlobalSettings.CJKIndex;
int MAXWORDLENGTH = 255;
StringBuilder sbResults = new StringBuilder();
//Hashtable to temporarily hold parsed words
//used to easily ensure unique words only
Hashtable ht = new Hashtable();
//Stuff required for creating xml fragment on the fly in memory (string)
StringBuilder sb = new StringBuilder();
StringBuilder sbWord = new StringBuilder();
System.IO.StringWriter sr = new System.IO.StringWriter(sb);
System.Xml.XmlTextWriter w = new System.Xml.XmlTextWriter(sr);
w.Formatting = System.Xml.Formatting.Indented;
w.WriteStartElement("Items");
//Loop through each of the passed in strings
foreach (string s in text)
{
if (s == null || s == "") continue;
//get all the characters in a unicode compliant manner...
TextElementEnumerator t = StringInfo.GetTextElementEnumerator(s);
//start at the top
t.Reset();
TokenTypes LastToken = TokenTypes.Nothing;
//Used by CJK
bool BasicLatinBlock = true;
//Process each "character" (text element,glyph whatever) in the
//current string
while (t.MoveNext())
{
//get it as a character
char c = t.GetTextElement()[0];
if (!CJK)
{
#region regular tokenizer
//Is it a token we want to include?
//Or a wildcard character
if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%'))
{
#region Include token
//All latin text is converted to lower case
c = char.ToLower(c);
//Do we already have a word?
if (sbWord.Length > 0)
{
//Maybe we need to flush this word into the word list
//if we're over the word length limit
if (sbWord.Length >= MAXWORDLENGTH)
{
//flush away...
if (!ht.ContainsKey(sbWord.ToString()))
{
ht[sbWord.ToString()] = 1;
//sbTest.Append(sbWord.ToString()+"\r\n");
}
sbWord.Length = 0;
sbWord.Append(c);
LastToken = TokenTypes.Latin;
continue;
}
}
//append character and go on to next one
sbWord.Append(c);
LastToken = TokenTypes.Latin;
continue;
#endregion
}
else
{
#region Word Boundary token
LastToken = TokenTypes.Separator;
if (sbWord.Length > 0)
{
//flush away...
if (!ht.ContainsKey(sbWord.ToString()))
{
ht[sbWord.ToString()] = 1;
//sbTest.Append(sbWord.ToString()+"\r\n");
}
sbWord.Length = 0;
continue;
}
#endregion
}
#endregion
}
else
{
#region CJK Tokenizer
//Is it a basic latin charater? (ascii basically)
//see: http://www.unicode.org/charts/index.html
//and here for a funky online viewer:
//http://www.fileformat.info/info/unicode/block/index.htm
//we need to know this so that regular english text
//within cjk text gets properly indexed as whole words
BasicLatinBlock = false;
if ((int)c < 256) BasicLatinBlock = true;
if (BasicLatinBlock)
{
//Is it a token we want to include?
if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%'))
{
#region Latin Include token
//All latin text is converted to lower case
c = char.ToLower(c);
//Do we already have a word?
if (sbWord.Length > 0)
{
//Maybe we need to flush this word into the word list
//if we're over the word length limit or we are going from
//CJK to latin
if (LastToken == TokenTypes.CJK || sbWord.Length >= MAXWORDLENGTH)
{
//flush away...
if (!ht.ContainsKey(sbWord.ToString()))
{
ht[sbWord.ToString()] = 1;
//sbTest.Append(sbWord.ToString()+"\r\n");
}
sbWord.Length = 0;
sbWord.Append(c);
LastToken = TokenTypes.Latin;
continue;
}
}
//append character and go on to next one
sbWord.Append(c);
LastToken = TokenTypes.Latin;
continue;
#endregion
}
else
{
#region Latin Word Boundary token
LastToken = TokenTypes.Separator;
if (sbWord.Length > 0)
{
//flush away...
if (!ht.ContainsKey(sbWord.ToString()))
{
ht[sbWord.ToString()] = 1;
//sbTest.Append(sbWord.ToString()+"\r\n");
}
sbWord.Length = 0;
continue;
}
#endregion
}
}
else//CJK character
{
if (char.IsLetter(c) || (KeepWildCards && c == '%'))
{
#region CJK Include token
//Do we already have a word?
if (sbWord.Length > 0)
{
//Maybe we need to flush this word into the word list
//if we're over the word length limit or we are going from
//latin TO CJK
if (LastToken == TokenTypes.Latin || sbWord.Length >= MAXWORDLENGTH)
{
//flush away...
if (!ht.ContainsKey(sbWord.ToString()))
{
ht[sbWord.ToString()] = 1;
//sbTest.Append(sbWord.ToString()+"\r\n");
}
sbWord.Length = 0;
sbWord.Append(c);
LastToken = TokenTypes.CJK;
continue;
}
if (LastToken == TokenTypes.CJK)
{
//we're here because there is more than zero characters already stored
//and the last was CJK so we need append current character
//and flush the resultant 2 character n-gram
sbWord.Append(c);
System.Diagnostics.Debug.Assert(sbWord.Length == 2);
if (!ht.ContainsKey(sbWord.ToString()))
{
ht[sbWord.ToString()] = 1;
//sbTest.Append(sbWord.ToString()+"\r\n");
}
sbWord.Length = 0;
sbWord.Append(c);
LastToken = TokenTypes.CJK;
continue;
}
}
//append character and go on to next one
sbWord.Append(c);
LastToken = TokenTypes.CJK;
continue;
#endregion
}
else
{
#region CJK Word Boundary token
LastToken = TokenTypes.Separator;
if (sbWord.Length > 0)
{
//flush away...
if (!ht.ContainsKey(sbWord.ToString()))
{
ht[sbWord.ToString()] = 1;
//sbTest.Append(sbWord.ToString()+"\r\n");
}
sbWord.Length = 0;
continue;
}
#endregion
}
}
#endregion
}
}
//Flush out the last word
if (sbWord.Length > 0)
{
//flush away...
if (!ht.ContainsKey(sbWord.ToString()))
{
ht[sbWord.ToString()] = 1;
////sbTest.Append(sbWord.ToString()+"\r\n");
}
sbWord.Length = 0;
}
}
//bail early if there is nothing indexed
if (ht.Count == 0) return "";
if (AsXML)
{
//Make a return xml fragment
//from the word list
foreach (DictionaryEntry d in ht)
{
//Add only non stopwords
if (!StopList.Contains(d.Key.ToString()))
{
w.WriteStartElement("i");
w.WriteAttributeString("w", d.Key.ToString());
w.WriteEndElement();
}
}
w.WriteEndElement();
sr.Close();
return sr.ToString();
}
else
{
//Make a return string array
//from the word list
foreach (DictionaryEntry d in ht)
{
//Add only non stopwords
if (!StopList.Contains(d.Key.ToString()))
{
sbResults.Append(d.Key.ToString());
sbResults.Append(",");
}
}
//sometimes all the results are stop words so you end up
//here with nothing in sbResults. Removed some code that was
//causing a crash here
return sbResults.ToString().TrimEnd(',');
}
//return sbTest.ToString();
}
#endregion
public class LocaleSearchData
{
public bool CJKIndex { get; set; }
public List<string> StopWords { get; set; }
public LocaleSearchData(){
CJKIndex=false;
StopWords=new List<string>();
}
}
}//eoc
}//eons