This commit is contained in:
2018-09-18 23:15:02 +00:00
parent 6771c9f2a1
commit 02d4084826
2 changed files with 422 additions and 5 deletions

View File

@@ -164,6 +164,16 @@ namespace AyaNova.Biz
} }
//Get the CJKIndex value for the locale specified
internal static async Task<bool> GetCJKIndex(long localeId, AyContext ct=null)
{
if(ct==null)
ct = ServiceProviderProvider.DBContext;
var ret = await ct.Locale.Where(x => x.Id == localeId).Select(m=>m.CjkIndex).SingleOrDefaultAsync();
return ret;
}
/// <summary> /// <summary>
/// Get the value of the key provided in the default locale chosen /// Get the value of the key provided in the default locale chosen
/// </summary> /// </summary>

View File

@@ -13,12 +13,18 @@ namespace AyaNova.Biz
//This class handles word breaking, processing keywords and searching for results //This class handles word breaking, processing keywords and searching for results
public static class Search public static class Search
{ {
//Initial keyword indexing consists of
//WordBreaker - break down into words
//ProcessKeywords into database
/// <summary> /// <summary>
/// Process the keywords into the dictionary /// Process the keywords into the dictionary
/// </summary> /// </summary>
public static void ProcessKeywords(AyContext ct, long localeId, long objectID, AyaType objectType, bool newRecord, string keyWords, string name) public static void ProcessKeywords(AyContext ct, long localeId, long objectID, AyaType objectType, bool newRecord, string keyWords, string name)
{ {
var StopWords = GetLocaleSearchData(ct, localeId); var LocaleSearchData = GetLocaleSearchData(ct, localeId);
//Get CJK index bool flag. //Get CJK index bool flag.
//TODO: should this be a property of the locale or a global setting as before?? //TODO: should this be a property of the locale or a global setting as before??
@@ -53,8 +59,9 @@ namespace AyaNova.Biz
//Get the current stopwords for the user's locale //Get the current stopwords for the user's locale
private static List<string> GetLocaleSearchData(AyContext ct, long localeId) private static LocaleSearchData GetLocaleSearchData(AyContext ct, long localeId)
{ {
LocaleSearchData LSD=new LocaleSearchData();
//Get stopwords //Get stopwords
//Validate locale id, if not right then use default instead //Validate locale id, if not right then use default instead
var Param = new Api.Controllers.LocaleController.LocaleSubsetParam(); var Param = new Api.Controllers.LocaleController.LocaleSubsetParam();
@@ -67,21 +74,421 @@ namespace AyaNova.Biz
Param.Keys.Add("StopWords6"); Param.Keys.Add("StopWords6");
Param.Keys.Add("StopWords7"); Param.Keys.Add("StopWords7");
var Stops = LocaleBiz.GetSubsetStatic(Param).Result; var Stops = LocaleBiz.GetSubsetStatic(Param).Result;
List<string> StopWords = new List<string>();
foreach (KeyValuePair<string, string> kvp in Stops) foreach (KeyValuePair<string, string> kvp in Stops)
{ {
//Each stopwords locale key is a space delimited list of words and in the case of an empty local string (i.e. StopWords7) it's value is a single question mark //Each stopwords locale key is a space delimited list of words and in the case of an empty local string (i.e. StopWords7) it's value is a single question mark
if (kvp.Value != "?") if (kvp.Value != "?")
{ {
StopWords.AddRange(kvp.Value.Split(" ")); LSD.StopWords.AddRange(kvp.Value.Split(" "));
} }
} }
return StopWords;
LSD.CJKIndex = LocaleBiz.GetCJKIndex(localeId,ct).Result;
return LSD;
} }
#region Breaker
public enum TokenTypes
{ Nothing, Separator, CJK, Latin };
/// <summary>
/// Take an array of strings and
/// return a single string
/// containing unique only, lowercase comma delimited
/// keywords suitable for passing to a
/// stored procedure or other function
///
/// Use Locale setting CJKIndex=true to handle Chinese, Japanese, Korean etc
/// (languages with no easily identifiable word boundaries as in english)
/// </summary>
///
/// <param name="text">An array of 0 to * strings of text</param>
/// <returns></returns>
internal static string Break(params string[] text)
{
return BreakCore(false, text);
}
/// <summary>
/// Used to Process users search phrase and preserve wild
/// cards entered
/// </summary>
/// <param name="text"></param>
/// <returns></returns>
internal static string BreakSearchPhrase(params string[] text)
{
return BreakCore(true, text);
}
/// <summary>
/// Stop words list reset upon login or editing of localized text
/// used for eliminating noise words from search dictionary
/// </summary>
public static System.Collections.Generic.List<string> StopList = null;
internal static string BreakCore(bool KeepWildCards, params string[] text)
{
////case 1039 //log.Debug("Break");
#region stopwords
if (StopList == null)
{
StopList = new List<string>();
for (int stopkeys = 1; stopkeys < 8; stopkeys++)
{
MatchCollection mc = rxAllWords.Matches(LocalizedTextTable.GetLocalizedTextDirect("StopWords" + stopkeys.ToString()));
foreach (Match m in mc)
{
if (!string.IsNullOrEmpty(m.Value) && m.Value != "?" && !StopList.Contains(m.Value))
StopList.Add(m.Value);
}
}
}
#endregion
bool CJK = GlobalSettings.CJKIndex;
int MAXWORDLENGTH = 255;
StringBuilder sbResults = new StringBuilder();
//Hashtable to temporarily hold parsed words
//used to easily ensure unique words only
Hashtable ht = new Hashtable();
//Stuff required for creating xml fragment on the fly in memory (string)
StringBuilder sb = new StringBuilder();
StringBuilder sbWord = new StringBuilder();
System.IO.StringWriter sr = new System.IO.StringWriter(sb);
System.Xml.XmlTextWriter w = new System.Xml.XmlTextWriter(sr);
w.Formatting = System.Xml.Formatting.Indented;
w.WriteStartElement("Items");
//Loop through each of the passed in strings
foreach (string s in text)
{
if (s == null || s == "") continue;
//get all the characters in a unicode compliant manner...
TextElementEnumerator t = StringInfo.GetTextElementEnumerator(s);
//start at the top
t.Reset();
TokenTypes LastToken = TokenTypes.Nothing;
//Used by CJK
bool BasicLatinBlock = true;
//Process each "character" (text element,glyph whatever) in the
//current string
while (t.MoveNext())
{
//get it as a character
char c = t.GetTextElement()[0];
if (!CJK)
{
#region regular tokenizer
//Is it a token we want to include?
//Or a wildcard character
if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%'))
{
#region Include token
//All latin text is converted to lower case
c = char.ToLower(c);
//Do we already have a word?
if (sbWord.Length > 0)
{
//Maybe we need to flush this word into the word list
//if we're over the word length limit
if (sbWord.Length >= MAXWORDLENGTH)
{
//flush away...
if (!ht.ContainsKey(sbWord.ToString()))
{
ht[sbWord.ToString()] = 1;
//sbTest.Append(sbWord.ToString()+"\r\n");
}
sbWord.Length = 0;
sbWord.Append(c);
LastToken = TokenTypes.Latin;
continue;
}
}
//append character and go on to next one
sbWord.Append(c);
LastToken = TokenTypes.Latin;
continue;
#endregion
}
else
{
#region Word Boundary token
LastToken = TokenTypes.Separator;
if (sbWord.Length > 0)
{
//flush away...
if (!ht.ContainsKey(sbWord.ToString()))
{
ht[sbWord.ToString()] = 1;
//sbTest.Append(sbWord.ToString()+"\r\n");
}
sbWord.Length = 0;
continue;
}
#endregion
}
#endregion
}
else
{
#region CJK Tokenizer
//Is it a basic latin charater? (ascii basically)
//see: http://www.unicode.org/charts/index.html
//and here for a funky online viewer:
//http://www.fileformat.info/info/unicode/block/index.htm
//we need to know this so that regular english text
//within cjk text gets properly indexed as whole words
BasicLatinBlock = false;
if ((int)c < 256) BasicLatinBlock = true;
if (BasicLatinBlock)
{
//Is it a token we want to include?
if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%'))
{
#region Latin Include token
//All latin text is converted to lower case
c = char.ToLower(c);
//Do we already have a word?
if (sbWord.Length > 0)
{
//Maybe we need to flush this word into the word list
//if we're over the word length limit or we are going from
//CJK to latin
if (LastToken == TokenTypes.CJK || sbWord.Length >= MAXWORDLENGTH)
{
//flush away...
if (!ht.ContainsKey(sbWord.ToString()))
{
ht[sbWord.ToString()] = 1;
//sbTest.Append(sbWord.ToString()+"\r\n");
}
sbWord.Length = 0;
sbWord.Append(c);
LastToken = TokenTypes.Latin;
continue;
}
}
//append character and go on to next one
sbWord.Append(c);
LastToken = TokenTypes.Latin;
continue;
#endregion
}
else
{
#region Latin Word Boundary token
LastToken = TokenTypes.Separator;
if (sbWord.Length > 0)
{
//flush away...
if (!ht.ContainsKey(sbWord.ToString()))
{
ht[sbWord.ToString()] = 1;
//sbTest.Append(sbWord.ToString()+"\r\n");
}
sbWord.Length = 0;
continue;
}
#endregion
}
}
else//CJK character
{
if (char.IsLetter(c) || (KeepWildCards && c == '%'))
{
#region CJK Include token
//Do we already have a word?
if (sbWord.Length > 0)
{
//Maybe we need to flush this word into the word list
//if we're over the word length limit or we are going from
//latin TO CJK
if (LastToken == TokenTypes.Latin || sbWord.Length >= MAXWORDLENGTH)
{
//flush away...
if (!ht.ContainsKey(sbWord.ToString()))
{
ht[sbWord.ToString()] = 1;
//sbTest.Append(sbWord.ToString()+"\r\n");
}
sbWord.Length = 0;
sbWord.Append(c);
LastToken = TokenTypes.CJK;
continue;
}
if (LastToken == TokenTypes.CJK)
{
//we're here because there is more than zero characters already stored
//and the last was CJK so we need append current character
//and flush the resultant 2 character n-gram
sbWord.Append(c);
System.Diagnostics.Debug.Assert(sbWord.Length == 2);
if (!ht.ContainsKey(sbWord.ToString()))
{
ht[sbWord.ToString()] = 1;
//sbTest.Append(sbWord.ToString()+"\r\n");
}
sbWord.Length = 0;
sbWord.Append(c);
LastToken = TokenTypes.CJK;
continue;
}
}
//append character and go on to next one
sbWord.Append(c);
LastToken = TokenTypes.CJK;
continue;
#endregion
}
else
{
#region CJK Word Boundary token
LastToken = TokenTypes.Separator;
if (sbWord.Length > 0)
{
//flush away...
if (!ht.ContainsKey(sbWord.ToString()))
{
ht[sbWord.ToString()] = 1;
//sbTest.Append(sbWord.ToString()+"\r\n");
}
sbWord.Length = 0;
continue;
}
#endregion
}
}
#endregion
}
}
//Flush out the last word
if (sbWord.Length > 0)
{
//flush away...
if (!ht.ContainsKey(sbWord.ToString()))
{
ht[sbWord.ToString()] = 1;
////sbTest.Append(sbWord.ToString()+"\r\n");
}
sbWord.Length = 0;
}
}
//bail early if there is nothing indexed
if (ht.Count == 0) return "";
if (AsXML)
{
//Make a return xml fragment
//from the word list
foreach (DictionaryEntry d in ht)
{
//Add only non stopwords
if (!StopList.Contains(d.Key.ToString()))
{
w.WriteStartElement("i");
w.WriteAttributeString("w", d.Key.ToString());
w.WriteEndElement();
}
}
w.WriteEndElement();
sr.Close();
return sr.ToString();
}
else
{
//Make a return string array
//from the word list
foreach (DictionaryEntry d in ht)
{
//Add only non stopwords
if (!StopList.Contains(d.Key.ToString()))
{
sbResults.Append(d.Key.ToString());
sbResults.Append(",");
}
}
//sometimes all the results are stop words so you end up
//here with nothing in sbResults. Removed some code that was
//causing a crash here
return sbResults.ToString().TrimEnd(',');
}
//return sbTest.ToString();
}
#endregion
public class LocaleSearchData
{
public bool CJKIndex { get; set; }
public List<string> StopWords { get; set; }
public LocaleSearchData(){
CJKIndex=false;
StopWords=new List<string>();
}
}
}//eoc }//eoc
}//eons }//eons