This commit is contained in:
@@ -164,6 +164,16 @@ namespace AyaNova.Biz
|
||||
}
|
||||
|
||||
|
||||
//Get the CJKIndex value for the locale specified
|
||||
internal static async Task<bool> GetCJKIndex(long localeId, AyContext ct=null)
|
||||
{
|
||||
if(ct==null)
|
||||
ct = ServiceProviderProvider.DBContext;
|
||||
var ret = await ct.Locale.Where(x => x.Id == localeId).Select(m=>m.CjkIndex).SingleOrDefaultAsync();
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Get the value of the key provided in the default locale chosen
|
||||
/// </summary>
|
||||
|
||||
@@ -13,12 +13,18 @@ namespace AyaNova.Biz
|
||||
//This class handles word breaking, processing keywords and searching for results
|
||||
public static class Search
|
||||
{
|
||||
|
||||
//Initial keyword indexing consists of
|
||||
//WordBreaker - break down into words
|
||||
//ProcessKeywords into database
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Process the keywords into the dictionary
|
||||
/// </summary>
|
||||
public static void ProcessKeywords(AyContext ct, long localeId, long objectID, AyaType objectType, bool newRecord, string keyWords, string name)
|
||||
{
|
||||
var StopWords = GetLocaleSearchData(ct, localeId);
|
||||
var LocaleSearchData = GetLocaleSearchData(ct, localeId);
|
||||
|
||||
//Get CJK index bool flag.
|
||||
//TODO: should this be a property of the locale or a global setting as before??
|
||||
@@ -53,8 +59,9 @@ namespace AyaNova.Biz
|
||||
|
||||
|
||||
//Get the current stopwords for the user's locale
|
||||
private static List<string> GetLocaleSearchData(AyContext ct, long localeId)
|
||||
private static LocaleSearchData GetLocaleSearchData(AyContext ct, long localeId)
|
||||
{
|
||||
LocaleSearchData LSD=new LocaleSearchData();
|
||||
//Get stopwords
|
||||
//Validate locale id, if not right then use default instead
|
||||
var Param = new Api.Controllers.LocaleController.LocaleSubsetParam();
|
||||
@@ -67,21 +74,421 @@ namespace AyaNova.Biz
|
||||
Param.Keys.Add("StopWords6");
|
||||
Param.Keys.Add("StopWords7");
|
||||
var Stops = LocaleBiz.GetSubsetStatic(Param).Result;
|
||||
List<string> StopWords = new List<string>();
|
||||
|
||||
foreach (KeyValuePair<string, string> kvp in Stops)
|
||||
{
|
||||
//Each stopwords locale key is a space delimited list of words and in the case of an empty local string (i.e. StopWords7) it's value is a single question mark
|
||||
if (kvp.Value != "?")
|
||||
{
|
||||
StopWords.AddRange(kvp.Value.Split(" "));
|
||||
LSD.StopWords.AddRange(kvp.Value.Split(" "));
|
||||
}
|
||||
}
|
||||
return StopWords;
|
||||
|
||||
LSD.CJKIndex = LocaleBiz.GetCJKIndex(localeId,ct).Result;
|
||||
return LSD;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
#region Breaker
|
||||
|
||||
public enum TokenTypes
|
||||
{ Nothing, Separator, CJK, Latin };
|
||||
|
||||
/// <summary>
|
||||
/// Take an array of strings and
|
||||
/// return a single string
|
||||
/// containing unique only, lowercase comma delimited
|
||||
/// keywords suitable for passing to a
|
||||
/// stored procedure or other function
|
||||
///
|
||||
/// Use Locale setting CJKIndex=true to handle Chinese, Japanese, Korean etc
|
||||
/// (languages with no easily identifiable word boundaries as in english)
|
||||
/// </summary>
|
||||
///
|
||||
|
||||
/// <param name="text">An array of 0 to * strings of text</param>
|
||||
/// <returns></returns>
|
||||
internal static string Break(params string[] text)
|
||||
{
|
||||
return BreakCore(false, text);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Used to Process users search phrase and preserve wild
|
||||
/// cards entered
|
||||
/// </summary>
|
||||
/// <param name="text"></param>
|
||||
/// <returns></returns>
|
||||
internal static string BreakSearchPhrase(params string[] text)
|
||||
{
|
||||
return BreakCore(true, text);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Stop words list reset upon login or editing of localized text
|
||||
/// used for eliminating noise words from search dictionary
|
||||
/// </summary>
|
||||
public static System.Collections.Generic.List<string> StopList = null;
|
||||
|
||||
internal static string BreakCore(bool KeepWildCards, params string[] text)
|
||||
{
|
||||
|
||||
////case 1039 //log.Debug("Break");
|
||||
|
||||
#region stopwords
|
||||
if (StopList == null)
|
||||
{
|
||||
StopList = new List<string>();
|
||||
for (int stopkeys = 1; stopkeys < 8; stopkeys++)
|
||||
{
|
||||
MatchCollection mc = rxAllWords.Matches(LocalizedTextTable.GetLocalizedTextDirect("StopWords" + stopkeys.ToString()));
|
||||
foreach (Match m in mc)
|
||||
{
|
||||
if (!string.IsNullOrEmpty(m.Value) && m.Value != "?" && !StopList.Contains(m.Value))
|
||||
StopList.Add(m.Value);
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
#endregion
|
||||
|
||||
bool CJK = GlobalSettings.CJKIndex;
|
||||
int MAXWORDLENGTH = 255;
|
||||
|
||||
StringBuilder sbResults = new StringBuilder();
|
||||
|
||||
//Hashtable to temporarily hold parsed words
|
||||
//used to easily ensure unique words only
|
||||
Hashtable ht = new Hashtable();
|
||||
|
||||
//Stuff required for creating xml fragment on the fly in memory (string)
|
||||
StringBuilder sb = new StringBuilder();
|
||||
StringBuilder sbWord = new StringBuilder();
|
||||
System.IO.StringWriter sr = new System.IO.StringWriter(sb);
|
||||
System.Xml.XmlTextWriter w = new System.Xml.XmlTextWriter(sr);
|
||||
|
||||
w.Formatting = System.Xml.Formatting.Indented;
|
||||
w.WriteStartElement("Items");
|
||||
|
||||
|
||||
//Loop through each of the passed in strings
|
||||
foreach (string s in text)
|
||||
{
|
||||
if (s == null || s == "") continue;
|
||||
//get all the characters in a unicode compliant manner...
|
||||
TextElementEnumerator t = StringInfo.GetTextElementEnumerator(s);
|
||||
//start at the top
|
||||
t.Reset();
|
||||
|
||||
TokenTypes LastToken = TokenTypes.Nothing;
|
||||
|
||||
//Used by CJK
|
||||
bool BasicLatinBlock = true;
|
||||
|
||||
//Process each "character" (text element,glyph whatever) in the
|
||||
//current string
|
||||
while (t.MoveNext())
|
||||
{
|
||||
//get it as a character
|
||||
char c = t.GetTextElement()[0];
|
||||
|
||||
if (!CJK)
|
||||
{
|
||||
#region regular tokenizer
|
||||
|
||||
//Is it a token we want to include?
|
||||
//Or a wildcard character
|
||||
if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%'))
|
||||
{
|
||||
#region Include token
|
||||
//All latin text is converted to lower case
|
||||
c = char.ToLower(c);
|
||||
|
||||
//Do we already have a word?
|
||||
if (sbWord.Length > 0)
|
||||
{
|
||||
//Maybe we need to flush this word into the word list
|
||||
//if we're over the word length limit
|
||||
if (sbWord.Length >= MAXWORDLENGTH)
|
||||
{
|
||||
//flush away...
|
||||
if (!ht.ContainsKey(sbWord.ToString()))
|
||||
{
|
||||
ht[sbWord.ToString()] = 1;
|
||||
//sbTest.Append(sbWord.ToString()+"\r\n");
|
||||
}
|
||||
sbWord.Length = 0;
|
||||
sbWord.Append(c);
|
||||
LastToken = TokenTypes.Latin;
|
||||
continue;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
//append character and go on to next one
|
||||
sbWord.Append(c);
|
||||
LastToken = TokenTypes.Latin;
|
||||
continue;
|
||||
#endregion
|
||||
}
|
||||
else
|
||||
{
|
||||
#region Word Boundary token
|
||||
LastToken = TokenTypes.Separator;
|
||||
if (sbWord.Length > 0)
|
||||
{
|
||||
//flush away...
|
||||
if (!ht.ContainsKey(sbWord.ToString()))
|
||||
{
|
||||
ht[sbWord.ToString()] = 1;
|
||||
//sbTest.Append(sbWord.ToString()+"\r\n");
|
||||
}
|
||||
sbWord.Length = 0;
|
||||
|
||||
continue;
|
||||
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
#endregion
|
||||
}
|
||||
else
|
||||
{
|
||||
#region CJK Tokenizer
|
||||
|
||||
//Is it a basic latin charater? (ascii basically)
|
||||
//see: http://www.unicode.org/charts/index.html
|
||||
//and here for a funky online viewer:
|
||||
//http://www.fileformat.info/info/unicode/block/index.htm
|
||||
//we need to know this so that regular english text
|
||||
//within cjk text gets properly indexed as whole words
|
||||
BasicLatinBlock = false;
|
||||
if ((int)c < 256) BasicLatinBlock = true;
|
||||
|
||||
if (BasicLatinBlock)
|
||||
{
|
||||
//Is it a token we want to include?
|
||||
if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%'))
|
||||
{
|
||||
#region Latin Include token
|
||||
//All latin text is converted to lower case
|
||||
c = char.ToLower(c);
|
||||
|
||||
//Do we already have a word?
|
||||
if (sbWord.Length > 0)
|
||||
{
|
||||
//Maybe we need to flush this word into the word list
|
||||
//if we're over the word length limit or we are going from
|
||||
//CJK to latin
|
||||
if (LastToken == TokenTypes.CJK || sbWord.Length >= MAXWORDLENGTH)
|
||||
{
|
||||
//flush away...
|
||||
if (!ht.ContainsKey(sbWord.ToString()))
|
||||
{
|
||||
ht[sbWord.ToString()] = 1;
|
||||
//sbTest.Append(sbWord.ToString()+"\r\n");
|
||||
}
|
||||
sbWord.Length = 0;
|
||||
sbWord.Append(c);
|
||||
LastToken = TokenTypes.Latin;
|
||||
continue;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
//append character and go on to next one
|
||||
sbWord.Append(c);
|
||||
LastToken = TokenTypes.Latin;
|
||||
continue;
|
||||
#endregion
|
||||
}
|
||||
else
|
||||
{
|
||||
#region Latin Word Boundary token
|
||||
LastToken = TokenTypes.Separator;
|
||||
if (sbWord.Length > 0)
|
||||
{
|
||||
//flush away...
|
||||
if (!ht.ContainsKey(sbWord.ToString()))
|
||||
{
|
||||
ht[sbWord.ToString()] = 1;
|
||||
//sbTest.Append(sbWord.ToString()+"\r\n");
|
||||
}
|
||||
sbWord.Length = 0;
|
||||
|
||||
continue;
|
||||
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
|
||||
}
|
||||
else//CJK character
|
||||
{
|
||||
if (char.IsLetter(c) || (KeepWildCards && c == '%'))
|
||||
{
|
||||
#region CJK Include token
|
||||
//Do we already have a word?
|
||||
if (sbWord.Length > 0)
|
||||
{
|
||||
//Maybe we need to flush this word into the word list
|
||||
//if we're over the word length limit or we are going from
|
||||
//latin TO CJK
|
||||
if (LastToken == TokenTypes.Latin || sbWord.Length >= MAXWORDLENGTH)
|
||||
{
|
||||
//flush away...
|
||||
if (!ht.ContainsKey(sbWord.ToString()))
|
||||
{
|
||||
ht[sbWord.ToString()] = 1;
|
||||
//sbTest.Append(sbWord.ToString()+"\r\n");
|
||||
}
|
||||
sbWord.Length = 0;
|
||||
sbWord.Append(c);
|
||||
LastToken = TokenTypes.CJK;
|
||||
continue;
|
||||
|
||||
}
|
||||
|
||||
if (LastToken == TokenTypes.CJK)
|
||||
{
|
||||
//we're here because there is more than zero characters already stored
|
||||
//and the last was CJK so we need append current character
|
||||
//and flush the resultant 2 character n-gram
|
||||
sbWord.Append(c);
|
||||
System.Diagnostics.Debug.Assert(sbWord.Length == 2);
|
||||
if (!ht.ContainsKey(sbWord.ToString()))
|
||||
{
|
||||
ht[sbWord.ToString()] = 1;
|
||||
//sbTest.Append(sbWord.ToString()+"\r\n");
|
||||
}
|
||||
sbWord.Length = 0;
|
||||
sbWord.Append(c);
|
||||
LastToken = TokenTypes.CJK;
|
||||
continue;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
//append character and go on to next one
|
||||
sbWord.Append(c);
|
||||
LastToken = TokenTypes.CJK;
|
||||
continue;
|
||||
#endregion
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
#region CJK Word Boundary token
|
||||
LastToken = TokenTypes.Separator;
|
||||
if (sbWord.Length > 0)
|
||||
{
|
||||
//flush away...
|
||||
if (!ht.ContainsKey(sbWord.ToString()))
|
||||
{
|
||||
ht[sbWord.ToString()] = 1;
|
||||
//sbTest.Append(sbWord.ToString()+"\r\n");
|
||||
}
|
||||
sbWord.Length = 0;
|
||||
|
||||
continue;
|
||||
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#endregion
|
||||
}
|
||||
}
|
||||
|
||||
//Flush out the last word
|
||||
if (sbWord.Length > 0)
|
||||
{
|
||||
//flush away...
|
||||
if (!ht.ContainsKey(sbWord.ToString()))
|
||||
{
|
||||
ht[sbWord.ToString()] = 1;
|
||||
////sbTest.Append(sbWord.ToString()+"\r\n");
|
||||
}
|
||||
sbWord.Length = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//bail early if there is nothing indexed
|
||||
if (ht.Count == 0) return "";
|
||||
|
||||
if (AsXML)
|
||||
{
|
||||
//Make a return xml fragment
|
||||
//from the word list
|
||||
foreach (DictionaryEntry d in ht)
|
||||
{
|
||||
//Add only non stopwords
|
||||
if (!StopList.Contains(d.Key.ToString()))
|
||||
{
|
||||
w.WriteStartElement("i");
|
||||
w.WriteAttributeString("w", d.Key.ToString());
|
||||
w.WriteEndElement();
|
||||
}
|
||||
}
|
||||
|
||||
w.WriteEndElement();
|
||||
sr.Close();
|
||||
return sr.ToString();
|
||||
}
|
||||
else
|
||||
{
|
||||
//Make a return string array
|
||||
//from the word list
|
||||
foreach (DictionaryEntry d in ht)
|
||||
{
|
||||
//Add only non stopwords
|
||||
if (!StopList.Contains(d.Key.ToString()))
|
||||
{
|
||||
sbResults.Append(d.Key.ToString());
|
||||
sbResults.Append(",");
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
//sometimes all the results are stop words so you end up
|
||||
//here with nothing in sbResults. Removed some code that was
|
||||
//causing a crash here
|
||||
return sbResults.ToString().TrimEnd(',');
|
||||
|
||||
}
|
||||
//return sbTest.ToString();
|
||||
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
|
||||
public class LocaleSearchData
|
||||
{
|
||||
public bool CJKIndex { get; set; }
|
||||
public List<string> StopWords { get; set; }
|
||||
public LocaleSearchData(){
|
||||
CJKIndex=false;
|
||||
StopWords=new List<string>();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}//eoc
|
||||
|
||||
}//eons
|
||||
Reference in New Issue
Block a user