This commit is contained in:
@@ -164,6 +164,16 @@ namespace AyaNova.Biz
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
//Get the CJKIndex value for the locale specified
|
||||||
|
internal static async Task<bool> GetCJKIndex(long localeId, AyContext ct=null)
|
||||||
|
{
|
||||||
|
if(ct==null)
|
||||||
|
ct = ServiceProviderProvider.DBContext;
|
||||||
|
var ret = await ct.Locale.Where(x => x.Id == localeId).Select(m=>m.CjkIndex).SingleOrDefaultAsync();
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Get the value of the key provided in the default locale chosen
|
/// Get the value of the key provided in the default locale chosen
|
||||||
/// </summary>
|
/// </summary>
|
||||||
|
|||||||
@@ -13,12 +13,18 @@ namespace AyaNova.Biz
|
|||||||
//This class handles word breaking, processing keywords and searching for results
|
//This class handles word breaking, processing keywords and searching for results
|
||||||
public static class Search
|
public static class Search
|
||||||
{
|
{
|
||||||
|
|
||||||
|
//Initial keyword indexing consists of
|
||||||
|
//WordBreaker - break down into words
|
||||||
|
//ProcessKeywords into database
|
||||||
|
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Process the keywords into the dictionary
|
/// Process the keywords into the dictionary
|
||||||
/// </summary>
|
/// </summary>
|
||||||
public static void ProcessKeywords(AyContext ct, long localeId, long objectID, AyaType objectType, bool newRecord, string keyWords, string name)
|
public static void ProcessKeywords(AyContext ct, long localeId, long objectID, AyaType objectType, bool newRecord, string keyWords, string name)
|
||||||
{
|
{
|
||||||
var StopWords = GetLocaleSearchData(ct, localeId);
|
var LocaleSearchData = GetLocaleSearchData(ct, localeId);
|
||||||
|
|
||||||
//Get CJK index bool flag.
|
//Get CJK index bool flag.
|
||||||
//TODO: should this be a property of the locale or a global setting as before??
|
//TODO: should this be a property of the locale or a global setting as before??
|
||||||
@@ -53,8 +59,9 @@ namespace AyaNova.Biz
|
|||||||
|
|
||||||
|
|
||||||
//Get the current stopwords for the user's locale
|
//Get the current stopwords for the user's locale
|
||||||
private static List<string> GetLocaleSearchData(AyContext ct, long localeId)
|
private static LocaleSearchData GetLocaleSearchData(AyContext ct, long localeId)
|
||||||
{
|
{
|
||||||
|
LocaleSearchData LSD=new LocaleSearchData();
|
||||||
//Get stopwords
|
//Get stopwords
|
||||||
//Validate locale id, if not right then use default instead
|
//Validate locale id, if not right then use default instead
|
||||||
var Param = new Api.Controllers.LocaleController.LocaleSubsetParam();
|
var Param = new Api.Controllers.LocaleController.LocaleSubsetParam();
|
||||||
@@ -67,21 +74,421 @@ namespace AyaNova.Biz
|
|||||||
Param.Keys.Add("StopWords6");
|
Param.Keys.Add("StopWords6");
|
||||||
Param.Keys.Add("StopWords7");
|
Param.Keys.Add("StopWords7");
|
||||||
var Stops = LocaleBiz.GetSubsetStatic(Param).Result;
|
var Stops = LocaleBiz.GetSubsetStatic(Param).Result;
|
||||||
List<string> StopWords = new List<string>();
|
|
||||||
foreach (KeyValuePair<string, string> kvp in Stops)
|
foreach (KeyValuePair<string, string> kvp in Stops)
|
||||||
{
|
{
|
||||||
//Each stopwords locale key is a space delimited list of words and in the case of an empty local string (i.e. StopWords7) it's value is a single question mark
|
//Each stopwords locale key is a space delimited list of words and in the case of an empty local string (i.e. StopWords7) it's value is a single question mark
|
||||||
if (kvp.Value != "?")
|
if (kvp.Value != "?")
|
||||||
{
|
{
|
||||||
StopWords.AddRange(kvp.Value.Split(" "));
|
LSD.StopWords.AddRange(kvp.Value.Split(" "));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return StopWords;
|
|
||||||
|
LSD.CJKIndex = LocaleBiz.GetCJKIndex(localeId,ct).Result;
|
||||||
|
return LSD;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#region Breaker
|
||||||
|
|
||||||
|
public enum TokenTypes
|
||||||
|
{ Nothing, Separator, CJK, Latin };
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Take an array of strings and
|
||||||
|
/// return a single string
|
||||||
|
/// containing unique only, lowercase comma delimited
|
||||||
|
/// keywords suitable for passing to a
|
||||||
|
/// stored procedure or other function
|
||||||
|
///
|
||||||
|
/// Use Locale setting CJKIndex=true to handle Chinese, Japanese, Korean etc
|
||||||
|
/// (languages with no easily identifiable word boundaries as in english)
|
||||||
|
/// </summary>
|
||||||
|
///
|
||||||
|
|
||||||
|
/// <param name="text">An array of 0 to * strings of text</param>
|
||||||
|
/// <returns></returns>
|
||||||
|
internal static string Break(params string[] text)
|
||||||
|
{
|
||||||
|
return BreakCore(false, text);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Used to Process users search phrase and preserve wild
|
||||||
|
/// cards entered
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="text"></param>
|
||||||
|
/// <returns></returns>
|
||||||
|
internal static string BreakSearchPhrase(params string[] text)
|
||||||
|
{
|
||||||
|
return BreakCore(true, text);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Stop words list reset upon login or editing of localized text
|
||||||
|
/// used for eliminating noise words from search dictionary
|
||||||
|
/// </summary>
|
||||||
|
public static System.Collections.Generic.List<string> StopList = null;
|
||||||
|
|
||||||
|
internal static string BreakCore(bool KeepWildCards, params string[] text)
|
||||||
|
{
|
||||||
|
|
||||||
|
////case 1039 //log.Debug("Break");
|
||||||
|
|
||||||
|
#region stopwords
|
||||||
|
if (StopList == null)
|
||||||
|
{
|
||||||
|
StopList = new List<string>();
|
||||||
|
for (int stopkeys = 1; stopkeys < 8; stopkeys++)
|
||||||
|
{
|
||||||
|
MatchCollection mc = rxAllWords.Matches(LocalizedTextTable.GetLocalizedTextDirect("StopWords" + stopkeys.ToString()));
|
||||||
|
foreach (Match m in mc)
|
||||||
|
{
|
||||||
|
if (!string.IsNullOrEmpty(m.Value) && m.Value != "?" && !StopList.Contains(m.Value))
|
||||||
|
StopList.Add(m.Value);
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endregion
|
||||||
|
|
||||||
|
bool CJK = GlobalSettings.CJKIndex;
|
||||||
|
int MAXWORDLENGTH = 255;
|
||||||
|
|
||||||
|
StringBuilder sbResults = new StringBuilder();
|
||||||
|
|
||||||
|
//Hashtable to temporarily hold parsed words
|
||||||
|
//used to easily ensure unique words only
|
||||||
|
Hashtable ht = new Hashtable();
|
||||||
|
|
||||||
|
//Stuff required for creating xml fragment on the fly in memory (string)
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
StringBuilder sbWord = new StringBuilder();
|
||||||
|
System.IO.StringWriter sr = new System.IO.StringWriter(sb);
|
||||||
|
System.Xml.XmlTextWriter w = new System.Xml.XmlTextWriter(sr);
|
||||||
|
|
||||||
|
w.Formatting = System.Xml.Formatting.Indented;
|
||||||
|
w.WriteStartElement("Items");
|
||||||
|
|
||||||
|
|
||||||
|
//Loop through each of the passed in strings
|
||||||
|
foreach (string s in text)
|
||||||
|
{
|
||||||
|
if (s == null || s == "") continue;
|
||||||
|
//get all the characters in a unicode compliant manner...
|
||||||
|
TextElementEnumerator t = StringInfo.GetTextElementEnumerator(s);
|
||||||
|
//start at the top
|
||||||
|
t.Reset();
|
||||||
|
|
||||||
|
TokenTypes LastToken = TokenTypes.Nothing;
|
||||||
|
|
||||||
|
//Used by CJK
|
||||||
|
bool BasicLatinBlock = true;
|
||||||
|
|
||||||
|
//Process each "character" (text element,glyph whatever) in the
|
||||||
|
//current string
|
||||||
|
while (t.MoveNext())
|
||||||
|
{
|
||||||
|
//get it as a character
|
||||||
|
char c = t.GetTextElement()[0];
|
||||||
|
|
||||||
|
if (!CJK)
|
||||||
|
{
|
||||||
|
#region regular tokenizer
|
||||||
|
|
||||||
|
//Is it a token we want to include?
|
||||||
|
//Or a wildcard character
|
||||||
|
if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%'))
|
||||||
|
{
|
||||||
|
#region Include token
|
||||||
|
//All latin text is converted to lower case
|
||||||
|
c = char.ToLower(c);
|
||||||
|
|
||||||
|
//Do we already have a word?
|
||||||
|
if (sbWord.Length > 0)
|
||||||
|
{
|
||||||
|
//Maybe we need to flush this word into the word list
|
||||||
|
//if we're over the word length limit
|
||||||
|
if (sbWord.Length >= MAXWORDLENGTH)
|
||||||
|
{
|
||||||
|
//flush away...
|
||||||
|
if (!ht.ContainsKey(sbWord.ToString()))
|
||||||
|
{
|
||||||
|
ht[sbWord.ToString()] = 1;
|
||||||
|
//sbTest.Append(sbWord.ToString()+"\r\n");
|
||||||
|
}
|
||||||
|
sbWord.Length = 0;
|
||||||
|
sbWord.Append(c);
|
||||||
|
LastToken = TokenTypes.Latin;
|
||||||
|
continue;
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//append character and go on to next one
|
||||||
|
sbWord.Append(c);
|
||||||
|
LastToken = TokenTypes.Latin;
|
||||||
|
continue;
|
||||||
|
#endregion
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
#region Word Boundary token
|
||||||
|
LastToken = TokenTypes.Separator;
|
||||||
|
if (sbWord.Length > 0)
|
||||||
|
{
|
||||||
|
//flush away...
|
||||||
|
if (!ht.ContainsKey(sbWord.ToString()))
|
||||||
|
{
|
||||||
|
ht[sbWord.ToString()] = 1;
|
||||||
|
//sbTest.Append(sbWord.ToString()+"\r\n");
|
||||||
|
}
|
||||||
|
sbWord.Length = 0;
|
||||||
|
|
||||||
|
continue;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
}
|
||||||
|
#endregion
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
#region CJK Tokenizer
|
||||||
|
|
||||||
|
//Is it a basic latin charater? (ascii basically)
|
||||||
|
//see: http://www.unicode.org/charts/index.html
|
||||||
|
//and here for a funky online viewer:
|
||||||
|
//http://www.fileformat.info/info/unicode/block/index.htm
|
||||||
|
//we need to know this so that regular english text
|
||||||
|
//within cjk text gets properly indexed as whole words
|
||||||
|
BasicLatinBlock = false;
|
||||||
|
if ((int)c < 256) BasicLatinBlock = true;
|
||||||
|
|
||||||
|
if (BasicLatinBlock)
|
||||||
|
{
|
||||||
|
//Is it a token we want to include?
|
||||||
|
if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%'))
|
||||||
|
{
|
||||||
|
#region Latin Include token
|
||||||
|
//All latin text is converted to lower case
|
||||||
|
c = char.ToLower(c);
|
||||||
|
|
||||||
|
//Do we already have a word?
|
||||||
|
if (sbWord.Length > 0)
|
||||||
|
{
|
||||||
|
//Maybe we need to flush this word into the word list
|
||||||
|
//if we're over the word length limit or we are going from
|
||||||
|
//CJK to latin
|
||||||
|
if (LastToken == TokenTypes.CJK || sbWord.Length >= MAXWORDLENGTH)
|
||||||
|
{
|
||||||
|
//flush away...
|
||||||
|
if (!ht.ContainsKey(sbWord.ToString()))
|
||||||
|
{
|
||||||
|
ht[sbWord.ToString()] = 1;
|
||||||
|
//sbTest.Append(sbWord.ToString()+"\r\n");
|
||||||
|
}
|
||||||
|
sbWord.Length = 0;
|
||||||
|
sbWord.Append(c);
|
||||||
|
LastToken = TokenTypes.Latin;
|
||||||
|
continue;
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//append character and go on to next one
|
||||||
|
sbWord.Append(c);
|
||||||
|
LastToken = TokenTypes.Latin;
|
||||||
|
continue;
|
||||||
|
#endregion
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
#region Latin Word Boundary token
|
||||||
|
LastToken = TokenTypes.Separator;
|
||||||
|
if (sbWord.Length > 0)
|
||||||
|
{
|
||||||
|
//flush away...
|
||||||
|
if (!ht.ContainsKey(sbWord.ToString()))
|
||||||
|
{
|
||||||
|
ht[sbWord.ToString()] = 1;
|
||||||
|
//sbTest.Append(sbWord.ToString()+"\r\n");
|
||||||
|
}
|
||||||
|
sbWord.Length = 0;
|
||||||
|
|
||||||
|
continue;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
else//CJK character
|
||||||
|
{
|
||||||
|
if (char.IsLetter(c) || (KeepWildCards && c == '%'))
|
||||||
|
{
|
||||||
|
#region CJK Include token
|
||||||
|
//Do we already have a word?
|
||||||
|
if (sbWord.Length > 0)
|
||||||
|
{
|
||||||
|
//Maybe we need to flush this word into the word list
|
||||||
|
//if we're over the word length limit or we are going from
|
||||||
|
//latin TO CJK
|
||||||
|
if (LastToken == TokenTypes.Latin || sbWord.Length >= MAXWORDLENGTH)
|
||||||
|
{
|
||||||
|
//flush away...
|
||||||
|
if (!ht.ContainsKey(sbWord.ToString()))
|
||||||
|
{
|
||||||
|
ht[sbWord.ToString()] = 1;
|
||||||
|
//sbTest.Append(sbWord.ToString()+"\r\n");
|
||||||
|
}
|
||||||
|
sbWord.Length = 0;
|
||||||
|
sbWord.Append(c);
|
||||||
|
LastToken = TokenTypes.CJK;
|
||||||
|
continue;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
if (LastToken == TokenTypes.CJK)
|
||||||
|
{
|
||||||
|
//we're here because there is more than zero characters already stored
|
||||||
|
//and the last was CJK so we need append current character
|
||||||
|
//and flush the resultant 2 character n-gram
|
||||||
|
sbWord.Append(c);
|
||||||
|
System.Diagnostics.Debug.Assert(sbWord.Length == 2);
|
||||||
|
if (!ht.ContainsKey(sbWord.ToString()))
|
||||||
|
{
|
||||||
|
ht[sbWord.ToString()] = 1;
|
||||||
|
//sbTest.Append(sbWord.ToString()+"\r\n");
|
||||||
|
}
|
||||||
|
sbWord.Length = 0;
|
||||||
|
sbWord.Append(c);
|
||||||
|
LastToken = TokenTypes.CJK;
|
||||||
|
continue;
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//append character and go on to next one
|
||||||
|
sbWord.Append(c);
|
||||||
|
LastToken = TokenTypes.CJK;
|
||||||
|
continue;
|
||||||
|
#endregion
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
#region CJK Word Boundary token
|
||||||
|
LastToken = TokenTypes.Separator;
|
||||||
|
if (sbWord.Length > 0)
|
||||||
|
{
|
||||||
|
//flush away...
|
||||||
|
if (!ht.ContainsKey(sbWord.ToString()))
|
||||||
|
{
|
||||||
|
ht[sbWord.ToString()] = 1;
|
||||||
|
//sbTest.Append(sbWord.ToString()+"\r\n");
|
||||||
|
}
|
||||||
|
sbWord.Length = 0;
|
||||||
|
|
||||||
|
continue;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//Flush out the last word
|
||||||
|
if (sbWord.Length > 0)
|
||||||
|
{
|
||||||
|
//flush away...
|
||||||
|
if (!ht.ContainsKey(sbWord.ToString()))
|
||||||
|
{
|
||||||
|
ht[sbWord.ToString()] = 1;
|
||||||
|
////sbTest.Append(sbWord.ToString()+"\r\n");
|
||||||
|
}
|
||||||
|
sbWord.Length = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
//bail early if there is nothing indexed
|
||||||
|
if (ht.Count == 0) return "";
|
||||||
|
|
||||||
|
if (AsXML)
|
||||||
|
{
|
||||||
|
//Make a return xml fragment
|
||||||
|
//from the word list
|
||||||
|
foreach (DictionaryEntry d in ht)
|
||||||
|
{
|
||||||
|
//Add only non stopwords
|
||||||
|
if (!StopList.Contains(d.Key.ToString()))
|
||||||
|
{
|
||||||
|
w.WriteStartElement("i");
|
||||||
|
w.WriteAttributeString("w", d.Key.ToString());
|
||||||
|
w.WriteEndElement();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
w.WriteEndElement();
|
||||||
|
sr.Close();
|
||||||
|
return sr.ToString();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
//Make a return string array
|
||||||
|
//from the word list
|
||||||
|
foreach (DictionaryEntry d in ht)
|
||||||
|
{
|
||||||
|
//Add only non stopwords
|
||||||
|
if (!StopList.Contains(d.Key.ToString()))
|
||||||
|
{
|
||||||
|
sbResults.Append(d.Key.ToString());
|
||||||
|
sbResults.Append(",");
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//sometimes all the results are stop words so you end up
|
||||||
|
//here with nothing in sbResults. Removed some code that was
|
||||||
|
//causing a crash here
|
||||||
|
return sbResults.ToString().TrimEnd(',');
|
||||||
|
|
||||||
|
}
|
||||||
|
//return sbTest.ToString();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
|
||||||
|
|
||||||
|
public class LocaleSearchData
|
||||||
|
{
|
||||||
|
public bool CJKIndex { get; set; }
|
||||||
|
public List<string> StopWords { get; set; }
|
||||||
|
public LocaleSearchData(){
|
||||||
|
CJKIndex=false;
|
||||||
|
StopWords=new List<string>();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}//eoc
|
}//eoc
|
||||||
|
|
||||||
}//eons
|
}//eons
|
||||||
Reference in New Issue
Block a user