443 lines
17 KiB
C#
443 lines
17 KiB
C#
using System;
|
|
using System.Globalization;
|
|
using System.Text;
|
|
using System.Collections.Generic;
|
|
using System.IO;
|
|
using Newtonsoft.Json.Linq;
|
|
using Microsoft.Extensions.Logging;
|
|
using AyaNova.Util;
|
|
using AyaNova.Models;
|
|
|
|
|
|
namespace AyaNova.Biz
|
|
{
|
|
|
|
//This class handles word breaking, processing keywords and searching for results
|
|
public static class Search
|
|
{
|
|
|
|
//Initial keyword indexing consists of
|
|
//WordBreaker - break down into words
|
|
//ProcessKeywords into database
|
|
|
|
|
|
/// <summary>
|
|
/// Process the keywords into the dictionary
|
|
/// </summary>
|
|
public static void ProcessKeywords(AyContext ct, long localeId, long objectID, AyaType objectType, bool newRecord, string keyWords, string name)
|
|
{
|
|
|
|
|
|
//Get CJK index bool flag.
|
|
//TODO: should this be a property of the locale or a global setting as before??
|
|
//if it's a locale property, it could be stored as just another word in the locale dictionary rather than getting into other aspects or maybe it belongs as a bool value on the
|
|
//locale record itself?
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// //get a db and logger
|
|
// ILogger log = AyaNova.Util.ApplicationLogging.CreateLogger("PrimeData");
|
|
// User u = new User();
|
|
// u.Active=true;
|
|
// u.Name = "AyaNova Administrator";
|
|
// u.Salt = Hasher.GenerateSalt();
|
|
// u.Login = "manager";
|
|
// u.Password = Hasher.hash(u.Salt, "l3tm3in");
|
|
// u.Roles = AuthorizationRoles.BizAdminFull | AuthorizationRoles.OpsAdminFull | AuthorizationRoles.DispatchFull | AuthorizationRoles.InventoryFull;
|
|
// u.OwnerId = 1;
|
|
// u.LocaleId=ServerBootConfig.AYANOVA_DEFAULT_LANGUAGE_ID;//Ensure primeLocales is called first
|
|
// u.UserType=UserType.Administrator;
|
|
// u.UserOptions=new UserOptions(1);
|
|
// ct.User.Add(u);
|
|
// ct.SaveChanges();
|
|
|
|
}
|
|
|
|
|
|
//Get the current stopwords for the user's locale
|
|
private static LocaleSearchData GetLocaleSearchData(long localeId, AyContext ct = null)
|
|
{
|
|
LocaleSearchData LSD = new LocaleSearchData();
|
|
if (ct == null)
|
|
ct = ServiceProviderProvider.DBContext;
|
|
//Get stopwords
|
|
//Validate locale id, if not right then use default instead
|
|
var Param = new Api.Controllers.LocaleController.LocaleSubsetParam();
|
|
Param.LocaleId = LocaleBiz.EnsuredLocaleIdStatic(localeId, ct);
|
|
Param.Keys.Add("StopWords1");
|
|
Param.Keys.Add("StopWords2");
|
|
Param.Keys.Add("StopWords3");
|
|
Param.Keys.Add("StopWords4");
|
|
Param.Keys.Add("StopWords5");
|
|
Param.Keys.Add("StopWords6");
|
|
Param.Keys.Add("StopWords7");
|
|
var Stops = LocaleBiz.GetSubsetStatic(Param).Result;
|
|
|
|
foreach (KeyValuePair<string, string> kvp in Stops)
|
|
{
|
|
//Each stopwords locale key is a space delimited list of words and in the case of an empty local string (i.e. StopWords7) it's value is a single question mark
|
|
if (kvp.Value != "?")
|
|
{
|
|
LSD.StopWords.AddRange(kvp.Value.Split(" "));
|
|
}
|
|
}
|
|
|
|
LSD.CJKIndex = LocaleBiz.GetCJKIndex(localeId, ct).Result;
|
|
return LSD;
|
|
}
|
|
|
|
|
|
|
|
|
|
#region Breaker
|
|
|
|
public enum TokenTypes
|
|
{ Nothing, Separator, CJK, Latin };
|
|
|
|
/// <summary>
|
|
/// Take an array of strings and
|
|
/// return a single string
|
|
/// containing unique only, lowercase comma delimited
|
|
/// keywords suitable for passing to a
|
|
/// stored procedure or other function
|
|
///
|
|
/// Use Locale setting CJKIndex=true to handle Chinese, Japanese, Korean etc
|
|
/// (languages with no easily identifiable word boundaries as in english)
|
|
/// </summary>
|
|
///
|
|
/// <param name="localeId"></param>
|
|
/// <param name="text">An array of 0 to * strings of text</param>
|
|
/// <returns></returns>
|
|
internal static string Break(long localeId, params string[] text)
|
|
{
|
|
return BreakCore(localeId, false, text);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Used to Process users search phrase and preserve wild
|
|
/// cards entered
|
|
/// </summary>
|
|
/// <param name="localeId"></param>
|
|
/// <param name="text"></param>
|
|
/// <returns></returns>
|
|
internal static string BreakSearchPhrase(long localeId, params string[] text)
|
|
{
|
|
return BreakCore(localeId, true, text);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Stop words list reset upon login or editing of localized text
|
|
/// used for eliminating noise words from search dictionary
|
|
/// </summary>
|
|
public static System.Collections.Generic.List<string> StopList = null;
|
|
|
|
internal static string BreakCore(long localeId, bool KeepWildCards, params string[] text)
|
|
{
|
|
//Get stopwords and CJKIndex flag value
|
|
LocaleSearchData LSD = GetLocaleSearchData(localeId);
|
|
|
|
|
|
|
|
//bool CJK = GlobalSettings.CJKIndex;
|
|
int MAXWORDLENGTH = 255;
|
|
|
|
StringBuilder sbResults = new StringBuilder();
|
|
|
|
//List to temporarily hold parsed words
|
|
//used to easily ensure unique words only
|
|
List<string> tempParsedWords = new List<string>();
|
|
|
|
//Stuff required for creating xml fragment on the fly in memory (string)
|
|
StringBuilder sb = new StringBuilder();
|
|
StringBuilder sbWord = new StringBuilder();
|
|
System.IO.StringWriter sr = new System.IO.StringWriter(sb);
|
|
System.Xml.XmlTextWriter w = new System.Xml.XmlTextWriter(sr);
|
|
|
|
w.Formatting = System.Xml.Formatting.Indented;
|
|
w.WriteStartElement("Items");
|
|
|
|
|
|
//Loop through each of the passed in strings
|
|
foreach (string s in text)
|
|
{
|
|
if (s == null || s == "") continue;
|
|
//get all the characters in a unicode compliant manner...
|
|
TextElementEnumerator t = StringInfo.GetTextElementEnumerator(s);
|
|
//start at the top
|
|
t.Reset();
|
|
|
|
TokenTypes LastToken = TokenTypes.Nothing;
|
|
|
|
//Used by CJK
|
|
bool BasicLatinBlock = true;
|
|
|
|
//Process each "character" (text element,glyph whatever) in the
|
|
//current string
|
|
while (t.MoveNext())
|
|
{
|
|
//get it as a character
|
|
char c = t.GetTextElement()[0];
|
|
|
|
if (!LSD.CJKIndex)
|
|
{
|
|
#region regular tokenizer
|
|
|
|
//Is it a token we want to include?
|
|
//Or a wildcard character
|
|
if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%'))
|
|
{
|
|
#region Include token
|
|
//All latin text is converted to lower case
|
|
c = char.ToLower(c);
|
|
|
|
//Do we already have a word?
|
|
if (sbWord.Length > 0)
|
|
{
|
|
//Maybe we need to flush this word into the word list
|
|
//if we're over the word length limit
|
|
if (sbWord.Length >= MAXWORDLENGTH)
|
|
{
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
sbWord.Append(c);
|
|
LastToken = TokenTypes.Latin;
|
|
continue;
|
|
|
|
}
|
|
}
|
|
|
|
//append character and go on to next one
|
|
sbWord.Append(c);
|
|
LastToken = TokenTypes.Latin;
|
|
continue;
|
|
#endregion
|
|
}
|
|
else
|
|
{
|
|
#region Word Boundary token
|
|
LastToken = TokenTypes.Separator;
|
|
if (sbWord.Length > 0)
|
|
{
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
continue;
|
|
}
|
|
|
|
#endregion
|
|
}
|
|
#endregion
|
|
}
|
|
else
|
|
{
|
|
#region CJK Tokenizer
|
|
|
|
//Is it a basic latin charater? (ascii basically)
|
|
//see: http://www.unicode.org/charts/index.html
|
|
//and here for a funky online viewer:
|
|
//http://www.fileformat.info/info/unicode/block/index.htm
|
|
//we need to know this so that regular english text
|
|
//within cjk text gets properly indexed as whole words
|
|
BasicLatinBlock = false;
|
|
if ((int)c < 256) BasicLatinBlock = true;
|
|
|
|
if (BasicLatinBlock)
|
|
{
|
|
//Is it a token we want to include?
|
|
if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%'))
|
|
{
|
|
#region Latin Include token
|
|
//All latin text is converted to lower case
|
|
c = char.ToLower(c);
|
|
|
|
//Do we already have a word?
|
|
if (sbWord.Length > 0)
|
|
{
|
|
//Maybe we need to flush this word into the word list
|
|
//if we're over the word length limit or we are going from
|
|
//CJK to latin
|
|
if (LastToken == TokenTypes.CJK || sbWord.Length >= MAXWORDLENGTH)
|
|
{
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
sbWord.Append(c);
|
|
LastToken = TokenTypes.Latin;
|
|
continue;
|
|
|
|
}
|
|
}
|
|
|
|
//append character and go on to next one
|
|
sbWord.Append(c);
|
|
LastToken = TokenTypes.Latin;
|
|
continue;
|
|
#endregion
|
|
}
|
|
else
|
|
{
|
|
#region Latin Word Boundary token
|
|
LastToken = TokenTypes.Separator;
|
|
if (sbWord.Length > 0)
|
|
{
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
#endregion
|
|
}
|
|
|
|
}
|
|
else//CJK character
|
|
{
|
|
if (char.IsLetter(c) || (KeepWildCards && c == '%'))
|
|
{
|
|
#region CJK Include token
|
|
//Do we already have a word?
|
|
if (sbWord.Length > 0)
|
|
{
|
|
//Maybe we need to flush this word into the word list
|
|
//if we're over the word length limit or we are going from
|
|
//latin TO CJK
|
|
if (LastToken == TokenTypes.Latin || sbWord.Length >= MAXWORDLENGTH)
|
|
{
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
sbWord.Append(c);
|
|
LastToken = TokenTypes.CJK;
|
|
continue;
|
|
|
|
}
|
|
|
|
if (LastToken == TokenTypes.CJK)
|
|
{
|
|
//we're here because there is more than zero characters already stored
|
|
//and the last was CJK so we need append current character
|
|
//and flush the resultant 2 character n-gram
|
|
sbWord.Append(c);
|
|
System.Diagnostics.Debug.Assert(sbWord.Length == 2);
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
sbWord.Append(c);
|
|
LastToken = TokenTypes.CJK;
|
|
continue;
|
|
|
|
}
|
|
}
|
|
|
|
//append character and go on to next one
|
|
sbWord.Append(c);
|
|
LastToken = TokenTypes.CJK;
|
|
continue;
|
|
#endregion
|
|
|
|
|
|
}
|
|
else
|
|
{
|
|
#region CJK Word Boundary token
|
|
LastToken = TokenTypes.Separator;
|
|
if (sbWord.Length > 0)
|
|
{
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
continue;
|
|
}
|
|
|
|
#endregion
|
|
}
|
|
|
|
}
|
|
|
|
#endregion
|
|
}
|
|
}
|
|
|
|
//Flush out the last word
|
|
if (sbWord.Length > 0)
|
|
{
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
}
|
|
}
|
|
|
|
|
|
//bail early if there is nothing indexed
|
|
if (tempParsedWords.Count == 0) return "";
|
|
|
|
|
|
//Make a return string array
|
|
//from the word list
|
|
foreach (string s in tempParsedWords)
|
|
{
|
|
//Add only non stopwords
|
|
if (!StopList.Contains(s))
|
|
{
|
|
sbResults.Append(s);
|
|
sbResults.Append(",");
|
|
}
|
|
}
|
|
|
|
//sometimes all the results are stop words so you end up
|
|
//here with nothing in sbResults.
|
|
return sbResults.ToString().TrimEnd(',');
|
|
|
|
}
|
|
|
|
#endregion
|
|
|
|
|
|
public class LocaleSearchData
|
|
{
|
|
public bool CJKIndex { get; set; }
|
|
public List<string> StopWords { get; set; }
|
|
public LocaleSearchData()
|
|
{
|
|
CJKIndex = false;
|
|
StopWords = new List<string>();
|
|
}
|
|
}
|
|
|
|
|
|
}//eoc
|
|
|
|
}//eons |