458 lines
19 KiB
C#
458 lines
19 KiB
C#
using System;
|
|
using System.Globalization;
|
|
using System.Text;
|
|
using System.Collections.Generic;
|
|
using System.IO;
|
|
using Newtonsoft.Json.Linq;
|
|
using Microsoft.Extensions.Logging;
|
|
using Microsoft.EntityFrameworkCore;
|
|
using AyaNova.Util;
|
|
using AyaNova.Models;
|
|
|
|
|
|
namespace AyaNova.Biz
|
|
{
|
|
|
|
//This class handles word breaking, processing keywords and searching for results
|
|
public static class Search
|
|
{
|
|
|
|
//Initial keyword indexing consists of
|
|
//WordBreaker - break down into words
|
|
//ProcessKeywords into database
|
|
|
|
#region ProcessKeywords into Database
|
|
|
|
public static void ProcessNewObjectKeywords(AyContext ct, long localeId, long objectID, AyaType objectType, string name, params string[] text)
|
|
{
|
|
ProcessKeywords(ct, localeId, objectID, objectType, true, name, text);
|
|
}
|
|
|
|
public static void ProcessUpdatedObjectKeywords(AyContext ct, long localeId, long objectID, AyaType objectType, string name, params string[] text)
|
|
{
|
|
ProcessKeywords(ct, localeId, objectID, objectType, false, name, text);
|
|
}
|
|
|
|
public static void ProcessDeletedObjectKeywords(AyContext ct, long objectID, AyaType objectType)
|
|
{
|
|
//Be careful in future, if you put ToString at the end of each object in the string interpolation
|
|
//npgsql driver will assume it's a string and put quotes around it triggering an error that a string can't be compared to an int
|
|
ct.Database.ExecuteSqlCommand($"delete from asearchkey where objectid={objectID} and objecttype={(int)objectType}");
|
|
}
|
|
|
|
|
|
/// <summary>
|
|
/// Process the keywords into the dictionary
|
|
/// </summary>
|
|
private static void ProcessKeywords(AyContext ct, long localeId, long objectID, AyaType objectType, bool newRecord, string name, params string[] text)
|
|
{
|
|
|
|
//TODO: Code this, using method idea from v7 code and adding the handling of the new InName flag and Name separately.
|
|
//Note that as initially coded in widget test class the context will be saved by the controller as it is also done with the event log
|
|
//So theoretically I don't save here, but it may turn out that the code requires a save so in that case need to re-do the WidgetController calls to here to
|
|
//account for the save
|
|
|
|
|
|
//IF NOT NEW, DELETE ALL EXISTING ENTRIES FOR OBJECT TYPE AND ID
|
|
if (!newRecord)
|
|
{
|
|
ProcessDeletedObjectKeywords(ct, objectID, objectType);
|
|
}
|
|
|
|
//BREAK STRING ARRAY INTO KEYWORD LIST
|
|
List<string> KeyWordList = Break(localeId, text);
|
|
|
|
//EARLY EXIT IF NO KEYWORDS OR NAME RECORD TO PROCESS
|
|
if (KeyWordList.Count == 0 && string.IsNullOrWhiteSpace(name))
|
|
{
|
|
return;
|
|
}
|
|
|
|
//ITERATE ALL THE KEYWORDS, SEARCH IN THE SEARCHDICTIONARY TABLE AND COLLECT ID'S OF ANY PRE-EXISTING IN DB KEYWORDS
|
|
|
|
//ITERATE THROUGH THE KEYWORDS THAT DO *NOT* HAVE MATCHES IN THE SEARCHDICTIONARY AND ADD THEM TO THE SEARCH DICTIONARY, COLLECTING THEIR ID'S
|
|
|
|
//CREATE THE SEARCHKEY RECORDS FOR ALL THE KEYWORDS
|
|
|
|
|
|
}
|
|
#endregion
|
|
|
|
|
|
|
|
|
|
#region Breaker
|
|
|
|
//Class to hold relevant locale data for breaking text
|
|
public class LocaleWordBreakingData
|
|
{
|
|
public bool CJKIndex { get; set; }
|
|
public List<string> StopWords { get; set; }
|
|
public LocaleWordBreakingData()
|
|
{
|
|
CJKIndex = false;
|
|
StopWords = new List<string>();
|
|
}
|
|
}
|
|
|
|
//Get the current stopwords for the user's locale
|
|
private static LocaleWordBreakingData GetLocaleSearchData(long localeId, AyContext ct = null)
|
|
{
|
|
LocaleWordBreakingData LSD = new LocaleWordBreakingData();
|
|
if (ct == null)
|
|
ct = ServiceProviderProvider.DBContext;
|
|
//Get stopwords
|
|
//Validate locale id, if not right then use default instead
|
|
var Param = new Api.Controllers.LocaleController.LocaleSubsetParam();
|
|
Param.LocaleId = LocaleBiz.EnsuredLocaleIdStatic(localeId, ct);
|
|
Param.Keys.Add("StopWords1");
|
|
Param.Keys.Add("StopWords2");
|
|
Param.Keys.Add("StopWords3");
|
|
Param.Keys.Add("StopWords4");
|
|
Param.Keys.Add("StopWords5");
|
|
Param.Keys.Add("StopWords6");
|
|
Param.Keys.Add("StopWords7");
|
|
var Stops = LocaleBiz.GetSubsetStatic(Param).Result;
|
|
|
|
foreach (KeyValuePair<string, string> kvp in Stops)
|
|
{
|
|
//Each stopwords locale key is a space delimited list of words and in the case of an empty local string (i.e. StopWords7) it's value is a single question mark
|
|
if (kvp.Value != "?")
|
|
{
|
|
LSD.StopWords.AddRange(kvp.Value.Split(" "));
|
|
}
|
|
}
|
|
|
|
LSD.CJKIndex = LocaleBiz.GetCJKIndex(localeId, ct).Result;
|
|
return LSD;
|
|
}
|
|
|
|
public enum TokenTypes
|
|
{ Nothing, Separator, CJK, Latin };
|
|
|
|
/// <summary>
|
|
/// Take an array of strings and
|
|
/// return a single string
|
|
/// containing unique only, lowercase comma delimited
|
|
/// keywords suitable for passing to a
|
|
/// stored procedure or other function
|
|
///
|
|
/// Use Locale setting CJKIndex=true to handle Chinese, Japanese, Korean etc
|
|
/// (languages with no easily identifiable word boundaries as in english)
|
|
/// </summary>
|
|
///
|
|
/// <param name="localeId"></param>
|
|
/// <param name="text">An array of 0 to * strings of text</param>
|
|
/// <returns>List of strings</returns>
|
|
internal static List<string> Break(long localeId, params string[] text)
|
|
{
|
|
List<string> KeyWordList = new List<string>(BreakCore(localeId, false, text).Split(','));
|
|
return KeyWordList;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Used to Process users search phrase and preserve wild
|
|
/// cards entered
|
|
/// </summary>
|
|
/// <param name="localeId"></param>
|
|
/// <param name="text"></param>
|
|
/// <returns></returns>
|
|
internal static string BreakSearchPhrase(long localeId, params string[] text)
|
|
{
|
|
return BreakCore(localeId, true, text);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Stop words list reset upon login or editing of localized text
|
|
/// used for eliminating noise words from search dictionary
|
|
/// </summary>
|
|
public static System.Collections.Generic.List<string> StopList = null;
|
|
|
|
internal static string BreakCore(long localeId, bool KeepWildCards, params string[] text)
|
|
{
|
|
//Get stopwords and CJKIndex flag value
|
|
LocaleWordBreakingData LSD = GetLocaleSearchData(localeId);
|
|
int MAXWORDLENGTH = 255;
|
|
StringBuilder sbResults = new StringBuilder();
|
|
//List to temporarily hold parsed words
|
|
//used to easily ensure unique words only
|
|
List<string> tempParsedWords = new List<string>();
|
|
|
|
StringBuilder sb = new StringBuilder();
|
|
StringBuilder sbWord = new StringBuilder();
|
|
// System.IO.StringWriter sr = new System.IO.StringWriter(sb);
|
|
// System.Xml.XmlTextWriter w = new System.Xml.XmlTextWriter(sr);
|
|
|
|
// w.Formatting = System.Xml.Formatting.Indented;
|
|
// w.WriteStartElement("Items");
|
|
|
|
|
|
//Loop through each of the passed in strings
|
|
foreach (string s in text)
|
|
{
|
|
if (s == null || s == "") continue;
|
|
//get all the characters in a unicode compliant manner...
|
|
TextElementEnumerator t = StringInfo.GetTextElementEnumerator(s);
|
|
//start at the top
|
|
t.Reset();
|
|
|
|
TokenTypes LastToken = TokenTypes.Nothing;
|
|
|
|
//Used by CJK
|
|
bool BasicLatinBlock = true;
|
|
|
|
//Process each "character" (text element,glyph whatever) in the
|
|
//current string
|
|
while (t.MoveNext())
|
|
{
|
|
//get it as a character
|
|
char c = t.GetTextElement()[0];
|
|
|
|
if (!LSD.CJKIndex)
|
|
{
|
|
#region regular tokenizer
|
|
|
|
//Is it a token we want to include?
|
|
//Or a wildcard character
|
|
if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%'))
|
|
{
|
|
#region Include token
|
|
//All latin text is converted to lower case
|
|
c = char.ToLower(c);
|
|
|
|
//Do we already have a word?
|
|
if (sbWord.Length > 0)
|
|
{
|
|
//Maybe we need to flush this word into the word list
|
|
//if we're over the word length limit
|
|
if (sbWord.Length >= MAXWORDLENGTH)
|
|
{
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
sbWord.Append(c);
|
|
LastToken = TokenTypes.Latin;
|
|
continue;
|
|
|
|
}
|
|
}
|
|
|
|
//append character and go on to next one
|
|
sbWord.Append(c);
|
|
LastToken = TokenTypes.Latin;
|
|
continue;
|
|
#endregion
|
|
}
|
|
else
|
|
{
|
|
#region Word Boundary token
|
|
LastToken = TokenTypes.Separator;
|
|
if (sbWord.Length > 0)
|
|
{
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
continue;
|
|
}
|
|
|
|
#endregion
|
|
}
|
|
#endregion
|
|
}
|
|
else
|
|
{
|
|
#region CJK Tokenizer
|
|
|
|
//Is it a basic latin charater? (ascii basically)
|
|
//see: http://www.unicode.org/charts/index.html
|
|
//and here for a funky online viewer:
|
|
//http://www.fileformat.info/info/unicode/block/index.htm
|
|
//we need to know this so that regular english text
|
|
//within cjk text gets properly indexed as whole words
|
|
BasicLatinBlock = false;
|
|
if ((int)c < 256) BasicLatinBlock = true;
|
|
|
|
if (BasicLatinBlock)
|
|
{
|
|
//Is it a token we want to include?
|
|
if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%'))
|
|
{
|
|
#region Latin Include token
|
|
//All latin text is converted to lower case
|
|
c = char.ToLower(c);
|
|
|
|
//Do we already have a word?
|
|
if (sbWord.Length > 0)
|
|
{
|
|
//Maybe we need to flush this word into the word list
|
|
//if we're over the word length limit or we are going from
|
|
//CJK to latin
|
|
if (LastToken == TokenTypes.CJK || sbWord.Length >= MAXWORDLENGTH)
|
|
{
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
sbWord.Append(c);
|
|
LastToken = TokenTypes.Latin;
|
|
continue;
|
|
|
|
}
|
|
}
|
|
|
|
//append character and go on to next one
|
|
sbWord.Append(c);
|
|
LastToken = TokenTypes.Latin;
|
|
continue;
|
|
#endregion
|
|
}
|
|
else
|
|
{
|
|
#region Latin Word Boundary token
|
|
LastToken = TokenTypes.Separator;
|
|
if (sbWord.Length > 0)
|
|
{
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
#endregion
|
|
}
|
|
|
|
}
|
|
else//CJK character
|
|
{
|
|
if (char.IsLetter(c) || (KeepWildCards && c == '%'))
|
|
{
|
|
#region CJK Include token
|
|
//Do we already have a word?
|
|
if (sbWord.Length > 0)
|
|
{
|
|
//Maybe we need to flush this word into the word list
|
|
//if we're over the word length limit or we are going from
|
|
//latin TO CJK
|
|
if (LastToken == TokenTypes.Latin || sbWord.Length >= MAXWORDLENGTH)
|
|
{
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
sbWord.Append(c);
|
|
LastToken = TokenTypes.CJK;
|
|
continue;
|
|
|
|
}
|
|
|
|
if (LastToken == TokenTypes.CJK)
|
|
{
|
|
//we're here because there is more than zero characters already stored
|
|
//and the last was CJK so we need append current character
|
|
//and flush the resultant 2 character n-gram
|
|
sbWord.Append(c);
|
|
System.Diagnostics.Debug.Assert(sbWord.Length == 2);
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
sbWord.Append(c);
|
|
LastToken = TokenTypes.CJK;
|
|
continue;
|
|
|
|
}
|
|
}
|
|
|
|
//append character and go on to next one
|
|
sbWord.Append(c);
|
|
LastToken = TokenTypes.CJK;
|
|
continue;
|
|
#endregion
|
|
|
|
|
|
}
|
|
else
|
|
{
|
|
#region CJK Word Boundary token
|
|
LastToken = TokenTypes.Separator;
|
|
if (sbWord.Length > 0)
|
|
{
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
continue;
|
|
}
|
|
|
|
#endregion
|
|
}
|
|
|
|
}
|
|
|
|
#endregion
|
|
}
|
|
}
|
|
|
|
//Flush out the last word
|
|
if (sbWord.Length > 0)
|
|
{
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
}
|
|
}
|
|
|
|
|
|
//bail early if there is nothing indexed
|
|
if (tempParsedWords.Count == 0) return "";
|
|
|
|
|
|
//Make a return string array
|
|
//from the word list
|
|
foreach (string s in tempParsedWords)
|
|
{
|
|
//Add only non stopwords
|
|
if (!StopList.Contains(s))
|
|
{
|
|
sbResults.Append(s);
|
|
sbResults.Append(",");
|
|
}
|
|
}
|
|
|
|
//sometimes all the results are stop words so you end up
|
|
//here with nothing in sbResults.
|
|
return sbResults.ToString().TrimEnd(',');
|
|
|
|
}
|
|
|
|
#endregion
|
|
|
|
|
|
|
|
|
|
|
|
}//eoc
|
|
|
|
}//eons |