Files
raven/server/AyaNova/biz/Search.cs
2018-09-19 22:35:47 +00:00

668 lines
26 KiB
C#

using System;
using System.Linq;
using System.Globalization;
using System.Text;
using System.Collections.Generic;
using System.IO;
using System.Threading.Tasks;
using Newtonsoft.Json.Linq;
using Microsoft.Extensions.Logging;
using Microsoft.EntityFrameworkCore;
using AyaNova.Util;
using AyaNova.Models;
namespace AyaNova.Biz
{
//This class handles word breaking, processing keywords and searching for results
public static class Search
{
/*
ISSUES:
none at the moment
*/
#region Search and return results
/*
Requirements:
INPUT PARAMETERS
- Search phrase (with wildcard support)
- Can be empty if tags are specified, no tags and no phrase is an error condition
- ObjectType: only return results for objects of this type
- InName: flag that indicates only search in names
- Tag ids that are also on result objects
- Can be empty if a phrase is specified
ACTION
Find search matches, then find tag matches then intersect, then sort and return
Filter OUT results that user is not permitted to read
//TODO: proper testing of searching
- SAMPLE DATA: Need a huge amount of sample data indexed to load test it
- INDEXES: play with it and see what works best
OUTPUT FORMAT
- No localized text, up to client
- Name of object in return result
- Object Type and ID in return result
- Group results by object type, then by object ID descending which will result in natural most recently created order
result:[
{
name:"blah",
type:2,
id:210
},
]
*/
//Class to hold search request parameters
public class SearchRequestParameters
{
public string Phrase { get; set; }
public bool NameOnly { get; set; }
public AyaType TypeOnly { get; set; }
public List<long> Tags { get; set; }
public SearchRequestParameters()
{
NameOnly = false;
TypeOnly = AyaType.NoType;
Tags = new List<long>();
}
public bool IsValid
{
get
{
//has a phrase?
if (!string.IsNullOrWhiteSpace(this.Phrase))
return true;
//has tags?
if (this.Tags.Count > 0)
return true;
return false;
}
}
}
//Class to hold search result
public class SearchResult
{
public string Name { get; set; }
public AyaType Type { get; set; }
public long Id { get; set; }
}
public static async Task<List<SearchResult>> DoSearch(AyContext ct, long localeId, SearchRequestParameters searchParameters)
{
List<SearchResult> ResultList = new List<SearchResult>();
if (!searchParameters.IsValid)
{
throw new System.ArgumentException("Search::DoSearch - Search request parameters must contain a phrase or tags");
}
//IF PHRASE SPECIFIED
//Modify Phrase to replace wildcard * with % as breakcore expects sql style wildcards
searchParameters.Phrase = searchParameters.Phrase.Replace("*", "%");
//BREAK SEARCH PHRASE INTO SEPARATE TERMS
var PhraseItems = BreakSearchPhrase(localeId, searchParameters.Phrase);
//SPLIT OUT WILDCARDS FROM NON WILDCARDS
List<string> WildCardSearchTerms = new List<string>();
List<string> RegularSearchTerms = new List<string>();
foreach (string PhraseItem in PhraseItems)
{
if (PhraseItem.Contains("%"))
WildCardSearchTerms.Add(PhraseItem);
else
RegularSearchTerms.Add(PhraseItem);
}
//GET LIST OF DICTIONARY ID'S THAT MATCH REGULAR SEARCH TERMS
List<SearchDictionary> RegularMatches = new List<SearchDictionary>();
if (RegularSearchTerms.Count > 0)
RegularMatches = await ct.SearchDictionary.Where(m => RegularSearchTerms.Contains(m.Word)).ToListAsync();
//GET LIST OF DICTIONARY ID'S THAT MATCH WILDCARD SEARCH TERMS
List<SearchDictionary> WildCardMatches = new List<SearchDictionary>();
if (WildCardSearchTerms.Count > 0)
{
//Ok some fuckery required to implement this the EF CORE way
/*
.Where(entity => entity.Name.Contains("xyz"))
.Where(entity => entity.Name.EndsWith("xyz"))
.Where(entity => entity.Name.StartsWith("xyz"))
*/
foreach (string WildCardSearchTerm in WildCardSearchTerms)
{
//Contains?
if (WildCardSearchTerm.StartsWith("%") && WildCardSearchTerm.EndsWith("%"))
{
WildCardMatches.AddRange(await ct.SearchDictionary.Where(m => m.Word.EndsWith(WildCardSearchTerm.Replace("%", ""))).ToListAsync());
}
else if (WildCardSearchTerm.EndsWith("%")) //STARTS WITH?
{
WildCardMatches.AddRange(await ct.SearchDictionary.Where(m => m.Word.EndsWith(WildCardSearchTerm.Replace("%", ""))).ToListAsync());
}
else if (WildCardSearchTerm.StartsWith("%"))//ENDS WITH?
{
WildCardMatches.AddRange(await ct.SearchDictionary.Where(m => m.Word.EndsWith(WildCardSearchTerm.Replace("%", ""))).ToListAsync());
}
}
WildCardMatches = await ct.SearchDictionary.Where(m => WildCardMatches.Contains(m.Word)).ToListAsync();
}
//SEARCH SEARCHKEY FOR MATCHING WORDS AND OPTIONALLY TYPE AND INNAME
//IF TAGS SPECIFIED
//LOOP THROUGH SEARCHKEY MATCHES
//FOREACH OBJECT SEARCH TAGMAP FOR MATCHING OBJECTTYPE AND ID
//REMOVE RESULTS FROM SEARCH PHRASE PHASE THAT ARE NOT MATCHING
//fake await to clear error
//await ct.SaveChangesAsync();
return ResultList;
}
#endregion dosearch
#region ProcessKeywords into Database
public static void ProcessNewObjectKeywords(AyContext ct, long localeId, long objectID, AyaType objectType, string name, params string[] text)
{
ProcessKeywords(ct, localeId, objectID, objectType, true, name, text);
}
public static void ProcessUpdatedObjectKeywords(AyContext ct, long localeId, long objectID, AyaType objectType, string name, params string[] text)
{
ProcessKeywords(ct, localeId, objectID, objectType, false, name, text);
}
public static void ProcessDeletedObjectKeywords(AyContext ct, long objectID, AyaType objectType)
{
//Be careful in future, if you put ToString at the end of each object in the string interpolation
//npgsql driver will assume it's a string and put quotes around it triggering an error that a string can't be compared to an int
ct.Database.ExecuteSqlCommand($"delete from asearchkey where objectid={objectID} and objecttype={(int)objectType}");
}
/// <summary>
/// Process the keywords into the dictionary
/// NOTE: NAME parameter is in ADDITION to the NAME also being on of the strings passed in text parameter
/// </summary>
private static void ProcessKeywords(AyContext ct, long localeId, long objectID, AyaType objectType, bool newRecord, string name, params string[] text)
{
//IF NOT NEW, DELETE ALL EXISTING ENTRIES FOR OBJECT TYPE AND ID
if (!newRecord)
{
ProcessDeletedObjectKeywords(ct, objectID, objectType);
}
//BREAK STRING ARRAY INTO KEYWORD LIST
List<string> KeyWordList = Break(localeId, text);
//BREAK NAME STRING
List<string> NameKeyWordList = Break(localeId, name);
//EARLY EXIT IF NO KEYWORDS OR NAME RECORD TO PROCESS
if (KeyWordList.Count == 0 && string.IsNullOrWhiteSpace(name))
{
return;
}
//BUILD A LIST OF MatchingDictionaryEntry items FOR THE MATCHING WORDS
List<MatchingDictionaryEntry> MatchingKeywordIdList = new List<MatchingDictionaryEntry>();
//ITERATE ALL THE KEYWORDS, SEARCH IN THE SEARCHDICTIONARY TABLE AND COLLECT ID'S OF ANY PRE-EXISTING IN DB KEYWORDS
var ExistingKeywordMatches = ct.SearchDictionary.Where(m => KeyWordList.Contains(m.Word)).ToDictionary(m => m.Id, m => m.Word);
//Put the matching keyword ID's into the list
foreach (KeyValuePair<long, string> K in ExistingKeywordMatches)
{
bool IsName = false;
if (NameKeyWordList.Contains(K.Value))
IsName = true;
MatchingKeywordIdList.Add(new MatchingDictionaryEntry() { DictionaryId = K.Key, InName = IsName });
}
//ITERATE THROUGH THE KEYWORDS THAT DO *NOT* HAVE MATCHES IN THE SEARCHDICTIONARY AND ADD THEM TO THE SEARCH DICTIONARY, COLLECTING THEIR ID'S
foreach (string KeyWord in KeyWordList)
{
if (!ExistingKeywordMatches.ContainsValue(KeyWord))
{
ct.SearchDictionary.Add(new SearchDictionary() { Word = KeyWord });
}
}
//Save the context in order to get the id's of the new words added
ct.SaveChanges();
//Now add the id's of the newly created words to the matching keyword id list for this object
foreach (SearchDictionary SD in ct.SearchDictionary.Local)
{
bool IsName = false;
if (NameKeyWordList.Contains(SD.Word))
IsName = true;
MatchingKeywordIdList.Add(new MatchingDictionaryEntry() { DictionaryId = SD.Id, InName = IsName });
}
//CREATE THE SEARCHKEY RECORDS FOR ALL THE KEYWORDS
foreach (MatchingDictionaryEntry E in MatchingKeywordIdList)
{
ct.SearchKey.Add(new SearchKey() { WordId = E.DictionaryId, InName = E.InName, ObjectId = objectID, ObjectType = objectType });
}
ct.SaveChanges();
}//eoc
//Class to hold temporary list of matching id
public class MatchingDictionaryEntry
{
public bool InName { get; set; }
public long DictionaryId { get; set; }
public MatchingDictionaryEntry()
{
InName = false;
DictionaryId = -1;
}
}
#endregion
#region Breaker
//Class to hold relevant locale data for breaking text
public class LocaleWordBreakingData
{
public bool CJKIndex { get; set; }
public List<string> StopWords { get; set; }
public LocaleWordBreakingData()
{
CJKIndex = false;
StopWords = new List<string>();
}
}
//Get the current stopwords for the user's locale
private static LocaleWordBreakingData GetLocaleSearchData(long localeId, AyContext ct = null)
{
LocaleWordBreakingData LSD = new LocaleWordBreakingData();
if (ct == null)
ct = ServiceProviderProvider.DBContext;
//Get stopwords
//Validate locale id, if not right then use default instead
var Param = new Api.Controllers.LocaleController.LocaleSubsetParam();
Param.LocaleId = LocaleBiz.EnsuredLocaleIdStatic(localeId, ct);
Param.Keys.Add("StopWords1");
Param.Keys.Add("StopWords2");
Param.Keys.Add("StopWords3");
Param.Keys.Add("StopWords4");
Param.Keys.Add("StopWords5");
Param.Keys.Add("StopWords6");
Param.Keys.Add("StopWords7");
var Stops = LocaleBiz.GetSubsetStatic(Param).Result;
foreach (KeyValuePair<string, string> kvp in Stops)
{
//Each stopwords locale key is a space delimited list of words and in the case of an empty local string (i.e. StopWords7) it's value is a single question mark
if (kvp.Value != "?")
{
LSD.StopWords.AddRange(kvp.Value.Split(" "));
}
}
LSD.CJKIndex = LocaleBiz.GetCJKIndex(localeId, ct).Result;
return LSD;
}
public enum TokenTypes
{ Nothing, Separator, CJK, Latin };
/// <summary>
/// Take an array of strings and
/// return a single string
/// containing unique only, lowercase comma delimited
/// keywords suitable for passing to a
/// stored procedure or other function
///
/// Use Locale setting CJKIndex=true to handle Chinese, Japanese, Korean etc
/// (languages with no easily identifiable word boundaries as in english)
/// </summary>
///
/// <param name="localeId"></param>
/// <param name="text">An array of 0 to * strings of text</param>
/// <returns>List of strings</returns>
internal static List<string> Break(long localeId, params string[] text)
{
return BreakCore(localeId, false, text);
}
/// <summary>
/// Used to Process users search phrase and preserve wild
/// cards entered
/// </summary>
/// <param name="localeId"></param>
/// <param name="text"></param>
/// <returns></returns>
internal static List<string> BreakSearchPhrase(long localeId, params string[] text)
{
return BreakCore(localeId, true, text);
}
/// <summary>
/// Stop words list reset upon login or editing of localized text
/// used for eliminating noise words from search dictionary
/// </summary>
// public static System.Collections.Generic.List<string> StopList = null;
internal static List<string> BreakCore(long localeId, bool KeepWildCards, params string[] text)
{
//Get stopwords and CJKIndex flag value
LocaleWordBreakingData LocaleSearchData = GetLocaleSearchData(localeId);
int MAXWORDLENGTH = 255;
StringBuilder sbResults = new StringBuilder();
//List to temporarily hold parsed words
//used to easily ensure unique words only
List<string> tempParsedWords = new List<string>();
StringBuilder sb = new StringBuilder();
StringBuilder sbWord = new StringBuilder();
List<string> ReturnList = new List<string>();
//Loop through each of the passed in strings
foreach (string s in text)
{
if (s == null || s == "") continue;
//get all the characters in a unicode compliant manner...
TextElementEnumerator t = StringInfo.GetTextElementEnumerator(s);
//start at the top
t.Reset();
TokenTypes LastToken = TokenTypes.Nothing;
//Used by CJK
bool BasicLatinBlock = true;
//Process each "character" (text element,glyph whatever) in the
//current string
while (t.MoveNext())
{
//get it as a character
char c = t.GetTextElement()[0];
if (!LocaleSearchData.CJKIndex)
{
#region regular tokenizer
//Is it a token we want to include?
//Or a wildcard character
if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%'))
{
#region Include token
//All latin text is converted to lower case
c = char.ToLower(c);
//Do we already have a word?
if (sbWord.Length > 0)
{
//Maybe we need to flush this word into the word list
//if we're over the word length limit
if (sbWord.Length >= MAXWORDLENGTH)
{
//flush away...
if (!tempParsedWords.Contains(sbWord.ToString()))
{
tempParsedWords.Add(sbWord.ToString());
}
sbWord.Length = 0;
sbWord.Append(c);
LastToken = TokenTypes.Latin;
continue;
}
}
//append character and go on to next one
sbWord.Append(c);
LastToken = TokenTypes.Latin;
continue;
#endregion
}
else
{
#region Word Boundary token
LastToken = TokenTypes.Separator;
if (sbWord.Length > 0)
{
//flush away...
if (!tempParsedWords.Contains(sbWord.ToString()))
{
tempParsedWords.Add(sbWord.ToString());
}
sbWord.Length = 0;
continue;
}
#endregion
}
#endregion
}
else
{
#region CJK Tokenizer
//Is it a basic latin charater? (ascii basically)
//see: http://www.unicode.org/charts/index.html
//and here for a funky online viewer:
//http://www.fileformat.info/info/unicode/block/index.htm
//we need to know this so that regular english text
//within cjk text gets properly indexed as whole words
BasicLatinBlock = false;
if ((int)c < 256) BasicLatinBlock = true;
if (BasicLatinBlock)
{
//Is it a token we want to include?
if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%'))
{
#region Latin Include token
//All latin text is converted to lower case
c = char.ToLower(c);
//Do we already have a word?
if (sbWord.Length > 0)
{
//Maybe we need to flush this word into the word list
//if we're over the word length limit or we are going from
//CJK to latin
if (LastToken == TokenTypes.CJK || sbWord.Length >= MAXWORDLENGTH)
{
//flush away...
if (!tempParsedWords.Contains(sbWord.ToString()))
{
tempParsedWords.Add(sbWord.ToString());
}
sbWord.Length = 0;
sbWord.Append(c);
LastToken = TokenTypes.Latin;
continue;
}
}
//append character and go on to next one
sbWord.Append(c);
LastToken = TokenTypes.Latin;
continue;
#endregion
}
else
{
#region Latin Word Boundary token
LastToken = TokenTypes.Separator;
if (sbWord.Length > 0)
{
//flush away...
if (!tempParsedWords.Contains(sbWord.ToString()))
{
tempParsedWords.Add(sbWord.ToString());
}
sbWord.Length = 0;
continue;
}
#endregion
}
}
else//CJK character
{
if (char.IsLetter(c) || (KeepWildCards && c == '%'))
{
#region CJK Include token
//Do we already have a word?
if (sbWord.Length > 0)
{
//Maybe we need to flush this word into the word list
//if we're over the word length limit or we are going from
//latin TO CJK
if (LastToken == TokenTypes.Latin || sbWord.Length >= MAXWORDLENGTH)
{
//flush away...
if (!tempParsedWords.Contains(sbWord.ToString()))
{
tempParsedWords.Add(sbWord.ToString());
}
sbWord.Length = 0;
sbWord.Append(c);
LastToken = TokenTypes.CJK;
continue;
}
if (LastToken == TokenTypes.CJK)
{
//we're here because there is more than zero characters already stored
//and the last was CJK so we need append current character
//and flush the resultant 2 character n-gram
sbWord.Append(c);
System.Diagnostics.Debug.Assert(sbWord.Length == 2);
//flush away...
if (!tempParsedWords.Contains(sbWord.ToString()))
{
tempParsedWords.Add(sbWord.ToString());
}
sbWord.Length = 0;
sbWord.Append(c);
LastToken = TokenTypes.CJK;
continue;
}
}
//append character and go on to next one
sbWord.Append(c);
LastToken = TokenTypes.CJK;
continue;
#endregion
}
else
{
#region CJK Word Boundary token
LastToken = TokenTypes.Separator;
if (sbWord.Length > 0)
{
//flush away...
if (!tempParsedWords.Contains(sbWord.ToString()))
{
tempParsedWords.Add(sbWord.ToString());
}
sbWord.Length = 0;
continue;
}
#endregion
}
}
#endregion
}
}
//Flush out the last word
if (sbWord.Length > 0)
{
//flush away...
if (!tempParsedWords.Contains(sbWord.ToString()))
{
tempParsedWords.Add(sbWord.ToString());
}
sbWord.Length = 0;
}
}
//bail early if there is nothing indexed
if (tempParsedWords.Count == 0) return ReturnList;
//Make a return string array
//from the word list
foreach (string s in tempParsedWords)
{
//Add only non stopwords
if (!LocaleSearchData.StopWords.Contains(s))
{
ReturnList.Add(s);
}
}
//sometimes all the results are stop words so you end up here with nothing
return ReturnList;
}
#endregion
}//eoc
}//eons