1046 lines
42 KiB
C#
1046 lines
42 KiB
C#
using System.Linq;
|
|
using System.Globalization;
|
|
using System.Text;
|
|
using System.Collections.Generic;
|
|
using System.Threading.Tasks;
|
|
using Microsoft.EntityFrameworkCore;
|
|
using AyaNova.Util;
|
|
using AyaNova.Models;
|
|
//using System.Diagnostics;
|
|
|
|
|
|
namespace AyaNova.Biz
|
|
{
|
|
|
|
//This class handles word breaking, processing keywords and searching for results
|
|
public static class Search
|
|
{
|
|
|
|
#region Search and return results
|
|
|
|
public class SearchRequestParameters
|
|
{
|
|
public string Phrase { get; set; }
|
|
|
|
public AyaType TypeOnly { get; set; }
|
|
|
|
//Note: maxresults of 0 will get all results
|
|
public int MaxResults { get; set; }
|
|
|
|
public SearchRequestParameters()
|
|
{
|
|
|
|
TypeOnly = AyaType.NoType;
|
|
MaxResults = 500;
|
|
}
|
|
|
|
public bool IsValid
|
|
{
|
|
get
|
|
{
|
|
//has a phrase?
|
|
if (!string.IsNullOrWhiteSpace(this.Phrase))
|
|
return true;
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
//Classes to hold search results returned to client
|
|
public class SearchResult
|
|
{
|
|
public string Name { get; set; }
|
|
public AyaType Type { get; set; }
|
|
public long Id { get; set; }
|
|
}
|
|
|
|
public class SearchReturnObject
|
|
{
|
|
public long TotalResultsFound { get; set; }
|
|
public List<SearchResult> SearchResults { get; set; }
|
|
public SearchReturnObject()
|
|
{
|
|
TotalResultsFound = 0;
|
|
SearchResults = new List<SearchResult>();
|
|
}
|
|
}
|
|
|
|
|
|
public static async Task<SearchReturnObject> DoSearchAsync(AyContext ct, long translationId, AuthorizationRoles currentUserRoles, long currentUserId, SearchRequestParameters searchParameters)
|
|
{
|
|
var ReturnObject = new SearchReturnObject();
|
|
|
|
//list to hold temporary search/tag hits
|
|
List<AyaTypeId> MatchingObjects = new List<AyaTypeId>();
|
|
|
|
if (!searchParameters.IsValid)
|
|
{
|
|
//this is expected, don't throw, just return nothing
|
|
//throw new System.ArgumentException("Search::DoSearch - Search request parameters must contain a phrase or tags");
|
|
return ReturnObject;
|
|
}
|
|
|
|
|
|
//escape literal percentage signs first just in case they are searching for 50% off or something
|
|
//https://www.postgresql.org/docs/current/functions-matching.html#FUNCTIONS-LIKE
|
|
//need to get around breaking possibly losing the symbol so make it text
|
|
searchParameters.Phrase = searchParameters.Phrase.Replace("%", "pctsym");
|
|
|
|
//Modify Phrase to replace wildcard * with % as breakcore expects sql style wildcards
|
|
searchParameters.Phrase = searchParameters.Phrase.Replace("*", "%");
|
|
|
|
//BREAK SEARCH PHRASE INTO SEPARATE TERMS
|
|
var PhraseItems = await BreakSearchPhraseAsync(translationId, searchParameters.Phrase);
|
|
|
|
//SPLIT OUT WILDCARDS FROM NON WILDCARDS
|
|
List<string> PreWildCardedSearchTerms = new List<string>();
|
|
List<string> SearchTerms = new List<string>();
|
|
|
|
foreach (string PhraseItem in PhraseItems)
|
|
{
|
|
if (PhraseItem.Contains("%"))
|
|
PreWildCardedSearchTerms.Add(PhraseItem.Replace("pctsym", @"\%"));//put back literal percentage symbol if necessary
|
|
else
|
|
SearchTerms.Add(PhraseItem.Replace("pctsym", @"\%"));//put back literal percentage symbol if necessary
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
this query will do what I want properly. The wildcards here are just for wildcard terms and can also be direct matches doesn't matter the end result is only matches where all terms match the record
|
|
however they are desired i.e. AND not OR which is better than v7 which was shit at this as the wildcard search returned all it's results and so did any other terms
|
|
So I just need to build this query in this form from the search query terms, put them into the MatchingObjects collection and then the rest will fall into place and can resume as normal
|
|
|
|
|
|
WITH qr AS (SELECT aSearchKey.ObjectID, aSearchKey.aType,
|
|
count(*) FILTER (WHERE asearchdictionary.word like '%1qt%') AS "st1",
|
|
count(*) FILTER (WHERE asearchdictionary.word like '%44%') AS "st2"
|
|
FROM aSearchDictionary INNER JOIN aSearchKey ON aSearchDictionary.id = aSearchKey.WordID
|
|
group by asearchkey.objectid, asearchkey.atype)
|
|
select objectid, atype FROM qr
|
|
WHERE st1 > 0 and st2 >0
|
|
order by atype, objectid
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
//List holder for matching dictionary ID's
|
|
List<long> DictionaryMatches = new List<long>();
|
|
|
|
|
|
|
|
//GET LIST OF DICTIONARY ID'S THAT MATCH REGULAR SEARCH TERMS
|
|
if (SearchTerms.Count > 0)
|
|
foreach (string Term in SearchTerms)
|
|
{
|
|
DictionaryMatches.AddRange(await ct.SearchDictionary.Where(z => z.Word==Term).Select(z => z.Id).ToListAsync());
|
|
}
|
|
|
|
|
|
|
|
//GET LIST OF DICTIONARY ID'S THAT MATCH WILDCARD SEARCH TERMS
|
|
if (PreWildCardedSearchTerms.Count > 0)
|
|
{
|
|
foreach (string WildCardSearchTerm in PreWildCardedSearchTerms)
|
|
{
|
|
//Contains?
|
|
if (WildCardSearchTerm.StartsWith("%") && WildCardSearchTerm.EndsWith("%"))
|
|
{
|
|
DictionaryMatches.AddRange(await ct.SearchDictionary.Where(z => z.Word.Contains(WildCardSearchTerm.Replace("%", ""))).Select(z => z.Id).ToListAsync());
|
|
}
|
|
else if (WildCardSearchTerm.EndsWith("%")) //STARTS WITH?
|
|
{
|
|
DictionaryMatches.AddRange(await ct.SearchDictionary.Where(z => z.Word.StartsWith(WildCardSearchTerm.Replace("%", ""))).Select(z => z.Id).ToListAsync());
|
|
}
|
|
else if (WildCardSearchTerm.StartsWith("%"))//ENDS WITH?
|
|
{
|
|
DictionaryMatches.AddRange(await ct.SearchDictionary.Where(z => z.Word.EndsWith(WildCardSearchTerm.Replace("%", ""))).Select(z => z.Id).ToListAsync());
|
|
}
|
|
}
|
|
}
|
|
|
|
//SEARCH SEARCHKEY FOR MATCHING WORDS AND OPTIONALLY TYPE
|
|
var TotalSearchTermsToMatch = PreWildCardedSearchTerms.Count + SearchTerms.Count;
|
|
|
|
//Build search query based on searchParameters
|
|
var q = ct.SearchKey.Distinct().Where(z => DictionaryMatches.Contains(z.WordId));
|
|
|
|
|
|
//Of type?
|
|
if (searchParameters.TypeOnly != AyaType.NoType)
|
|
q = q.Where(z => z.AType == searchParameters.TypeOnly);
|
|
|
|
|
|
//Find the records that have the search terms in searchkey
|
|
var SearchMatches = q.GroupBy(z => new { z.AType, z.ObjectId }).Select(z => new { ObjectId = z.Key.ObjectId, AType = z.Key.AType, ObjectCount = z.LongCount() });
|
|
|
|
|
|
//PUT THE RESULTS INTO MATCHING OBJECTS LIST
|
|
foreach (var SearchMatch in SearchMatches)
|
|
{
|
|
//keep any object that matches *all* the search terms
|
|
if (SearchMatch.ObjectCount >= TotalSearchTermsToMatch)
|
|
MatchingObjects.Add(new AyaTypeId(SearchMatch.AType, SearchMatch.ObjectId));
|
|
}
|
|
|
|
|
|
//REMOVE ANY ITEMS THAT USER IS NOT PERMITTED TO READ
|
|
//list to hold temporary matches
|
|
List<AyaTypeId> CanReadMatchingObjects = new List<AyaTypeId>();
|
|
foreach (AyaTypeId t in MatchingObjects)
|
|
{
|
|
if (t.AType == AyaType.FileAttachment)
|
|
{
|
|
//have to look up the actual underlying object type and id here
|
|
//check if it's readable for user
|
|
//then add the PARENT object type and id to the CanREadMatchingObjects list
|
|
//this means user will not see it return as an attachment, just as the object
|
|
FileAttachment f = await ct.FileAttachment.AsNoTracking().FirstOrDefaultAsync(z => z.Id == t.ObjectId);
|
|
if (AyaNova.Api.ControllerHelpers.Authorized.HasReadFullRole(currentUserRoles, f.AttachToAType))
|
|
{
|
|
CanReadMatchingObjects.Add(new AyaTypeId(f.AttachToAType, f.AttachToObjectId));
|
|
}
|
|
}
|
|
else if (t.AType == AyaType.Memo)
|
|
{
|
|
//Users are only permitted to search their own memo's
|
|
if (await ct.Memo.AsNoTracking().AnyAsync(z => z.Id == t.ObjectId && z.ToId == currentUserId))
|
|
CanReadMatchingObjects.Add(t);
|
|
}
|
|
else
|
|
{
|
|
if (AyaNova.Api.ControllerHelpers.Authorized.HasReadFullRole(currentUserRoles, t.AType))
|
|
{
|
|
CanReadMatchingObjects.Add(t);
|
|
}
|
|
}
|
|
}
|
|
|
|
//Ok, we're here with the list of allowable objects which is now the master matching objects list so...
|
|
MatchingObjects = CanReadMatchingObjects;
|
|
|
|
|
|
|
|
//TOTAL RESULTS
|
|
//we have the total results here so set accordingly
|
|
ReturnObject.TotalResultsFound = MatchingObjects.Count;
|
|
|
|
//MAXIMUM RESULTS FILTER
|
|
//The theory is that it should be filtered BEFORE sorting so that you get the most random collection of results
|
|
//As the results are not ranked so...
|
|
if (searchParameters.MaxResults > 0)//0 = all results
|
|
MatchingObjects = MatchingObjects.Take(searchParameters.MaxResults).ToList();
|
|
|
|
//Sort and group the matching objects list in return order
|
|
//zCzustomer.OrderBy(z => z.LastName).ThenBy(z => z.FirstName)
|
|
var OrderedMatchingObjects = MatchingObjects.OrderBy(z => z.AType).ThenByDescending(z => z.ObjectId);
|
|
|
|
|
|
|
|
// var watch = new System.Diagnostics.Stopwatch();//###################### PROFILING
|
|
// watch.Start();//###################### PROFILING
|
|
|
|
//Get names using best performing technique
|
|
using (var command = ct.Database.GetDbConnection().CreateCommand())
|
|
{
|
|
|
|
ct.Database.OpenConnection();
|
|
//Build the return list from the remaining matching objects list
|
|
foreach (AyaTypeId i in OrderedMatchingObjects)
|
|
{
|
|
SearchResult SR = new SearchResult();
|
|
SR.Name = BizObjectNameFetcherDirect.Name(i.AType,
|
|
i.ObjectId,
|
|
command);
|
|
SR.Id = i.ObjectId;
|
|
SR.Type = i.AType;
|
|
ReturnObject.SearchResults.Add(SR);
|
|
}
|
|
}
|
|
|
|
// watch.Stop();//###################### PROFILING
|
|
// var TimeToBuildSearchResultReturnList = watch.ElapsedMilliseconds;//###################### PROFILING
|
|
|
|
return ReturnObject;
|
|
}
|
|
|
|
|
|
#endregion dosearch
|
|
|
|
#region Get info (excerpt)
|
|
public static async Task<string> GetInfoAsync(long translationId, AuthorizationRoles currentUserRoles, long userId, string phrase, int max, AyaType ayaType, long id, AyContext ct)
|
|
{
|
|
//escape literal percentage signs first just in case they are searching for 50% off or something
|
|
//https://www.postgresql.org/docs/current/functions-matching.html#FUNCTIONS-LIKE
|
|
//need to get around breaking possibly losing the symbol so make it text
|
|
phrase = phrase.Replace("%", "pctsym");
|
|
|
|
//Modify Phrase to replace wildcard * with % as breakcore expects sql style wildcards
|
|
phrase = phrase.Replace("*", "%");
|
|
|
|
//BREAK SEARCH PHRASE INTO SEPARATE TERMS
|
|
var PhraseItems = await BreakSearchPhraseAsync(translationId, phrase);
|
|
PhraseItems.ToArray();
|
|
|
|
//get text
|
|
ISearchAbleObject o = (ISearchAbleObject)BizObjectFactory.GetBizObject(ayaType, ct, userId, currentUserRoles, translationId);
|
|
|
|
//get extract
|
|
var searchParams = await o.GetSearchResultSummary(id, ayaType);
|
|
|
|
//extract and rank here
|
|
ExtractAndRank er = new ExtractAndRank();
|
|
er.Process(searchParams, PhraseItems.ToArray(), max);
|
|
// sr.Extract = er.Extract;
|
|
// sr.Rank = er.Ranking;
|
|
|
|
return er.Extract;
|
|
|
|
}
|
|
|
|
|
|
|
|
#region Search rank and extract
|
|
/// <summary>
|
|
/// Rank and extract best excerpt of specified text and search terms
|
|
/// </summary>
|
|
public sealed class ExtractAndRank
|
|
{
|
|
|
|
#region Fields
|
|
private string[] searchTerms;
|
|
private string rawtext;
|
|
private string extract = "";
|
|
private bool flattenExtract = true;
|
|
private float ranking;
|
|
private int extractionThresholdRank = 10;
|
|
private int maximumCharactersToExtract = 40;
|
|
#endregion
|
|
|
|
#region Properties
|
|
|
|
/// <summary>
|
|
/// This is the ranking of the source text as it pertains to the
|
|
/// search terms
|
|
///
|
|
/// A rank of zero means either there was no match or the rank that was calculated
|
|
/// was lower than the threshold ranking, either way, no excerpt extraction is done.
|
|
///
|
|
/// It is a percentage value on a scale of 0 to 100
|
|
/// and is weighted:
|
|
///
|
|
/// 75% of the score is the percentage of all search terms found in the text
|
|
/// 25% of the score is the percentage of all characters in the text that are search term characters
|
|
///
|
|
///
|
|
/// </summary>
|
|
public float Ranking
|
|
{
|
|
get
|
|
{
|
|
return ranking;
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Maximum characters to appear in an extraction
|
|
/// default is 80
|
|
/// Minimum is 10
|
|
/// </summary>
|
|
public int MaximumCharactersToExtract
|
|
{
|
|
get
|
|
{
|
|
return maximumCharactersToExtract;
|
|
}
|
|
set
|
|
{
|
|
|
|
if (value > 10)
|
|
maximumCharactersToExtract = value;
|
|
else
|
|
maximumCharactersToExtract = 10;
|
|
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// ExtractionThresholdRank
|
|
/// Extraction will only take place if the rank is
|
|
/// this value or higher
|
|
///
|
|
/// default is 10, maximum is 100 minimum is 0
|
|
/// </summary>
|
|
public int ExtractionThresholdRank
|
|
{
|
|
get
|
|
{
|
|
return extractionThresholdRank;
|
|
}
|
|
set
|
|
{
|
|
if (value > 100)
|
|
extractionThresholdRank = 100;
|
|
else if (value < 0)
|
|
extractionThresholdRank = 0;
|
|
else
|
|
extractionThresholdRank = value;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/// <summary>
|
|
/// If true, carriage returns and line feeds will be removed from extract
|
|
/// </summary>
|
|
public bool FlattenExtract
|
|
{
|
|
get
|
|
{
|
|
return this.flattenExtract;
|
|
}
|
|
set
|
|
{
|
|
this.flattenExtract = value;
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Extracted text excerpt that best reflects search terms
|
|
/// </summary>
|
|
public string Extract
|
|
{
|
|
get
|
|
{
|
|
return extract;
|
|
}
|
|
}
|
|
|
|
#endregion
|
|
|
|
#region public methods
|
|
/// <summary>
|
|
/// Do the extraction and ranking
|
|
/// </summary>
|
|
public void Process(SearchIndexProcessObjectParameters searchObjectParams, string[] searchTerms, int max)
|
|
{
|
|
this.maximumCharactersToExtract = max;
|
|
|
|
ranking = 0;
|
|
extract = "";
|
|
|
|
string rawText = string.Join(" ", searchObjectParams.Words);
|
|
|
|
//System.Diagnostics.Debug.Assert(rawText!=null && rawText!="","EXTRACT AND RANK","EMPTY RAWTEXT, CHECK OBJECTS GetSearchResult() CODE TO ENSURE IT'S GOT THE correct SP (CHECK THE SP IF NOT)");
|
|
if (rawText == null || rawText == "") return;
|
|
this.rawtext = rawText;
|
|
|
|
if (searchTerms == null || searchTerms.Length == 0) return;
|
|
this.searchTerms = searchTerms;
|
|
|
|
|
|
ranking = score(0, this.rawtext.Length);
|
|
if (ranking > extractionThresholdRank)
|
|
DoExtract();
|
|
}
|
|
#endregion
|
|
|
|
#region Calculate score
|
|
/// <summary>
|
|
/// Give a percentage score for a given window of
|
|
/// text in the raw text string
|
|
/// 75% of the score is the percentage of all search terms found in the window
|
|
/// 25% of the score is the percentage of all characters in the search window that are search term characters
|
|
///
|
|
///
|
|
///
|
|
/// </summary>
|
|
/// <param name="nStartPos"></param>
|
|
/// <param name="nEndPos"></param>
|
|
/// <returns>Float value of zero to one hundred</returns>
|
|
private float score(int nStartPos, int nEndPos)
|
|
{
|
|
//rewrite this as an integer based calculation
|
|
|
|
System.Diagnostics.Debug.Assert(nStartPos < nEndPos);
|
|
if (nStartPos < 0) nStartPos = 0;
|
|
if (nEndPos > this.rawtext.Length) nEndPos = this.rawtext.Length;
|
|
|
|
int nTermCharsInWindow = 0;//how many of the characters in the window are matching term characters
|
|
string SearchString = this.rawtext.Substring(nStartPos, nEndPos - nStartPos).ToLower(System.Globalization.CultureInfo.CurrentCulture);
|
|
|
|
int nMatches = 0;
|
|
|
|
foreach (string term in searchTerms)
|
|
{
|
|
//remove the wild card character if present and set to lower case
|
|
string lTerm = term.ToLower(System.Globalization.CultureInfo.CurrentCulture).Replace("%", "");
|
|
int nLocation = SearchString.IndexOf(lTerm);
|
|
if (nLocation != -1)
|
|
{
|
|
nMatches++;
|
|
while (nLocation != -1)
|
|
{
|
|
nTermCharsInWindow += lTerm.Length; ;
|
|
nLocation = SearchString.IndexOf(lTerm, nLocation + 1);
|
|
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
//If no matches then rank is automatically zero
|
|
if (nMatches == 0)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
|
|
|
|
//Rank is calculated on a weighted scale
|
|
//75% for matching all search terms
|
|
//25% for the quantity of search terms versus other text found
|
|
float fTermsFoundPct = 75 * ((float)nMatches / (float)searchTerms.GetLength(0));
|
|
float fTermsVsTextPct = 0;
|
|
if (nTermCharsInWindow > 0)
|
|
fTermsVsTextPct = 25 * ((float)nTermCharsInWindow / (float)SearchString.Length);
|
|
|
|
return fTermsFoundPct + fTermsVsTextPct;
|
|
|
|
}
|
|
#endregion
|
|
|
|
#region Extract best excerpt
|
|
/// <summary>
|
|
/// Extract the best scoring excerpt fragments of
|
|
/// raw text
|
|
/// </summary>
|
|
private void DoExtract()
|
|
{
|
|
//If the whole thing is less than the max to extract
|
|
//just save time and return the whole thing
|
|
if (this.rawtext.Length < this.maximumCharactersToExtract)
|
|
{
|
|
this.extract = this.rawtext;
|
|
return;
|
|
}
|
|
|
|
string BestWindow = "";
|
|
float BestScore = 0;
|
|
float thisscore = 0;
|
|
int BestWindowStartPos = 0;
|
|
|
|
//Get the shortest search term length so
|
|
//we can save time iterating over the window in the extract
|
|
//function below
|
|
int shortestSearchTermLength = int.MaxValue;
|
|
foreach (string s in this.searchTerms)
|
|
{
|
|
if (s.Length < shortestSearchTermLength)
|
|
shortestSearchTermLength = s.Length;
|
|
|
|
}
|
|
|
|
|
|
//slide a window over the text and check it's score, the highest scoring window wins
|
|
//move the length of the shortest search term so as to ensure we won't
|
|
//miss it, but faster than moving one character at a time
|
|
for (int z = 0; z < this.rawtext.Length - maximumCharactersToExtract; z += shortestSearchTermLength)
|
|
{
|
|
thisscore = score(z, z + (maximumCharactersToExtract));
|
|
|
|
if (thisscore == 0) continue;
|
|
|
|
if (thisscore > BestScore)
|
|
{
|
|
BestScore = thisscore;
|
|
BestWindow = this.rawtext.Substring(z, maximumCharactersToExtract);
|
|
//Best window to get if the future score is equal
|
|
//I.E. put the terms in the center of the window if
|
|
//the score is equal
|
|
BestWindowStartPos = z + (maximumCharactersToExtract / 2);
|
|
}
|
|
|
|
//If it's equal to the last and we're positioned over
|
|
//the best spot (terms in center) then capture that
|
|
if (thisscore == BestScore && z == BestWindowStartPos)
|
|
{
|
|
BestWindow = this.rawtext.Substring(z, maximumCharactersToExtract);
|
|
|
|
}
|
|
}
|
|
|
|
if (this.flattenExtract)
|
|
this.extract = "..." + BestWindow.Trim().Replace("\r", "").Replace("\n", "").Replace("\t", "") + "...";//case 1593 added tab character removal
|
|
else
|
|
this.extract = "..." + BestWindow.Trim() + "...";
|
|
|
|
|
|
}
|
|
|
|
|
|
//========================================================================
|
|
|
|
#endregion
|
|
|
|
}
|
|
#endregion Xtract
|
|
|
|
|
|
#endregion
|
|
|
|
#region ProcessKeywords into Database
|
|
|
|
//Class to hold process input parameters
|
|
//also used for getting summary search results
|
|
public class SearchIndexProcessObjectParameters
|
|
{
|
|
public long TranslationId { get; set; }
|
|
public long ObjectId { get; set; }
|
|
public AyaType AType { get; set; }
|
|
public List<string> Words { get; set; }
|
|
|
|
|
|
public SearchIndexProcessObjectParameters(long translationId, long objectID, AyaType aType)
|
|
{
|
|
Words = new List<string>();
|
|
TranslationId = translationId;
|
|
ObjectId = objectID;
|
|
AType = aType;
|
|
}
|
|
|
|
//format used for getsummmary by biz objects
|
|
public SearchIndexProcessObjectParameters()
|
|
{
|
|
Words = new List<string>();
|
|
TranslationId = 0;
|
|
ObjectId = 0;
|
|
AType = 0;
|
|
}
|
|
|
|
public SearchIndexProcessObjectParameters AddText(string s)
|
|
{
|
|
if (!string.IsNullOrWhiteSpace(s))
|
|
{
|
|
Words.Add(s);
|
|
}
|
|
return this;
|
|
}
|
|
|
|
|
|
public SearchIndexProcessObjectParameters AddText(long l)
|
|
{
|
|
Words.Add(l.ToString());
|
|
return this;
|
|
}
|
|
|
|
public SearchIndexProcessObjectParameters AddText(decimal? d)
|
|
{
|
|
if (d != null)
|
|
Words.Add(d.ToString());
|
|
return this;
|
|
}
|
|
|
|
public SearchIndexProcessObjectParameters AddText(List<string> lWords)
|
|
{
|
|
if (lWords != null)
|
|
{
|
|
foreach (string s in lWords)
|
|
{
|
|
if (!string.IsNullOrWhiteSpace(s))
|
|
{
|
|
Words.Add(s);
|
|
}
|
|
}
|
|
}
|
|
|
|
return this;
|
|
}
|
|
public SearchIndexProcessObjectParameters AddCustomFields(string jsonString)
|
|
{
|
|
//Extract the text from custom fields json fragment as an array of strings and add it here
|
|
AddText(JsonUtil.GetCustomFieldsAsStringArrayForSearchIndexing(jsonString));
|
|
return this;
|
|
}
|
|
}
|
|
|
|
public static async Task ProcessNewObjectKeywordsAsync(SearchIndexProcessObjectParameters searchIndexObjectParameters)
|
|
{
|
|
await ProcessKeywordsAsync(searchIndexObjectParameters, true);
|
|
}
|
|
|
|
public static async Task ProcessUpdatedObjectKeywordsAsync(SearchIndexProcessObjectParameters searchIndexObjectParameters)
|
|
{
|
|
await ProcessKeywordsAsync(searchIndexObjectParameters, false);
|
|
}
|
|
|
|
public static async Task ProcessDeletedObjectKeywordsAsync(long objectID, AyaType aType, AyContext ct)
|
|
{
|
|
//Be careful in future, if you put ToString at the end of each object in the string interpolation
|
|
//npgsql driver will assume it's a string and put quotes around it triggering an error that a string can't be compared to an int
|
|
await ct.Database.ExecuteSqlInterpolatedAsync($"delete from asearchkey where objectid={objectID} and aType={(int)aType}");
|
|
//nothing to save here, it's a direct command already executed
|
|
}
|
|
|
|
|
|
/// <summary>
|
|
/// Process the keywords into the dictionary
|
|
/// </summary>
|
|
private static async Task ProcessKeywordsAsync(SearchIndexProcessObjectParameters p, bool newRecord)
|
|
{
|
|
// #if (DEBUG)
|
|
// if (!p.AType.HasAttribute(typeof(CoreBizObjectAttribute)))
|
|
// throw new System.NotSupportedException($"Search::ProcessKeywords - Invalid type presented {p.AType}");
|
|
// #endif
|
|
List<string> KeyWordList = await BreakAsync(p.TranslationId, p.Words);
|
|
|
|
if (KeyWordList.Count == 0) return;
|
|
//call stored procedure to do the work right at the server (fastest method by far)
|
|
using (AyContext ct = ServiceProviderProvider.DBContext)
|
|
await ct.Database.ExecuteSqlInterpolatedAsync($"call aydosearchindex({KeyWordList},{p.ObjectId},{p.AType},{!newRecord})");
|
|
return;
|
|
}//eoc
|
|
#endregion
|
|
|
|
#region Breaker
|
|
|
|
public enum TokenTypes
|
|
{ Nothing, Separator, CJK, Latin };
|
|
|
|
/// <summary>
|
|
/// Take an array of strings and
|
|
/// return a single string
|
|
/// containing unique only, lowercase comma delimited
|
|
/// keywords suitable for passing to a
|
|
/// stored procedure or other function
|
|
///
|
|
/// Use Translation setting CJKIndex=true to handle Chinese, Japanese, Korean etc
|
|
/// (languages with no easily identifiable word boundaries as in english)
|
|
/// </summary>
|
|
/// <returns>List of strings</returns>
|
|
internal static async Task<List<string>> BreakAsync(long translationId, List<string> textStrings)
|
|
{
|
|
return await BreakCoreAsync(translationId, false, textStrings);
|
|
}
|
|
|
|
/// <summary>
|
|
///
|
|
/// </summary>
|
|
internal static async Task<List<string>> BreakAsync(long translationId, string textString)
|
|
{
|
|
List<string> textStrings = new List<string>(1);
|
|
textStrings.Add(textString);
|
|
return await BreakCoreAsync(translationId, false, textStrings);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Used to Process users search phrase and preserve wild
|
|
/// cards entered
|
|
/// </summary>
|
|
internal static async Task<List<string>> BreakSearchPhraseAsync(long translationId, string searchPhrase)
|
|
{
|
|
List<string> textStrings = new List<string>();
|
|
textStrings.Add(searchPhrase);
|
|
//note: we want stopwords if this is a search phrase break because they might type "some" wanting awesome but some is a stopword so..
|
|
return await BreakCoreAsync(translationId, true, textStrings, true);
|
|
}
|
|
|
|
|
|
|
|
internal static async Task<List<string>> BreakCoreAsync(long translationId, bool KeepWildCards, List<string> textStrings, bool ignoreStopWords = false)
|
|
{
|
|
//For stopwords and CJKIndex flag value
|
|
var translationWordBreakData = await SearchTranslationWordBreakDataCache.GetWordBreakData(translationId);
|
|
|
|
int MAXWORDLENGTH = 255;
|
|
int MINWORDLENGTH = 2;//A word isn't a word unless it's got at least two characters in it
|
|
StringBuilder sbResults = new StringBuilder();
|
|
//List to temporarily hold parsed words
|
|
//used to easily ensure unique words only
|
|
List<string> tempParsedWords = new List<string>();
|
|
|
|
StringBuilder sb = new StringBuilder();
|
|
StringBuilder sbWord = new StringBuilder();
|
|
List<string> ReturnList = new List<string>();
|
|
|
|
|
|
//Loop through each of the passed in strings
|
|
foreach (string s in textStrings)
|
|
{
|
|
if (s == null || s == "") continue;
|
|
//get all the characters in a unicode compliant manner...
|
|
TextElementEnumerator t = StringInfo.GetTextElementEnumerator(s);
|
|
//start at the top
|
|
t.Reset();
|
|
|
|
TokenTypes LastToken = TokenTypes.Nothing;
|
|
|
|
//Used by CJK
|
|
bool BasicLatinBlock = true;
|
|
|
|
//Process each "character" (text element,glyph whatever) in the
|
|
//current string
|
|
while (t.MoveNext())
|
|
{
|
|
//get it as a character
|
|
char c = t.GetTextElement()[0];
|
|
|
|
if (!translationWordBreakData.CJKIndex)
|
|
{
|
|
#region regular tokenizer
|
|
|
|
//Is it a token we want to include?
|
|
//Or a wildcard character
|
|
if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%'))
|
|
{
|
|
#region Include token
|
|
//All latin text is converted to lower case
|
|
c = char.ToLower(c,System.Globalization.CultureInfo.CurrentCulture);
|
|
|
|
//Do we already have a word?
|
|
if (sbWord.Length > 0)
|
|
{
|
|
//Maybe we need to flush this word into the word list
|
|
//if we're over the word length limit
|
|
if (sbWord.Length >= MAXWORDLENGTH)
|
|
{
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
sbWord.Append(c);
|
|
LastToken = TokenTypes.Latin;
|
|
continue;
|
|
|
|
}
|
|
}
|
|
|
|
//append character and go on to next one
|
|
sbWord.Append(c);
|
|
LastToken = TokenTypes.Latin;
|
|
continue;
|
|
#endregion
|
|
}
|
|
else
|
|
{
|
|
#region Word Boundary token
|
|
LastToken = TokenTypes.Separator;
|
|
if (sbWord.Length > 0)
|
|
{
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
continue;
|
|
}
|
|
|
|
#endregion
|
|
}
|
|
#endregion
|
|
}
|
|
else
|
|
{
|
|
#region CJK Tokenizer
|
|
|
|
//Is it a basic latin charater? (ascii basically)
|
|
//see: http://www.unicode.org/charts/index.html
|
|
//and here for a funky online viewer:
|
|
//http://www.fileformat.info/info/unicode/block/index.htm
|
|
//we need to know this so that regular english text
|
|
//within cjk text gets properly indexed as whole words
|
|
BasicLatinBlock = false;
|
|
if ((int)c < 256) BasicLatinBlock = true;
|
|
|
|
if (BasicLatinBlock)
|
|
{
|
|
//Is it a token we want to include?
|
|
if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%'))
|
|
{
|
|
#region Latin Include token
|
|
//All latin text is converted to lower case
|
|
c = char.ToLower(c,System.Globalization.CultureInfo.CurrentCulture);
|
|
|
|
//Do we already have a word?
|
|
if (sbWord.Length > 0)
|
|
{
|
|
//Maybe we need to flush this word into the word list
|
|
//if we're over the word length limit or we are going from
|
|
//CJK to latin
|
|
if (LastToken == TokenTypes.CJK || sbWord.Length >= MAXWORDLENGTH)
|
|
{
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
sbWord.Append(c);
|
|
LastToken = TokenTypes.Latin;
|
|
continue;
|
|
|
|
}
|
|
}
|
|
|
|
//append character and go on to next one
|
|
sbWord.Append(c);
|
|
LastToken = TokenTypes.Latin;
|
|
continue;
|
|
#endregion
|
|
}
|
|
else
|
|
{
|
|
#region Latin Word Boundary token
|
|
LastToken = TokenTypes.Separator;
|
|
if (sbWord.Length > 0)
|
|
{
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
#endregion
|
|
}
|
|
|
|
}
|
|
else//CJK character
|
|
{
|
|
if (char.IsLetter(c) || (KeepWildCards && c == '%'))
|
|
{
|
|
#region CJK Include token
|
|
//Do we already have a word?
|
|
if (sbWord.Length > 0)
|
|
{
|
|
//Maybe we need to flush this word into the word list
|
|
//if we're over the word length limit or we are going from
|
|
//latin TO CJK
|
|
if (LastToken == TokenTypes.Latin || sbWord.Length >= MAXWORDLENGTH)
|
|
{
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
sbWord.Append(c);
|
|
LastToken = TokenTypes.CJK;
|
|
continue;
|
|
|
|
}
|
|
|
|
if (LastToken == TokenTypes.CJK)
|
|
{
|
|
//we're here because there is more than zero characters already stored
|
|
//and the last was CJK so we need append current character
|
|
//and flush the resultant 2 character n-gram
|
|
sbWord.Append(c);
|
|
System.Diagnostics.Debug.Assert(sbWord.Length == 2);
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
sbWord.Append(c);
|
|
LastToken = TokenTypes.CJK;
|
|
continue;
|
|
|
|
}
|
|
}
|
|
|
|
//append character and go on to next one
|
|
sbWord.Append(c);
|
|
LastToken = TokenTypes.CJK;
|
|
continue;
|
|
#endregion
|
|
|
|
|
|
}
|
|
else
|
|
{
|
|
#region CJK Word Boundary token
|
|
LastToken = TokenTypes.Separator;
|
|
if (sbWord.Length > 0)
|
|
{
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
continue;
|
|
}
|
|
|
|
#endregion
|
|
}
|
|
|
|
}
|
|
|
|
#endregion
|
|
}
|
|
}
|
|
|
|
//Flush out the last word
|
|
if (sbWord.Length > 0)
|
|
{
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
}
|
|
}
|
|
|
|
|
|
//bail early if there is nothing indexed
|
|
if (tempParsedWords.Count == 0) return ReturnList;
|
|
|
|
|
|
//Make a return string array
|
|
//from the word list
|
|
foreach (string s in tempParsedWords)
|
|
{
|
|
//Filter out short words if we are breaking for indexing
|
|
//but keep them if they are part of a wildcard search phrase
|
|
if (s.Length >= MINWORDLENGTH || (KeepWildCards && s.Contains('%')))
|
|
{
|
|
if (ignoreStopWords)
|
|
{
|
|
//breaking of search phrase
|
|
ReturnList.Add(s);
|
|
}
|
|
else
|
|
{
|
|
//Add only non stopwords - regular breaking of object for dictionary entry
|
|
if (!translationWordBreakData.StopWords.Contains(s))
|
|
{
|
|
ReturnList.Add(s);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
//sometimes all the results are stop words so you end up here with nothing
|
|
return ReturnList;
|
|
|
|
}
|
|
|
|
#endregion
|
|
|
|
}//eoc
|
|
|
|
}//eons |