This commit is contained in:
998
server/biz/Search.cs
Normal file
998
server/biz/Search.cs
Normal file
@@ -0,0 +1,998 @@
|
||||
using System.Linq;
|
||||
using System.Globalization;
|
||||
using System.Text;
|
||||
using System.Collections.Generic;
|
||||
using System.Threading.Tasks;
|
||||
using Microsoft.EntityFrameworkCore;
|
||||
using Sockeye.Util;
|
||||
using Sockeye.Models;
|
||||
|
||||
namespace Sockeye.Biz
|
||||
{
|
||||
|
||||
//This class handles word breaking, processing keywords and searching for results
|
||||
public static class Search
|
||||
{
|
||||
|
||||
#region Search and return results
|
||||
|
||||
public class SearchRequestParameters
|
||||
{
|
||||
public string Phrase { get; set; }
|
||||
|
||||
public SockType TypeOnly { get; set; }
|
||||
|
||||
//Note: maxresults of 0 will get all results
|
||||
public int MaxResults { get; set; }
|
||||
|
||||
public SearchRequestParameters()
|
||||
{
|
||||
|
||||
TypeOnly = SockType.NoType;
|
||||
MaxResults = 500;
|
||||
}
|
||||
|
||||
public bool IsValid
|
||||
{
|
||||
get
|
||||
{
|
||||
//has a phrase?
|
||||
if (!string.IsNullOrWhiteSpace(this.Phrase))
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//Classes to hold search results returned to client
|
||||
public class SearchResult
|
||||
{
|
||||
public string Name { get; set; }
|
||||
public SockType Type { get; set; }
|
||||
public long Id { get; set; }
|
||||
}
|
||||
|
||||
public class SearchReturnObject
|
||||
{
|
||||
public long TotalResultsFound { get; set; }
|
||||
public List<SearchResult> SearchResults { get; set; }
|
||||
public SearchReturnObject()
|
||||
{
|
||||
TotalResultsFound = 0;
|
||||
SearchResults = new List<SearchResult>();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static async Task<SearchReturnObject> DoSearchAsync(AyContext ct, long translationId, AuthorizationRoles currentUserRoles, long currentUserId, SearchRequestParameters searchParameters)
|
||||
{
|
||||
var ReturnObject = new SearchReturnObject();
|
||||
|
||||
//list to hold temporary search/tag hits
|
||||
List<SockTypeId> MatchingObjects = new List<SockTypeId>();
|
||||
|
||||
if (!searchParameters.IsValid)
|
||||
{
|
||||
//this is expected, don't throw, just return nothing
|
||||
//throw new System.ArgumentException("Search::DoSearch - Search request parameters must contain a phrase or tags");
|
||||
return ReturnObject;
|
||||
}
|
||||
|
||||
//escape literal percentage signs first just in case they are searching for 50% off or something
|
||||
//https://www.postgresql.org/docs/current/functions-matching.html#FUNCTIONS-LIKE
|
||||
//need to get around breaking possibly losing the symbol so make it text
|
||||
searchParameters.Phrase = searchParameters.Phrase.Replace("%", "pctsym");
|
||||
|
||||
//Modify Phrase to replace wildcard * with % as breakcore expects sql style wildcards
|
||||
searchParameters.Phrase = searchParameters.Phrase.Replace("*", "%");
|
||||
|
||||
//BREAK SEARCH PHRASE INTO SEPARATE TERMS
|
||||
var PhraseItems = await BreakSearchPhraseAsync(translationId, searchParameters.Phrase);
|
||||
|
||||
//SPLIT OUT WILDCARDS FROM NON WILDCARDS
|
||||
List<string> PreWildCardedSearchTerms = new List<string>();
|
||||
List<string> SearchTerms = new List<string>();
|
||||
|
||||
foreach (string PhraseItem in PhraseItems)
|
||||
{
|
||||
if (PhraseItem.Contains("%"))
|
||||
PreWildCardedSearchTerms.Add(PhraseItem.Replace("pctsym", @"\%"));//put back literal percentage symbol if necessary
|
||||
else
|
||||
SearchTerms.Add(PhraseItem.Replace("pctsym", @"\%"));//put back literal percentage symbol if necessary
|
||||
}
|
||||
|
||||
StringBuilder q = new StringBuilder();
|
||||
int termCount = 0;
|
||||
|
||||
q.Append("WITH qr AS (SELECT asearchkey.sockType, asearchkey.objectid, ");
|
||||
|
||||
//EXACT MATCH SEARCH TERMS
|
||||
foreach (string Term in SearchTerms)
|
||||
q.Append($"COUNT(*) FILTER (WHERE asearchdictionary.word = '{Term}') AS st{++termCount}, ");
|
||||
|
||||
//WILDCARD SEARCH TERMS
|
||||
foreach (string WildCardSearchTerm in PreWildCardedSearchTerms)
|
||||
q.Append($"COUNT(*) FILTER (WHERE asearchdictionary.word LIKE '{WildCardSearchTerm}') AS st{++termCount}, ");
|
||||
|
||||
q.Length=q.Length-2;//trim the final comma and space
|
||||
|
||||
var qTypeOnly=string.Empty;
|
||||
if(searchParameters.TypeOnly!=SockType.NoType){
|
||||
//INNER JOIN ASEARCHKEY ON ASEARCHDICTIONARY.ID = ASEARCHKEY.WORDID and asearchkey.sockType=20
|
||||
qTypeOnly=$"AND ASEARCHKEY.ATYPE={(int)searchParameters.TypeOnly}";
|
||||
}
|
||||
|
||||
q.Append($" FROM asearchdictionary INNER JOIN asearchkey ON asearchdictionary.id = asearchkey.wordid {qTypeOnly} GROUP BY asearchkey.objectid, asearchkey.sockType) SELECT sockType, objectid FROM qr WHERE ");
|
||||
|
||||
for (; termCount > 0; termCount--)
|
||||
q.Append($"st{termCount} > 0 {(termCount > 1 ? "AND " : "")}");
|
||||
|
||||
|
||||
//execute the query and iterate the results
|
||||
using (var command = ct.Database.GetDbConnection().CreateCommand())
|
||||
{
|
||||
await ct.Database.OpenConnectionAsync();
|
||||
command.CommandText = q.ToString();
|
||||
using (var dr = await command.ExecuteReaderAsync())
|
||||
{
|
||||
while (dr.Read())
|
||||
{
|
||||
MatchingObjects.Add(new SockTypeId((SockType)dr.GetInt32(0), dr.GetInt64(1)));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
//REMOVE ANY ITEMS THAT USER IS NOT PERMITTED TO READ
|
||||
//list to hold temporary matches
|
||||
List<SockTypeId> CanReadMatchingObjects = new List<SockTypeId>();
|
||||
foreach (SockTypeId t in MatchingObjects)
|
||||
{
|
||||
if (t.SockType == SockType.FileAttachment)
|
||||
{
|
||||
//have to look up the actual underlying object type and id here
|
||||
//check if it's readable for user
|
||||
//then add the PARENT object type and id to the CanREadMatchingObjects list
|
||||
//this means user will not see it return as an attachment, just as the object
|
||||
FileAttachment f = await ct.FileAttachment.AsNoTracking().FirstOrDefaultAsync(z => z.Id == t.ObjectId);
|
||||
if (Sockeye.Api.ControllerHelpers.Authorized.HasReadFullRole(currentUserRoles, f.AttachToAType))
|
||||
{
|
||||
CanReadMatchingObjects.Add(new SockTypeId(f.AttachToAType, f.AttachToObjectId));
|
||||
}
|
||||
}
|
||||
else if (t.SockType == SockType.Memo)
|
||||
{
|
||||
//Users are only permitted to search their own memo's
|
||||
if (await ct.Memo.AsNoTracking().AnyAsync(z => z.Id == t.ObjectId && z.ToId == currentUserId))
|
||||
CanReadMatchingObjects.Add(t);
|
||||
}
|
||||
else if (t.SockType == SockType.Reminder)
|
||||
{
|
||||
//Users are only permitted to search their own reminder's
|
||||
if (await ct.Reminder.AsNoTracking().AnyAsync(z => z.Id == t.ObjectId && z.UserId == currentUserId))
|
||||
CanReadMatchingObjects.Add(t);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (Sockeye.Api.ControllerHelpers.Authorized.HasReadFullRole(currentUserRoles, t.SockType))
|
||||
{
|
||||
CanReadMatchingObjects.Add(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//Ok, we're here with the list of allowable objects which is now the master matching objects list so...
|
||||
MatchingObjects = CanReadMatchingObjects;
|
||||
|
||||
//TOTAL RESULTS
|
||||
//we have the total results here so set accordingly
|
||||
ReturnObject.TotalResultsFound = MatchingObjects.Count;
|
||||
|
||||
//MAXIMUM RESULTS FILTER
|
||||
//The theory is that it should be filtered BEFORE sorting so that you get the most random collection of results
|
||||
//As the results are not ranked so...
|
||||
if (searchParameters.MaxResults > 0)//0 = all results
|
||||
MatchingObjects = MatchingObjects.Take(searchParameters.MaxResults).ToList();
|
||||
|
||||
//Sort and group the matching objects list in return order
|
||||
var OrderedMatchingObjects = MatchingObjects.OrderBy(z => z.SockType).ThenByDescending(z => z.ObjectId);
|
||||
|
||||
//Get names using best performing technique
|
||||
using (var command = ct.Database.GetDbConnection().CreateCommand())
|
||||
{
|
||||
|
||||
ct.Database.OpenConnection();
|
||||
//Build the return list from the remaining matching objects list
|
||||
foreach (SockTypeId i in OrderedMatchingObjects)
|
||||
{
|
||||
SearchResult SR = new SearchResult();
|
||||
SR.Name = BizObjectNameFetcherDirect.Name(i.SockType,
|
||||
i.ObjectId,translationId,
|
||||
command);
|
||||
SR.Id = i.ObjectId;
|
||||
SR.Type = i.SockType;
|
||||
ReturnObject.SearchResults.Add(SR);
|
||||
}
|
||||
}
|
||||
|
||||
return ReturnObject;
|
||||
}
|
||||
|
||||
|
||||
#endregion dosearch
|
||||
|
||||
#region Get info (excerpt)
|
||||
public static async Task<string> GetInfoAsync(long translationId, AuthorizationRoles currentUserRoles, long userId, string phrase, int max, SockType sockType, long id, AyContext ct)
|
||||
{
|
||||
//escape literal percentage signs first just in case they are searching for 50% off or something
|
||||
//https://www.postgresql.org/docs/current/functions-matching.html#FUNCTIONS-LIKE
|
||||
//need to get around breaking possibly losing the symbol so make it text
|
||||
phrase = phrase.Replace("%", "pctsym");
|
||||
|
||||
//Modify Phrase to replace wildcard * with % as breakcore expects sql style wildcards
|
||||
phrase = phrase.Replace("*", "%");
|
||||
|
||||
//BREAK SEARCH PHRASE INTO SEPARATE TERMS
|
||||
var PhraseItems = await BreakSearchPhraseAsync(translationId, phrase);
|
||||
PhraseItems.ToArray();
|
||||
|
||||
//get text
|
||||
ISearchAbleObject o = (ISearchAbleObject)BizObjectFactory.GetBizObject(sockType, ct, userId, currentUserRoles, translationId);
|
||||
|
||||
//get extract
|
||||
var searchParams = await o.GetSearchResultSummary(id, sockType);
|
||||
|
||||
//extract and rank here
|
||||
ExtractAndRank er = new ExtractAndRank();
|
||||
er.Process(searchParams, PhraseItems.ToArray(), max);
|
||||
// sr.Extract = er.Extract;
|
||||
// sr.Rank = er.Ranking;
|
||||
|
||||
return er.Extract;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
#region Search rank and extract
|
||||
/// <summary>
|
||||
/// Rank and extract best excerpt of specified text and search terms
|
||||
/// </summary>
|
||||
public sealed class ExtractAndRank
|
||||
{
|
||||
|
||||
#region Fields
|
||||
private string[] searchTerms;
|
||||
private string rawtext;
|
||||
private string extract = "";
|
||||
private bool flattenExtract = true;
|
||||
private float ranking;
|
||||
private int extractionThresholdRank = 10;
|
||||
private int maximumCharactersToExtract = 40;
|
||||
#endregion
|
||||
|
||||
#region Properties
|
||||
|
||||
/// <summary>
|
||||
/// This is the ranking of the source text as it pertains to the
|
||||
/// search terms
|
||||
///
|
||||
/// A rank of zero means either there was no match or the rank that was calculated
|
||||
/// was lower than the threshold ranking, either way, no excerpt extraction is done.
|
||||
///
|
||||
/// It is a percentage value on a scale of 0 to 100
|
||||
/// and is weighted:
|
||||
///
|
||||
/// 75% of the score is the percentage of all search terms found in the text
|
||||
/// 25% of the score is the percentage of all characters in the text that are search term characters
|
||||
///
|
||||
///
|
||||
/// </summary>
|
||||
public float Ranking
|
||||
{
|
||||
get
|
||||
{
|
||||
return ranking;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Maximum characters to appear in an extraction
|
||||
/// default is 80
|
||||
/// Minimum is 10
|
||||
/// </summary>
|
||||
public int MaximumCharactersToExtract
|
||||
{
|
||||
get
|
||||
{
|
||||
return maximumCharactersToExtract;
|
||||
}
|
||||
set
|
||||
{
|
||||
|
||||
if (value > 10)
|
||||
maximumCharactersToExtract = value;
|
||||
else
|
||||
maximumCharactersToExtract = 10;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// ExtractionThresholdRank
|
||||
/// Extraction will only take place if the rank is
|
||||
/// this value or higher
|
||||
///
|
||||
/// default is 10, maximum is 100 minimum is 0
|
||||
/// </summary>
|
||||
public int ExtractionThresholdRank
|
||||
{
|
||||
get
|
||||
{
|
||||
return extractionThresholdRank;
|
||||
}
|
||||
set
|
||||
{
|
||||
if (value > 100)
|
||||
extractionThresholdRank = 100;
|
||||
else if (value < 0)
|
||||
extractionThresholdRank = 0;
|
||||
else
|
||||
extractionThresholdRank = value;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// If true, carriage returns and line feeds will be removed from extract
|
||||
/// </summary>
|
||||
public bool FlattenExtract
|
||||
{
|
||||
get
|
||||
{
|
||||
return this.flattenExtract;
|
||||
}
|
||||
set
|
||||
{
|
||||
this.flattenExtract = value;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Extracted text excerpt that best reflects search terms
|
||||
/// </summary>
|
||||
public string Extract
|
||||
{
|
||||
get
|
||||
{
|
||||
return extract;
|
||||
}
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region public methods
|
||||
/// <summary>
|
||||
/// Do the extraction and ranking
|
||||
/// </summary>
|
||||
public void Process(SearchIndexProcessObjectParameters searchObjectParams, string[] searchTerms, int max)
|
||||
{
|
||||
this.maximumCharactersToExtract = max;
|
||||
|
||||
ranking = 0;
|
||||
extract = "";
|
||||
|
||||
string rawText = string.Join(" ", searchObjectParams.Words);
|
||||
|
||||
//System.Diagnostics.Debug.Assert(rawText!=null && rawText!="","EXTRACT AND RANK","EMPTY RAWTEXT, CHECK OBJECTS GetSearchResult() CODE TO ENSURE IT'S GOT THE correct SP (CHECK THE SP IF NOT)");
|
||||
if (rawText == null || rawText == "") return;
|
||||
this.rawtext = rawText;
|
||||
|
||||
if (searchTerms == null || searchTerms.Length == 0) return;
|
||||
this.searchTerms = searchTerms;
|
||||
|
||||
|
||||
ranking = score(0, this.rawtext.Length);
|
||||
if (ranking > extractionThresholdRank)
|
||||
DoExtract();
|
||||
}
|
||||
#endregion
|
||||
|
||||
#region Calculate score
|
||||
/// <summary>
|
||||
/// Give a percentage score for a given window of
|
||||
/// text in the raw text string
|
||||
/// 75% of the score is the percentage of all search terms found in the window
|
||||
/// 25% of the score is the percentage of all characters in the search window that are search term characters
|
||||
///
|
||||
///
|
||||
///
|
||||
/// </summary>
|
||||
/// <param name="nStartPos"></param>
|
||||
/// <param name="nEndPos"></param>
|
||||
/// <returns>Float value of zero to one hundred</returns>
|
||||
private float score(int nStartPos, int nEndPos)
|
||||
{
|
||||
//rewrite this as an integer based calculation
|
||||
|
||||
System.Diagnostics.Debug.Assert(nStartPos < nEndPos);
|
||||
if (nStartPos < 0) nStartPos = 0;
|
||||
if (nEndPos > this.rawtext.Length) nEndPos = this.rawtext.Length;
|
||||
|
||||
int nTermCharsInWindow = 0;//how many of the characters in the window are matching term characters
|
||||
string SearchString = this.rawtext.Substring(nStartPos, nEndPos - nStartPos).ToLower(System.Globalization.CultureInfo.CurrentCulture);
|
||||
|
||||
int nMatches = 0;
|
||||
|
||||
foreach (string term in searchTerms)
|
||||
{
|
||||
//remove the wild card character if present and set to lower case
|
||||
string lTerm = term.ToLower(System.Globalization.CultureInfo.CurrentCulture).Replace("%", "");
|
||||
int nLocation = SearchString.IndexOf(lTerm);
|
||||
if (nLocation != -1)
|
||||
{
|
||||
nMatches++;
|
||||
while (nLocation != -1)
|
||||
{
|
||||
nTermCharsInWindow += lTerm.Length; ;
|
||||
nLocation = SearchString.IndexOf(lTerm, nLocation + 1);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
//If no matches then rank is automatically zero
|
||||
if (nMatches == 0)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
//Rank is calculated on a weighted scale
|
||||
//75% for matching all search terms
|
||||
//25% for the quantity of search terms versus other text found
|
||||
float fTermsFoundPct = 75 * ((float)nMatches / (float)searchTerms.GetLength(0));
|
||||
float fTermsVsTextPct = 0;
|
||||
if (nTermCharsInWindow > 0)
|
||||
fTermsVsTextPct = 25 * ((float)nTermCharsInWindow / (float)SearchString.Length);
|
||||
|
||||
return fTermsFoundPct + fTermsVsTextPct;
|
||||
|
||||
}
|
||||
#endregion
|
||||
|
||||
#region Extract best excerpt
|
||||
/// <summary>
|
||||
/// Extract the best scoring excerpt fragments of
|
||||
/// raw text
|
||||
/// </summary>
|
||||
private void DoExtract()
|
||||
{
|
||||
//If the whole thing is less than the max to extract
|
||||
//just save time and return the whole thing
|
||||
if (this.rawtext.Length < this.maximumCharactersToExtract)
|
||||
{
|
||||
this.extract = this.rawtext;
|
||||
return;
|
||||
}
|
||||
|
||||
string BestWindow = "";
|
||||
float BestScore = 0;
|
||||
float thisscore = 0;
|
||||
int BestWindowStartPos = 0;
|
||||
|
||||
//Get the shortest search term length so
|
||||
//we can save time iterating over the window in the extract
|
||||
//function below
|
||||
int shortestSearchTermLength = int.MaxValue;
|
||||
foreach (string s in this.searchTerms)
|
||||
{
|
||||
if (s.Length < shortestSearchTermLength)
|
||||
shortestSearchTermLength = s.Length;
|
||||
|
||||
}
|
||||
|
||||
|
||||
//slide a window over the text and check it's score, the highest scoring window wins
|
||||
//move the length of the shortest search term so as to ensure we won't
|
||||
//miss it, but faster than moving one character at a time
|
||||
for (int z = 0; z < this.rawtext.Length - maximumCharactersToExtract; z += shortestSearchTermLength)
|
||||
{
|
||||
thisscore = score(z, z + (maximumCharactersToExtract));
|
||||
|
||||
if (thisscore == 0) continue;
|
||||
|
||||
if (thisscore > BestScore)
|
||||
{
|
||||
BestScore = thisscore;
|
||||
BestWindow = this.rawtext.Substring(z, maximumCharactersToExtract);
|
||||
//Best window to get if the future score is equal
|
||||
//I.E. put the terms in the center of the window if
|
||||
//the score is equal
|
||||
BestWindowStartPos = z + (maximumCharactersToExtract / 2);
|
||||
}
|
||||
|
||||
//If it's equal to the last and we're positioned over
|
||||
//the best spot (terms in center) then capture that
|
||||
if (thisscore == BestScore && z == BestWindowStartPos)
|
||||
{
|
||||
BestWindow = this.rawtext.Substring(z, maximumCharactersToExtract);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
if (this.flattenExtract)
|
||||
this.extract = "..." + BestWindow.Trim().Replace("\r", "").Replace("\n", "").Replace("\t", "") + "...";//case 1593 added tab character removal
|
||||
else
|
||||
this.extract = "..." + BestWindow.Trim() + "...";
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
//========================================================================
|
||||
|
||||
#endregion
|
||||
|
||||
}
|
||||
#endregion Xtract
|
||||
|
||||
|
||||
#endregion
|
||||
|
||||
#region ProcessKeywords into Database
|
||||
|
||||
//Class to hold process input parameters
|
||||
//also used for getting summary search results
|
||||
public class SearchIndexProcessObjectParameters
|
||||
{
|
||||
public long TranslationId { get; set; }
|
||||
public long ObjectId { get; set; }
|
||||
public SockType SockType { get; set; }
|
||||
public List<string> Words { get; set; }
|
||||
|
||||
|
||||
public SearchIndexProcessObjectParameters(long translationId, long objectID, SockType aType)
|
||||
{
|
||||
Words = new List<string>();
|
||||
TranslationId = translationId;
|
||||
ObjectId = objectID;
|
||||
SockType = aType;
|
||||
}
|
||||
|
||||
//format used for getsummmary by biz objects
|
||||
public SearchIndexProcessObjectParameters()
|
||||
{
|
||||
Words = new List<string>();
|
||||
TranslationId = 0;
|
||||
ObjectId = 0;
|
||||
SockType = 0;
|
||||
}
|
||||
|
||||
public SearchIndexProcessObjectParameters AddText(string s)
|
||||
{
|
||||
if (!string.IsNullOrWhiteSpace(s))
|
||||
{
|
||||
Words.Add(s);
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
public SearchIndexProcessObjectParameters AddText(long l)
|
||||
{
|
||||
Words.Add(l.ToString());
|
||||
return this;
|
||||
}
|
||||
|
||||
// public SearchIndexProcessObjectParameters AddText(decimal? d)
|
||||
// {
|
||||
// if (d != null)
|
||||
// Words.Add(d.ToString());
|
||||
// return this;
|
||||
// }
|
||||
|
||||
public SearchIndexProcessObjectParameters AddText(List<string> lWords)
|
||||
{
|
||||
if (lWords != null)
|
||||
{
|
||||
foreach (string s in lWords)
|
||||
{
|
||||
if (!string.IsNullOrWhiteSpace(s))
|
||||
{
|
||||
Words.Add(s);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return this;
|
||||
}
|
||||
public SearchIndexProcessObjectParameters AddCustomFields(string jsonString)
|
||||
{
|
||||
//Extract the text from custom fields json fragment as an array of strings and add it here
|
||||
AddText(JsonUtil.GetCustomFieldsAsStringArrayForSearchIndexing(jsonString));
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
||||
public static async Task ProcessNewObjectKeywordsAsync(SearchIndexProcessObjectParameters searchIndexObjectParameters)
|
||||
{
|
||||
await ProcessKeywordsAsync(searchIndexObjectParameters, true);
|
||||
}
|
||||
|
||||
public static async Task ProcessUpdatedObjectKeywordsAsync(SearchIndexProcessObjectParameters searchIndexObjectParameters)
|
||||
{
|
||||
await ProcessKeywordsAsync(searchIndexObjectParameters, false);
|
||||
}
|
||||
|
||||
public static async Task ProcessDeletedObjectKeywordsAsync(long objectID, SockType aType, AyContext ct)
|
||||
{
|
||||
//Be careful in future, if you put ToString at the end of each object in the string interpolation
|
||||
//npgsql driver will assume it's a string and put quotes around it triggering an error that a string can't be compared to an int
|
||||
await ct.Database.ExecuteSqlInterpolatedAsync($"delete from asearchkey where objectid={objectID} and aType={(int)aType}");
|
||||
//nothing to save here, it's a direct command already executed
|
||||
}
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Process the keywords into the dictionary
|
||||
/// </summary>
|
||||
private static async Task ProcessKeywordsAsync(SearchIndexProcessObjectParameters p, bool newRecord)
|
||||
{
|
||||
// #if (DEBUG)
|
||||
// if (!p.SockType.HasAttribute(typeof(CoreBizObjectAttribute)))
|
||||
// throw new System.NotSupportedException($"Search::ProcessKeywords - Invalid type presented {p.SockType}");
|
||||
// #endif
|
||||
List<string> KeyWordList = await BreakAsync(p.TranslationId, p.Words);
|
||||
|
||||
if (KeyWordList.Count == 0) return;
|
||||
//call stored procedure to do the work right at the server (fastest method by far)
|
||||
using (AyContext ct = ServiceProviderProvider.DBContext)
|
||||
await ct.Database.ExecuteSqlInterpolatedAsync($"call aydosearchindex({KeyWordList},{p.ObjectId},{p.SockType},{!newRecord})");
|
||||
return;
|
||||
}//eoc
|
||||
#endregion
|
||||
|
||||
#region Breaker
|
||||
|
||||
public enum TokenTypes
|
||||
{ Nothing, Separator, CJK, Latin };
|
||||
|
||||
/// <summary>
|
||||
/// Take an array of strings and
|
||||
/// return a single string
|
||||
/// containing unique only, lowercase comma delimited
|
||||
/// keywords suitable for passing to a
|
||||
/// stored procedure or other function
|
||||
///
|
||||
/// Use Translation setting CJKIndex=true to handle Chinese, Japanese, Korean etc
|
||||
/// (languages with no easily identifiable word boundaries as in english)
|
||||
/// </summary>
|
||||
/// <returns>List of strings</returns>
|
||||
internal static async Task<List<string>> BreakAsync(long translationId, List<string> textStrings)
|
||||
{
|
||||
return await BreakCoreAsync(translationId, false, textStrings);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
///
|
||||
/// </summary>
|
||||
internal static async Task<List<string>> BreakAsync(long translationId, string textString)
|
||||
{
|
||||
List<string> textStrings = new List<string>(1);
|
||||
textStrings.Add(textString);
|
||||
return await BreakCoreAsync(translationId, false, textStrings);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Used to Process users search phrase and preserve wild
|
||||
/// cards entered
|
||||
/// </summary>
|
||||
internal static async Task<List<string>> BreakSearchPhraseAsync(long translationId, string searchPhrase)
|
||||
{
|
||||
List<string> textStrings = new List<string>();
|
||||
textStrings.Add(searchPhrase);
|
||||
//note: we want stopwords if this is a search phrase break because they might type "some" wanting awesome but some is a stopword so..
|
||||
return await BreakCoreAsync(translationId, true, textStrings, true);
|
||||
}
|
||||
|
||||
|
||||
|
||||
internal static async Task<List<string>> BreakCoreAsync(long translationId, bool KeepWildCards, List<string> textStrings, bool ignoreStopWords = false)
|
||||
{
|
||||
//For stopwords and CJKIndex flag value
|
||||
var translationWordBreakData = await SearchTranslationWordBreakDataCache.GetWordBreakData(translationId);
|
||||
|
||||
int MAXWORDLENGTH = 255;
|
||||
int MINWORDLENGTH = 2;//A word isn't a word unless it's got at least two characters in it
|
||||
StringBuilder sbResults = new StringBuilder();
|
||||
//List to temporarily hold parsed words
|
||||
//used to easily ensure unique words only
|
||||
List<string> tempParsedWords = new List<string>();
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
StringBuilder sbWord = new StringBuilder();
|
||||
List<string> ReturnList = new List<string>();
|
||||
|
||||
|
||||
//Loop through each of the passed in strings
|
||||
foreach (string s in textStrings)
|
||||
{
|
||||
if (s == null || s == "") continue;
|
||||
//get all the characters in a unicode compliant manner...
|
||||
TextElementEnumerator t = StringInfo.GetTextElementEnumerator(s);
|
||||
//start at the top
|
||||
t.Reset();
|
||||
|
||||
TokenTypes LastToken = TokenTypes.Nothing;
|
||||
|
||||
//Used by CJK
|
||||
bool BasicLatinBlock = true;
|
||||
|
||||
//Process each "character" (text element,glyph whatever) in the
|
||||
//current string
|
||||
while (t.MoveNext())
|
||||
{
|
||||
//get it as a character
|
||||
char c = t.GetTextElement()[0];
|
||||
|
||||
if (!translationWordBreakData.CJKIndex)
|
||||
{
|
||||
#region regular tokenizer
|
||||
|
||||
//Is it a token we want to include?
|
||||
//Or a wildcard character
|
||||
if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%'))
|
||||
{
|
||||
#region Include token
|
||||
//All latin text is converted to lower case
|
||||
c = char.ToLower(c, System.Globalization.CultureInfo.CurrentCulture);
|
||||
|
||||
//Do we already have a word?
|
||||
if (sbWord.Length > 0)
|
||||
{
|
||||
//Maybe we need to flush this word into the word list
|
||||
//if we're over the word length limit
|
||||
if (sbWord.Length >= MAXWORDLENGTH)
|
||||
{
|
||||
//flush away...
|
||||
if (!tempParsedWords.Contains(sbWord.ToString()))
|
||||
{
|
||||
tempParsedWords.Add(sbWord.ToString());
|
||||
}
|
||||
sbWord.Length = 0;
|
||||
sbWord.Append(c);
|
||||
LastToken = TokenTypes.Latin;
|
||||
continue;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
//append character and go on to next one
|
||||
sbWord.Append(c);
|
||||
LastToken = TokenTypes.Latin;
|
||||
continue;
|
||||
#endregion
|
||||
}
|
||||
else
|
||||
{
|
||||
#region Word Boundary token
|
||||
LastToken = TokenTypes.Separator;
|
||||
if (sbWord.Length > 0)
|
||||
{
|
||||
//flush away...
|
||||
if (!tempParsedWords.Contains(sbWord.ToString()))
|
||||
{
|
||||
tempParsedWords.Add(sbWord.ToString());
|
||||
}
|
||||
sbWord.Length = 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
#endregion
|
||||
}
|
||||
else
|
||||
{
|
||||
#region CJK Tokenizer
|
||||
|
||||
//Is it a basic latin charater? (ascii basically)
|
||||
//see: http://www.unicode.org/charts/index.html
|
||||
//and here for a funky online viewer:
|
||||
//http://www.fileformat.info/info/unicode/block/index.htm
|
||||
//we need to know this so that regular english text
|
||||
//within cjk text gets properly indexed as whole words
|
||||
BasicLatinBlock = false;
|
||||
if ((int)c < 256) BasicLatinBlock = true;
|
||||
|
||||
if (BasicLatinBlock)
|
||||
{
|
||||
//Is it a token we want to include?
|
||||
if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%'))
|
||||
{
|
||||
#region Latin Include token
|
||||
//All latin text is converted to lower case
|
||||
c = char.ToLower(c, System.Globalization.CultureInfo.CurrentCulture);
|
||||
|
||||
//Do we already have a word?
|
||||
if (sbWord.Length > 0)
|
||||
{
|
||||
//Maybe we need to flush this word into the word list
|
||||
//if we're over the word length limit or we are going from
|
||||
//CJK to latin
|
||||
if (LastToken == TokenTypes.CJK || sbWord.Length >= MAXWORDLENGTH)
|
||||
{
|
||||
//flush away...
|
||||
if (!tempParsedWords.Contains(sbWord.ToString()))
|
||||
{
|
||||
tempParsedWords.Add(sbWord.ToString());
|
||||
}
|
||||
sbWord.Length = 0;
|
||||
sbWord.Append(c);
|
||||
LastToken = TokenTypes.Latin;
|
||||
continue;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
//append character and go on to next one
|
||||
sbWord.Append(c);
|
||||
LastToken = TokenTypes.Latin;
|
||||
continue;
|
||||
#endregion
|
||||
}
|
||||
else
|
||||
{
|
||||
#region Latin Word Boundary token
|
||||
LastToken = TokenTypes.Separator;
|
||||
if (sbWord.Length > 0)
|
||||
{
|
||||
//flush away...
|
||||
if (!tempParsedWords.Contains(sbWord.ToString()))
|
||||
{
|
||||
tempParsedWords.Add(sbWord.ToString());
|
||||
}
|
||||
sbWord.Length = 0;
|
||||
|
||||
continue;
|
||||
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
|
||||
}
|
||||
else//CJK character
|
||||
{
|
||||
if (char.IsLetter(c) || (KeepWildCards && c == '%'))
|
||||
{
|
||||
#region CJK Include token
|
||||
//Do we already have a word?
|
||||
if (sbWord.Length > 0)
|
||||
{
|
||||
//Maybe we need to flush this word into the word list
|
||||
//if we're over the word length limit or we are going from
|
||||
//latin TO CJK
|
||||
if (LastToken == TokenTypes.Latin || sbWord.Length >= MAXWORDLENGTH)
|
||||
{
|
||||
//flush away...
|
||||
if (!tempParsedWords.Contains(sbWord.ToString()))
|
||||
{
|
||||
tempParsedWords.Add(sbWord.ToString());
|
||||
}
|
||||
sbWord.Length = 0;
|
||||
sbWord.Append(c);
|
||||
LastToken = TokenTypes.CJK;
|
||||
continue;
|
||||
|
||||
}
|
||||
|
||||
if (LastToken == TokenTypes.CJK)
|
||||
{
|
||||
//we're here because there is more than zero characters already stored
|
||||
//and the last was CJK so we need append current character
|
||||
//and flush the resultant 2 character n-gram
|
||||
sbWord.Append(c);
|
||||
System.Diagnostics.Debug.Assert(sbWord.Length == 2);
|
||||
//flush away...
|
||||
if (!tempParsedWords.Contains(sbWord.ToString()))
|
||||
{
|
||||
tempParsedWords.Add(sbWord.ToString());
|
||||
}
|
||||
sbWord.Length = 0;
|
||||
sbWord.Append(c);
|
||||
LastToken = TokenTypes.CJK;
|
||||
continue;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
//append character and go on to next one
|
||||
sbWord.Append(c);
|
||||
LastToken = TokenTypes.CJK;
|
||||
continue;
|
||||
#endregion
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
#region CJK Word Boundary token
|
||||
LastToken = TokenTypes.Separator;
|
||||
if (sbWord.Length > 0)
|
||||
{
|
||||
//flush away...
|
||||
if (!tempParsedWords.Contains(sbWord.ToString()))
|
||||
{
|
||||
tempParsedWords.Add(sbWord.ToString());
|
||||
}
|
||||
sbWord.Length = 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
}
|
||||
|
||||
//Flush out the last word
|
||||
if (sbWord.Length > 0)
|
||||
{
|
||||
//flush away...
|
||||
if (!tempParsedWords.Contains(sbWord.ToString()))
|
||||
{
|
||||
tempParsedWords.Add(sbWord.ToString());
|
||||
}
|
||||
sbWord.Length = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//bail early if there is nothing indexed
|
||||
if (tempParsedWords.Count == 0) return ReturnList;
|
||||
|
||||
|
||||
//Make a return string array
|
||||
//from the word list
|
||||
foreach (string s in tempParsedWords)
|
||||
{
|
||||
//Filter out short words if we are breaking for indexing
|
||||
//but keep them if they are part of a wildcard search phrase
|
||||
if (s.Length >= MINWORDLENGTH || (KeepWildCards && s.Contains('%')))
|
||||
{
|
||||
if (ignoreStopWords)
|
||||
{
|
||||
//breaking of search phrase
|
||||
ReturnList.Add(s);
|
||||
}
|
||||
else
|
||||
{
|
||||
//Add only non stopwords - regular breaking of object for dictionary entry
|
||||
if (!translationWordBreakData.StopWords.Contains(s))
|
||||
{
|
||||
ReturnList.Add(s);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//sometimes all the results are stop words so you end up here with nothing
|
||||
return ReturnList;
|
||||
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
}//eoc
|
||||
|
||||
}//eons
|
||||
Reference in New Issue
Block a user