1088 lines
46 KiB
C#
1088 lines
46 KiB
C#
using System;
|
|
using System.Linq;
|
|
using System.Globalization;
|
|
using System.Text;
|
|
using System.Collections.Generic;
|
|
using System.IO;
|
|
using System.Threading.Tasks;
|
|
using Newtonsoft.Json.Linq;
|
|
using Microsoft.Extensions.Logging;
|
|
using Microsoft.EntityFrameworkCore;
|
|
using AyaNova.Util;
|
|
using AyaNova.Models;
|
|
//using System.Diagnostics;
|
|
|
|
|
|
namespace AyaNova.Biz
|
|
{
|
|
|
|
//This class handles word breaking, processing keywords and searching for results
|
|
public static class Search
|
|
{
|
|
|
|
#region Search and return results
|
|
|
|
/*
|
|
Requirements:
|
|
|
|
INPUT PARAMETERS
|
|
- Search phrase (with wildcard support)
|
|
- Can be empty if tags are specified, no tags and no phrase is an error condition
|
|
- ObjectType: only return results for objects of this type
|
|
- InName: flag that indicates only search in names
|
|
|
|
|
|
|
|
ACTION
|
|
Find search matches, then find tag matches then intersect, then sort and return
|
|
Filter OUT results that user is not permitted to read
|
|
//TODO: proper testing of searching
|
|
- SAMPLE DATA: Need a huge amount of sample data indexed to load test it
|
|
- INDEXES: play with it and see what works best
|
|
|
|
OUTPUT FORMAT
|
|
- No localized text, up to client
|
|
- Name of object in return result
|
|
- Object Type and ID in return result
|
|
- Group results by object type, then by object ID descending which will result in natural most recently created order
|
|
|
|
result:[
|
|
{
|
|
name:"blah",
|
|
type:2,
|
|
id:210
|
|
},
|
|
]
|
|
|
|
|
|
*/
|
|
|
|
//Class to hold search request parameters
|
|
public class SearchRequestParameters
|
|
{
|
|
public string Phrase { get; set; }
|
|
public bool NameOnly { get; set; }
|
|
public AyaType TypeOnly { get; set; }
|
|
|
|
//Note: maxresults of 0 will get all results
|
|
public int MaxResults { get; set; }
|
|
|
|
public SearchRequestParameters()
|
|
{
|
|
NameOnly = false;
|
|
TypeOnly = AyaType.NoType;
|
|
MaxResults = 500;
|
|
}
|
|
|
|
public bool IsValid
|
|
{
|
|
get
|
|
{
|
|
//has a phrase?
|
|
if (!string.IsNullOrWhiteSpace(this.Phrase))
|
|
return true;
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
//Classes to hold search results returned to client
|
|
public class SearchResult
|
|
{
|
|
public string Name { get; set; }
|
|
public AyaType Type { get; set; }
|
|
public long Id { get; set; }
|
|
}
|
|
|
|
public class SearchReturnObject
|
|
{
|
|
public long TotalResultsFound { get; set; }
|
|
public List<SearchResult> SearchResults { get; set; }
|
|
public SearchReturnObject()
|
|
{
|
|
TotalResultsFound = 0;
|
|
SearchResults = new List<SearchResult>();
|
|
}
|
|
}
|
|
|
|
|
|
public static async Task<SearchReturnObject> DoSearchAsync(AyContext ct, long localeId, AuthorizationRoles currentUserRoles, SearchRequestParameters searchParameters)
|
|
{
|
|
var ReturnObject = new SearchReturnObject();
|
|
|
|
//list to hold temporary search/tag hits
|
|
List<AyaTypeId> MatchingObjects = new List<AyaTypeId>();
|
|
|
|
if (!searchParameters.IsValid)
|
|
{
|
|
throw new System.ArgumentException("Search::DoSearch - Search request parameters must contain a phrase or tags");
|
|
}
|
|
|
|
//IF PHRASE SPECIFIED
|
|
|
|
//Modify Phrase to replace wildcard * with % as breakcore expects sql style wildcards
|
|
searchParameters.Phrase = searchParameters.Phrase.Replace("*", "%");
|
|
|
|
//BREAK SEARCH PHRASE INTO SEPARATE TERMS
|
|
var PhraseItems = BreakSearchPhrase(localeId, searchParameters.Phrase);
|
|
|
|
//SPLIT OUT WILDCARDS FROM NON WILDCARDS
|
|
List<string> WildCardSearchTerms = new List<string>();
|
|
List<string> RegularSearchTerms = new List<string>();
|
|
|
|
foreach (string PhraseItem in PhraseItems)
|
|
{
|
|
if (PhraseItem.Contains("%"))
|
|
WildCardSearchTerms.Add(PhraseItem);
|
|
else
|
|
RegularSearchTerms.Add(PhraseItem);
|
|
}
|
|
|
|
|
|
//List holder for matching dictionary ID's
|
|
List<long> DictionaryMatches = new List<long>();
|
|
|
|
|
|
|
|
//GET LIST OF DICTIONARY ID'S THAT MATCH REGULAR SEARCH TERMS
|
|
if (RegularSearchTerms.Count > 0)
|
|
DictionaryMatches = await ct.SearchDictionary.Where(m => RegularSearchTerms.Contains(m.Word)).Select(m => m.Id).ToListAsync();
|
|
|
|
|
|
|
|
//GET LIST OF DICTIONARY ID'S THAT MATCH WILDCARD SEARCH TERMS
|
|
if (WildCardSearchTerms.Count > 0)
|
|
{
|
|
foreach (string WildCardSearchTerm in WildCardSearchTerms)
|
|
{
|
|
//Contains?
|
|
if (WildCardSearchTerm.StartsWith("%") && WildCardSearchTerm.EndsWith("%"))
|
|
{
|
|
DictionaryMatches.AddRange(await ct.SearchDictionary.Where(m => m.Word.Contains(WildCardSearchTerm.Replace("%", ""))).Select(m => m.Id).ToListAsync());
|
|
}
|
|
else if (WildCardSearchTerm.EndsWith("%")) //STARTS WITH?
|
|
{
|
|
DictionaryMatches.AddRange(await ct.SearchDictionary.Where(m => m.Word.StartsWith(WildCardSearchTerm.Replace("%", ""))).Select(m => m.Id).ToListAsync());
|
|
}
|
|
else if (WildCardSearchTerm.StartsWith("%"))//ENDS WITH?
|
|
{
|
|
DictionaryMatches.AddRange(await ct.SearchDictionary.Where(m => m.Word.EndsWith(WildCardSearchTerm.Replace("%", ""))).Select(m => m.Id).ToListAsync());
|
|
}
|
|
}
|
|
}
|
|
|
|
//SEARCH SEARCHKEY FOR MATCHING WORDS AND OPTIONALLY TYPE AND INNAME
|
|
var TotalSearchTermsToMatch = WildCardSearchTerms.Count + RegularSearchTerms.Count;
|
|
|
|
// var TestRawMatches = await ct.SearchKey.Where(x => DictionaryMatches.Contains(x.WordId)).ToListAsync();
|
|
|
|
//Build search query based on searchParameters
|
|
var q = ct.SearchKey.Distinct().Where(x => DictionaryMatches.Contains(x.WordId));
|
|
|
|
//In name?
|
|
if (searchParameters.NameOnly)
|
|
q = q.Where(m => m.InName == true);
|
|
|
|
//Of type?
|
|
if (searchParameters.TypeOnly != AyaType.NoType)
|
|
q = q.Where(m => m.ObjectType == searchParameters.TypeOnly);
|
|
|
|
|
|
//Find the records that have the search terms in searchkey
|
|
var SearchMatches = q.GroupBy(x => new { x.ObjectType, x.ObjectId }).Select(x => new { ObjectId = x.Key.ObjectId, ObjectType = x.Key.ObjectType, ObjectCount = x.LongCount() });
|
|
|
|
|
|
//PUT THE RESULTS INTO MATCHING OBJECTS LIST
|
|
foreach (var SearchMatch in SearchMatches)
|
|
{
|
|
//keep any object that matches *all* the search terms
|
|
if (SearchMatch.ObjectCount == TotalSearchTermsToMatch)
|
|
MatchingObjects.Add(new AyaTypeId(SearchMatch.ObjectType, SearchMatch.ObjectId));
|
|
}
|
|
|
|
|
|
#region TAGS DEPRECATED
|
|
// //--------------- TAGS ARE DEPRECATED AS BEING SOMETHING YOU CAN SEARCH FOR SEPARATELY (INTAGS) INSTEAD THEY ARE TREATED AS PART OF THE TEXT OF THE OBJECT like any other
|
|
// //IF TAGS SPECIFIED
|
|
// if (searchParameters.Tags.Count > 0)
|
|
// {
|
|
// //get a count of the search tags (used by both paths below)
|
|
// var SearchTagCount = searchParameters.Tags.Count;
|
|
|
|
// if (string.IsNullOrWhiteSpace(searchParameters.Phrase))
|
|
// {
|
|
|
|
// #region TAGS ONLY SEARCH (NO PHRASE) ALL FULL MATCHES ARE INCLUSIVE
|
|
// Dictionary<long, long> TagCounts = new Dictionary<long, long>();
|
|
|
|
// //QUERY FOR ALL TAGMAPS THAT MATCH OBJECT TYPE AND ID FOR EVERY TAG SPECIFIED (UNION)
|
|
// //var tagmatches= await ct.TagMap.Where(m => ).Select(m => m.Id).ToListAsync();
|
|
// //ct.TagMap.Where(n => n.Tags.Count(t => tags.Contains(t.DisplayName)) == tags.Count)
|
|
|
|
// //algorithm:
|
|
// //1) get counts for each tag specified from tagmap, if any are zero then none match and can bail early
|
|
// foreach (long SearchTagId in searchParameters.Tags)
|
|
// {
|
|
// var MatchTagCount = await ct.TagMap.Where(m => m.TagId == SearchTagId).LongCountAsync();
|
|
// //zero tags matching here at any point means no results for the entire search and we can bail
|
|
// if (MatchTagCount == 0)
|
|
// {
|
|
// //return empty resultlist
|
|
// return ReturnObject;
|
|
// }
|
|
|
|
// //Save the matching count
|
|
// TagCounts.Add(SearchTagId, MatchTagCount);
|
|
// }
|
|
|
|
// //2) find smallest count match so we are working with the shortest list first
|
|
// var ShortestMatchingTag = TagCounts.OrderBy(x => x.Value).First().Key;
|
|
|
|
// //3) Generate the shortlist of items that match the shortest tag list
|
|
// var ShortList = await ct.TagMap.Where(x => x.TagId == ShortestMatchingTag).ToListAsync();
|
|
|
|
// //4) Iterate the shortlist and see if each item matches all other tags specified if it does then put it into the matching objects list for return
|
|
|
|
// //Iterate shortlist
|
|
// foreach (TagMap t in ShortList)
|
|
// {
|
|
// var matchCount = 1;
|
|
// //Iterate requested tags
|
|
// foreach (long TagId in searchParameters.Tags)
|
|
// {
|
|
// //skipping already matched shortest tag
|
|
// if (TagId != ShortestMatchingTag)
|
|
// {
|
|
// //Ok, does this object have this tag?
|
|
// bool HasTag = await ct.TagMap.Where(x => x.TagToObjectId == t.TagToObjectId && x.TagToObjectType == t.TagToObjectType && x.TagId == TagId).AnyAsync();
|
|
// if (HasTag)
|
|
// matchCount++;
|
|
// }
|
|
// }
|
|
// //does it match all tags?
|
|
// if (matchCount == SearchTagCount)
|
|
// {
|
|
// //yes, add it to the results
|
|
// MatchingObjects.Add(new AyaTypeId(t.TagToObjectType, t.TagToObjectId));
|
|
// }
|
|
// }
|
|
// #endregion
|
|
|
|
|
|
// }
|
|
// else
|
|
// {
|
|
// #region TAGS PLUS PHRASE SEARCH WITH NON MATCHING TAGS EXCLUSIVE
|
|
// //list to hold temporary matches
|
|
// List<AyaTypeId> TagMatchingObjects = new List<AyaTypeId>();
|
|
|
|
// //LOOP THROUGH MATCHING OBJECTS LIST
|
|
// foreach (AyaTypeId i in MatchingObjects)
|
|
// {
|
|
// var matchCount = await ct.TagMap.Where(x => x.TagToObjectId == i.ObjectId && x.TagToObjectType == i.ObjectType && searchParameters.Tags.Contains(x.TagId)).LongCountAsync();
|
|
// if (matchCount == SearchTagCount)
|
|
// {
|
|
// TagMatchingObjects.Add(i);
|
|
// }
|
|
|
|
// }
|
|
|
|
// //Ok here we have all the MatchingObjects that had all the tags in the TagMatchingObjects list so that's actually now our defacto return list
|
|
// MatchingObjects = TagMatchingObjects;
|
|
|
|
|
|
// #endregion
|
|
|
|
// }
|
|
// }
|
|
|
|
//--------------------- END TAGS -------------
|
|
#endregion tags DEPRECATED
|
|
|
|
|
|
//REMOVE ANY ITEMS THAT USER IS NOT PERMITTED TO READ
|
|
//If it's a name only search then all is allowed
|
|
//If it's not a name only search then rights need to be checked for full read because even if it's just a tags search that's part of the full record of the object
|
|
//Note: I have decided in the interests of simplicity that even if the result was only found in the name, the user still needs full rights to read the object if the type of search
|
|
//was not InNameOnly type. This greatly simplifies processing.
|
|
if (!searchParameters.NameOnly)
|
|
{
|
|
//list to hold temporary matches
|
|
List<AyaTypeId> CanReadMatchingObjects = new List<AyaTypeId>();
|
|
foreach (AyaTypeId t in MatchingObjects)
|
|
{
|
|
if (AyaNova.Api.ControllerHelpers.Authorized.HasReadFullRole(currentUserRoles, t.ObjectType))
|
|
{
|
|
CanReadMatchingObjects.Add(t);
|
|
}
|
|
}
|
|
|
|
//Ok, we're here with the list of allowable objects which is now the master matching objects list so...
|
|
MatchingObjects = CanReadMatchingObjects;
|
|
}
|
|
|
|
|
|
//TOTAL RESULTS
|
|
//we have the total results here so set accordingly
|
|
ReturnObject.TotalResultsFound = MatchingObjects.Count;
|
|
|
|
//MAXIMUM RESULTS FILTER
|
|
//The theory is that it should be filtered BEFORE sorting so that you get the most random collection of results
|
|
//As the results are not ranked so...
|
|
if (searchParameters.MaxResults > 0)//0 = all results
|
|
MatchingObjects = MatchingObjects.Take(searchParameters.MaxResults).ToList();
|
|
|
|
//Sort and group the matching objects list in return order
|
|
//Customer.OrderBy(c => c.LastName).ThenBy(c => c.FirstName)
|
|
var OrderedMatchingObjects = MatchingObjects.OrderBy(x => x.ObjectType).ThenByDescending(x => x.ObjectId);
|
|
|
|
|
|
|
|
// var watch = new System.Diagnostics.Stopwatch();//###################### PROFILING
|
|
// watch.Start();//###################### PROFILING
|
|
|
|
//Get names using best performing technique
|
|
using (var command = ct.Database.GetDbConnection().CreateCommand())
|
|
{
|
|
|
|
ct.Database.OpenConnection();
|
|
//Build the return list from the remaining matching objects list
|
|
foreach (AyaTypeId i in OrderedMatchingObjects)
|
|
{
|
|
SearchResult SR = new SearchResult();
|
|
|
|
SR.Name = BizObjectNameFetcherDirect.Name(i, command);//THIS IS CAUSING ALL THE SLOWNESS IN RETURNING SEARCH RESULTS (...much later ??? WTF??)
|
|
|
|
SR.Id = i.ObjectId;
|
|
SR.Type = i.ObjectType;
|
|
ReturnObject.SearchResults.Add(SR);
|
|
}
|
|
}
|
|
|
|
// watch.Stop();//###################### PROFILING
|
|
// var TimeToBuildSearchResultReturnList = watch.ElapsedMilliseconds;//###################### PROFILING
|
|
|
|
return ReturnObject;
|
|
}
|
|
|
|
|
|
#endregion dosearch
|
|
|
|
#region ProcessKeywords into Database
|
|
|
|
//Class to hold process input parameters
|
|
public class SearchIndexProcessObjectParameters
|
|
{
|
|
public long LocaleId { get; set; }
|
|
public long ObjectId { get; set; }
|
|
public AyaType ObjectType { get; set; }
|
|
public string Name { get; set; }
|
|
public List<string> Words { get; set; }
|
|
|
|
|
|
|
|
public SearchIndexProcessObjectParameters(long localeId, long objectID, AyaType objectType, string name)
|
|
{
|
|
Words = new List<string>();
|
|
|
|
LocaleId = localeId;
|
|
ObjectId = objectID;
|
|
ObjectType = objectType;
|
|
Name = name;
|
|
|
|
}
|
|
|
|
public SearchIndexProcessObjectParameters AddText(string s)
|
|
{
|
|
if (!string.IsNullOrWhiteSpace(s))
|
|
{
|
|
Words.Add(s);
|
|
}
|
|
return this;
|
|
}
|
|
|
|
public SearchIndexProcessObjectParameters AddText(uint u)
|
|
{
|
|
Words.Add(u.ToString());
|
|
return this;
|
|
}
|
|
|
|
public SearchIndexProcessObjectParameters AddText(List<string> lWords)
|
|
{
|
|
if (lWords != null)
|
|
{
|
|
foreach (string s in lWords)
|
|
{
|
|
if (!string.IsNullOrWhiteSpace(s))
|
|
{
|
|
Words.Add(s);
|
|
}
|
|
}
|
|
}
|
|
|
|
return this;
|
|
}
|
|
|
|
|
|
public SearchIndexProcessObjectParameters AddCustomFields(string jsonString)
|
|
{
|
|
//Extract the text from custom fields json fragment as an array of strings and add it here
|
|
AddText(JsonUtil.GetCustomFieldsAsStringArrayForSearchIndexing(jsonString));
|
|
return this;
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
public static async Task ProcessNewObjectKeywordsAsync(SearchIndexProcessObjectParameters searchIndexObjectParameters)//(long localeId, long objectID, AyaType objectType, string name, params string[] text)
|
|
{
|
|
await ProcessKeywordsAsync(searchIndexObjectParameters, true);//localeId, objectID, objectType, true, name, text);
|
|
}
|
|
|
|
public static async Task ProcessUpdatedObjectKeywordsAsync(SearchIndexProcessObjectParameters searchIndexObjectParameters)// localeId, long objectID, AyaType objectType, string name, params string[] text)
|
|
{
|
|
await ProcessKeywordsAsync(searchIndexObjectParameters, false);//localeId, objectID, objectType, false, name, text);
|
|
}
|
|
|
|
public static async Task ProcessDeletedObjectKeywordsAsync(long objectID, AyaType objectType)
|
|
{
|
|
//Be careful in future, if you put ToString at the end of each object in the string interpolation
|
|
//npgsql driver will assume it's a string and put quotes around it triggering an error that a string can't be compared to an int
|
|
AyContext ct = ServiceProviderProvider.DBContext;
|
|
await ct.Database.ExecuteSqlInterpolatedAsync($"delete from asearchkey where objectid={objectID} and objecttype={(int)objectType}");
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
|
/// Process the keywords into the dictionary
|
|
/// NOTE: NAME parameter is in ADDITION to the NAME also being one of the strings passed in text parameter
|
|
/// </summary>
|
|
private static async Task ProcessKeywordsAsync(SearchIndexProcessObjectParameters p, bool newRecord)//long localeId, long objectID, AyaType objectType, string name, params string[] text)
|
|
{
|
|
|
|
#if (DEBUG)
|
|
if (p.ObjectType == AyaType.JobOperations || p.ObjectType == AyaType.Locale)
|
|
{
|
|
throw new System.NotSupportedException($"Search::ProcessKeywords - Invalid type presented {p.ObjectType}");
|
|
}
|
|
#endif
|
|
|
|
|
|
//IF NOT NEW, DELETE ALL EXISTING ENTRIES FOR OBJECT TYPE AND ID
|
|
if (!newRecord)
|
|
{
|
|
await ProcessDeletedObjectKeywordsAsync(p.ObjectId, p.ObjectType);
|
|
}
|
|
|
|
//BREAK OBJECT TEXT STRINGS INTO KEYWORD LIST
|
|
List<string> KeyWordList = Break(p.LocaleId, p.Words);
|
|
|
|
//BREAK NAME STRING
|
|
List<string> NameKeyWordList = Break(p.LocaleId, p.Name);
|
|
|
|
|
|
//EARLY EXIT IF NO KEYWORDS OR NAME RECORD OR TAGS TO PROCESS
|
|
if (KeyWordList.Count == 0 && string.IsNullOrWhiteSpace(p.Name))
|
|
{
|
|
return;
|
|
}
|
|
|
|
|
|
//BUILD A LIST OF MatchingDictionaryEntry items FOR THE MATCHING WORDS
|
|
List<MatchingDictionaryEntry> MatchingKeywordIdList = new List<MatchingDictionaryEntry>();
|
|
|
|
//ITERATE ALL THE KEYWORDS, SEARCH IN THE SEARCHDICTIONARY TABLE AND COLLECT ID'S OF ANY PRE-EXISTING IN DB KEYWORDS
|
|
var ExistingKeywordMatches = await ServiceProviderProvider.DBContext.SearchDictionary.AsNoTracking().Where(m => KeyWordList.Contains(m.Word)).ToDictionaryAsync(m => m.Id, m => m.Word);
|
|
/*example of above query, returns a list of words and ids
|
|
SELECT a.id, a.xmin, a.word
|
|
FROM asearchdictionary AS a
|
|
WHERE a.word IN ('eos', 'quia', 'voluptate', 'delectus', 'sapiente', 'omnis', 'suscipit', 'rerum', 'unbranded', 'soft', 'towels', '25', 'green', 'zone', 'red', 'sequi', 'aspernatur', 'animi', '85586490', '70907391547648')
|
|
|
|
|
|
|
|
*/
|
|
|
|
//Put the matching keyword ID's into the list
|
|
foreach (KeyValuePair<long, string> K in ExistingKeywordMatches)
|
|
{
|
|
//Name or regular word match?
|
|
bool IsName = NameKeyWordList.Contains(K.Value);
|
|
MatchingKeywordIdList.Add(new MatchingDictionaryEntry() { DictionaryId = K.Key, InName = IsName });
|
|
|
|
}
|
|
|
|
//-------- START CRITICAL SECTION -----------
|
|
//-------------------------------------------
|
|
#region NEW WORD ADDITION second attempt, do it word by word and accept clashes and handle them
|
|
#if (DEBUG)
|
|
var log = AyaNova.Util.ApplicationLogging.CreateLogger("### Search::ProcessKeywords ###");
|
|
#endif
|
|
|
|
#region PERFORMANCE NOTES / EXPERIMENTS
|
|
/*
|
|
This next block is where all the slowness exists.
|
|
I've played with it and brought it down to half the original time it took, but could likely find more savings,
|
|
however not a good use of time right now and really only affects bulk ops which is seeding right now,
|
|
so keeping my notes here just in case I take another whack at it
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
TODO: Search indexing is painfully slow, it accounts for 16 of 22 seconds when creating 500 widgets with full paragraphs of text
|
|
- Try to see if it's just one part of the operation by timing it
|
|
- Re-code it not using EF but directly interacting with the DB
|
|
- Maybe it's a case for stored procedures or something?
|
|
|
|
SEARCH INDEXING PERFORMANCE WORK
|
|
Baseline from before doing anything seeding a medium level with full text
|
|
2020-01-21 16:49:17.4662|INFO|Seeder|75 Users seeded in 2279 ms
|
|
2020-01-21 16:49:39.4481|INFO|Seeder|500 Widgets seeded in 21968 ms
|
|
|
|
After round one of improvements (less text in seed data notes, not calling savechanges or add async)
|
|
//about 2 seconds came from the async db stuff and the rest was from using less text so less indexing which isn't really a permanent solution just a workaround
|
|
2020-01-23 16:57:57.0422|INFO|Seeder|75 Users seeded in 2398 ms
|
|
2020-01-23 16:58:11.9983|INFO|Seeder|500 Widgets seeded in 14958 ms
|
|
|
|
TODO: Find out if this is linear time for more widgets or exponential to see if it exposes part of the issue
|
|
X widgets, ms per widget:
|
|
100=32
|
|
500=29 (27 in non debug mode)
|
|
5000=29
|
|
|
|
|
|
Stripped out all text to index except single letter a in notes and c2
|
|
500=20
|
|
|
|
Now going to try the opposite, a *lot* of text 10 paragraphs in both c2 and notes
|
|
500=59ms
|
|
|
|
So the quantity of text directly affects the performance, so it's not just some overhead from the query being run, it's the amount of work it needs to do in the queries
|
|
|
|
THINGS TO TRY:
|
|
Completely alternate methods:
|
|
- https://stackoverflow.com/a/15089664/8939 Store a Digest of each record with that record then can just search the digests (would mean a search has to traverse all records of every table possibly)
|
|
|
|
DB INDEX TUNING?
|
|
- Play with the indexes and see if there is a slowup with an unnecessary index maybe affecting things
|
|
|
|
Async the keyword processing
|
|
- Fire off the indexing and return immediately so there would be a bit of time to come into compliance maybe more clashes?
|
|
|
|
Removing use of EF entirely in search indexing processing in favor of direct sql queries
|
|
|
|
cache or provide directly the locale to save time repeatedly fetching it when doing bulk ops!!!
|
|
-After doing this 500=21 That's as fast as when I stripped out all the text, what a huge overhead saving right there!:
|
|
2020-01-24 12:00:41.2547|INFO|Seeder|Seeding 500 Widgets....
|
|
2020-01-24 12:00:51.9138|INFO|Seeder|500 Widgets seeded in 10649 ms
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
|
|
*/
|
|
#endregion performance notes experiments
|
|
|
|
foreach (string KeyWord in KeyWordList)
|
|
{
|
|
if (!ExistingKeywordMatches.ContainsValue(KeyWord))
|
|
{
|
|
//algorithm: Attempt to add it to the db and get the id, if it fails with the expected exception for a duplicate word insertion attempt, then immediately read back that word and handle it
|
|
|
|
//ATTEMPT TO ADD THE WORD TO THE SEARCHDICTIONARY
|
|
SearchDictionary NewWord = new SearchDictionary();
|
|
NewWord.Word = KeyWord;
|
|
|
|
try
|
|
{
|
|
|
|
|
|
//ADD WORD TO DICTIONARY, SAVE THE ID INTO THE MATCHINGKEYWORDIDLIST
|
|
var CtAdd = ServiceProviderProvider.DBContext;
|
|
await CtAdd.SearchDictionary.AddAsync(NewWord);
|
|
await CtAdd.SaveChangesAsync();
|
|
|
|
|
|
//-------
|
|
//Add to matching keywords
|
|
MatchingKeywordIdList.Add(new MatchingDictionaryEntry() { DictionaryId = NewWord.Id, InName = NameKeyWordList.Contains(KeyWord) });
|
|
//-------
|
|
|
|
//It exists now
|
|
ExistingKeywordMatches.Add(NewWord.Id, NewWord.Word);
|
|
}
|
|
catch (Microsoft.EntityFrameworkCore.DbUpdateException ex)
|
|
{
|
|
#region Exceptions from word already existing (added maybe in another thread)
|
|
#if (DEBUG)
|
|
log.LogInformation($"###################### Exception caught attempting to add word: '{KeyWord}' fetching instead...");
|
|
#endif
|
|
//FAIL DUE TO OTHER CAUSE THAN WORD ALREADY ADDED?
|
|
if (ex.InnerException == null || !ex.InnerException.Message.Contains("asearchdictionary_word_idx"))
|
|
{
|
|
#if (DEBUG)
|
|
log.LogInformation($"###################### Unexpected inner exception on add word: '{KeyWord}'!?");
|
|
#endif
|
|
throw ex;
|
|
}
|
|
|
|
//FETCH THE WORD ID, PLACE IN MATCHINGKEYWORDLIST AND MOVE ON TO THE NEXT WORD
|
|
var SearchDictionaryMatchFoundInDB = await ServiceProviderProvider.DBContext.SearchDictionary.AsNoTracking().Where(x => x.Word == KeyWord).FirstOrDefaultAsync();
|
|
if (SearchDictionaryMatchFoundInDB != null)
|
|
{
|
|
MatchingKeywordIdList.Add(new MatchingDictionaryEntry() { DictionaryId = SearchDictionaryMatchFoundInDB.Id, InName = NameKeyWordList.Contains(KeyWord) });
|
|
//It exists now
|
|
ExistingKeywordMatches.Add(SearchDictionaryMatchFoundInDB.Id, SearchDictionaryMatchFoundInDB.Word);
|
|
}
|
|
else
|
|
{
|
|
#if (DEBUG)
|
|
log.LogInformation($"###################### NULL when expected to find word: '{KeyWord}'!?");
|
|
#endif
|
|
}
|
|
#endregion
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
#if (DEBUG)
|
|
log.LogInformation(ex, $"###################### Unexpected exception adding word: '{KeyWord}'!?");
|
|
#endif
|
|
throw ex;
|
|
}
|
|
}
|
|
}
|
|
|
|
#endregion second attempt
|
|
|
|
//-------- END CRITICAL SECTION -------------
|
|
//-------------------------------------------
|
|
|
|
|
|
|
|
//CREATE THE SEARCHKEY RECORDS FOR ALL THE KEYWORDS
|
|
var NewSearchKeyList = new List<SearchKey>();
|
|
foreach (MatchingDictionaryEntry E in MatchingKeywordIdList)
|
|
{
|
|
NewSearchKeyList.Add(new SearchKey() { WordId = E.DictionaryId, InName = E.InName, ObjectId = p.ObjectId, ObjectType = p.ObjectType });
|
|
}
|
|
var CtSearchKeyAdd = ServiceProviderProvider.DBContext;
|
|
await CtSearchKeyAdd.SearchKey.AddRangeAsync(NewSearchKeyList);
|
|
await CtSearchKeyAdd.SaveChangesAsync();
|
|
|
|
//---------------------------------
|
|
|
|
}//eoc
|
|
|
|
//Class to hold temporary list of matching id
|
|
public class MatchingDictionaryEntry
|
|
{
|
|
public bool InName { get; set; }
|
|
|
|
public long DictionaryId { get; set; }
|
|
public MatchingDictionaryEntry()
|
|
{
|
|
InName = false;
|
|
|
|
DictionaryId = -1;
|
|
}
|
|
}
|
|
|
|
|
|
#endregion
|
|
|
|
#region Breaker
|
|
|
|
//Class to hold relevant locale data for breaking text
|
|
public class LocaleWordBreakingData
|
|
{
|
|
public bool CJKIndex { get; set; }
|
|
public List<string> StopWords { get; set; }
|
|
public LocaleWordBreakingData()
|
|
{
|
|
CJKIndex = false;
|
|
StopWords = new List<string>();
|
|
}
|
|
}
|
|
|
|
//Get the current stopwords for the user's locale
|
|
//called in here in this class and also by any bulk ops like seeding etc
|
|
internal static LocaleWordBreakingData GetLocaleSearchData(long localeId, AyContext ct = null)
|
|
{
|
|
LocaleWordBreakingData LSD = new LocaleWordBreakingData();
|
|
if (ct == null)
|
|
ct = ServiceProviderProvider.DBContext;
|
|
//Get stopwords
|
|
//Validate locale id, if not right then use default instead
|
|
var Param = new List<string>();
|
|
localeId = LocaleBiz.EnsuredLocaleIdStatic(localeId, ct);
|
|
Param.Add("StopWords1");
|
|
Param.Add("StopWords2");
|
|
Param.Add("StopWords3");
|
|
Param.Add("StopWords4");
|
|
Param.Add("StopWords5");
|
|
Param.Add("StopWords6");
|
|
Param.Add("StopWords7");
|
|
var Stops = LocaleBiz.GetSubsetStaticAsync(Param, localeId).Result;
|
|
|
|
foreach (KeyValuePair<string, string> kvp in Stops)
|
|
{
|
|
//Each stopwords locale key is a space delimited list of words and in the case of an empty local string (i.e. StopWords7) it's value is a single question mark
|
|
if (kvp.Value != "?")
|
|
{
|
|
LSD.StopWords.AddRange(kvp.Value.Split(" "));
|
|
}
|
|
}
|
|
|
|
LSD.CJKIndex = LocaleBiz.GetCJKIndexAsync(localeId, ct).Result;
|
|
return LSD;
|
|
}
|
|
|
|
public enum TokenTypes
|
|
{ Nothing, Separator, CJK, Latin };
|
|
|
|
/// <summary>
|
|
/// Take an array of strings and
|
|
/// return a single string
|
|
/// containing unique only, lowercase comma delimited
|
|
/// keywords suitable for passing to a
|
|
/// stored procedure or other function
|
|
///
|
|
/// Use Locale setting CJKIndex=true to handle Chinese, Japanese, Korean etc
|
|
/// (languages with no easily identifiable word boundaries as in english)
|
|
/// </summary>
|
|
/// <returns>List of strings</returns>
|
|
internal static List<string> Break(long localeId, List<string> textStrings)
|
|
{
|
|
return BreakCore(localeId, false, textStrings);
|
|
}
|
|
|
|
/// <summary>
|
|
///
|
|
/// </summary>
|
|
internal static List<string> Break(long localeId, string textString)
|
|
{
|
|
List<string> textStrings = new List<string>(1);
|
|
textStrings.Add(textString);
|
|
return BreakCore(localeId, false, textStrings);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Used to Process users search phrase and preserve wild
|
|
/// cards entered
|
|
/// </summary>
|
|
internal static List<string> BreakSearchPhrase(long localeId, string searchPhrase)
|
|
{
|
|
List<string> textStrings = new List<string>();
|
|
textStrings.Add(searchPhrase);
|
|
return BreakCore(localeId, true, textStrings);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Stop words list reset upon login or editing of localized text
|
|
/// used for eliminating noise words from search dictionary
|
|
/// </summary>
|
|
|
|
private static Dictionary<long, LocaleWordBreakingData> localeWordBreakingDataCache = new Dictionary<long, LocaleWordBreakingData>();
|
|
//called by Locale in the rare circumstance that a local has changed that is cached
|
|
//and might affect word breaking (stopwords cjkindex etc)
|
|
internal static void ClearLocaleWordBreakingDataCache(long localeId)
|
|
{
|
|
localeWordBreakingDataCache.Remove(localeId);
|
|
}
|
|
|
|
internal static List<string> BreakCore(long localeId, bool KeepWildCards, List<string> textStrings)
|
|
{
|
|
//For stopwords and CJKIndex flag value
|
|
//if not provided (will be provided by seeder for performance but normally never) then fetch
|
|
|
|
if (!localeWordBreakingDataCache.ContainsKey(localeId))
|
|
{
|
|
localeWordBreakingDataCache.Add(localeId, GetLocaleSearchData(localeId));
|
|
}
|
|
var localeWordBreakData = localeWordBreakingDataCache[localeId];
|
|
|
|
|
|
int MAXWORDLENGTH = 255;
|
|
int MINWORDLENGTH = 2;//A word isn't a word unless it's got at least two characters in it
|
|
StringBuilder sbResults = new StringBuilder();
|
|
//List to temporarily hold parsed words
|
|
//used to easily ensure unique words only
|
|
List<string> tempParsedWords = new List<string>();
|
|
|
|
StringBuilder sb = new StringBuilder();
|
|
StringBuilder sbWord = new StringBuilder();
|
|
List<string> ReturnList = new List<string>();
|
|
|
|
|
|
//Loop through each of the passed in strings
|
|
foreach (string s in textStrings)
|
|
{
|
|
if (s == null || s == "") continue;
|
|
//get all the characters in a unicode compliant manner...
|
|
TextElementEnumerator t = StringInfo.GetTextElementEnumerator(s);
|
|
//start at the top
|
|
t.Reset();
|
|
|
|
TokenTypes LastToken = TokenTypes.Nothing;
|
|
|
|
//Used by CJK
|
|
bool BasicLatinBlock = true;
|
|
|
|
//Process each "character" (text element,glyph whatever) in the
|
|
//current string
|
|
while (t.MoveNext())
|
|
{
|
|
//get it as a character
|
|
char c = t.GetTextElement()[0];
|
|
|
|
if (!localeWordBreakData.CJKIndex)
|
|
{
|
|
#region regular tokenizer
|
|
|
|
//Is it a token we want to include?
|
|
//Or a wildcard character
|
|
if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%'))
|
|
{
|
|
#region Include token
|
|
//All latin text is converted to lower case
|
|
c = char.ToLower(c);
|
|
|
|
//Do we already have a word?
|
|
if (sbWord.Length > 0)
|
|
{
|
|
//Maybe we need to flush this word into the word list
|
|
//if we're over the word length limit
|
|
if (sbWord.Length >= MAXWORDLENGTH)
|
|
{
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
sbWord.Append(c);
|
|
LastToken = TokenTypes.Latin;
|
|
continue;
|
|
|
|
}
|
|
}
|
|
|
|
//append character and go on to next one
|
|
sbWord.Append(c);
|
|
LastToken = TokenTypes.Latin;
|
|
continue;
|
|
#endregion
|
|
}
|
|
else
|
|
{
|
|
#region Word Boundary token
|
|
LastToken = TokenTypes.Separator;
|
|
if (sbWord.Length > 0)
|
|
{
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
continue;
|
|
}
|
|
|
|
#endregion
|
|
}
|
|
#endregion
|
|
}
|
|
else
|
|
{
|
|
#region CJK Tokenizer
|
|
|
|
//Is it a basic latin charater? (ascii basically)
|
|
//see: http://www.unicode.org/charts/index.html
|
|
//and here for a funky online viewer:
|
|
//http://www.fileformat.info/info/unicode/block/index.htm
|
|
//we need to know this so that regular english text
|
|
//within cjk text gets properly indexed as whole words
|
|
BasicLatinBlock = false;
|
|
if ((int)c < 256) BasicLatinBlock = true;
|
|
|
|
if (BasicLatinBlock)
|
|
{
|
|
//Is it a token we want to include?
|
|
if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%'))
|
|
{
|
|
#region Latin Include token
|
|
//All latin text is converted to lower case
|
|
c = char.ToLower(c);
|
|
|
|
//Do we already have a word?
|
|
if (sbWord.Length > 0)
|
|
{
|
|
//Maybe we need to flush this word into the word list
|
|
//if we're over the word length limit or we are going from
|
|
//CJK to latin
|
|
if (LastToken == TokenTypes.CJK || sbWord.Length >= MAXWORDLENGTH)
|
|
{
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
sbWord.Append(c);
|
|
LastToken = TokenTypes.Latin;
|
|
continue;
|
|
|
|
}
|
|
}
|
|
|
|
//append character and go on to next one
|
|
sbWord.Append(c);
|
|
LastToken = TokenTypes.Latin;
|
|
continue;
|
|
#endregion
|
|
}
|
|
else
|
|
{
|
|
#region Latin Word Boundary token
|
|
LastToken = TokenTypes.Separator;
|
|
if (sbWord.Length > 0)
|
|
{
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
#endregion
|
|
}
|
|
|
|
}
|
|
else//CJK character
|
|
{
|
|
if (char.IsLetter(c) || (KeepWildCards && c == '%'))
|
|
{
|
|
#region CJK Include token
|
|
//Do we already have a word?
|
|
if (sbWord.Length > 0)
|
|
{
|
|
//Maybe we need to flush this word into the word list
|
|
//if we're over the word length limit or we are going from
|
|
//latin TO CJK
|
|
if (LastToken == TokenTypes.Latin || sbWord.Length >= MAXWORDLENGTH)
|
|
{
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
sbWord.Append(c);
|
|
LastToken = TokenTypes.CJK;
|
|
continue;
|
|
|
|
}
|
|
|
|
if (LastToken == TokenTypes.CJK)
|
|
{
|
|
//we're here because there is more than zero characters already stored
|
|
//and the last was CJK so we need append current character
|
|
//and flush the resultant 2 character n-gram
|
|
sbWord.Append(c);
|
|
System.Diagnostics.Debug.Assert(sbWord.Length == 2);
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
sbWord.Append(c);
|
|
LastToken = TokenTypes.CJK;
|
|
continue;
|
|
|
|
}
|
|
}
|
|
|
|
//append character and go on to next one
|
|
sbWord.Append(c);
|
|
LastToken = TokenTypes.CJK;
|
|
continue;
|
|
#endregion
|
|
|
|
|
|
}
|
|
else
|
|
{
|
|
#region CJK Word Boundary token
|
|
LastToken = TokenTypes.Separator;
|
|
if (sbWord.Length > 0)
|
|
{
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
continue;
|
|
}
|
|
|
|
#endregion
|
|
}
|
|
|
|
}
|
|
|
|
#endregion
|
|
}
|
|
}
|
|
|
|
//Flush out the last word
|
|
if (sbWord.Length > 0)
|
|
{
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
}
|
|
}
|
|
|
|
|
|
//bail early if there is nothing indexed
|
|
if (tempParsedWords.Count == 0) return ReturnList;
|
|
|
|
|
|
//Make a return string array
|
|
//from the word list
|
|
foreach (string s in tempParsedWords)
|
|
{
|
|
//Filter out short words if we are breaking for indexing
|
|
//but keep them if they are part of a wildcard search phrase
|
|
if (s.Length >= MINWORDLENGTH || (KeepWildCards && s.Contains('%')))
|
|
{
|
|
//Add only non stopwords
|
|
if (!localeWordBreakData.StopWords.Contains(s))
|
|
{
|
|
ReturnList.Add(s);
|
|
}
|
|
}
|
|
}
|
|
|
|
//sometimes all the results are stop words so you end up here with nothing
|
|
return ReturnList;
|
|
|
|
}
|
|
|
|
#endregion
|
|
|
|
#region Utility
|
|
|
|
#endregion utility
|
|
|
|
}//eoc
|
|
|
|
}//eons |