Search indexing best version to date

This commit is contained in:
2020-05-21 18:52:35 +00:00
parent f47f9f1bed
commit 7ae49df200
2 changed files with 8 additions and 438 deletions

View File

@@ -20,42 +20,6 @@ namespace AyaNova.Biz
#region Search and return results
/*
Requirements:
INPUT PARAMETERS
- Search phrase (with wildcard support)
- Can be empty if tags are specified, no tags and no phrase is an error condition
- ObjectType: only return results for objects of this type
ACTION
Find search matches, then find tag matches then intersect, then sort and return
Filter OUT results that user is not permitted to read
//TODO: proper testing of searching
- SAMPLE DATA: Need a huge amount of sample data indexed to load test it
- INDEXES: play with it and see what works best
OUTPUT FORMAT
- No translated text, up to client
- Name of object in return result
- Object Type and ID in return result
- Group results by object type, then by object ID descending which will result in natural most recently created order
result:[
{
name:"blah",
type:2,
id:210
},
]
*/
//Class to hold search request parameters
public class SearchRequestParameters
{
public string Phrase { get; set; }
@@ -588,6 +552,7 @@ namespace AyaNova.Biz
#endregion
#region ProcessKeywords into Database
//Class to hold process input parameters
@@ -658,19 +623,14 @@ namespace AyaNova.Biz
return this;
}
public SearchIndexProcessObjectParameters AddCustomFields(string jsonString)
{
//Extract the text from custom fields json fragment as an array of strings and add it here
AddText(JsonUtil.GetCustomFieldsAsStringArrayForSearchIndexing(jsonString));
return this;
}
}
public static async Task ProcessNewObjectKeywordsAsync(SearchIndexProcessObjectParameters searchIndexObjectParameters)
{
await ProcessKeywordsAsync(searchIndexObjectParameters, true);
@@ -691,338 +651,22 @@ namespace AyaNova.Biz
}
/// <summary>
/// Process the keywords into the dictionary
/// NOTE: NAME parameter is in ADDITION to the NAME also being one of the strings passed in text parameter
/// Process the keywords into the dictionary
/// </summary>
private static async Task ProcessKeywordsAsync(SearchIndexProcessObjectParameters p, bool newRecord)
{
#if (DEBUG)
if (p.ObjectType == AyaType.ServerJob || p.ObjectType == AyaType.Translation)
{
if (!p.ObjectType.HasAttribute(typeof(CoreBizObjectAttribute)))
throw new System.NotSupportedException($"Search::ProcessKeywords - Invalid type presented {p.ObjectType}");
}
#endif
//BREAK OBJECT TEXT STRINGS INTO KEYWORD LIST
#endif
List<string> KeyWordList = await BreakAsync(p.TranslationId, p.Words);
//EARLY EXIT IF NO KEYWORDS TO PROCESS
if (KeyWordList.Count == 0)
{
return;
}
//NOTES:
//Using slow but tried and true method:
//Highest ID in search dictionary is 830 count is 831
//Highest ID in search key is 18702 count is 18731
//OLD METHOD WITH EF CORE
// 2020-05-20 10:27:50.9343|INFO|Seeder|Seeding HUGE number of user(s)....
// 2020-05-20 10:28:46.7850|INFO|Seeder|1463 Users seeded in 55854 ms
// 2020-05-20 10:28:46.7850|INFO|Seeder|Seeding 20,000 Widgets....
// 2020-05-20 10:48:00.4773|INFO|Seeder|20k Widgets seeded in 1153691 ms
//Using v1.0 of new sp
//Highest id in search dictionary is 18728, count is 830 total
//Highest ID in search key is 18728 count is 18728
// 2020-05-20 18:23:51.4753|INFO|Seeder|Seeding HUGE number of user(s)....
// 2020-05-20 18:24:39.7168|INFO|Seeder|1463 Users seeded in 48233 ms
// 2020-05-20 18:24:39.7168|INFO|Seeder|Seeding 20,000 Widgets....
// 2020-05-20 19:34:28.0548|INFO|Seeder|20k Widgets seeded in 4188347 ms
// 2020-05-20 19:34:28.0548|INFO|Seeder|Seeding completed successfully
//Using v2.0 of new sp conflict do nothing then fetch
//Highest id in search dictionary is 18671, count is 829 total
//Highest ID in search key is 18671 count is 18671
// 2020-05-20 17:06:17.5835|INFO|Seeder|Seeding HUGE number of user(s)....
// 2020-05-20 17:07:05.8438|INFO|Seeder|1463 Users seeded in 48250 ms
// 2020-05-20 17:07:05.8438|INFO|Seeder|Seeding 20,000 Widgets....
// 2020-05-20 18:17:47.9833|INFO|Seeder|20k Widgets seeded in 4242139 ms
// 2020-05-20 18:17:47.9889|INFO|Seeder|Seeding completed successfully
//v3 - remove the automatic delete and try fetch first only then insert
//call stored procedure
if (KeyWordList.Count == 0) return;
//call stored procedure to do the work right at the server (fastest method by far)
var SomeValue = await ServiceProviderProvider.DBContext.Database.ExecuteSqlInterpolatedAsync($"call aydosearchindex({KeyWordList},{p.ObjectId},{p.ObjectType},{!newRecord})");
return;
//BUILD A LIST OF MatchingDictionaryEntry items FOR THE MATCHING WORDS
List<long> MatchingKeywordIdList = new List<long>();
//******************* REDUNDANT PAST HERE *********************************
//ITERATE ALL THE KEYWORDS, SEARCH IN THE SEARCHDICTIONARY TABLE AND COLLECT ID'S OF ANY PRE-EXISTING IN DB KEYWORDS
var ExistingKeywordMatches = await ServiceProviderProvider.DBContext.SearchDictionary.AsNoTracking().Where(z => KeyWordList.Contains(z.Word)).ToDictionaryAsync(z => z.Id, z => z.Word);
/*example of above query, returns a list of words and ids
SELECT a.id, a.xmin, a.word
FROM asearchdictionary AS a
WHERE a.word IN ('eos', 'quia', 'voluptate', 'delectus', 'sapiente', 'omnis', 'suscipit', 'rerum', 'unbranded', 'soft', 'towels', '25', 'green', 'zone', 'red', 'sequi', 'aspernatur', 'animi', '85586490', '70907391547648')
*/
//Put the matching keyword ID's into the list
foreach (KeyValuePair<long, string> K in ExistingKeywordMatches)
{
MatchingKeywordIdList.Add(K.Key);
}
#region PERFORMANCE NOTES / EXPERIMENTS
/*
This next block is where all the slowness exists.
I've played with it and brought it down to half the original time it took, but could likely find more savings,
however not a good use of time right now and really only affects bulk ops which is seeding right now,
so keeping my notes here just in case I take another whack at it
///////////////////////////////////////////////////////////////////////////////
TODO: Search indexing is painfully slow, it accounts for 16 of 22 seconds when creating 500 widgets with full paragraphs of text
- Try to see if it's just one part of the operation by timing it
- Re-code it not using EF but directly interacting with the DB
- Maybe it's a case for stored procedures or something?
SEARCH INDEXING PERFORMANCE WORK
Baseline from before doing anything seeding a medium level with full text
2020-01-21 16:49:17.4662|INFO|Seeder|75 Users seeded in 2279 ms
2020-01-21 16:49:39.4481|INFO|Seeder|500 Widgets seeded in 21968 ms
After round one of improvements (less text in seed data notes, not calling savechanges or add async)
//about 2 seconds came from the async db stuff and the rest was from using less text so less indexing which isn't really a permanent solution just a workaround
2020-01-23 16:57:57.0422|INFO|Seeder|75 Users seeded in 2398 ms
2020-01-23 16:58:11.9983|INFO|Seeder|500 Widgets seeded in 14958 ms
TODO: Find out if this is linear time for more widgets or exponential to see if it exposes part of the issue
X widgets, ms per widget:
100=32
500=29 (27 in non debug mode)
5000=29
Stripped out all text to index except single letter a in notes and c2
500=20
Now going to try the opposite, a *lot* of text 10 paragraphs in both c2 and notes
500=59ms
So the quantity of text directly affects the performance, so it's not just some overhead from the query being run, it's the amount of work it needs to do in the queries
cache or provide directly the translation to save time repeatedly fetching it when doing bulk ops!!!
-After doing this 500=21 That's as fast as when I stripped out all the text, what a huge overhead saving right there!:
2020-01-24 12:00:41.2547|INFO|Seeder|Seeding 500 Widgets....
2020-01-24 12:00:51.9138|INFO|Seeder|500 Widgets seeded in 10649 ms
THINGS TO TRY:
Completely alternate methods:
- https://stackoverflow.com/a/15089664/8939 Store a Digest of each record with that record then can just search the digests (would mean a search has to traverse all records of every table possibly)
DB INDEX TUNING?
- Play with the indexes and see if there is a slowup with an unnecessary index maybe affecting things
Async the keyword processing
- Fire off the indexing and return immediately so there would be a bit of time to come into compliance maybe more clashes?
Removing use of EF entirely in search indexing processing in favor of direct sql queries
Transaction locked faster than exception method now used?
Make indexing a job that happens periodically behind the scenes based on last indexing date and event log?
ON CONFLICT IDEA
https://www.postgresql.org/docs/current/sql-insert.html#SQL-ON-CONFLICT
Idea: do the insert manually with the clause "on conflict do nothing"
if detect it hasn't inserted (conflict) trigger a fetch instead
like what is being done now but won't have the exception to deal with!!
///////////////////////////////////////////////////////////////////////////////
NEW IDEA
todo: Search indexing performance improvement and exception avoidance (Search.cs 828)
ON CONFLICT IDEA
https://www.postgresql.org/docs/current/sql-insert.html#SQL-ON-CONFLICT
Idea: do the insert manually with the clause "on conflict do nothing"
if detect it hasn't inserted (conflict) trigger a fetch instead
like what is being done now but won't have the exception to deal with!!
ON CONFLICT IDEA
https://www.postgresql.org/docs/current/sql-insert.html#SQL-ON-CONFLICT
Idea: do the insert manually with the clause "on conflict do nothing"
if detect it hasn't inserted (conflict) trigger a fetch instead
like what is being done now but won't have the exception to deal with!!
//BASELINE b4 making any changes 2020-05-20 10:57:50
RUN1 - LARGE
=-=-
2020-05-20 09:09:32.9424|INFO|Seeder|Seeding LARGE number of user(s)....
2020-05-20 09:09:45.2050|INFO|Seeder|317 Users seeded in 12260 ms
2020-05-20 09:09:45.2050|INFO|Seeder|Seeding 5,000 Widgets....
2020-05-20 09:14:33.8176|INFO|Seeder|5k Widgets seeded in 288611 ms
RUN 2 - LARGE
=-=-=-
2020-05-20 09:39:22.0704|INFO|Seeder|Seeding LARGE number of user(s)....
2020-05-20 09:39:34.4116|INFO|Seeder|317 Users seeded in 12337 ms
2020-05-20 09:39:34.4116|INFO|Seeder|Seeding 5,000 Widgets....
2020-05-20 09:44:19.1750|INFO|Seeder|5k Widgets seeded in 284770 ms
RUN 3 - HUGE
=-=-=-=-=-=-
2020-05-20 09:49:03.5884|INFO|Seeder|Seeding HUGE number of user(s)....
2020-05-20 09:50:00.1437|INFO|Seeder|1463 Users seeded in 56546 ms
2020-05-20 09:50:00.1437|INFO|Seeder|Seeding 20,000 Widgets....
2020-05-20 10:09:12.7938|INFO|Seeder|20k Widgets seeded in 1152656 ms
RUN 4 - HUGE 2
=-=-=-=-=-=-
2020-05-20 10:27:50.9343|INFO|Seeder|Seeding HUGE number of user(s)....
2020-05-20 10:28:46.7850|INFO|Seeder|1463 Users seeded in 55854 ms
2020-05-20 10:28:46.7850|INFO|Seeder|Seeding 20,000 Widgets....
2020-05-20 10:48:00.4773|INFO|Seeder|20k Widgets seeded in 1153691 ms
RUN 5 - SMALL 1
=-=-=-=-=-=-
2020-05-20 10:55:55.4385|INFO|Seeder|Seeding SMALL number of user(s)....
2020-05-20 10:55:55.5828|INFO|Seeder|25 Users seeded in 155 ms
2020-05-20 10:56:02.4069|INFO|Seeder|100 Widgets seeded in 6823 ms
RUN 6 - SMALL 2
=-=-=-=-=-=-
2020-05-20 10:56:57.4113|INFO|Seeder|Seeding SMALL number of user(s)....
2020-05-20 10:56:57.5519|INFO|Seeder|25 Users seeded in 143 ms
2020-05-20 10:57:04.3323|INFO|Seeder|100 Widgets seeded in 6787 ms
*/
#endregion performance notes experiments
#region OLD - NEW WORD ADDITION second attempt, do it word by word and accept clashes and handle them
//-------- START CRITICAL SECTION -----------
//-------------------------------------------
#if (DEBUG)
var log = AyaNova.Util.ApplicationLogging.CreateLogger("### Search::ProcessKeywords ###");
#endif
foreach (string KeyWord in KeyWordList)
{
if (!ExistingKeywordMatches.ContainsValue(KeyWord))
{
//algorithm: Attempt to add it to the db and get the id, if it fails with the expected exception for a duplicate word insertion attempt, then immediately read back that word and handle it
//ATTEMPT TO ADD THE WORD TO THE SEARCHDICTIONARY
SearchDictionary NewWord = new SearchDictionary();
NewWord.Word = KeyWord;
try
{
//ADD WORD TO DICTIONARY, SAVE THE ID INTO THE MATCHINGKEYWORDIDLIST
var CtAdd = ServiceProviderProvider.DBContext;
await CtAdd.SearchDictionary.AddAsync(NewWord);
await CtAdd.SaveChangesAsync();
//-------
//Add to matching keywords
MatchingKeywordIdList.Add(NewWord.Id);
//-------
//It exists now
ExistingKeywordMatches.Add(NewWord.Id, NewWord.Word);
}
catch (Microsoft.EntityFrameworkCore.DbUpdateException ex)
{
#region Exceptions from word already existing (added maybe in another thread)
#if (DEBUG)
log.LogInformation($"###################### Exception caught attempting to add word: '{KeyWord}' fetching instead...");
#endif
//FAIL DUE TO OTHER CAUSE THAN WORD ALREADY ADDED?
if (ex.InnerException == null || !ex.InnerException.Message.Contains("asearchdictionary_word_idx"))
{
#if (DEBUG)
log.LogInformation($"###################### Unexpected inner exception on add word: '{KeyWord}'!?");
#endif
throw ex;
}
//FETCH THE WORD ID, PLACE IN MATCHINGKEYWORDLIST AND MOVE ON TO THE NEXT WORD
var SearchDictionaryMatchFoundInDB = await ServiceProviderProvider.DBContext.SearchDictionary.AsNoTracking().Where(z => z.Word == KeyWord).FirstOrDefaultAsync();
if (SearchDictionaryMatchFoundInDB != null)
{
MatchingKeywordIdList.Add(SearchDictionaryMatchFoundInDB.Id);
//It exists now
ExistingKeywordMatches.Add(SearchDictionaryMatchFoundInDB.Id, SearchDictionaryMatchFoundInDB.Word);
}
else
{
#if (DEBUG)
log.LogInformation($"###################### NULL when expected to find word: '{KeyWord}'!?");
#endif
}
#endregion
}
catch (Exception ex)
{
#if (DEBUG)
log.LogInformation(ex, $"###################### Unexpected exception adding word: '{KeyWord}'!?");
#endif
throw ex;
}
}
}
//-------- END CRITICAL SECTION -------------
//-------------------------------------------
#endregion second attempt
//CREATE THE SEARCHKEY RECORDS FOR ALL THE KEYWORDS
var NewSearchKeyList = new List<SearchKey>();
foreach (long E in MatchingKeywordIdList)
{
NewSearchKeyList.Add(new SearchKey() { WordId = E, ObjectId = p.ObjectId, ObjectType = p.ObjectType });
}
var CtSearchKeyAdd = ServiceProviderProvider.DBContext;
await CtSearchKeyAdd.SearchKey.AddRangeAsync(NewSearchKeyList);
await CtSearchKeyAdd.SaveChangesAsync();
//---------------------------------
}//eoc
#endregion
#region Breaker
@@ -1421,10 +1065,6 @@ RUN 6 - SMALL 2
#endregion
#region Utility
#endregion utility
}//eoc
}//eons

View File

@@ -173,70 +173,6 @@ namespace AyaNova.Util
await ExecQueryAsync("CREATE INDEX asearchkey_wordid_otype_idx ON asearchkey (wordid, objecttype);");
//Search indexing stored procedure
////v1
// await ExecQueryAsync(@"CREATE OR REPLACE PROCEDURE public.aydosearchindex(
// wordlist text[],
// ayobjectid bigint,
// ayobjecttype integer)
// LANGUAGE 'plpgsql'
// AS $BODY$DECLARE
// s text;
// wordid bigint;
// BEGIN
// IF ayobjectid=0 THEN
// RAISE EXCEPTION 'Bad object id --> %', ayobjectid;
// END IF;
// IF ayobjecttype=0 THEN
// RAISE EXCEPTION 'Bad object type --> %', ayobjecttype;
// END IF;
// delete from asearchkey where objectid=ayobjectid and objecttype=ayobjecttype;
// FOREACH s IN ARRAY wordlist
// LOOP
// insert into asearchdictionary (word) values(s) on conflict (word) do update set word=excluded.word returning id into wordid;
// insert into asearchkey (wordid,objectid,objecttype) values(wordid,ayobjectid,ayobjecttype);
// END LOOP;
// END;
// $BODY$;");
//v2
// await ExecQueryAsync(@"CREATE OR REPLACE PROCEDURE public.aydosearchindex(
// wordlist text[],
// ayobjectid bigint,
// ayobjecttype integer)
// LANGUAGE 'plpgsql'
// AS $BODY$DECLARE
// s text;
// wordid bigint;
// BEGIN
// IF ayobjectid=0 THEN
// RAISE EXCEPTION 'Bad object id --> %', ayobjectid;
// END IF;
// IF ayobjecttype=0 THEN
// RAISE EXCEPTION 'Bad object type --> %', ayobjecttype;
// END IF;
// delete from asearchkey where objectid=ayobjectid and objecttype=ayobjecttype;
// FOREACH s IN ARRAY wordlist
// LOOP
// insert into asearchdictionary (word) values(s) on conflict (word) do nothing;
// SELECT id INTO STRICT wordid FROM asearchdictionary WHERE word = s;
// insert into asearchkey (wordid,objectid,objecttype) values(wordid,ayobjectid,ayobjecttype);
// END LOOP;
// END;
// $BODY$;");
//v3
await ExecQueryAsync(@"
CREATE OR REPLACE PROCEDURE public.aydosearchindex(
wordlist text[],
@@ -256,24 +192,18 @@ BEGIN
IF ayobjecttype=0 THEN
RAISE EXCEPTION 'Bad object type --> %', ayobjecttype;
END IF;
IF cleanfirst=true THEN
delete from asearchkey where objectid=ayobjectid and objecttype=ayobjecttype;
END IF;
FOREACH s IN ARRAY wordlist
LOOP
--raise info 'processing word %',s;
SELECT id INTO wordid FROM asearchdictionary WHERE word = s;
--raise info 'initial select found that word id is %', wordid;
LOOP
SELECT id INTO wordid FROM asearchdictionary WHERE word = s;
IF wordid IS NULL THEN
--raise info 'since wordid was null inserting %...', wordid;
insert into asearchdictionary (word) values(s) on conflict (word) do update set word=excluded.word returning id into wordid;
--raise info 'After insert new word returned word id %, inserting into searchkey', wordid;
insert into asearchkey (wordid,objectid,objecttype) values(wordid,ayobjectid,ayobjecttype);
ELSE
--raise info 'since we have initial word id from select inserting into search key the Word id %', wordid;
insert into asearchkey (wordid,objectid,objecttype) values(wordid,ayobjectid,ayobjecttype);
END IF;
END LOOP;