873 lines
36 KiB
C#
873 lines
36 KiB
C#
using System;
|
|
using System.Linq;
|
|
using System.Globalization;
|
|
using System.Text;
|
|
using System.Collections.Generic;
|
|
using System.IO;
|
|
using System.Threading.Tasks;
|
|
using Newtonsoft.Json.Linq;
|
|
using Microsoft.Extensions.Logging;
|
|
using Microsoft.EntityFrameworkCore;
|
|
using AyaNova.Util;
|
|
using AyaNova.Models;
|
|
using System.Diagnostics;
|
|
|
|
|
|
namespace AyaNova.Biz
|
|
{
|
|
|
|
//This class handles word breaking, processing keywords and searching for results
|
|
public static class Search
|
|
{
|
|
|
|
/*
|
|
ISSUES:
|
|
Search of big data a little slow, attempt to tweak indices
|
|
|
|
|
|
*/
|
|
|
|
|
|
#region Search and return results
|
|
|
|
/*
|
|
Requirements:
|
|
|
|
INPUT PARAMETERS
|
|
- Search phrase (with wildcard support)
|
|
- Can be empty if tags are specified, no tags and no phrase is an error condition
|
|
- ObjectType: only return results for objects of this type
|
|
- InName: flag that indicates only search in names
|
|
- Tag ids that are also on result objects
|
|
- Can be empty if a phrase is specified
|
|
|
|
|
|
ACTION
|
|
Find search matches, then find tag matches then intersect, then sort and return
|
|
Filter OUT results that user is not permitted to read
|
|
//TODO: proper testing of searching
|
|
- SAMPLE DATA: Need a huge amount of sample data indexed to load test it
|
|
- INDEXES: play with it and see what works best
|
|
|
|
OUTPUT FORMAT
|
|
- No localized text, up to client
|
|
- Name of object in return result
|
|
- Object Type and ID in return result
|
|
- Group results by object type, then by object ID descending which will result in natural most recently created order
|
|
|
|
result:[
|
|
{
|
|
name:"blah",
|
|
type:2,
|
|
id:210
|
|
},
|
|
]
|
|
|
|
|
|
*/
|
|
|
|
//Class to hold search request parameters
|
|
public class SearchRequestParameters
|
|
{
|
|
public string Phrase { get; set; }
|
|
public bool NameOnly { get; set; }
|
|
public AyaType TypeOnly { get; set; }
|
|
public List<long> Tags { get; set; }
|
|
//Note: maxresults of 0 will get all results
|
|
public int MaxResults { get; set; }
|
|
|
|
public SearchRequestParameters()
|
|
{
|
|
NameOnly = false;
|
|
TypeOnly = AyaType.NoType;
|
|
Tags = new List<long>();
|
|
MaxResults = 500;
|
|
}
|
|
|
|
public bool IsValid
|
|
{
|
|
get
|
|
{
|
|
//has a phrase?
|
|
if (!string.IsNullOrWhiteSpace(this.Phrase))
|
|
return true;
|
|
|
|
//has tags?
|
|
if (this.Tags.Count > 0)
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
//Class to hold search result returned to client
|
|
public class SearchResult
|
|
{
|
|
public string Name { get; set; }
|
|
public AyaType Type { get; set; }
|
|
public long Id { get; set; }
|
|
}
|
|
|
|
|
|
|
|
public static async Task<List<SearchResult>> DoSearch(AyContext ct, long localeId, AuthorizationRoles currentUserRoles, SearchRequestParameters searchParameters)
|
|
{
|
|
List<SearchResult> ResultList = new List<SearchResult>();
|
|
|
|
//list to hold temporary search/tag hits
|
|
List<AyaTypeId> MatchingObjects = new List<AyaTypeId>();
|
|
|
|
if (!searchParameters.IsValid)
|
|
{
|
|
throw new System.ArgumentException("Search::DoSearch - Search request parameters must contain a phrase or tags");
|
|
}
|
|
|
|
//IF PHRASE SPECIFIED
|
|
|
|
//Modify Phrase to replace wildcard * with % as breakcore expects sql style wildcards
|
|
searchParameters.Phrase = searchParameters.Phrase.Replace("*", "%");
|
|
|
|
//BREAK SEARCH PHRASE INTO SEPARATE TERMS
|
|
var PhraseItems = BreakSearchPhrase(localeId, searchParameters.Phrase);
|
|
|
|
//SPLIT OUT WILDCARDS FROM NON WILDCARDS
|
|
List<string> WildCardSearchTerms = new List<string>();
|
|
List<string> RegularSearchTerms = new List<string>();
|
|
|
|
foreach (string PhraseItem in PhraseItems)
|
|
{
|
|
if (PhraseItem.Contains("%"))
|
|
WildCardSearchTerms.Add(PhraseItem);
|
|
else
|
|
RegularSearchTerms.Add(PhraseItem);
|
|
}
|
|
|
|
|
|
//List holder for matching dictionary ID's
|
|
List<long> DictionaryMatches = new List<long>();
|
|
|
|
|
|
|
|
//GET LIST OF DICTIONARY ID'S THAT MATCH REGULAR SEARCH TERMS
|
|
if (RegularSearchTerms.Count > 0)
|
|
DictionaryMatches = await ct.SearchDictionary.Where(m => RegularSearchTerms.Contains(m.Word)).Select(m => m.Id).ToListAsync();
|
|
|
|
|
|
|
|
//GET LIST OF DICTIONARY ID'S THAT MATCH WILDCARD SEARCH TERMS
|
|
if (WildCardSearchTerms.Count > 0)
|
|
{
|
|
foreach (string WildCardSearchTerm in WildCardSearchTerms)
|
|
{
|
|
//Contains?
|
|
if (WildCardSearchTerm.StartsWith("%") && WildCardSearchTerm.EndsWith("%"))
|
|
{
|
|
DictionaryMatches.AddRange(await ct.SearchDictionary.Where(m => m.Word.Contains(WildCardSearchTerm.Replace("%", ""))).Select(m => m.Id).ToListAsync());
|
|
}
|
|
else if (WildCardSearchTerm.EndsWith("%")) //STARTS WITH?
|
|
{
|
|
DictionaryMatches.AddRange(await ct.SearchDictionary.Where(m => m.Word.StartsWith(WildCardSearchTerm.Replace("%", ""))).Select(m => m.Id).ToListAsync());
|
|
}
|
|
else if (WildCardSearchTerm.StartsWith("%"))//ENDS WITH?
|
|
{
|
|
DictionaryMatches.AddRange(await ct.SearchDictionary.Where(m => m.Word.EndsWith(WildCardSearchTerm.Replace("%", ""))).Select(m => m.Id).ToListAsync());
|
|
}
|
|
}
|
|
}
|
|
|
|
//SEARCH SEARCHKEY FOR MATCHING WORDS AND OPTIONALLY TYPE AND INNAME
|
|
var TotalSearchTermsToMatch = WildCardSearchTerms.Count + RegularSearchTerms.Count;
|
|
|
|
// var TestRawMatches = await ct.SearchKey.Where(x => DictionaryMatches.Contains(x.WordId)).ToListAsync();
|
|
|
|
//Build search query based on searchParameters
|
|
var q = ct.SearchKey.Distinct().Where(x => DictionaryMatches.Contains(x.WordId));
|
|
|
|
//In name?
|
|
if (searchParameters.NameOnly)
|
|
q = q.Where(m => m.InName == true);
|
|
|
|
//Of type?
|
|
if (searchParameters.TypeOnly != AyaType.NoType)
|
|
q = q.Where(m => m.ObjectType == searchParameters.TypeOnly);
|
|
|
|
|
|
|
|
//Find the records that have the search terms in searchkey
|
|
var SearchMatches = q.GroupBy(x => new { x.ObjectType, x.ObjectId }).Select(x => new { ObjectId = x.Key.ObjectId, ObjectType = x.Key.ObjectType, ObjectCount = x.LongCount() });
|
|
|
|
|
|
//PUT THE RESULTS INTO MATCHING OBJECTS LIST
|
|
foreach (var SearchMatch in SearchMatches)
|
|
{
|
|
//keep any object that matches *all* the search terms
|
|
if (SearchMatch.ObjectCount == TotalSearchTermsToMatch)
|
|
MatchingObjects.Add(new AyaTypeId(SearchMatch.ObjectType, SearchMatch.ObjectId));
|
|
}
|
|
|
|
|
|
//IF TAGS SPECIFIED
|
|
//BUGBUG: If no valid tags provided, i.e. a single tag of type or id 0 then can skip
|
|
if (searchParameters.Tags.Count > 0)
|
|
{
|
|
//get a count of the search tags (used by both paths below)
|
|
var SearchTagCount = searchParameters.Tags.Count;
|
|
|
|
if (string.IsNullOrWhiteSpace(searchParameters.Phrase))
|
|
{
|
|
|
|
#region TAGS ONLY SEARCH (NO PHRASE) ALL FULL MATCHES ARE INCLUSIVE
|
|
Dictionary<long, long> TagCounts = new Dictionary<long, long>();
|
|
|
|
//QUERY FOR ALL TAGMAPS THAT MATCH OBJECT TYPE AND ID FOR EVERY TAG SPECIFIED (UNION)
|
|
//var tagmatches= await ct.TagMap.Where(m => ).Select(m => m.Id).ToListAsync();
|
|
//ct.TagMap.Where(n => n.Tags.Count(t => tags.Contains(t.DisplayName)) == tags.Count)
|
|
|
|
//algorithm:
|
|
//1) get counts for each tag specified from tagmap, if any are zero then none match and can bail early
|
|
foreach (long SearchTagId in searchParameters.Tags)
|
|
{
|
|
var MatchTagCount = await ct.TagMap.Where(m => m.TagId == SearchTagId).LongCountAsync();
|
|
//zero tags matching here at any point means no results for the entire search and we can bail
|
|
if (MatchTagCount == 0)
|
|
{
|
|
//return empty resultlist
|
|
return ResultList;
|
|
}
|
|
|
|
//Save the matching count
|
|
TagCounts.Add(SearchTagId, MatchTagCount);
|
|
}
|
|
|
|
//2) find smallest count match so we are working with the shortest list first
|
|
var ShortestMatchingTag = TagCounts.OrderBy(x => x.Value).First().Key;
|
|
|
|
//3) Generate the shortlist of items that match the shortest tag list
|
|
var ShortList = await ct.TagMap.Where(x => x.TagId == ShortestMatchingTag).ToListAsync();
|
|
|
|
//4) Iterate the shortlist and see if each item matches all other tags specified if it does then put it into the matching objects list for return
|
|
|
|
//Iterate shortlist
|
|
foreach (TagMap t in ShortList)
|
|
{
|
|
var matchCount = 1;
|
|
//Iterate requested tags
|
|
foreach (long TagId in searchParameters.Tags)
|
|
{
|
|
//skipping already matched shortest tag
|
|
if (TagId != ShortestMatchingTag)
|
|
{
|
|
//Ok, does this object have this tag?
|
|
bool HasTag = await ct.TagMap.Where(x => x.TagToObjectId == t.TagToObjectId && x.TagToObjectType == t.TagToObjectType && x.TagId == TagId).AnyAsync();
|
|
if (HasTag)
|
|
matchCount++;
|
|
}
|
|
}
|
|
//does it match all tags?
|
|
if (matchCount == SearchTagCount)
|
|
{
|
|
//yes, add it to the results
|
|
MatchingObjects.Add(new AyaTypeId(t.TagToObjectType, t.TagToObjectId));
|
|
}
|
|
}
|
|
#endregion
|
|
|
|
|
|
}
|
|
else
|
|
{
|
|
#region TAGS PLUS PHRASE SEARCH WITH NON MATCHING TAGS EXCLUSIVE
|
|
//list to hold temporary matches
|
|
List<AyaTypeId> TagMatchingObjects = new List<AyaTypeId>();
|
|
|
|
//LOOP THROUGH MATCHING OBJECTS LIST
|
|
foreach (AyaTypeId i in MatchingObjects)
|
|
{
|
|
var matchCount = await ct.TagMap.Where(x => x.TagToObjectId == i.ObjectId && x.TagToObjectType == i.ObjectType && searchParameters.Tags.Contains(x.TagId)).LongCountAsync();
|
|
if (matchCount == SearchTagCount)
|
|
{
|
|
TagMatchingObjects.Add(i);
|
|
}
|
|
|
|
}
|
|
|
|
//Ok here we have all the MatchingObjects that had all the tags in the TagMatchingObjects list so that's actually now our defacto return list
|
|
MatchingObjects = TagMatchingObjects;
|
|
|
|
|
|
#endregion
|
|
|
|
}
|
|
}
|
|
|
|
//REMOVE ANY ITEMS THAT USER IS NOT PERMITTED TO READ
|
|
//If it's a name only search then all is allowed
|
|
//If it's not a name only search then rights need to be checked for full read because even if it's just a tags search that's part of the full record of the object
|
|
//Note: I have decided in the interests of simplicity that even if the result was only found in the name, the user still needs full rights to read the object if the type of search
|
|
//was not InNameOnly type. This greatly simplifies processing.
|
|
if (!searchParameters.NameOnly)
|
|
{
|
|
//list to hold temporary matches
|
|
List<AyaTypeId> CanReadMatchingObjects = new List<AyaTypeId>();
|
|
foreach (AyaTypeId t in MatchingObjects)
|
|
{
|
|
if (AyaNova.Api.ControllerHelpers.Authorized.IsAuthorizedToReadFullRecord(currentUserRoles, t.ObjectType))
|
|
{
|
|
CanReadMatchingObjects.Add(t);
|
|
}
|
|
}
|
|
|
|
//Ok, we're here with the list of allowable objects which is now the master matching objects list so...
|
|
MatchingObjects = CanReadMatchingObjects;
|
|
}
|
|
|
|
//MAXIMUM RESULTS FILTER
|
|
//The theory is that it should be filtered BEFORE sorting so that you get the most random collection of results
|
|
//As the results are not ranked so...
|
|
var watch = new System.Diagnostics.Stopwatch();//###################### PROFILING
|
|
watch.Start();//###################### PROFILING
|
|
//BUGBUG: THIS is what is taking all the time in the queries FFS
|
|
if (searchParameters.MaxResults > 0)//0 = all results
|
|
MatchingObjects = MatchingObjects.Take(searchParameters.MaxResults).ToList();
|
|
watch.Stop();//###################### PROFILING
|
|
var TimeToMaximumResultsFilter = watch.ElapsedMilliseconds;//###################### PROFILING
|
|
watch.Reset();
|
|
|
|
watch.Start();//###################### PROFILING
|
|
//Sort and group the matching objects list in return order
|
|
//Customer.OrderBy(c => c.LastName).ThenBy(c => c.FirstName)
|
|
var OrderedMatchingObjects = MatchingObjects.OrderBy(x => x.ObjectType).ThenByDescending(x => x.ObjectId);
|
|
|
|
watch.Stop();//###################### PROFILING
|
|
var TimeToOrderMatchingObjects = watch.ElapsedMilliseconds;//###################### PROFILING
|
|
watch.Reset();
|
|
|
|
|
|
watch.Start();//###################### PROFILING
|
|
//Build the return list from the remaining matching objects list
|
|
foreach (AyaTypeId i in OrderedMatchingObjects)
|
|
{
|
|
SearchResult SR = new SearchResult();
|
|
SR.Name = BizObjectNameFetcher.Name(i, ct);
|
|
SR.Id = i.ObjectId;
|
|
SR.Type = i.ObjectType;
|
|
ResultList.Add(SR);
|
|
}
|
|
|
|
watch.Stop();//###################### PROFILING
|
|
var TimeToBuildSearchResultReturnList = watch.ElapsedMilliseconds;//###################### PROFILING
|
|
|
|
|
|
return ResultList;
|
|
}
|
|
|
|
|
|
#endregion dosearch
|
|
|
|
#region ProcessKeywords into Database
|
|
|
|
public static void ProcessNewObjectKeywords(AyContext ct, long localeId, long objectID, AyaType objectType, string name, params string[] text)
|
|
{
|
|
ProcessKeywords(ct, localeId, objectID, objectType, true, name, text);
|
|
}
|
|
|
|
public static void ProcessUpdatedObjectKeywords(AyContext ct, long localeId, long objectID, AyaType objectType, string name, params string[] text)
|
|
{
|
|
ProcessKeywords(ct, localeId, objectID, objectType, false, name, text);
|
|
}
|
|
|
|
public static void ProcessDeletedObjectKeywords(AyContext ct, long objectID, AyaType objectType)
|
|
{
|
|
//Be careful in future, if you put ToString at the end of each object in the string interpolation
|
|
//npgsql driver will assume it's a string and put quotes around it triggering an error that a string can't be compared to an int
|
|
ct.Database.ExecuteSqlCommand($"delete from asearchkey where objectid={objectID} and objecttype={(int)objectType}");
|
|
}
|
|
|
|
|
|
/// <summary>
|
|
/// Process the keywords into the dictionary
|
|
/// NOTE: NAME parameter is in ADDITION to the NAME also being on of the strings passed in text parameter
|
|
/// </summary>
|
|
private static void ProcessKeywords(AyContext ct, long localeId, long objectID, AyaType objectType, bool newRecord, string name, params string[] text)
|
|
{
|
|
|
|
//IF NOT NEW, DELETE ALL EXISTING ENTRIES FOR OBJECT TYPE AND ID
|
|
if (!newRecord)
|
|
{
|
|
ProcessDeletedObjectKeywords(ct, objectID, objectType);
|
|
}
|
|
|
|
//BREAK STRING ARRAY INTO KEYWORD LIST
|
|
List<string> KeyWordList = Break(localeId, text);
|
|
|
|
//BREAK NAME STRING
|
|
List<string> NameKeyWordList = Break(localeId, name);
|
|
|
|
//EARLY EXIT IF NO KEYWORDS OR NAME RECORD TO PROCESS
|
|
if (KeyWordList.Count == 0 && string.IsNullOrWhiteSpace(name))
|
|
{
|
|
return;
|
|
}
|
|
|
|
//BUILD A LIST OF MatchingDictionaryEntry items FOR THE MATCHING WORDS
|
|
List<MatchingDictionaryEntry> MatchingKeywordIdList = new List<MatchingDictionaryEntry>();
|
|
|
|
//ITERATE ALL THE KEYWORDS, SEARCH IN THE SEARCHDICTIONARY TABLE AND COLLECT ID'S OF ANY PRE-EXISTING IN DB KEYWORDS
|
|
var ExistingKeywordMatches = ct.SearchDictionary.Where(m => KeyWordList.Contains(m.Word)).ToDictionary(m => m.Id, m => m.Word);
|
|
//Put the matching keyword ID's into the list
|
|
foreach (KeyValuePair<long, string> K in ExistingKeywordMatches)
|
|
{
|
|
bool IsName = false;
|
|
if (NameKeyWordList.Contains(K.Value))
|
|
IsName = true;
|
|
MatchingKeywordIdList.Add(new MatchingDictionaryEntry() { DictionaryId = K.Key, InName = IsName });
|
|
}
|
|
|
|
//ITERATE THROUGH THE KEYWORDS THAT DO *NOT* HAVE MATCHES IN THE SEARCHDICTIONARY AND ADD THEM TO THE SEARCH DICTIONARY, COLLECTING THEIR ID'S
|
|
bool NewWordsAdded = false;
|
|
var NewSearchDictionaryWordsList = new List<SearchDictionary>();
|
|
foreach (string KeyWord in KeyWordList)
|
|
{
|
|
if (!ExistingKeywordMatches.ContainsValue(KeyWord))
|
|
{
|
|
NewSearchDictionaryWordsList.Add(new SearchDictionary() { Word = KeyWord });
|
|
NewWordsAdded = true;
|
|
}
|
|
}
|
|
|
|
//Save the context in order to get the id's of the new words added
|
|
if (NewWordsAdded)
|
|
{
|
|
//adding in a range sped this up noticeably
|
|
ct.SearchDictionary.AddRange(NewSearchDictionaryWordsList);
|
|
ct.SaveChanges();
|
|
}
|
|
|
|
|
|
//-----
|
|
//Now add the id's of the newly created words to the matching keyword id list for this object
|
|
|
|
foreach (SearchDictionary SD in ct.SearchDictionary.Local)
|
|
{
|
|
bool IsName = false;
|
|
if (NameKeyWordList.Contains(SD.Word))
|
|
IsName = true;
|
|
//See if it's already in the matching keywordlist or needs to be added
|
|
var ExistingMatch = MatchingKeywordIdList.Where(x => x.DictionaryId == SD.Id).FirstOrDefault();
|
|
|
|
if (ExistingMatch == null)//If null then needs to be added
|
|
MatchingKeywordIdList.Add(new MatchingDictionaryEntry() { DictionaryId = SD.Id, InName = IsName });
|
|
else
|
|
{
|
|
//Not null, but may need to be updated to reflect that it's in the name
|
|
if (!ExistingMatch.InName && IsName)
|
|
{
|
|
ExistingMatch.InName = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
//CREATE THE SEARCHKEY RECORDS FOR ALL THE KEYWORDS
|
|
var NewSearchKeyList = new List<SearchKey>();
|
|
foreach (MatchingDictionaryEntry E in MatchingKeywordIdList)
|
|
{
|
|
NewSearchKeyList.Add(new SearchKey() { WordId = E.DictionaryId, InName = E.InName, ObjectId = objectID, ObjectType = objectType });
|
|
//ct.SearchKey.Add(new SearchKey() { WordId = E.DictionaryId, InName = E.InName, ObjectId = objectID, ObjectType = objectType });
|
|
}
|
|
ct.SearchKey.AddRange(NewSearchKeyList);
|
|
ct.SaveChanges();
|
|
|
|
//---------------------------------
|
|
|
|
|
|
|
|
}//eoc
|
|
|
|
//Class to hold temporary list of matching id
|
|
public class MatchingDictionaryEntry
|
|
{
|
|
public bool InName { get; set; }
|
|
public long DictionaryId { get; set; }
|
|
public MatchingDictionaryEntry()
|
|
{
|
|
InName = false;
|
|
DictionaryId = -1;
|
|
}
|
|
}
|
|
|
|
|
|
#endregion
|
|
|
|
#region Breaker
|
|
|
|
//Class to hold relevant locale data for breaking text
|
|
public class LocaleWordBreakingData
|
|
{
|
|
public bool CJKIndex { get; set; }
|
|
public List<string> StopWords { get; set; }
|
|
public LocaleWordBreakingData()
|
|
{
|
|
CJKIndex = false;
|
|
StopWords = new List<string>();
|
|
}
|
|
}
|
|
|
|
//Get the current stopwords for the user's locale
|
|
private static LocaleWordBreakingData GetLocaleSearchData(long localeId, AyContext ct = null)
|
|
{
|
|
LocaleWordBreakingData LSD = new LocaleWordBreakingData();
|
|
if (ct == null)
|
|
ct = ServiceProviderProvider.DBContext;
|
|
//Get stopwords
|
|
//Validate locale id, if not right then use default instead
|
|
var Param = new Api.Controllers.LocaleController.LocaleSubsetParam();
|
|
Param.LocaleId = LocaleBiz.EnsuredLocaleIdStatic(localeId, ct);
|
|
Param.Keys.Add("StopWords1");
|
|
Param.Keys.Add("StopWords2");
|
|
Param.Keys.Add("StopWords3");
|
|
Param.Keys.Add("StopWords4");
|
|
Param.Keys.Add("StopWords5");
|
|
Param.Keys.Add("StopWords6");
|
|
Param.Keys.Add("StopWords7");
|
|
var Stops = LocaleBiz.GetSubsetStatic(Param).Result;
|
|
|
|
foreach (KeyValuePair<string, string> kvp in Stops)
|
|
{
|
|
//Each stopwords locale key is a space delimited list of words and in the case of an empty local string (i.e. StopWords7) it's value is a single question mark
|
|
if (kvp.Value != "?")
|
|
{
|
|
LSD.StopWords.AddRange(kvp.Value.Split(" "));
|
|
}
|
|
}
|
|
|
|
LSD.CJKIndex = LocaleBiz.GetCJKIndex(localeId, ct).Result;
|
|
return LSD;
|
|
}
|
|
|
|
public enum TokenTypes
|
|
{ Nothing, Separator, CJK, Latin };
|
|
|
|
/// <summary>
|
|
/// Take an array of strings and
|
|
/// return a single string
|
|
/// containing unique only, lowercase comma delimited
|
|
/// keywords suitable for passing to a
|
|
/// stored procedure or other function
|
|
///
|
|
/// Use Locale setting CJKIndex=true to handle Chinese, Japanese, Korean etc
|
|
/// (languages with no easily identifiable word boundaries as in english)
|
|
/// </summary>
|
|
///
|
|
/// <param name="localeId"></param>
|
|
/// <param name="text">An array of 0 to * strings of text</param>
|
|
/// <returns>List of strings</returns>
|
|
internal static List<string> Break(long localeId, params string[] text)
|
|
{
|
|
return BreakCore(localeId, false, text);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Used to Process users search phrase and preserve wild
|
|
/// cards entered
|
|
/// </summary>
|
|
/// <param name="localeId"></param>
|
|
/// <param name="text"></param>
|
|
/// <returns></returns>
|
|
internal static List<string> BreakSearchPhrase(long localeId, params string[] text)
|
|
{
|
|
return BreakCore(localeId, true, text);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Stop words list reset upon login or editing of localized text
|
|
/// used for eliminating noise words from search dictionary
|
|
/// </summary>
|
|
// public static System.Collections.Generic.List<string> StopList = null;
|
|
|
|
internal static List<string> BreakCore(long localeId, bool KeepWildCards, params string[] text)
|
|
{
|
|
//Get stopwords and CJKIndex flag value
|
|
LocaleWordBreakingData LocaleSearchData = GetLocaleSearchData(localeId);
|
|
int MAXWORDLENGTH = 255;
|
|
int MINWORDLENGTH = 2;//A word isn't a word unless it's got at least two characters in it
|
|
StringBuilder sbResults = new StringBuilder();
|
|
//List to temporarily hold parsed words
|
|
//used to easily ensure unique words only
|
|
List<string> tempParsedWords = new List<string>();
|
|
|
|
StringBuilder sb = new StringBuilder();
|
|
StringBuilder sbWord = new StringBuilder();
|
|
List<string> ReturnList = new List<string>();
|
|
|
|
|
|
//Loop through each of the passed in strings
|
|
foreach (string s in text)
|
|
{
|
|
if (s == null || s == "") continue;
|
|
//get all the characters in a unicode compliant manner...
|
|
TextElementEnumerator t = StringInfo.GetTextElementEnumerator(s);
|
|
//start at the top
|
|
t.Reset();
|
|
|
|
TokenTypes LastToken = TokenTypes.Nothing;
|
|
|
|
//Used by CJK
|
|
bool BasicLatinBlock = true;
|
|
|
|
//Process each "character" (text element,glyph whatever) in the
|
|
//current string
|
|
while (t.MoveNext())
|
|
{
|
|
//get it as a character
|
|
char c = t.GetTextElement()[0];
|
|
|
|
if (!LocaleSearchData.CJKIndex)
|
|
{
|
|
#region regular tokenizer
|
|
|
|
//Is it a token we want to include?
|
|
//Or a wildcard character
|
|
if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%'))
|
|
{
|
|
#region Include token
|
|
//All latin text is converted to lower case
|
|
c = char.ToLower(c);
|
|
|
|
//Do we already have a word?
|
|
if (sbWord.Length > 0)
|
|
{
|
|
//Maybe we need to flush this word into the word list
|
|
//if we're over the word length limit
|
|
if (sbWord.Length >= MAXWORDLENGTH)
|
|
{
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
sbWord.Append(c);
|
|
LastToken = TokenTypes.Latin;
|
|
continue;
|
|
|
|
}
|
|
}
|
|
|
|
//append character and go on to next one
|
|
sbWord.Append(c);
|
|
LastToken = TokenTypes.Latin;
|
|
continue;
|
|
#endregion
|
|
}
|
|
else
|
|
{
|
|
#region Word Boundary token
|
|
LastToken = TokenTypes.Separator;
|
|
if (sbWord.Length > 0)
|
|
{
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
continue;
|
|
}
|
|
|
|
#endregion
|
|
}
|
|
#endregion
|
|
}
|
|
else
|
|
{
|
|
#region CJK Tokenizer
|
|
|
|
//Is it a basic latin charater? (ascii basically)
|
|
//see: http://www.unicode.org/charts/index.html
|
|
//and here for a funky online viewer:
|
|
//http://www.fileformat.info/info/unicode/block/index.htm
|
|
//we need to know this so that regular english text
|
|
//within cjk text gets properly indexed as whole words
|
|
BasicLatinBlock = false;
|
|
if ((int)c < 256) BasicLatinBlock = true;
|
|
|
|
if (BasicLatinBlock)
|
|
{
|
|
//Is it a token we want to include?
|
|
if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%'))
|
|
{
|
|
#region Latin Include token
|
|
//All latin text is converted to lower case
|
|
c = char.ToLower(c);
|
|
|
|
//Do we already have a word?
|
|
if (sbWord.Length > 0)
|
|
{
|
|
//Maybe we need to flush this word into the word list
|
|
//if we're over the word length limit or we are going from
|
|
//CJK to latin
|
|
if (LastToken == TokenTypes.CJK || sbWord.Length >= MAXWORDLENGTH)
|
|
{
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
sbWord.Append(c);
|
|
LastToken = TokenTypes.Latin;
|
|
continue;
|
|
|
|
}
|
|
}
|
|
|
|
//append character and go on to next one
|
|
sbWord.Append(c);
|
|
LastToken = TokenTypes.Latin;
|
|
continue;
|
|
#endregion
|
|
}
|
|
else
|
|
{
|
|
#region Latin Word Boundary token
|
|
LastToken = TokenTypes.Separator;
|
|
if (sbWord.Length > 0)
|
|
{
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
#endregion
|
|
}
|
|
|
|
}
|
|
else//CJK character
|
|
{
|
|
if (char.IsLetter(c) || (KeepWildCards && c == '%'))
|
|
{
|
|
#region CJK Include token
|
|
//Do we already have a word?
|
|
if (sbWord.Length > 0)
|
|
{
|
|
//Maybe we need to flush this word into the word list
|
|
//if we're over the word length limit or we are going from
|
|
//latin TO CJK
|
|
if (LastToken == TokenTypes.Latin || sbWord.Length >= MAXWORDLENGTH)
|
|
{
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
sbWord.Append(c);
|
|
LastToken = TokenTypes.CJK;
|
|
continue;
|
|
|
|
}
|
|
|
|
if (LastToken == TokenTypes.CJK)
|
|
{
|
|
//we're here because there is more than zero characters already stored
|
|
//and the last was CJK so we need append current character
|
|
//and flush the resultant 2 character n-gram
|
|
sbWord.Append(c);
|
|
System.Diagnostics.Debug.Assert(sbWord.Length == 2);
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
sbWord.Append(c);
|
|
LastToken = TokenTypes.CJK;
|
|
continue;
|
|
|
|
}
|
|
}
|
|
|
|
//append character and go on to next one
|
|
sbWord.Append(c);
|
|
LastToken = TokenTypes.CJK;
|
|
continue;
|
|
#endregion
|
|
|
|
|
|
}
|
|
else
|
|
{
|
|
#region CJK Word Boundary token
|
|
LastToken = TokenTypes.Separator;
|
|
if (sbWord.Length > 0)
|
|
{
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
continue;
|
|
}
|
|
|
|
#endregion
|
|
}
|
|
|
|
}
|
|
|
|
#endregion
|
|
}
|
|
}
|
|
|
|
//Flush out the last word
|
|
if (sbWord.Length > 0)
|
|
{
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
}
|
|
}
|
|
|
|
|
|
//bail early if there is nothing indexed
|
|
if (tempParsedWords.Count == 0) return ReturnList;
|
|
|
|
|
|
//Make a return string array
|
|
//from the word list
|
|
foreach (string s in tempParsedWords)
|
|
{
|
|
//Filter out short words if we are breaking for indexing
|
|
//but keep them if they are part of a wildcard search phrase
|
|
if (s.Length > MINWORDLENGTH || (KeepWildCards && s.Contains('%')))
|
|
{
|
|
//Add only non stopwords
|
|
if (!LocaleSearchData.StopWords.Contains(s))
|
|
{
|
|
ReturnList.Add(s);
|
|
}
|
|
}
|
|
}
|
|
|
|
//sometimes all the results are stop words so you end up here with nothing
|
|
return ReturnList;
|
|
|
|
}
|
|
|
|
#endregion
|
|
|
|
|
|
}//eoc
|
|
|
|
}//eons |