797 lines
32 KiB
C#
797 lines
32 KiB
C#
using System;
|
|
using System.Linq;
|
|
using System.Globalization;
|
|
using System.Text;
|
|
using System.Collections.Generic;
|
|
using System.IO;
|
|
using System.Threading.Tasks;
|
|
using Newtonsoft.Json.Linq;
|
|
using Microsoft.Extensions.Logging;
|
|
using Microsoft.EntityFrameworkCore;
|
|
using AyaNova.Util;
|
|
using AyaNova.Models;
|
|
|
|
|
|
namespace AyaNova.Biz
|
|
{
|
|
|
|
//This class handles word breaking, processing keywords and searching for results
|
|
public static class Search
|
|
{
|
|
|
|
/*
|
|
ISSUES:
|
|
none at the moment
|
|
|
|
|
|
*/
|
|
|
|
|
|
#region Search and return results
|
|
|
|
/*
|
|
Requirements:
|
|
|
|
INPUT PARAMETERS
|
|
- Search phrase (with wildcard support)
|
|
- Can be empty if tags are specified, no tags and no phrase is an error condition
|
|
- ObjectType: only return results for objects of this type
|
|
- InName: flag that indicates only search in names
|
|
- Tag ids that are also on result objects
|
|
- Can be empty if a phrase is specified
|
|
|
|
|
|
ACTION
|
|
Find search matches, then find tag matches then intersect, then sort and return
|
|
Filter OUT results that user is not permitted to read
|
|
//TODO: proper testing of searching
|
|
- SAMPLE DATA: Need a huge amount of sample data indexed to load test it
|
|
- INDEXES: play with it and see what works best
|
|
|
|
OUTPUT FORMAT
|
|
- No localized text, up to client
|
|
- Name of object in return result
|
|
- Object Type and ID in return result
|
|
- Group results by object type, then by object ID descending which will result in natural most recently created order
|
|
|
|
result:[
|
|
{
|
|
name:"blah",
|
|
type:2,
|
|
id:210
|
|
},
|
|
]
|
|
|
|
|
|
*/
|
|
|
|
//Class to hold search request parameters
|
|
public class SearchRequestParameters
|
|
{
|
|
public string Phrase { get; set; }
|
|
public bool NameOnly { get; set; }
|
|
public AyaType TypeOnly { get; set; }
|
|
public List<long> Tags { get; set; }
|
|
|
|
public SearchRequestParameters()
|
|
{
|
|
NameOnly = false;
|
|
TypeOnly = AyaType.NoType;
|
|
Tags = new List<long>();
|
|
}
|
|
|
|
public bool IsValid
|
|
{
|
|
get
|
|
{
|
|
//has a phrase?
|
|
if (!string.IsNullOrWhiteSpace(this.Phrase))
|
|
return true;
|
|
|
|
//has tags?
|
|
if (this.Tags.Count > 0)
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
//Class to hold search result returned to client
|
|
public class SearchResult
|
|
{
|
|
public string Name { get; set; }
|
|
public AyaType Type { get; set; }
|
|
public long Id { get; set; }
|
|
}
|
|
|
|
|
|
|
|
|
|
public static async Task<List<SearchResult>> DoSearch(AyContext ct, long localeId, AuthorizationRoles currentUserRoles, SearchRequestParameters searchParameters)
|
|
{
|
|
List<SearchResult> ResultList = new List<SearchResult>();
|
|
|
|
//list to hold temporary search/tag hits
|
|
List<AyaTypeId> MatchingObjects = new List<AyaTypeId>();
|
|
|
|
if (!searchParameters.IsValid)
|
|
{
|
|
throw new System.ArgumentException("Search::DoSearch - Search request parameters must contain a phrase or tags");
|
|
}
|
|
|
|
//IF PHRASE SPECIFIED
|
|
|
|
//Modify Phrase to replace wildcard * with % as breakcore expects sql style wildcards
|
|
searchParameters.Phrase = searchParameters.Phrase.Replace("*", "%");
|
|
|
|
//BREAK SEARCH PHRASE INTO SEPARATE TERMS
|
|
var PhraseItems = BreakSearchPhrase(localeId, searchParameters.Phrase);
|
|
|
|
//SPLIT OUT WILDCARDS FROM NON WILDCARDS
|
|
List<string> WildCardSearchTerms = new List<string>();
|
|
List<string> RegularSearchTerms = new List<string>();
|
|
|
|
foreach (string PhraseItem in PhraseItems)
|
|
{
|
|
if (PhraseItem.Contains("%"))
|
|
WildCardSearchTerms.Add(PhraseItem);
|
|
else
|
|
RegularSearchTerms.Add(PhraseItem);
|
|
}
|
|
|
|
|
|
//List holder for matching dictionary ID's
|
|
List<long> DictionaryMatches = new List<long>();
|
|
|
|
//GET LIST OF DICTIONARY ID'S THAT MATCH REGULAR SEARCH TERMS
|
|
if (RegularSearchTerms.Count > 0)
|
|
DictionaryMatches = await ct.SearchDictionary.Where(m => RegularSearchTerms.Contains(m.Word)).Select(m => m.Id).ToListAsync();
|
|
|
|
|
|
//GET LIST OF DICTIONARY ID'S THAT MATCH WILDCARD SEARCH TERMS
|
|
if (WildCardSearchTerms.Count > 0)
|
|
{
|
|
foreach (string WildCardSearchTerm in WildCardSearchTerms)
|
|
{
|
|
//Contains?
|
|
if (WildCardSearchTerm.StartsWith("%") && WildCardSearchTerm.EndsWith("%"))
|
|
{
|
|
DictionaryMatches.AddRange(await ct.SearchDictionary.Where(m => m.Word.EndsWith(WildCardSearchTerm.Replace("%", ""))).Select(m => m.Id).ToListAsync());
|
|
}
|
|
else if (WildCardSearchTerm.EndsWith("%")) //STARTS WITH?
|
|
{
|
|
DictionaryMatches.AddRange(await ct.SearchDictionary.Where(m => m.Word.EndsWith(WildCardSearchTerm.Replace("%", ""))).Select(m => m.Id).ToListAsync());
|
|
}
|
|
else if (WildCardSearchTerm.StartsWith("%"))//ENDS WITH?
|
|
{
|
|
DictionaryMatches.AddRange(await ct.SearchDictionary.Where(m => m.Word.EndsWith(WildCardSearchTerm.Replace("%", ""))).Select(m => m.Id).ToListAsync());
|
|
}
|
|
}
|
|
}
|
|
|
|
//SEARCH SEARCHKEY FOR MATCHING WORDS AND OPTIONALLY TYPE AND INNAME
|
|
List<SearchKey> SearchKeyMatches = new List<SearchKey>();
|
|
|
|
//Build search query based on searchParameters
|
|
var q = ct.SearchKey.Where(m => DictionaryMatches.Contains(m.Id));
|
|
|
|
//In name?
|
|
if (searchParameters.NameOnly)
|
|
q.Where(m => m.InName == true);
|
|
|
|
//Of type?
|
|
if (searchParameters.TypeOnly != AyaType.NoType)
|
|
q.Where(m => m.ObjectType == searchParameters.TypeOnly);
|
|
|
|
//Trigger the search
|
|
SearchKeyMatches = await q.ToListAsync();
|
|
|
|
//PUT THE RESULTS INTO MATCHING OBJECTS LIST
|
|
foreach (SearchKey SearchKeyMatch in SearchKeyMatches)
|
|
{
|
|
MatchingObjects.Add(new AyaTypeId(SearchKeyMatch.ObjectType, SearchKeyMatch.ObjectId));
|
|
}
|
|
|
|
|
|
//IF TAGS SPECIFIED
|
|
if (searchParameters.Tags.Count > 0)
|
|
{
|
|
//get a count of the search tags (used by both paths below)
|
|
var SearchTagCount = searchParameters.Tags.Count;
|
|
|
|
if (string.IsNullOrWhiteSpace(searchParameters.Phrase))
|
|
{
|
|
|
|
#region TAGS ONLY SEARCH (NO PHRASE) ALL FULL MATCHES ARE INCLUSIVE
|
|
Dictionary<long, long> TagCounts = new Dictionary<long, long>();
|
|
|
|
//QUERY FOR ALL TAGMAPS THAT MATCH OBJECT TYPE AND ID FOR EVERY TAG SPECIFIED (UNION)
|
|
//var tagmatches= await ct.TagMap.Where(m => ).Select(m => m.Id).ToListAsync();
|
|
//ct.TagMap.Where(n => n.Tags.Count(t => tags.Contains(t.DisplayName)) == tags.Count)
|
|
|
|
//algorithm:
|
|
//1) get counts for each tag specified from tagmap, if any are zero then none match and can bail early
|
|
foreach (long SearchTagId in searchParameters.Tags)
|
|
{
|
|
var MatchTagCount = await ct.TagMap.Where(m => m.TagId == SearchTagId).LongCountAsync();
|
|
//zero tags matching here at any point means no results for the entire search and we can bail
|
|
if (MatchTagCount == 0)
|
|
{
|
|
//return empty resultlist
|
|
return ResultList;
|
|
}
|
|
|
|
//Save the matching count
|
|
TagCounts.Add(SearchTagId, MatchTagCount);
|
|
}
|
|
|
|
//2) find smallest count match so we are working with the shortest list first
|
|
var ShortestMatchingTag = TagCounts.OrderBy(x => x.Value).First().Key;
|
|
|
|
//3) Generate the shortlist of items that match the shortest tag list
|
|
var ShortList = await ct.TagMap.Where(x => x.TagId == ShortestMatchingTag).ToListAsync();
|
|
|
|
//4) Iterate the shortlist and see if each item matches all other tags specified if it does then put it into the matching objects list for return
|
|
|
|
//Iterate shortlist
|
|
foreach (TagMap t in ShortList)
|
|
{
|
|
var matchCount = 1;
|
|
//Iterate requested tags
|
|
foreach (long TagId in searchParameters.Tags)
|
|
{
|
|
//skipping already matched shortest tag
|
|
if (TagId != ShortestMatchingTag)
|
|
{
|
|
//Ok, does this object have this tag?
|
|
bool HasTag = await ct.TagMap.Where(x => x.TagToObjectId == t.TagToObjectId && x.TagToObjectType == t.TagToObjectType && x.TagId == TagId).AnyAsync();
|
|
if (HasTag)
|
|
matchCount++;
|
|
}
|
|
}
|
|
//does it match all tags?
|
|
if (matchCount == SearchTagCount)
|
|
{
|
|
//yes, add it to the results
|
|
MatchingObjects.Add(new AyaTypeId(t.TagToObjectType, t.TagToObjectId));
|
|
}
|
|
}
|
|
#endregion
|
|
|
|
|
|
}
|
|
else
|
|
{
|
|
#region TAGS PLUS PHRASE SEARCH WITH NON MATCHING TAGS EXCLUSIVE
|
|
//list to hold temporary matches
|
|
List<AyaTypeId> TagMatchingObjects = new List<AyaTypeId>();
|
|
|
|
//LOOP THROUGH MATCHING OBJECTS LIST
|
|
foreach (AyaTypeId i in MatchingObjects)
|
|
{
|
|
var matchCount = await ct.TagMap.Where(x => x.TagToObjectId == i.ObjectId && x.TagToObjectType == i.ObjectType && searchParameters.Tags.Contains(x.TagId)).LongCountAsync();
|
|
if (matchCount == SearchTagCount)
|
|
{
|
|
TagMatchingObjects.Add(i);
|
|
}
|
|
|
|
}
|
|
|
|
//Ok here we have all the MatchingObjects that had all the tags in the TagMatchingObjects list so that's actually now our defacto return list
|
|
MatchingObjects = TagMatchingObjects;
|
|
|
|
|
|
#endregion
|
|
|
|
}
|
|
}
|
|
|
|
//REMOVE ANY ITEMS THAT USER IS NOT PERMITTED TO READ
|
|
//If it's a name only search then all is allowed
|
|
//If it's not a name only search then rights need to be checked for full read because even if it's just a tags search that's part of the full record of the object
|
|
if (!searchParameters.NameOnly)
|
|
{
|
|
//list to hold temporary matches
|
|
List<AyaTypeId> CanReadMatchingObjects = new List<AyaTypeId>();
|
|
foreach (AyaTypeId t in MatchingObjects)
|
|
{
|
|
if (AyaNova.Api.ControllerHelpers.Authorized.IsAuthorizedToReadFullRecord(currentUserRoles, t.ObjectType))
|
|
{
|
|
CanReadMatchingObjects.Add(t);
|
|
}
|
|
}
|
|
|
|
//Ok, we're here with the list of allowable objects which is now the master matching objects list so...
|
|
MatchingObjects = CanReadMatchingObjects;
|
|
}
|
|
|
|
|
|
//Build the return list from the remaining matching objects list
|
|
foreach (AyaTypeId i in MatchingObjects)
|
|
{
|
|
SearchResult SR = new SearchResult();
|
|
SR.Name = BizObjectNameFetcher.Name(i, ct);
|
|
SR.Id = i.ObjectId;
|
|
SR.Type = i.ObjectType;
|
|
ResultList.Add(SR);
|
|
}
|
|
|
|
return ResultList;
|
|
}
|
|
|
|
|
|
#endregion dosearch
|
|
|
|
#region ProcessKeywords into Database
|
|
|
|
public static void ProcessNewObjectKeywords(AyContext ct, long localeId, long objectID, AyaType objectType, string name, params string[] text)
|
|
{
|
|
ProcessKeywords(ct, localeId, objectID, objectType, true, name, text);
|
|
}
|
|
|
|
public static void ProcessUpdatedObjectKeywords(AyContext ct, long localeId, long objectID, AyaType objectType, string name, params string[] text)
|
|
{
|
|
ProcessKeywords(ct, localeId, objectID, objectType, false, name, text);
|
|
}
|
|
|
|
public static void ProcessDeletedObjectKeywords(AyContext ct, long objectID, AyaType objectType)
|
|
{
|
|
//Be careful in future, if you put ToString at the end of each object in the string interpolation
|
|
//npgsql driver will assume it's a string and put quotes around it triggering an error that a string can't be compared to an int
|
|
ct.Database.ExecuteSqlCommand($"delete from asearchkey where objectid={objectID} and objecttype={(int)objectType}");
|
|
}
|
|
|
|
|
|
/// <summary>
|
|
/// Process the keywords into the dictionary
|
|
/// NOTE: NAME parameter is in ADDITION to the NAME also being on of the strings passed in text parameter
|
|
/// </summary>
|
|
private static void ProcessKeywords(AyContext ct, long localeId, long objectID, AyaType objectType, bool newRecord, string name, params string[] text)
|
|
{
|
|
|
|
//IF NOT NEW, DELETE ALL EXISTING ENTRIES FOR OBJECT TYPE AND ID
|
|
if (!newRecord)
|
|
{
|
|
ProcessDeletedObjectKeywords(ct, objectID, objectType);
|
|
}
|
|
|
|
//BREAK STRING ARRAY INTO KEYWORD LIST
|
|
List<string> KeyWordList = Break(localeId, text);
|
|
|
|
//BREAK NAME STRING
|
|
List<string> NameKeyWordList = Break(localeId, name);
|
|
|
|
//EARLY EXIT IF NO KEYWORDS OR NAME RECORD TO PROCESS
|
|
if (KeyWordList.Count == 0 && string.IsNullOrWhiteSpace(name))
|
|
{
|
|
return;
|
|
}
|
|
|
|
//BUILD A LIST OF MatchingDictionaryEntry items FOR THE MATCHING WORDS
|
|
List<MatchingDictionaryEntry> MatchingKeywordIdList = new List<MatchingDictionaryEntry>();
|
|
|
|
//ITERATE ALL THE KEYWORDS, SEARCH IN THE SEARCHDICTIONARY TABLE AND COLLECT ID'S OF ANY PRE-EXISTING IN DB KEYWORDS
|
|
var ExistingKeywordMatches = ct.SearchDictionary.Where(m => KeyWordList.Contains(m.Word)).ToDictionary(m => m.Id, m => m.Word);
|
|
//Put the matching keyword ID's into the list
|
|
foreach (KeyValuePair<long, string> K in ExistingKeywordMatches)
|
|
{
|
|
bool IsName = false;
|
|
if (NameKeyWordList.Contains(K.Value))
|
|
IsName = true;
|
|
MatchingKeywordIdList.Add(new MatchingDictionaryEntry() { DictionaryId = K.Key, InName = IsName });
|
|
}
|
|
|
|
//ITERATE THROUGH THE KEYWORDS THAT DO *NOT* HAVE MATCHES IN THE SEARCHDICTIONARY AND ADD THEM TO THE SEARCH DICTIONARY, COLLECTING THEIR ID'S
|
|
foreach (string KeyWord in KeyWordList)
|
|
{
|
|
if (!ExistingKeywordMatches.ContainsValue(KeyWord))
|
|
{
|
|
ct.SearchDictionary.Add(new SearchDictionary() { Word = KeyWord });
|
|
}
|
|
}
|
|
|
|
//Save the context in order to get the id's of the new words added
|
|
ct.SaveChanges();
|
|
|
|
//Now add the id's of the newly created words to the matching keyword id list for this object
|
|
foreach (SearchDictionary SD in ct.SearchDictionary.Local)
|
|
{
|
|
bool IsName = false;
|
|
if (NameKeyWordList.Contains(SD.Word))
|
|
IsName = true;
|
|
MatchingKeywordIdList.Add(new MatchingDictionaryEntry() { DictionaryId = SD.Id, InName = IsName });
|
|
}
|
|
|
|
|
|
//CREATE THE SEARCHKEY RECORDS FOR ALL THE KEYWORDS
|
|
foreach (MatchingDictionaryEntry E in MatchingKeywordIdList)
|
|
{
|
|
ct.SearchKey.Add(new SearchKey() { WordId = E.DictionaryId, InName = E.InName, ObjectId = objectID, ObjectType = objectType });
|
|
}
|
|
|
|
ct.SaveChanges();
|
|
|
|
}//eoc
|
|
|
|
//Class to hold temporary list of matching id
|
|
public class MatchingDictionaryEntry
|
|
{
|
|
public bool InName { get; set; }
|
|
public long DictionaryId { get; set; }
|
|
public MatchingDictionaryEntry()
|
|
{
|
|
InName = false;
|
|
DictionaryId = -1;
|
|
}
|
|
}
|
|
|
|
|
|
#endregion
|
|
|
|
#region Breaker
|
|
|
|
//Class to hold relevant locale data for breaking text
|
|
public class LocaleWordBreakingData
|
|
{
|
|
public bool CJKIndex { get; set; }
|
|
public List<string> StopWords { get; set; }
|
|
public LocaleWordBreakingData()
|
|
{
|
|
CJKIndex = false;
|
|
StopWords = new List<string>();
|
|
}
|
|
}
|
|
|
|
//Get the current stopwords for the user's locale
|
|
private static LocaleWordBreakingData GetLocaleSearchData(long localeId, AyContext ct = null)
|
|
{
|
|
LocaleWordBreakingData LSD = new LocaleWordBreakingData();
|
|
if (ct == null)
|
|
ct = ServiceProviderProvider.DBContext;
|
|
//Get stopwords
|
|
//Validate locale id, if not right then use default instead
|
|
var Param = new Api.Controllers.LocaleController.LocaleSubsetParam();
|
|
Param.LocaleId = LocaleBiz.EnsuredLocaleIdStatic(localeId, ct);
|
|
Param.Keys.Add("StopWords1");
|
|
Param.Keys.Add("StopWords2");
|
|
Param.Keys.Add("StopWords3");
|
|
Param.Keys.Add("StopWords4");
|
|
Param.Keys.Add("StopWords5");
|
|
Param.Keys.Add("StopWords6");
|
|
Param.Keys.Add("StopWords7");
|
|
var Stops = LocaleBiz.GetSubsetStatic(Param).Result;
|
|
|
|
foreach (KeyValuePair<string, string> kvp in Stops)
|
|
{
|
|
//Each stopwords locale key is a space delimited list of words and in the case of an empty local string (i.e. StopWords7) it's value is a single question mark
|
|
if (kvp.Value != "?")
|
|
{
|
|
LSD.StopWords.AddRange(kvp.Value.Split(" "));
|
|
}
|
|
}
|
|
|
|
LSD.CJKIndex = LocaleBiz.GetCJKIndex(localeId, ct).Result;
|
|
return LSD;
|
|
}
|
|
|
|
public enum TokenTypes
|
|
{ Nothing, Separator, CJK, Latin };
|
|
|
|
/// <summary>
|
|
/// Take an array of strings and
|
|
/// return a single string
|
|
/// containing unique only, lowercase comma delimited
|
|
/// keywords suitable for passing to a
|
|
/// stored procedure or other function
|
|
///
|
|
/// Use Locale setting CJKIndex=true to handle Chinese, Japanese, Korean etc
|
|
/// (languages with no easily identifiable word boundaries as in english)
|
|
/// </summary>
|
|
///
|
|
/// <param name="localeId"></param>
|
|
/// <param name="text">An array of 0 to * strings of text</param>
|
|
/// <returns>List of strings</returns>
|
|
internal static List<string> Break(long localeId, params string[] text)
|
|
{
|
|
return BreakCore(localeId, false, text);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Used to Process users search phrase and preserve wild
|
|
/// cards entered
|
|
/// </summary>
|
|
/// <param name="localeId"></param>
|
|
/// <param name="text"></param>
|
|
/// <returns></returns>
|
|
internal static List<string> BreakSearchPhrase(long localeId, params string[] text)
|
|
{
|
|
return BreakCore(localeId, true, text);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Stop words list reset upon login or editing of localized text
|
|
/// used for eliminating noise words from search dictionary
|
|
/// </summary>
|
|
// public static System.Collections.Generic.List<string> StopList = null;
|
|
|
|
internal static List<string> BreakCore(long localeId, bool KeepWildCards, params string[] text)
|
|
{
|
|
//Get stopwords and CJKIndex flag value
|
|
LocaleWordBreakingData LocaleSearchData = GetLocaleSearchData(localeId);
|
|
int MAXWORDLENGTH = 255;
|
|
StringBuilder sbResults = new StringBuilder();
|
|
//List to temporarily hold parsed words
|
|
//used to easily ensure unique words only
|
|
List<string> tempParsedWords = new List<string>();
|
|
|
|
StringBuilder sb = new StringBuilder();
|
|
StringBuilder sbWord = new StringBuilder();
|
|
List<string> ReturnList = new List<string>();
|
|
|
|
|
|
//Loop through each of the passed in strings
|
|
foreach (string s in text)
|
|
{
|
|
if (s == null || s == "") continue;
|
|
//get all the characters in a unicode compliant manner...
|
|
TextElementEnumerator t = StringInfo.GetTextElementEnumerator(s);
|
|
//start at the top
|
|
t.Reset();
|
|
|
|
TokenTypes LastToken = TokenTypes.Nothing;
|
|
|
|
//Used by CJK
|
|
bool BasicLatinBlock = true;
|
|
|
|
//Process each "character" (text element,glyph whatever) in the
|
|
//current string
|
|
while (t.MoveNext())
|
|
{
|
|
//get it as a character
|
|
char c = t.GetTextElement()[0];
|
|
|
|
if (!LocaleSearchData.CJKIndex)
|
|
{
|
|
#region regular tokenizer
|
|
|
|
//Is it a token we want to include?
|
|
//Or a wildcard character
|
|
if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%'))
|
|
{
|
|
#region Include token
|
|
//All latin text is converted to lower case
|
|
c = char.ToLower(c);
|
|
|
|
//Do we already have a word?
|
|
if (sbWord.Length > 0)
|
|
{
|
|
//Maybe we need to flush this word into the word list
|
|
//if we're over the word length limit
|
|
if (sbWord.Length >= MAXWORDLENGTH)
|
|
{
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
sbWord.Append(c);
|
|
LastToken = TokenTypes.Latin;
|
|
continue;
|
|
|
|
}
|
|
}
|
|
|
|
//append character and go on to next one
|
|
sbWord.Append(c);
|
|
LastToken = TokenTypes.Latin;
|
|
continue;
|
|
#endregion
|
|
}
|
|
else
|
|
{
|
|
#region Word Boundary token
|
|
LastToken = TokenTypes.Separator;
|
|
if (sbWord.Length > 0)
|
|
{
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
continue;
|
|
}
|
|
|
|
#endregion
|
|
}
|
|
#endregion
|
|
}
|
|
else
|
|
{
|
|
#region CJK Tokenizer
|
|
|
|
//Is it a basic latin charater? (ascii basically)
|
|
//see: http://www.unicode.org/charts/index.html
|
|
//and here for a funky online viewer:
|
|
//http://www.fileformat.info/info/unicode/block/index.htm
|
|
//we need to know this so that regular english text
|
|
//within cjk text gets properly indexed as whole words
|
|
BasicLatinBlock = false;
|
|
if ((int)c < 256) BasicLatinBlock = true;
|
|
|
|
if (BasicLatinBlock)
|
|
{
|
|
//Is it a token we want to include?
|
|
if (char.IsLetterOrDigit(c) || (KeepWildCards && c == '%'))
|
|
{
|
|
#region Latin Include token
|
|
//All latin text is converted to lower case
|
|
c = char.ToLower(c);
|
|
|
|
//Do we already have a word?
|
|
if (sbWord.Length > 0)
|
|
{
|
|
//Maybe we need to flush this word into the word list
|
|
//if we're over the word length limit or we are going from
|
|
//CJK to latin
|
|
if (LastToken == TokenTypes.CJK || sbWord.Length >= MAXWORDLENGTH)
|
|
{
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
sbWord.Append(c);
|
|
LastToken = TokenTypes.Latin;
|
|
continue;
|
|
|
|
}
|
|
}
|
|
|
|
//append character and go on to next one
|
|
sbWord.Append(c);
|
|
LastToken = TokenTypes.Latin;
|
|
continue;
|
|
#endregion
|
|
}
|
|
else
|
|
{
|
|
#region Latin Word Boundary token
|
|
LastToken = TokenTypes.Separator;
|
|
if (sbWord.Length > 0)
|
|
{
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
#endregion
|
|
}
|
|
|
|
}
|
|
else//CJK character
|
|
{
|
|
if (char.IsLetter(c) || (KeepWildCards && c == '%'))
|
|
{
|
|
#region CJK Include token
|
|
//Do we already have a word?
|
|
if (sbWord.Length > 0)
|
|
{
|
|
//Maybe we need to flush this word into the word list
|
|
//if we're over the word length limit or we are going from
|
|
//latin TO CJK
|
|
if (LastToken == TokenTypes.Latin || sbWord.Length >= MAXWORDLENGTH)
|
|
{
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
sbWord.Append(c);
|
|
LastToken = TokenTypes.CJK;
|
|
continue;
|
|
|
|
}
|
|
|
|
if (LastToken == TokenTypes.CJK)
|
|
{
|
|
//we're here because there is more than zero characters already stored
|
|
//and the last was CJK so we need append current character
|
|
//and flush the resultant 2 character n-gram
|
|
sbWord.Append(c);
|
|
System.Diagnostics.Debug.Assert(sbWord.Length == 2);
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
sbWord.Append(c);
|
|
LastToken = TokenTypes.CJK;
|
|
continue;
|
|
|
|
}
|
|
}
|
|
|
|
//append character and go on to next one
|
|
sbWord.Append(c);
|
|
LastToken = TokenTypes.CJK;
|
|
continue;
|
|
#endregion
|
|
|
|
|
|
}
|
|
else
|
|
{
|
|
#region CJK Word Boundary token
|
|
LastToken = TokenTypes.Separator;
|
|
if (sbWord.Length > 0)
|
|
{
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
continue;
|
|
}
|
|
|
|
#endregion
|
|
}
|
|
|
|
}
|
|
|
|
#endregion
|
|
}
|
|
}
|
|
|
|
//Flush out the last word
|
|
if (sbWord.Length > 0)
|
|
{
|
|
//flush away...
|
|
if (!tempParsedWords.Contains(sbWord.ToString()))
|
|
{
|
|
tempParsedWords.Add(sbWord.ToString());
|
|
}
|
|
sbWord.Length = 0;
|
|
}
|
|
}
|
|
|
|
|
|
//bail early if there is nothing indexed
|
|
if (tempParsedWords.Count == 0) return ReturnList;
|
|
|
|
|
|
//Make a return string array
|
|
//from the word list
|
|
foreach (string s in tempParsedWords)
|
|
{
|
|
//Add only non stopwords
|
|
if (!LocaleSearchData.StopWords.Contains(s))
|
|
{
|
|
ReturnList.Add(s);
|
|
}
|
|
}
|
|
|
|
//sometimes all the results are stop words so you end up here with nothing
|
|
return ReturnList;
|
|
|
|
}
|
|
|
|
#endregion
|
|
|
|
|
|
}//eoc
|
|
|
|
}//eons |