What is a suffix array
Wikipedia definition:
In computer science, a suffix array is a sorted array of all suffixes of a string. It is a data structure used in, among others, full text indices, data compression algorithms, and the field of bibliometrics.
Suffix Array is a space efficient alternative to Suffix Tree. The construction of a suffix array can be O(n) with an advanced algorithm. It can handle extremely large input, and it makes searching substrings very efficient.
Example
The steps to build a suffix array for the word Banana are:
- find a list of all the suffixes. They are:
Suffix | index |
---|---|
banana$ | 1 |
anana$ | 2 |
nana$ | 3 |
ana$ | 4 |
na$ | 5 |
a$ | 6 |
$ | 7 |
- sort the suffixes. The array of the new sorted indexes is the suffix array for Banana: [7, 6, 4, 2, 1, 5, 3].
Suffix | index |
---|---|
$ | 7 |
a$ | 6 |
ana$ | 4 |
anana$ | 2 |
banana$ | 1 |
na$ | 5 |
nana$ | 3 |
Because we can reconstruct all suffixes with the original string and its suffix array, there is no need to store any suffixes; therefore, it is very space-efficient compared to the suffix tree.
Typically, we add a $
sign to the end of the word to separate words i.e. to prevent a suffix of the previous word from being treated as prefixes to the following word. For example, if we are tasked to build a suffix array for Love Banana, we would concatenate the two words into a single string Love$Banana$
and then build the suffix array. The $
sign prevents the program to treat ove
as a prefix of banana
. Note that we can choose any non-alphabet characters as long as it is smaller than any alphabet.
Auto-complete
- Input: An array of terms and a search text
- Output: a list of terms containing the given search text
Approach
build a suffix array(naive approach: O(n^2logn), there are ways to build a suffix array at O(n))
- concatenate all terms, with $ at the end of each term, to prevent suffix being treated as a prefix to other terms
- get all suffixes from the concatenated string
- sort the suffixes, remove suffixes starting with $
- build a suffix array from the sorted suffixes
binary search the suffix array to find suffixes matching the search text
- start in the middle, compare search text with the middle suffix
- if smaller, search in the left half
- if larger, search in the right half
- if found a match to the search text
- expand and get all adjacent matches
- return all matches
Code
Tests
Tests result from the above planning in the approach section and TDD workflow. The tests can also serve as documentation for the implementations.
public class AutocompleteTest | |
{ | |
[Test] | |
public void ShouldConcatenateToStrings() | |
{ | |
var input = new[] | |
{ | |
"Hello", | |
"allow", | |
"world", | |
"word" | |
}; | |
var result = input.ConcatenateToString(); | |
const string expected = "Hello$allow$world$word$"; | |
result.Should().Be(expected); | |
} | |
[Test] | |
public void ShouldGetAllSuffixes() | |
{ | |
const string input = "Hello$allow$"; | |
var expected = new SuffixItem[] | |
{ | |
new("Hello$allow$", 0, 0), | |
new("ello$allow$", 0, 1), | |
new("llo$allow$", 0, 2), | |
new("lo$allow$", 0, 3), | |
new("o$allow$", 0, 4), | |
new("$allow$", 1, 5), | |
new("allow$", 1, 6), | |
new("llow$", 1, 7), | |
new("low$", 1, 8), | |
new("ow$", 1, 9), | |
new("w$", 1, 10), | |
new("$", 2, 11), | |
}; | |
var result = input.GetAllSuffixes(); | |
result.Should().BeEquivalentTo(expected); | |
} | |
[Test] | |
public void ShouldRemoveSuffixesStartingWithDollar() | |
{ | |
var input = new SuffixItem[] | |
{ | |
new("Hello$allow$", 0, 0), | |
new("ello$allow$", 0, 1), | |
new("llo$allow$", 0, 2), | |
new("lo$allow$", 0, 3), | |
new("o$allow$", 0, 4), | |
new("$allow$", 1, 5), | |
new("allow$", 1, 6), | |
new("llow$", 1, 7), | |
new("low$", 1, 8), | |
new("ow$", 1, 9), | |
new("w$", 1, 10), | |
new("$", 2, 11), | |
}; | |
var expected = new SuffixItem[] | |
{ | |
new("Hello$allow$", 0, 0), | |
new("ello$allow$", 0, 1), | |
new("llo$allow$", 0, 2), | |
new("lo$allow$", 0, 3), | |
new("o$allow$", 0, 4), | |
new("allow$", 1, 6), | |
new("llow$", 1, 7), | |
new("low$", 1, 8), | |
new("ow$", 1, 9), | |
new("w$", 1, 10), | |
}; | |
var result = input.RemoveSuffixesStartingWithDollar(); | |
result.Should().BeEquivalentTo(expected); | |
} | |
[Test] | |
public void ShouldSortSuffixItems() | |
{ | |
var input = new SuffixItem[] | |
{ | |
new("Hello$allow$", 0, 0), | |
new("ello$allow$", 0, 1), | |
new("llo$allow$", 0, 2), | |
new("lo$allow$", 0, 3), | |
new("o$allow$", 0, 4), | |
new("allow$", 1, 6), | |
new("llow$", 1, 7), | |
new("low$", 1, 8), | |
new("ow$", 1, 9), | |
new("w$", 1, 10), | |
}; | |
var expected = new SuffixItem[] | |
{ | |
new("Hello$allow$", 0, 0), | |
new("allow$", 1, 6), | |
new("ello$allow$", 0, 1), | |
new("llo$allow$", 0, 2), | |
new("llow$", 1, 7), | |
new("lo$allow$", 0, 3), | |
new("low$", 1, 8), | |
new("o$allow$", 0, 4), | |
new("ow$", 1, 9), | |
new("w$", 1, 10), | |
}; | |
var result = input.Sort().ToArray(); | |
result[0].Should().BeEquivalentTo(expected[0]); | |
result[1].Should().BeEquivalentTo(expected[1]); | |
result[2].Should().BeEquivalentTo(expected[2]); | |
result[3].Should().BeEquivalentTo(expected[3]); | |
result[4].Should().BeEquivalentTo(expected[4]); | |
result[5].Should().BeEquivalentTo(expected[5]); | |
result[6].Should().BeEquivalentTo(expected[6]); | |
result[7].Should().BeEquivalentTo(expected[7]); | |
result[8].Should().BeEquivalentTo(expected[8]); | |
result[9].Should().BeEquivalentTo(expected[9]); | |
} | |
[Test] | |
public void ShouldGetSuffixArray() | |
{ | |
var input = new SuffixItem[] | |
{ | |
new("allow$", 1, 6), | |
new("ello$allow$", 0, 1), | |
new("Hello$allow$", 0, 0), | |
new("llo$allow$", 0, 2), | |
new("llow$", 1, 7), | |
new("lo$allow$", 0, 3), | |
new("low$", 1, 8), | |
new("o$allow$", 0, 4), | |
new("ow$", 1, 9), | |
new("w$", 1, 10), | |
}; | |
var expected = new SuffixArrayItem[] | |
{ | |
new(1, 6), new(0, 1), new(0, 0), | |
new(0, 2), new(1, 7), new(0, 3), new(1, 8), new(0, 4), | |
new(1, 9), new(1, 10) | |
}; | |
var result = input.GetSuffixArray().ToArray(); | |
result[0].Should().BeEquivalentTo(expected[0]); | |
result[1].Should().BeEquivalentTo(expected[1]); | |
result[2].Should().BeEquivalentTo(expected[2]); | |
result[3].Should().BeEquivalentTo(expected[3]); | |
result[4].Should().BeEquivalentTo(expected[4]); | |
result[5].Should().BeEquivalentTo(expected[5]); | |
result[6].Should().BeEquivalentTo(expected[6]); | |
result[7].Should().BeEquivalentTo(expected[7]); | |
result[8].Should().BeEquivalentTo(expected[8]); | |
result[9].Should().BeEquivalentTo(expected[9]); | |
} | |
[Test] | |
public void ShouldGetSuffixArrayFromSource() | |
{ | |
var input = new[] | |
{ | |
"hello", | |
"allow" | |
}; | |
var expected = new SuffixArrayItem[] | |
{ | |
new(1, 6), | |
new(0, 1), | |
new(0, 0), | |
new(0, 2), | |
new(1, 7), | |
new(0, 3), | |
new(1, 8), | |
new(0, 4), | |
new(1, 9), | |
new(1, 10) | |
}; | |
var suffixArrayResult = input.GetSuffixArray(); | |
suffixArrayResult.ConcatenatedString.Should().Be("hello$allow$"); | |
var result = suffixArrayResult.SuffixArray.ToArray(); | |
result[0].Should().BeEquivalentTo(expected[0]); | |
result[1].Should().BeEquivalentTo(expected[1]); | |
result[2].Should().BeEquivalentTo(expected[2]); | |
result[3].Should().BeEquivalentTo(expected[3]); | |
result[4].Should().BeEquivalentTo(expected[4]); | |
result[5].Should().BeEquivalentTo(expected[5]); | |
result[6].Should().BeEquivalentTo(expected[6]); | |
result[7].Should().BeEquivalentTo(expected[7]); | |
result[8].Should().BeEquivalentTo(expected[8]); | |
result[9].Should().BeEquivalentTo(expected[9]); | |
} | |
[Test] | |
public void ShouldFindSuffix() | |
{ | |
var suffixArray = new SuffixArrayItem[] | |
{ | |
new(1, 6), | |
new(0, 1), | |
new(0, 0), | |
new(0, 2), | |
new(1, 7), | |
new(0, 3), | |
new(1, 8), | |
new(0, 4), | |
new(1, 9), | |
new(1, 1) | |
}; | |
const string input = "Hello$allow$"; | |
const string search = "el"; | |
var result = suffixArray.FindAnyMatchingSuffixIndex(input, search); | |
result.Should().Be(1); | |
} | |
[Test] | |
public void ShouldFindMatchingSuffixes() | |
{ | |
var suffixArray = new SuffixArrayItem[] | |
{ | |
new(1, 6), | |
new(0, 1), | |
new(0, 0), | |
new(0, 2), | |
new(1, 7), | |
new(0, 3), | |
new(1, 8), | |
new(0, 4), | |
new(1, 9), | |
new(1, 10) | |
}; | |
const string input = "Hello$allow$"; | |
const string search = "lo"; | |
var suffixArrayResult = new SuffixArrayResult(suffixArray, input); | |
var result = suffixArrayResult.FindMatchingSuffixIndexes(search); | |
result.Should().BeEquivalentTo(new[] { 5, 6 }); | |
} | |
[Test] | |
public void ShouldGetAutocompleteList() | |
{ | |
var input = new[] | |
{ | |
"hello", | |
"allow", | |
"world", | |
"word" | |
}; | |
var suffixArrayResult = input.GetSuffixArray(); | |
input.FilterByKeyword("or", suffixArrayResult).Should() | |
.BeEquivalentTo("world", "word"); | |
input.FilterByKeyword("lo", suffixArrayResult).Should() | |
.BeEquivalentTo("hello", "allow"); | |
input.FilterByKeyword("hello", suffixArrayResult).Should() | |
.BeEquivalentTo("hello"); | |
input.FilterByKeyword("Hello", suffixArrayResult).Should() | |
.BeEquivalentTo("hello"); | |
input.FilterByKeyword("wo", suffixArrayResult).Should() | |
.BeEquivalentTo("word", "world"); | |
input.FilterByKeyword("o", suffixArrayResult).Should() | |
.BeEquivalentTo("hello", "allow", "world", "word"); | |
input.FilterByKeyword("w", suffixArrayResult).Should() | |
.BeEquivalentTo("allow", "world", "word"); | |
} | |
} |
Implementations
My ultimate goal is to write simple and easy to understand code i.e. require very low cognitive load to read. If you are interested, you can read this awesome blog post by David Whitney Writing good code by understanding cognitive load.
If you feel you need to work out mentally on how something works in the code, I have failed.
public static class AutoComplete | |
{ | |
/** | |
* one time suffix array construction per set of terms. | |
*/ | |
public static SuffixArrayResult GetSuffixArray(this IEnumerable<string> input) | |
{ | |
var concatenateToString = input.ConcatenateToString(); | |
var suffixArray = concatenateToString.ToLower() | |
.GetAllSuffixes() | |
.RemoveSuffixesStartingWithDollar() | |
.Sort() | |
.GetSuffixArray().ToArray(); | |
return new SuffixArrayResult(suffixArray, concatenateToString); | |
} | |
/** | |
* extension method to given terms, take keyword and saved suffix array for the terms, and return filtered terms. | |
*/ | |
public static IEnumerable<string> FilterByKeyword(this IEnumerable<string> source, | |
string keyword, | |
SuffixArrayResult suffixArrayResult) | |
{ | |
var sourceTerms = source.ToArray(); | |
var result = suffixArrayResult | |
.FindMatchingSuffixIndexes(keyword.ToLower()) | |
.ToArray(); | |
var termIndexes = result.Select(i => suffixArrayResult.SuffixArray[i].TermIndex); | |
return termIndexes.Select(termIndex => sourceTerms[termIndex]); | |
} | |
internal static IEnumerable<int> FindMatchingSuffixIndexes( | |
this SuffixArrayResult suffixArrayResult, string search) | |
{ | |
var matchingIndex = | |
suffixArrayResult.SuffixArray.FindAnyMatchingSuffixIndex(suffixArrayResult.ConcatenatedString, | |
search); | |
return matchingIndex == -1 | |
? Enumerable.Empty<int>() | |
: GetAdjacentMatchingSuffixIndexes(suffixArrayResult, matchingIndex, search); | |
} | |
private static IEnumerable<int> GetAdjacentMatchingSuffixIndexes( | |
SuffixArrayResult suffixArrayResult, int matchingIndex, | |
string search) | |
{ | |
var result = new[] { matchingIndex }; | |
var arrayItems = suffixArrayResult.SuffixArray; | |
var leftIndex = matchingIndex; | |
while (true) | |
{ | |
leftIndex -= 1; | |
if (leftIndex < 0) break; | |
var substring = | |
suffixArrayResult.ConcatenatedString[arrayItems[leftIndex].WordIndex..]; | |
if (!substring.StartsWith(search)) break; | |
result = result.Append(leftIndex).ToArray(); | |
} | |
var rightIndex = matchingIndex; | |
while (true) | |
{ | |
rightIndex += 1; | |
if (rightIndex >= arrayItems.Length) break; | |
var substring = | |
suffixArrayResult.ConcatenatedString[arrayItems[rightIndex].WordIndex..]; | |
if (!substring.StartsWith(search)) break; | |
result = result.Append(rightIndex).ToArray(); | |
} | |
return result; | |
} | |
internal static int FindAnyMatchingSuffixIndex( | |
this IEnumerable<SuffixArrayItem> suffixArray, string input, string search) | |
{ | |
var suffixArrayItems = suffixArray.ToArray(); | |
var low = 0; | |
var high = suffixArrayItems.Length - 1; | |
SuffixArrayItem? result = null; | |
while (result == null && low <= high) | |
{ | |
var middle = low + (high - low) / 2; | |
var middleSubstring = input[suffixArrayItems[middle].WordIndex..]; | |
if (middleSubstring.StartsWith(search)) | |
{ | |
return middle; | |
} | |
if (String.CompareOrdinal(middleSubstring, search) < 0) // less | |
{ | |
low = middle + 1; | |
} | |
else | |
{ | |
high = middle - 1; | |
} | |
} | |
return -1; | |
} | |
internal static IEnumerable<SuffixArrayItem> GetSuffixArray(this IEnumerable<SuffixItem> input) | |
{ | |
return input.Select(x => new SuffixArrayItem(x.TermIndex, x.WordIndex)); | |
} | |
internal static string ConcatenateToString(this IEnumerable<string> input) | |
{ | |
return input.Aggregate("", | |
(current, word) => String.IsNullOrEmpty(current) ? $"{word}$" : $"{current}{word}$"); | |
} | |
internal static IEnumerable<SuffixItem> GetAllSuffixes(this string input) | |
{ | |
var termIndex = 0; | |
return input.Select((x, index) => | |
{ | |
if (x == '$') | |
{ | |
termIndex++; | |
} | |
var suffix = input[index..]; | |
return new SuffixItem(suffix, termIndex, index); | |
}); | |
} | |
internal static IEnumerable<SuffixItem> RemoveSuffixesStartingWithDollar( | |
this IEnumerable<SuffixItem> suffixItems) | |
{ | |
return suffixItems.Where(x => !x.Suffix.StartsWith('$')); | |
} | |
internal static IEnumerable<SuffixItem> Sort(this IEnumerable<SuffixItem> suffixItems) | |
{ | |
return suffixItems.OrderBy(x => x.Suffix, StringComparer.Ordinal); | |
} | |
} |
public class SuffixArrayItem | |
{ | |
public SuffixArrayItem(int termIndex, int wordIndex) | |
{ | |
WordIndex = wordIndex; | |
TermIndex = termIndex; | |
} | |
public int WordIndex { get; private set; } | |
public int TermIndex { get; private set; } | |
} | |
public class SuffixArrayResult | |
{ | |
public SuffixArrayResult(SuffixArrayItem[] suffixArray, string concatenatedString) | |
{ | |
SuffixArray = suffixArray; | |
ConcatenatedString = concatenatedString; | |
} | |
public SuffixArrayItem[] SuffixArray { get; private set; } | |
public string ConcatenatedString { get; private set; } | |
} | |
public class SuffixItem | |
{ | |
public SuffixItem(string suffix, int termIndex, int wordIndex) | |
{ | |
Suffix = suffix; | |
TermIndex = termIndex; | |
WordIndex = wordIndex; | |
} | |
public int TermIndex { get; private set; } | |
public int WordIndex { get; private set; } | |
public string Suffix { get; private set; } | |
} |
Top comments (0)