Templatize the implementation
All checks were successful
/ Build-Stuff (push) Successful in 5s

Also introduces an optimziation that uses the stack, if the buffer size
required, is lower than a pre-determined threshold
This commit is contained in:
Frederik Hertzum 2024-02-27 22:53:04 +01:00
parent c8a1e82cc7
commit 0171411d66
4 changed files with 479899 additions and 43 deletions

View File

@ -0,0 +1,27 @@
#include <vector>
#include <string>
#include <iostream>
#include <numeric>
#include <iterator>
#include <chrono>
#include "iosifovitch.h"
int main() {
std::ios::sync_with_stdio(false);
std::vector<std::string> words;
words.reserve(1000);
std::copy(std::istream_iterator<std::string>(std::cin), {}, std::back_inserter(words));
std::cout << "words read: " << words.size() << '\n';
auto begin = std::chrono::steady_clock::now();
for (auto const& w : words)
for (auto const& y : words)
{
auto distance = levenshtein_distance(w, y);
if (distance > (std::numeric_limits<unsigned int>::max() - 2))
{
std::cerr << "failure : " << w << ", " << y << "/" << distance << '\n';
}
}
std::cout << "total time taken: " << std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - begin).count() << std::endl;
return EXIT_SUCCESS;
}

479826
benchmarks/words Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,40 +1,5 @@
#include "iosifovitch.h"
#include <numeric>
auto levenshtein_distance(std::string_view const& a, std::string_view const& b) -> unsigned int {
if (a.size() == 0 || b.size() == 0) return a.size() + b.size();
if (a.size() > b.size()) return levenshtein_distance(b, a);
auto i = 0u;
while (i < a.size() && a[i] == b[i]) ++i;
if (i != 0) return levenshtein_distance(a.substr(i), b.substr(i));
i = 0;
while (i && a[a.size() - i] == b[b.size() - i]) ++i;
if (i != 0) return levenshtein_distance(a.substr(0, a.size() - i), b.substr(0, b.size() - i));
auto const buffer_length = a.size() + 1;
auto buffer = new unsigned int[buffer_length];
std::iota(buffer, buffer + buffer_length, 0);
for (auto i = 0u; i < b.size(); ++i)
{
buffer[0] = i;
auto temp = i;
for (auto j = 0u; j < a.size(); ++j)
{
temp = std::min(
temp + (a[j] == b[i] ? 0u : 1u),
std::min(buffer[j + 1], buffer[j]) + 1u
);
std::swap(buffer[j + 1], temp);
}
}
auto cost = buffer[buffer_length - 1];
delete [] buffer;
return cost;
auto levenshtein_distance(char * const a, char * const b) -> unsigned int {
return levenshtein_distance<std::string_view>(a, b);
}

View File

@ -1,6 +1,7 @@
#pragma once
#include <string>
#include <numeric>
#include <string_view>
/* Calculate the levenshtein distance between two strings.
*
@ -8,8 +9,45 @@
* best case, where n is the length of the shortest string.
*
*/
auto levenshtein_distance(
std::string_view const& a,
std::string_view const& b
) -> unsigned int
;
template<class S, unsigned int stack_buffer_size = 16>
auto levenshtein_distance(S const& a, S const& b) -> unsigned int {
if (a.size() > b.size()) return levenshtein_distance(b, a);
auto [a_begin, b_begin] = std::mismatch(a.data(), a.data() + a.size(), b.data());
auto a_end = a.data() + a.size();
auto b_end = b.data() + b.size();
while (a_begin != a_end && b_begin != b_end && *(a_end - 1) == *(b_end - 1))
{
--a_end;
--b_end;
}
if (a_begin == a_end) return b_end - b_begin;
auto const buffer_length = a_end - a_begin + 1;
unsigned int stack_buffer[stack_buffer_size];
auto buffer = buffer_length < stack_buffer_size ? stack_buffer : new unsigned int[buffer_length];
std::iota(buffer, buffer + buffer_length, 0);
for (auto i = 0u; i < b_end - b_begin; ++i)
{
buffer[0] = i;
auto temp = i;
for (auto j = 0u; j < a_end - a_begin; ++j)
{
temp = std::min(
temp + (a_begin[j] == b_begin[i] ? 0u : 1u),
std::min(buffer[j + 1], buffer[j]) + 1u
);
std::swap(buffer[j + 1], temp);
}
}
auto cost = buffer[buffer_length - 1];
if (stack_buffer != buffer) delete [] buffer;
return cost;
}
auto levenshtein_distance(char * const a, char * const b) -> unsigned int ;