Also introduces an optimziation that uses the stack, if the buffer size required, is lower than a pre-determined threshold
This commit is contained in:
parent
c8a1e82cc7
commit
0171411d66
|
@ -0,0 +1,27 @@
|
|||
#include <vector>
|
||||
#include <string>
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <iterator>
|
||||
#include <chrono>
|
||||
#include "iosifovitch.h"
|
||||
|
||||
int main() {
|
||||
std::ios::sync_with_stdio(false);
|
||||
std::vector<std::string> words;
|
||||
words.reserve(1000);
|
||||
std::copy(std::istream_iterator<std::string>(std::cin), {}, std::back_inserter(words));
|
||||
std::cout << "words read: " << words.size() << '\n';
|
||||
auto begin = std::chrono::steady_clock::now();
|
||||
for (auto const& w : words)
|
||||
for (auto const& y : words)
|
||||
{
|
||||
auto distance = levenshtein_distance(w, y);
|
||||
if (distance > (std::numeric_limits<unsigned int>::max() - 2))
|
||||
{
|
||||
std::cerr << "failure : " << w << ", " << y << "/" << distance << '\n';
|
||||
}
|
||||
}
|
||||
std::cout << "total time taken: " << std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - begin).count() << std::endl;
|
||||
return EXIT_SUCCESS;
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -1,40 +1,5 @@
|
|||
#include "iosifovitch.h"
|
||||
#include <numeric>
|
||||
|
||||
auto levenshtein_distance(std::string_view const& a, std::string_view const& b) -> unsigned int {
|
||||
if (a.size() == 0 || b.size() == 0) return a.size() + b.size();
|
||||
|
||||
if (a.size() > b.size()) return levenshtein_distance(b, a);
|
||||
|
||||
auto i = 0u;
|
||||
while (i < a.size() && a[i] == b[i]) ++i;
|
||||
|
||||
if (i != 0) return levenshtein_distance(a.substr(i), b.substr(i));
|
||||
|
||||
i = 0;
|
||||
while (i && a[a.size() - i] == b[b.size() - i]) ++i;
|
||||
if (i != 0) return levenshtein_distance(a.substr(0, a.size() - i), b.substr(0, b.size() - i));
|
||||
|
||||
auto const buffer_length = a.size() + 1;
|
||||
auto buffer = new unsigned int[buffer_length];
|
||||
|
||||
std::iota(buffer, buffer + buffer_length, 0);
|
||||
|
||||
for (auto i = 0u; i < b.size(); ++i)
|
||||
{
|
||||
buffer[0] = i;
|
||||
auto temp = i;
|
||||
for (auto j = 0u; j < a.size(); ++j)
|
||||
{
|
||||
temp = std::min(
|
||||
temp + (a[j] == b[i] ? 0u : 1u),
|
||||
std::min(buffer[j + 1], buffer[j]) + 1u
|
||||
);
|
||||
std::swap(buffer[j + 1], temp);
|
||||
}
|
||||
}
|
||||
|
||||
auto cost = buffer[buffer_length - 1];
|
||||
delete [] buffer;
|
||||
return cost;
|
||||
auto levenshtein_distance(char * const a, char * const b) -> unsigned int {
|
||||
return levenshtein_distance<std::string_view>(a, b);
|
||||
}
|
|
@ -1,6 +1,7 @@
|
|||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <numeric>
|
||||
#include <string_view>
|
||||
|
||||
/* Calculate the levenshtein distance between two strings.
|
||||
*
|
||||
|
@ -8,8 +9,45 @@
|
|||
* best case, where n is the length of the shortest string.
|
||||
*
|
||||
*/
|
||||
auto levenshtein_distance(
|
||||
std::string_view const& a,
|
||||
std::string_view const& b
|
||||
) -> unsigned int
|
||||
;
|
||||
template<class S, unsigned int stack_buffer_size = 16>
|
||||
auto levenshtein_distance(S const& a, S const& b) -> unsigned int {
|
||||
if (a.size() > b.size()) return levenshtein_distance(b, a);
|
||||
|
||||
auto [a_begin, b_begin] = std::mismatch(a.data(), a.data() + a.size(), b.data());
|
||||
|
||||
auto a_end = a.data() + a.size();
|
||||
auto b_end = b.data() + b.size();
|
||||
|
||||
while (a_begin != a_end && b_begin != b_end && *(a_end - 1) == *(b_end - 1))
|
||||
{
|
||||
--a_end;
|
||||
--b_end;
|
||||
}
|
||||
|
||||
if (a_begin == a_end) return b_end - b_begin;
|
||||
auto const buffer_length = a_end - a_begin + 1;
|
||||
unsigned int stack_buffer[stack_buffer_size];
|
||||
auto buffer = buffer_length < stack_buffer_size ? stack_buffer : new unsigned int[buffer_length];
|
||||
|
||||
std::iota(buffer, buffer + buffer_length, 0);
|
||||
|
||||
for (auto i = 0u; i < b_end - b_begin; ++i)
|
||||
{
|
||||
buffer[0] = i;
|
||||
auto temp = i;
|
||||
for (auto j = 0u; j < a_end - a_begin; ++j)
|
||||
{
|
||||
temp = std::min(
|
||||
temp + (a_begin[j] == b_begin[i] ? 0u : 1u),
|
||||
std::min(buffer[j + 1], buffer[j]) + 1u
|
||||
);
|
||||
std::swap(buffer[j + 1], temp);
|
||||
}
|
||||
}
|
||||
|
||||
auto cost = buffer[buffer_length - 1];
|
||||
if (stack_buffer != buffer) delete [] buffer;
|
||||
return cost;
|
||||
}
|
||||
|
||||
auto levenshtein_distance(char * const a, char * const b) -> unsigned int ;
|
Loading…
Reference in New Issue