ascii-chat 0.8.38
Real-time terminal-based video chat with ASCII art conversion
Loading...
Searching...
No Matches
levenshtein.c
Go to the documentation of this file.
1
12#include <string.h>
13#include <stdlib.h>
14#include <stdint.h>
15#include <ascii-chat/options/levenshtein.h>
16#include <ascii-chat/util/utf8.h>
17#include <ascii-chat/common.h>
18
19// Returns a size_t, depicting the difference between `a` and `b`.
20// See <https://en.wikipedia.org/wiki/Levenshtein_distance> for more information.
21// Works with UTF-8 strings by comparing codepoints instead of bytes.
22size_t levenshtein_n(const char *a, const size_t length, const char *b, const size_t bLength) {
23 // Shortcut optimizations / degenerate cases.
24 if (a == b) {
25 return 0;
26 }
27
28 if (length == 0) {
29 return bLength;
30 }
31
32 if (bLength == 0) {
33 return length;
34 }
35
36 size_t *cache = SAFE_CALLOC(length, sizeof(size_t), size_t *);
37 if (!cache) {
38 return SIZE_MAX; // Allocation failed
39 }
40
41 size_t index = 0;
42 size_t bIndex = 0;
43 size_t distance;
44 size_t bDistance;
45 size_t result = 0;
46 char code;
47
48 // initialize the vector.
49 while (index < length) {
50 cache[index] = index + 1;
51 index++;
52 }
53
54 // Loop.
55 while (bIndex < bLength) {
56 code = b[bIndex];
57 result = distance = bIndex++;
58
59 for (index = 0; index < length; index++) {
60 bDistance = code == a[index] ? distance : distance + 1;
61 distance = cache[index];
62
63 cache[index] = result = distance > result ? bDistance > result ? result + 1 : bDistance
64 : bDistance > distance ? distance + 1
65 : bDistance;
66 }
67 }
68
69 SAFE_FREE(cache);
70
71 return result;
72}
73
74size_t levenshtein(const char *a, const char *b) {
75 if (!a || !b) {
76 return SIZE_MAX;
77 }
78
79 // Count UTF-8 characters instead of bytes
80 size_t char_count_a = utf8_char_count(a);
81 size_t char_count_b = utf8_char_count(b);
82
83 if (char_count_a == SIZE_MAX || char_count_b == SIZE_MAX) {
84 return SIZE_MAX; // Invalid UTF-8 in one of the strings
85 }
86
87 // Convert strings to codepoint arrays for UTF-8-aware comparison
88 if (char_count_a == 0) {
89 return char_count_b;
90 }
91 if (char_count_b == 0) {
92 return char_count_a;
93 }
94
95 uint32_t *codepoints_a = SAFE_MALLOC(char_count_a * sizeof(uint32_t), uint32_t *);
96 uint32_t *codepoints_b = SAFE_MALLOC(char_count_b * sizeof(uint32_t), uint32_t *);
97
98 if (!codepoints_a || !codepoints_b) {
99 SAFE_FREE(codepoints_a);
100 SAFE_FREE(codepoints_b);
101 return SIZE_MAX;
102 }
103
104 size_t decoded_a = utf8_to_codepoints(a, codepoints_a, char_count_a);
105 size_t decoded_b = utf8_to_codepoints(b, codepoints_b, char_count_b);
106
107 if (decoded_a == SIZE_MAX || decoded_b == SIZE_MAX) {
108 SAFE_FREE(codepoints_a);
109 SAFE_FREE(codepoints_b);
110 return SIZE_MAX;
111 }
112
113 // Compute Levenshtein distance on codepoint arrays
114 size_t *cache = SAFE_CALLOC(char_count_a * sizeof(size_t), 1, size_t *);
115 if (!cache) {
116 SAFE_FREE(codepoints_a);
117 SAFE_FREE(codepoints_b);
118 return SIZE_MAX;
119 }
120
121 // Initialize cache
122 for (size_t i = 0; i < char_count_a; i++) {
123 cache[i] = i + 1;
124 }
125
126 size_t distance = 0;
127 size_t result = 0;
128 for (size_t b_idx = 0; b_idx < char_count_b; b_idx++) {
129 uint32_t b_code = codepoints_b[b_idx];
130 result = distance = b_idx;
131
132 for (size_t a_idx = 0; a_idx < char_count_a; a_idx++) {
133 uint32_t a_code = codepoints_a[a_idx];
134 size_t b_distance = (a_code == b_code) ? distance : distance + 1;
135 distance = cache[a_idx];
136
137 cache[a_idx] = result = (distance > result) ? ((b_distance > result) ? result + 1 : b_distance)
138 : ((b_distance > distance) ? distance + 1 : b_distance);
139 }
140 }
141
142 SAFE_FREE(cache);
143 SAFE_FREE(codepoints_a);
144 SAFE_FREE(codepoints_b);
145
146 return result;
147}
148
149const char *levenshtein_find_similar(const char *unknown, const char *const *candidates) {
150 if (!unknown || !candidates) {
151 return NULL;
152 }
153
154 const char *best_match = NULL;
155 size_t best_distance = SIZE_MAX;
156
157 for (int i = 0; candidates[i] != NULL; i++) {
158 size_t dist = levenshtein(unknown, candidates[i]);
159 if (dist < best_distance) {
160 best_distance = dist;
161 best_match = candidates[i];
162 }
163 }
164
165 // Only suggest if the distance is within our threshold
166 if (best_distance <= LEVENSHTEIN_SUGGESTION_THRESHOLD) {
167 return best_match;
168 }
169
170 return NULL;
171}
size_t levenshtein_n(const char *a, const size_t length, const char *b, const size_t bLength)
Definition levenshtein.c:22
const char * levenshtein_find_similar(const char *unknown, const char *const *candidates)
size_t levenshtein(const char *a, const char *b)
Definition levenshtein.c:74
size_t utf8_to_codepoints(const char *str, uint32_t *out_codepoints, size_t max_codepoints)
Definition utf8.c:187
size_t utf8_char_count(const char *str)
Definition utf8.c:138