ascii-chat 0.8.38
Real-time terminal-based video chat with ASCII art conversion
Loading...
Searching...
No Matches
levenshtein.c File Reference

Levenshtein distance algorithm for fuzzy string matching. More...

Go to the source code of this file.

Functions

size_t levenshtein_n (const char *a, const size_t length, const char *b, const size_t bLength)
 
size_t levenshtein (const char *a, const char *b)
 
const char * levenshtein_find_similar (const char *unknown, const char *const *candidates)
 

Detailed Description

Levenshtein distance algorithm for fuzzy string matching.

MIT licensed. Copyright (c) 2015 Titus Wormer titus.nosp@m.worm.nosp@m.er@gm.nosp@m.ail..nosp@m.com From: https://github.com/wooorm/levenshtein.c

Modified for ascii-chat to use project memory macros and UTF-8 character support.

Definition in file levenshtein.c.

Function Documentation

◆ levenshtein()

size_t levenshtein ( const char *  a,
const char *  b 
)

Definition at line 74 of file levenshtein.c.

74 {
75 if (!a || !b) {
76 return SIZE_MAX;
77 }
78
79 // Count UTF-8 characters instead of bytes
80 size_t char_count_a = utf8_char_count(a);
81 size_t char_count_b = utf8_char_count(b);
82
83 if (char_count_a == SIZE_MAX || char_count_b == SIZE_MAX) {
84 return SIZE_MAX; // Invalid UTF-8 in one of the strings
85 }
86
87 // Convert strings to codepoint arrays for UTF-8-aware comparison
88 if (char_count_a == 0) {
89 return char_count_b;
90 }
91 if (char_count_b == 0) {
92 return char_count_a;
93 }
94
95 uint32_t *codepoints_a = SAFE_MALLOC(char_count_a * sizeof(uint32_t), uint32_t *);
96 uint32_t *codepoints_b = SAFE_MALLOC(char_count_b * sizeof(uint32_t), uint32_t *);
97
98 if (!codepoints_a || !codepoints_b) {
99 SAFE_FREE(codepoints_a);
100 SAFE_FREE(codepoints_b);
101 return SIZE_MAX;
102 }
103
104 size_t decoded_a = utf8_to_codepoints(a, codepoints_a, char_count_a);
105 size_t decoded_b = utf8_to_codepoints(b, codepoints_b, char_count_b);
106
107 if (decoded_a == SIZE_MAX || decoded_b == SIZE_MAX) {
108 SAFE_FREE(codepoints_a);
109 SAFE_FREE(codepoints_b);
110 return SIZE_MAX;
111 }
112
113 // Compute Levenshtein distance on codepoint arrays
114 size_t *cache = SAFE_CALLOC(char_count_a * sizeof(size_t), 1, size_t *);
115 if (!cache) {
116 SAFE_FREE(codepoints_a);
117 SAFE_FREE(codepoints_b);
118 return SIZE_MAX;
119 }
120
121 // Initialize cache
122 for (size_t i = 0; i < char_count_a; i++) {
123 cache[i] = i + 1;
124 }
125
126 size_t distance = 0;
127 size_t result = 0;
128 for (size_t b_idx = 0; b_idx < char_count_b; b_idx++) {
129 uint32_t b_code = codepoints_b[b_idx];
130 result = distance = b_idx;
131
132 for (size_t a_idx = 0; a_idx < char_count_a; a_idx++) {
133 uint32_t a_code = codepoints_a[a_idx];
134 size_t b_distance = (a_code == b_code) ? distance : distance + 1;
135 distance = cache[a_idx];
136
137 cache[a_idx] = result = (distance > result) ? ((b_distance > result) ? result + 1 : b_distance)
138 : ((b_distance > distance) ? distance + 1 : b_distance);
139 }
140 }
141
142 SAFE_FREE(cache);
143 SAFE_FREE(codepoints_a);
144 SAFE_FREE(codepoints_b);
145
146 return result;
147}
size_t utf8_to_codepoints(const char *str, uint32_t *out_codepoints, size_t max_codepoints)
Definition utf8.c:187
size_t utf8_char_count(const char *str)
Definition utf8.c:138

References utf8_char_count(), and utf8_to_codepoints().

Referenced by find_similar_option(), find_similar_option_with_mode(), and levenshtein_find_similar().

◆ levenshtein_find_similar()

const char * levenshtein_find_similar ( const char *  unknown,
const char *const *  candidates 
)

Definition at line 149 of file levenshtein.c.

149 {
150 if (!unknown || !candidates) {
151 return NULL;
152 }
153
154 const char *best_match = NULL;
155 size_t best_distance = SIZE_MAX;
156
157 for (int i = 0; candidates[i] != NULL; i++) {
158 size_t dist = levenshtein(unknown, candidates[i]);
159 if (dist < best_distance) {
160 best_distance = dist;
161 best_match = candidates[i];
162 }
163 }
164
165 // Only suggest if the distance is within our threshold
166 if (best_distance <= LEVENSHTEIN_SUGGESTION_THRESHOLD) {
167 return best_match;
168 }
169
170 return NULL;
171}
size_t levenshtein(const char *a, const char *b)
Definition levenshtein.c:74

References levenshtein().

Referenced by asciichat_suggest_enum_value(), and asciichat_suggest_mode().

◆ levenshtein_n()

size_t levenshtein_n ( const char *  a,
const size_t  length,
const char *  b,
const size_t  bLength 
)

Definition at line 22 of file levenshtein.c.

22 {
23 // Shortcut optimizations / degenerate cases.
24 if (a == b) {
25 return 0;
26 }
27
28 if (length == 0) {
29 return bLength;
30 }
31
32 if (bLength == 0) {
33 return length;
34 }
35
36 size_t *cache = SAFE_CALLOC(length, sizeof(size_t), size_t *);
37 if (!cache) {
38 return SIZE_MAX; // Allocation failed
39 }
40
41 size_t index = 0;
42 size_t bIndex = 0;
43 size_t distance;
44 size_t bDistance;
45 size_t result = 0;
46 char code;
47
48 // initialize the vector.
49 while (index < length) {
50 cache[index] = index + 1;
51 index++;
52 }
53
54 // Loop.
55 while (bIndex < bLength) {
56 code = b[bIndex];
57 result = distance = bIndex++;
58
59 for (index = 0; index < length; index++) {
60 bDistance = code == a[index] ? distance : distance + 1;
61 distance = cache[index];
62
63 cache[index] = result = distance > result ? bDistance > result ? result + 1 : bDistance
64 : bDistance > distance ? distance + 1
65 : bDistance;
66 }
67 }
68
69 SAFE_FREE(cache);
70
71 return result;
72}