summaryrefslogtreecommitdiffstats
path: root/src/misspell.c
blob: 6536d7557a445c52f11a6adb3fcbdf20fd984f8b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#include <stdlib.h>
#include <string.h>
#include <limits.h>
#include <utils.h>
#include <misspell.h>

enum string_distance_function {
	DELETION		= 0,	/* m1 */
	INSERTION,			/* m2 */
	TRANSFORMATION,			/* m3 */
};
#define DISTANCE_MAX	(TRANSFORMATION + 1)

static unsigned int min_distance(unsigned int *cost)
{
	unsigned int min = UINT_MAX;
	int k;

	for (k = 0; k < DISTANCE_MAX; k++) {
		if (cost[k] < min)
			min = cost[k];
	}

	return min;
}

/* A simple implementation of "The string-to-string correction problem (1974)"
 * by Robert A. Wagner.
 */
static unsigned int string_distance(const char *a, const char *b)
{
	unsigned int len_a = strlen(a);
	unsigned int len_b = strlen(b);
	unsigned int *distance;
	unsigned int i, j, ret;

	distance = xzalloc((len_a + 1) * (len_b + 1) * sizeof(unsigned int));

#define DISTANCE(__i, __j)	distance[(__i) * len_b + (__j)]

	for (i = 0; i <= len_a; i++)
		DISTANCE(i, 0) = i;
	for (j = 0; j <= len_b; j++)
		DISTANCE(0, j) = j;

	for (i = 1; i <= len_a; i++) {
		for (j = 1; j <= len_b; j++) {
			unsigned int subcost = (a[i] == b[j]) ? 0 : 1;
			unsigned int cost[3];

			cost[DELETION] = DISTANCE(i - 1, j) + 1;
			cost[INSERTION] = DISTANCE(i, j - 1) + 1;
			cost[TRANSFORMATION] = DISTANCE(i - 1, j - 1) + subcost;
			DISTANCE(i, j) = min_distance(cost);

			if (i > 1 && j > 1 &&
			    a[i] == b[j - 1] &&
			    a[i - 1] == b[j])
				DISTANCE(i, j) =
					min(DISTANCE(i, j),
					    DISTANCE(i - 2, j - 2) + subcost);
		}
	}

	ret = DISTANCE(len_a, len_b);

	xfree(distance);

	return ret;
}

void string_misspell_init(struct string_misspell_state *st)
{
	st->obj = NULL;
	st->min_distance = UINT_MAX;
}

int string_misspell_update(const char *a, const char *b,
			   void *obj, struct string_misspell_state *st)
{
	unsigned int len_a, len_b, max_len, min_len, distance, threshold;

	len_a = strlen(a);
	len_b = strlen(b);

	max_len = max(len_a, len_b);
	min_len = min(len_a, len_b);

	if (max_len <= 1)
		return 0;

	if (max_len - min_len <= 1)
		threshold = max(div_round_up(max_len, 3), 1);
	else
		threshold = div_round_up(max_len + 2, 3);

	distance = string_distance(a, b);
	if (distance > threshold)
		return 0;
	else if (distance < st->min_distance) {
		st->min_distance = distance;
		st->obj = obj;
		return 1;
	}
	return 0;
}