Pattern Matching Algorithms in C
📘 Introduction to String Matching
String matching algorithms are fundamental to text processing, data retrieval, and many other applications. This guide covers 6 essential algorithms with C implementations.
🔍 Algorithm Implementations
The simplest approach that checks all possible positions in the text for the pattern.
void naiveSearch(char* text, char* pattern) {
int n = strlen(text);
int m = strlen(pattern);
for (int i = 0; i <= n - m; i++) {
int j;
for (j = 0; j < m; j++) {
if (text[i + j] != pattern[j])
break;
}
if (j == m)
printf("Pattern found at index %d\n", i);
}
}
Pros: Simple to implement, no preprocessing needed
Cons: Inefficient for large texts or repetitive patterns
Uses a failure function (partial match table) to skip unnecessary comparisons.
void computeLPS(char* pattern, int m, int* lps) {
int len = 0;
lps[0] = 0;
int i = 1;
while (i < m) {
if (pattern[i] == pattern[len]) {
len++;
lps[i] = len;
i++;
} else {
if (len != 0) {
len = lps[len - 1];
} else {
lps[i] = 0;
i++;
}
}
}
}
void KMPSearch(char* text, char* pattern) {
int n = strlen(text);
int m = strlen(pattern);
int lps[m];
computeLPS(pattern, m, lps);
int i = 0, j = 0;
while (i < n) {
if (pattern[j] == text[i]) {
j++; i++;
}
if (j == m) {
printf("Pattern found at index %d\n", i - j);
j = lps[j - 1];
} else if (i < n && pattern[j] != text[i]) {
if (j != 0)
j = lps[j - 1];
else
i++;
}
}
}
Pros: Linear time complexity, efficient for repetitive patterns
Cons: Requires O(m) extra space for LPS array
Uses two heuristics: bad character rule and good suffix rule to skip sections of text.
#define NO_OF_CHARS 256
void badCharHeuristic(char* str, int size, int badchar[NO_OF_CHARS]) {
for (int i = 0; i < NO_OF_CHARS; i++)
badchar[i] = -1;
for (int i = 0; i < size; i++)
badchar[(int)str[i]] = i;
}
void BMSearch(char* text, char* pattern) {
int m = strlen(pattern);
int n = strlen(text);
int badchar[NO_OF_CHARS];
badCharHeuristic(pattern, m, badchar);
int s = 0;
while (s <= (n - m)) {
int j = m - 1;
while (j >= 0 && pattern[j] == text[s + j])
j--;
if (j < 0) {
printf("Pattern found at index %d\n", s);
s += (s + m < n) ? m - badchar[text[s + m]] : 1;
} else {
s += max(1, j - badchar[text[s + j]]);
}
}
}
Pros: Often sub-linear time in practice, good for English text
Cons: More complex implementation, preprocessing needed
Uses hashing to find any of a set of pattern strings in a text.
#define d 256
void RKSearch(char* text, char* pattern, int q) {
int m = strlen(pattern);
int n = strlen(text);
int i, j;
int p = 0; // hash value for pattern
int t = 0; // hash value for text
int h = 1;
for (i = 0; i < m - 1; i++)
h = (h * d) % q;
for (i = 0; i < m; i++) {
p = (d * p + pattern[i]) % q;
t = (d * t + text[i]) % q;
}
for (i = 0; i <= n - m; i++) {
if (p == t) {
for (j = 0; j < m; j++) {
if (text[i + j] != pattern[j])
break;
}
if (j == m)
printf("Pattern found at index %d\n", i);
}
if (i < n - m) {
t = (d * (t - text[i] * h) + text[i + m]) % q;
if (t < 0)
t = (t + q);
}
}
}
Pros: Can extend to multiple pattern search, good average case
Cons: Needs good hash function to avoid spurious hits
Constructs Z array where Z[i] is the length of the longest substring starting from S[i] which is also a prefix of S.
void getZArray(char* str, int Z[]) {
int n = strlen(str);
int L, R, k;
L = R = 0;
for (int i = 1; i < n; ++i) {
if (i > R) {
L = R = i;
while (R < n && str[R - L] == str[R])
R++;
Z[i] = R - L;
R--;
} else {
k = i - L;
if (Z[k] < R - i + 1)
Z[i] = Z[k];
else {
L = i;
while (R < n && str[R - L] == str[R])
R++;
Z[i] = R - L;
R--;
}
}
}
}
void ZSearch(char* text, char* pattern) {
char concat[strlen(pattern) + strlen(text) + 2];
strcpy(concat, pattern);
strcat(concat, "$");
strcat(concat, text);
int l = strlen(concat);
int Z[l];
getZArray(concat, Z);
for (int i = 0; i < l; ++i) {
if (Z[i] == strlen(pattern))
printf("Pattern found at index %d\n", i - strlen(pattern) - 1);
}
}
Pros: Linear time, no extra space needed (except Z array)
Cons: Complex to understand and implement
A tree data structure used for efficient retrieval of keys in a dataset of strings.
#define ALPHABET_SIZE 26
struct TrieNode {
struct TrieNode* children[ALPHABET_SIZE];
bool isEndOfWord;
};
struct TrieNode* getNode() {
struct TrieNode* pNode = (struct TrieNode*)malloc(sizeof(struct TrieNode));
pNode->isEndOfWord = false;
for (int i = 0; i < ALPHABET_SIZE; i++)
pNode->children[i] = NULL;
return pNode;
}
void insert(struct TrieNode* root, char* key) {
struct TrieNode* pCrawl = root;
for (int i = 0; i < strlen(key); i++) {
int index = key[i] - 'a';
if (!pCrawl->children[index])
pCrawl->children[index] = getNode();
pCrawl = pCrawl->children[index];
}
pCrawl->isEndOfWord = true;
}
bool search(struct TrieNode* root, char* key) {
struct TrieNode* pCrawl = root;
for (int i = 0; i < strlen(key); i++) {
int index = key[i] - 'a';
if (!pCrawl->children[index])
return false;
pCrawl = pCrawl->children[index];
}
return (pCrawl != NULL && pCrawl->isEndOfWord);
}
Pros: Excellent for dictionary search, prefix search
Cons: High memory usage, complex implementation
📊 Algorithm Comparison
| Algorithm | Best Case | Worst Case | Space | When to Use |
|---|---|---|---|---|
| Naive | O(n) | O(n*m) | O(1) | Short patterns, simple cases |
| KMP | O(n) | O(n+m) | O(m) | Repetitive patterns, DNA sequences |
| Boyer-Moore | O(n/m) | O(n*m) | O(m) | English text, large alphabets |
| Rabin-Karp | O(n+m) | O(n*m) | O(1) | Multiple pattern search |
| Z Algorithm | O(n+m) | O(n+m) | O(n+m) | General purpose linear time |
| Trie | O(m) | O(m) | O(m*k) | Dictionary, prefix search |