Pattern Matching Algorithms in C

📘 Introduction to String Matching

String matching algorithms are fundamental to text processing, data retrieval, and many other applications. This guide covers 6 essential algorithms with C implementations.

🔍 Algorithm Implementations

1. Naive (Brute Force) Algorithm

The simplest approach that checks all possible positions in the text for the pattern.

Time Complexity: O(n*m)
void naiveSearch(char* text, char* pattern) {
    int n = strlen(text);
    int m = strlen(pattern);
    
    for (int i = 0; i <= n - m; i++) {
        int j;
        for (j = 0; j < m; j++) {
            if (text[i + j] != pattern[j])
                break;
        }
        if (j == m)
            printf("Pattern found at index %d\n", i);
    }
}

Pros: Simple to implement, no preprocessing needed

Cons: Inefficient for large texts or repetitive patterns

2. Knuth-Morris-Pratt (KMP) Algorithm

Uses a failure function (partial match table) to skip unnecessary comparisons.

Time Complexity: O(n + m)
void computeLPS(char* pattern, int m, int* lps) {
    int len = 0;
    lps[0] = 0;
    
    int i = 1;
    while (i < m) {
        if (pattern[i] == pattern[len]) {
            len++;
            lps[i] = len;
            i++;
        } else {
            if (len != 0) {
                len = lps[len - 1];
            } else {
                lps[i] = 0;
                i++;
            }
        }
    }
}

void KMPSearch(char* text, char* pattern) {
    int n = strlen(text);
    int m = strlen(pattern);
    
    int lps[m];
    computeLPS(pattern, m, lps);
    
    int i = 0, j = 0;
    while (i < n) {
        if (pattern[j] == text[i]) {
            j++; i++;
        }
        if (j == m) {
            printf("Pattern found at index %d\n", i - j);
            j = lps[j - 1];
        } else if (i < n && pattern[j] != text[i]) {
            if (j != 0)
                j = lps[j - 1];
            else
                i++;
        }
    }
}

Pros: Linear time complexity, efficient for repetitive patterns

Cons: Requires O(m) extra space for LPS array

3. Boyer-Moore Algorithm

Uses two heuristics: bad character rule and good suffix rule to skip sections of text.

Best: O(n/m), Worst: O(n*m)
#define NO_OF_CHARS 256

void badCharHeuristic(char* str, int size, int badchar[NO_OF_CHARS]) {
    for (int i = 0; i < NO_OF_CHARS; i++)
        badchar[i] = -1;
    for (int i = 0; i < size; i++)
        badchar[(int)str[i]] = i;
}

void BMSearch(char* text, char* pattern) {
    int m = strlen(pattern);
    int n = strlen(text);
    
    int badchar[NO_OF_CHARS];
    badCharHeuristic(pattern, m, badchar);
    
    int s = 0;
    while (s <= (n - m)) {
        int j = m - 1;
        while (j >= 0 && pattern[j] == text[s + j])
            j--;
        if (j < 0) {
            printf("Pattern found at index %d\n", s);
            s += (s + m < n) ? m - badchar[text[s + m]] : 1;
        } else {
            s += max(1, j - badchar[text[s + j]]);
        }
    }
}

Pros: Often sub-linear time in practice, good for English text

Cons: More complex implementation, preprocessing needed

4. Rabin-Karp Algorithm

Uses hashing to find any of a set of pattern strings in a text.

Average: O(n+m), Worst: O(n*m)
#define d 256

void RKSearch(char* text, char* pattern, int q) {
    int m = strlen(pattern);
    int n = strlen(text);
    int i, j;
    int p = 0; // hash value for pattern
    int t = 0; // hash value for text
    int h = 1;
    
    for (i = 0; i < m - 1; i++)
        h = (h * d) % q;
    
    for (i = 0; i < m; i++) {
        p = (d * p + pattern[i]) % q;
        t = (d * t + text[i]) % q;
    }
    
    for (i = 0; i <= n - m; i++) {
        if (p == t) {
            for (j = 0; j < m; j++) {
                if (text[i + j] != pattern[j])
                    break;
            }
            if (j == m)
                printf("Pattern found at index %d\n", i);
        }
        if (i < n - m) {
            t = (d * (t - text[i] * h) + text[i + m]) % q;
            if (t < 0)
                t = (t + q);
        }
    }
}

Pros: Can extend to multiple pattern search, good average case

Cons: Needs good hash function to avoid spurious hits

5. Z Algorithm

Constructs Z array where Z[i] is the length of the longest substring starting from S[i] which is also a prefix of S.

Time Complexity: O(n + m)
void getZArray(char* str, int Z[]) {
    int n = strlen(str);
    int L, R, k;
    L = R = 0;
    
    for (int i = 1; i < n; ++i) {
        if (i > R) {
            L = R = i;
            while (R < n && str[R - L] == str[R])
                R++;
            Z[i] = R - L;
            R--;
        } else {
            k = i - L;
            if (Z[k] < R - i + 1)
                Z[i] = Z[k];
            else {
                L = i;
                while (R < n && str[R - L] == str[R])
                    R++;
                Z[i] = R - L;
                R--;
            }
        }
    }
}

void ZSearch(char* text, char* pattern) {
    char concat[strlen(pattern) + strlen(text) + 2];
    strcpy(concat, pattern);
    strcat(concat, "$");
    strcat(concat, text);
    int l = strlen(concat);
    
    int Z[l];
    getZArray(concat, Z);
    
    for (int i = 0; i < l; ++i) {
        if (Z[i] == strlen(pattern))
            printf("Pattern found at index %d\n", i - strlen(pattern) - 1);
    }
}

Pros: Linear time, no extra space needed (except Z array)

Cons: Complex to understand and implement

6. Trie Algorithm

A tree data structure used for efficient retrieval of keys in a dataset of strings.

Search: O(m), Insert: O(m)
#define ALPHABET_SIZE 26

struct TrieNode {
    struct TrieNode* children[ALPHABET_SIZE];
    bool isEndOfWord;
};

struct TrieNode* getNode() {
    struct TrieNode* pNode = (struct TrieNode*)malloc(sizeof(struct TrieNode));
    pNode->isEndOfWord = false;
    for (int i = 0; i < ALPHABET_SIZE; i++)
        pNode->children[i] = NULL;
    return pNode;
}

void insert(struct TrieNode* root, char* key) {
    struct TrieNode* pCrawl = root;
    for (int i = 0; i < strlen(key); i++) {
        int index = key[i] - 'a';
        if (!pCrawl->children[index])
            pCrawl->children[index] = getNode();
        pCrawl = pCrawl->children[index];
    }
    pCrawl->isEndOfWord = true;
}

bool search(struct TrieNode* root, char* key) {
    struct TrieNode* pCrawl = root;
    for (int i = 0; i < strlen(key); i++) {
        int index = key[i] - 'a';
        if (!pCrawl->children[index])
            return false;
        pCrawl = pCrawl->children[index];
    }
    return (pCrawl != NULL && pCrawl->isEndOfWord);
}

Pros: Excellent for dictionary search, prefix search

Cons: High memory usage, complex implementation

📊 Algorithm Comparison

Algorithm Best Case Worst Case Space When to Use
Naive O(n) O(n*m) O(1) Short patterns, simple cases
KMP O(n) O(n+m) O(m) Repetitive patterns, DNA sequences
Boyer-Moore O(n/m) O(n*m) O(m) English text, large alphabets
Rabin-Karp O(n+m) O(n*m) O(1) Multiple pattern search
Z Algorithm O(n+m) O(n+m) O(n+m) General purpose linear time
Trie O(m) O(m) O(m*k) Dictionary, prefix search