AhoCorasick.java

package org.egothor.methodatlas.detect.secrets.internal;

import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Deque;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * Minimal clean-room Aho-Corasick multi-pattern string matcher.
 *
 * <p>Built once from a set of literal keywords, then reused to scan many inputs.
 * A single {@link #search(String)} pass reports every occurrence of every keyword
 * in {@code O(n + matches)} time. The implementation operates on UTF-16 {@code char}
 * units, which is sufficient for anchoring on ASCII credential markers.</p>
 *
 * @since 4.1.0
 */
public final class AhoCorasick {

    /** Initial transition-map capacity hint per node; most nodes have few children. */
    private static final int CHILD_HINT = 4;

    private final int[] fail;
    private final List<Map<Character, Integer>> transitions;
    private final List<List<String>> output;

    private AhoCorasick(
            int[] fail,
            List<Map<Character, Integer>> transitions,
            List<List<String>> output) {
        this.fail = fail;
        this.transitions = transitions;
        this.output = output;
    }

    /**
     * Builds an automaton for the given keywords.
     *
     * @param keywords non-null literal anchors; null or empty entries are ignored
     * @return a ready-to-search automaton
     * @since 4.1.0
     */
    public static AhoCorasick build(List<String> keywords) {
        List<Map<Character, Integer>> transitions = new ArrayList<>();
        List<List<String>> output = new ArrayList<>();
        transitions.add(new HashMap<>(CHILD_HINT));
        output.add(new ArrayList<>(1));

        for (String kw : keywords) {
            if (kw == null || kw.isEmpty()) {
                continue;
            }
            int node = 0;
            for (int i = 0; i < kw.length(); i++) {
                char c = kw.charAt(i);
                Integer next = transitions.get(node).get(c);
                if (next == null) {
                    next = transitions.size();
                    transitions.add(new HashMap<>(CHILD_HINT));
                    output.add(new ArrayList<>(1));
                    transitions.get(node).put(c, next);
                }
                node = next;
            }
            output.get(node).add(kw);
        }

        int[] fail = new int[transitions.size()];
        Deque<Integer> queue = new ArrayDeque<>();
        for (int child : transitions.get(0).values()) {
            fail[child] = 0;
            queue.add(child);
        }
        while (!queue.isEmpty()) {
            int node = queue.poll();
            for (Map.Entry<Character, Integer> e : transitions.get(node).entrySet()) {
                char c = e.getKey();
                int child = e.getValue();
                queue.add(child);
                int f = fail[node];
                while (f != 0 && !transitions.get(f).containsKey(c)) {
                    f = fail[f];
                }
                Integer target = transitions.get(f).get(c);
                fail[child] = (target != null && target != child) ? target : 0;
                output.get(child).addAll(output.get(fail[child]));
            }
        }
        return new AhoCorasick(fail, transitions, output);
    }

    /**
     * Scans {@code text} and returns every keyword occurrence.
     *
     * @param text input to scan; never {@code null}
     * @return hits in ascending start order; never {@code null}
     * @since 4.1.0
     */
    public List<Hit> search(String text) {
        List<Hit> hits = new ArrayList<>();
        int node = 0;
        for (int i = 0; i < text.length(); i++) {
            char c = text.charAt(i);
            while (node != 0 && !transitions.get(node).containsKey(c)) {
                node = fail[node];
            }
            Integer next = transitions.get(node).get(c);
            node = next != null ? next : 0;
            for (String kw : output.get(node)) {
                hits.add(new Hit(kw, i - kw.length() + 1));
            }
        }
        return hits;
    }

    /**
     * One keyword occurrence found by {@link #search(String)}.
     *
     * @param keyword the matched keyword
     * @param start   zero-based index of the first matched character in the scanned text
     * @since 4.1.0
     */
    public record Hit(String keyword, int start) {
    }
}