CatalogCredentialDetector.java

package org.egothor.methodatlas.detect.secrets;

import java.util.ArrayList;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.egothor.methodatlas.api.CredentialCandidate;
import org.egothor.methodatlas.api.CredentialDetector;
import org.egothor.methodatlas.api.CredentialDetectorConfig;
import org.egothor.methodatlas.api.CredentialScanUnit;
import org.egothor.methodatlas.detect.secrets.internal.AhoCorasick;
import org.egothor.methodatlas.detect.secrets.internal.Entropy;
import org.egothor.methodatlas.detect.secrets.internal.RuleCatalog;
import org.egothor.methodatlas.detect.secrets.internal.RuleCatalogLoader;
import org.egothor.methodatlas.detect.secrets.internal.CredentialRule;

/**
 * Built-in {@link CredentialDetector} backed by the curated YAML catalog and a
 * clean-room Aho-Corasick prefilter. Anchored rules run via the automaton;
 * unanchored rules run a regex-plus-entropy pass over string literals.
 *
 * @since 4.1.0
 */
public final class CatalogCredentialDetector implements CredentialDetector {

    private static final String DETECTOR_ID = "builtin-catalog";
    /** Characters either side of an anchor hit to feed the confirm regex. */
    private static final int CONFIRM_WINDOW = 200;
    /** Default entropy floor when neither config nor rule specifies one. */
    private static final double DEFAULT_ENTROPY = 4.0;
    /** Newline character used when building the line-start index. */
    private static final char NEWLINE = '\n';

    private RuleCatalog catalog;
    private AhoCorasick anchorAutomaton;
    private List<CredentialRule> anchoredRules;
    private List<CredentialRule> unanchoredRules;
    private double defaultEntropy = DEFAULT_ENTROPY;
    private boolean hadErrors;

    /**
     * Creates a detector with the bundled catalog; {@link #configure} may replace it.
     */
    public CatalogCredentialDetector() {
        loadCatalog(RuleCatalogLoader.loadBundled());
    }

    @Override
    public String detectorId() {
        return DETECTOR_ID;
    }

    @Override
    public void configure(CredentialDetectorConfig config) {
        this.defaultEntropy = config.entropyThreshold();
        config.customCatalog().ifPresent(path -> loadCatalog(RuleCatalogLoader.loadFile(path)));
    }

    /* default */ void loadCatalog(RuleCatalog loaded) {
        this.catalog = loaded;
        this.anchoredRules = new ArrayList<>();
        this.unanchoredRules = new ArrayList<>();
        // Build the automaton from lowercased anchors so that the prefilter pass
        // can be run on a lowercased copy of the source, enabling case-insensitive
        // anchor matching without modifying the Aho-Corasick implementation.
        List<String> allAnchorsLower = new ArrayList<>();
        for (CredentialRule rule : loaded.rules()) {
            if (rule.unanchored()) {
                unanchoredRules.add(rule);
            } else {
                anchoredRules.add(rule);
                for (String anchor : rule.anchors()) {
                    allAnchorsLower.add(anchor.toLowerCase(Locale.ROOT));
                }
            }
        }
        this.anchorAutomaton = AhoCorasick.build(allAnchorsLower);
    }

    /**
     * Returns the loaded catalog (exposed for reproducibility-receipt hashing).
     *
     * @return the active catalog; never {@code null} after construction
     */
    public RuleCatalog catalog() {
        return catalog;
    }

    @Override
    public List<CredentialCandidate> detect(CredentialScanUnit unit) {
        String source = unit.source();
        List<CredentialCandidate> out = new ArrayList<>();
        LineIndex lines = new LineIndex(source);

        // The prefilter automaton is built from lowercased anchors and run on a
        // lowercased copy of the source, enabling case-insensitive anchor matching
        // without modifying the Aho-Corasick implementation.
        String sourceLower = source.toLowerCase(Locale.ROOT);
        for (AhoCorasick.Hit hit : anchorAutomaton.search(sourceLower)) {
            for (CredentialRule rule : anchoredRules) {
                boolean anchorMatches = rule.anchors().stream()
                        .anyMatch(a -> a.toLowerCase(Locale.ROOT).equals(hit.keyword()));
                if (anchorMatches) {
                    confirmAnchored(source, hit.start(), rule, lines, out);
                }
            }
        }

        for (CredentialRule rule : unanchoredRules) {
            double floor = rule.entropyMin() > 0 ? rule.entropyMin() : defaultEntropy;
            Matcher m = Pattern.compile(rule.pattern()).matcher(source);
            while (m.find()) {
                String value = m.groupCount() >= 1 ? m.group(1) : m.group();
                if (Entropy.shannonBitsPerChar(value) >= floor) {
                    addCandidate(rule, m.start(), m.end(), value, lines, out);
                }
            }
        }
        // De-duplicate identical candidates that arise when several of a rule's
        // anchors fall within the confirm window of the same match.
        return new ArrayList<>(new LinkedHashSet<>(out));
    }

    private void confirmAnchored(String source, int anchorStart, CredentialRule rule,
            LineIndex lines, List<CredentialCandidate> out) {
        int from = Math.max(0, anchorStart - CONFIRM_WINDOW);
        int to = Math.min(source.length(), anchorStart + CONFIRM_WINDOW);
        Matcher m = Pattern.compile(rule.pattern()).matcher(source.substring(from, to));
        while (m.find()) {
            int absStart = from + m.start();
            int absEnd = from + m.end();
            if (absStart <= anchorStart && anchorStart < absEnd) {
                // Convention: a rule has at most one capturing group, which is the
                // secret value; a rule with no group treats the whole match as the value.
                String value = m.groupCount() >= 1 ? m.group(1) : m.group();
                addCandidate(rule, absStart, absEnd, value, lines, out);
            }
        }
    }

    private void addCandidate(CredentialRule rule, int start, int end,
            String value, LineIndex lines, List<CredentialCandidate> out) {
        int beginLine = lines.lineOf(start);
        int beginCol = lines.columnOf(start);
        int endLine = lines.lineOf(end > 0 ? end - 1 : 0);
        int endCol = lines.columnOf(end);
        out.add(new CredentialCandidate(DETECTOR_ID, rule.id(), rule.category(),
                beginLine, beginCol, endLine, endCol, value));
    }

    @Override
    public boolean hadErrors() {
        return hadErrors;
    }

    /** Maps absolute character offsets to one-based line/column. */
    private static final class LineIndex {
        private final int[] lineStarts;

        /* default */ LineIndex(String source) {
            List<Integer> starts = new ArrayList<>();
            starts.add(0);
            for (int i = 0; i < source.length(); i++) {
                if (source.charAt(i) == NEWLINE) {
                    starts.add(i + 1);
                }
            }
            this.lineStarts = starts.stream().mapToInt(Integer::intValue).toArray();
        }

        /* default */ int lineOf(int offset) {
            int lo = 0;
            int hi = lineStarts.length - 1;
            while (lo < hi) {
                int mid = (lo + hi + 1) >>> 1;
                if (lineStarts[mid] <= offset) {
                    lo = mid;
                } else {
                    hi = mid - 1;
                }
            }
            return lo + 1;
        }

        /* default */ int columnOf(int offset) {
            int line = lineOf(offset);
            return offset - lineStarts[line - 1] + 1;
        }
    }
}