DeltaReport.java

package org.egothor.methodatlas;

import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;

import org.egothor.methodatlas.api.ScanRecord;

/**
 * Computes the difference between two MethodAtlas scan outputs.
 *
 * <h2>Overview</h2>
 *
 * <p>
 * {@link #compute(Path, Path)} parses two MethodAtlas CSV files produced by
 * separate scan runs and returns a {@link DeltaResult} that enumerates every
 * test method that was added, removed, or modified between the runs. Unchanged
 * methods are counted but not listed individually.
 * </p>
 *
 * <h2>Method identity</h2>
 *
 * <p>
 * Two records are considered to represent the same method when their
 * {@code fqcn} and {@code method} columns match exactly. If a class or method
 * is renamed between scans, the old name appears as {@code REMOVED} and the
 * new name appears as {@code ADDED}. MethodAtlas does not attempt to track
 * renames.
 * </p>
 *
 * <h2>Comparable fields</h2>
 *
 * <p>
 * For each method present in both scans, the following fields are compared:
 * </p>
 *
 * <ul>
 * <li>{@code loc} — lines of code; always compared</li>
 * <li>{@code tags} — JUnit {@code @Tag} set; always compared (order-independent)</li>
 * <li>{@code display_name} — {@code @DisplayName} annotation value; always compared;
 *     {@code null} (column absent from the CSV) and {@code ""} (column present but
 *     no annotation on the method) are both treated as "no annotation" and considered
 *     equal, so comparing an old-format file against a new-format file does not
 *     produce false positives for methods that have no annotation in either scan</li>
 * <li>{@code content_hash} — source fingerprint; compared only when both records
 *     have a non-{@code null} value (i.e., both scans were run with
 *     {@code -content-hash}); a hash difference indicates the enclosing class
 *     source was edited</li>
 * <li>{@code ai_security_relevant} — compared only when both records carry a
 *     non-{@code null} value (both scans used {@code -ai})</li>
 * <li>{@code ai_tags} — compared only when both records carry a non-{@code null}
 *     value; comparison is order-independent</li>
 * </ul>
 *
 * <p>
 * Fields absent from either record (i.e., produced by scans with different flag
 * sets) are skipped so that a scan with {@code -content-hash} can be meaningfully
 * compared with one that did not use that flag.
 * </p>
 *
 * <h2>CSV format compatibility</h2>
 *
 * <p>
 * The parser handles the MethodAtlas CSV dialect (RFC 4180, comma-delimited,
 * double-quote escaping). {@code #}-prefixed comment lines emitted by
 * {@code -emit-metadata} are skipped; the {@code scan_timestamp} metadata value
 * is extracted and forwarded to {@link DeltaResult} for display. Blank lines
 * are ignored. Unknown column names are ignored, making the parser
 * forward-compatible with columns added in future versions.
 * </p>
 *
 * @see DeltaEntry
 * @see org.egothor.methodatlas.emit.DeltaEmitter
 */
public final class DeltaReport {

    private static final char CSV_QUOTE = '"';
    private static final char CSV_COMMA = ',';

    private DeltaReport() {
    }

    // -------------------------------------------------------------------------
    // Public API
    // -------------------------------------------------------------------------

    /**
     * Computes the difference between two MethodAtlas scan CSV files.
     *
     * <p>
     * Both files must be readable and must contain at least a CSV header row
     * with {@code fqcn} and {@code method} columns. Empty files (no data rows)
     * are handled gracefully and produce only {@code ADDED} or {@code REMOVED}
     * entries as appropriate.
     * </p>
     *
     * @param beforeCsv path to the scan output from the earlier run
     * @param afterCsv  path to the scan output from the later run
     * @return delta result; never {@code null}
     * @throws IOException              if either file cannot be read
     * @throws IllegalArgumentException if a required column ({@code fqcn} or
     *                                  {@code method}) is absent from a file
     */
    public static DeltaResult compute(Path beforeCsv, Path afterCsv) throws IOException {
        ParsedCsv before = parseCsv(beforeCsv);
        ParsedCsv after = parseCsv(afterCsv);

        // Build key → record maps; LinkedHashMap preserves file order, which produces
        // stable output when two runs scan the same sources in the same order.
        Map<String, ScanRecord> beforeMap = buildMap(before.records());
        Map<String, ScanRecord> afterMap = buildMap(after.records());

        List<DeltaEntry> entries = new ArrayList<>();
        int unchanged = 0;

        // Pass 1: check every before record against after.
        for (Map.Entry<String, ScanRecord> e : beforeMap.entrySet()) {
            ScanRecord afterRecord = afterMap.get(e.getKey());
            if (afterRecord == null) {
                entries.add(DeltaEntry.removed(e.getValue()));
            } else {
                Set<String> changed = findChangedFields(e.getValue(), afterRecord);
                if (changed.isEmpty()) {
                    unchanged++;
                } else {
                    entries.add(DeltaEntry.modified(e.getValue(), afterRecord, changed));
                }
            }
        }

        // Pass 2: find records in after that are not in before (ADDED).
        for (Map.Entry<String, ScanRecord> e : afterMap.entrySet()) {
            if (!beforeMap.containsKey(e.getKey())) {
                entries.add(DeltaEntry.added(e.getValue()));
            }
        }

        // Sort by (fqcn, method) so all changes to a class are grouped together.
        entries.sort(Comparator
                .<DeltaEntry, String>comparing(e -> e.record().fqcn())
                .thenComparing(e -> e.record().method()));

        return new DeltaResult(
                beforeCsv, afterCsv,
                before.scanTimestamp(), after.scanTimestamp(),
                beforeMap.size(), afterMap.size(),
                countSecurityRelevant(before.records()),
                countSecurityRelevant(after.records()),
                Collections.unmodifiableList(entries),
                unchanged);
    }

    // -------------------------------------------------------------------------
    // Result type
    // -------------------------------------------------------------------------

    /**
     * The aggregate result of comparing two MethodAtlas scan outputs.
     *
     * <p>
     * The {@link #entries()} list contains only changed methods (ADDED, REMOVED,
     * MODIFIED). Unchanged methods are represented only by the
     * {@link #unchangedCount()} counter to keep the report concise.
     * </p>
     *
     * @param beforePath               path to the <em>before</em> CSV file
     * @param afterPath                path to the <em>after</em> CSV file
     * @param beforeTimestamp          {@code scan_timestamp} metadata extracted
     *                                 from the <em>before</em> file, or
     *                                 {@code null} when the file was produced
     *                                 without {@code -emit-metadata}
     * @param afterTimestamp           {@code scan_timestamp} metadata extracted
     *                                 from the <em>after</em> file, or {@code null}
     * @param totalBefore              total number of test methods in the
     *                                 <em>before</em> scan
     * @param totalAfter               total number of test methods in the
     *                                 <em>after</em> scan
     * @param securityRelevantBefore   number of security-relevant methods in the
     *                                 <em>before</em> scan; {@code 0} when no AI
     *                                 columns were present
     * @param securityRelevantAfter    number of security-relevant methods in the
     *                                 <em>after</em> scan; {@code 0} when no AI
     *                                 columns were present
     * @param entries                  unmodifiable list of changed entries in
     *                                 (fqcn, method) order
     * @param unchangedCount           number of methods present in both scans
     *                                 with no detected differences
     */
    public record DeltaResult(
            Path beforePath,
            Path afterPath,
            String beforeTimestamp,
            String afterTimestamp,
            int totalBefore,
            int totalAfter,
            int securityRelevantBefore,
            int securityRelevantAfter,
            List<DeltaEntry> entries,
            int unchangedCount) {

        /** Returns the number of {@link DeltaEntry.ChangeType#ADDED} entries. */
        public int addedCount() {
            return (int) entries.stream()
                    .filter(e -> e.changeType() == DeltaEntry.ChangeType.ADDED).count();
        }

        /** Returns the number of {@link DeltaEntry.ChangeType#REMOVED} entries. */
        public int removedCount() {
            return (int) entries.stream()
                    .filter(e -> e.changeType() == DeltaEntry.ChangeType.REMOVED).count();
        }

        /** Returns the number of {@link DeltaEntry.ChangeType#MODIFIED} entries. */
        public int modifiedCount() {
            return (int) entries.stream()
                    .filter(e -> e.changeType() == DeltaEntry.ChangeType.MODIFIED).count();
        }
    }

    // -------------------------------------------------------------------------
    // CSV parsing — package-private for testing
    // -------------------------------------------------------------------------

    /**
     * Returns all scan records from the given MethodAtlas CSV file.
     *
     * <p>Used by {@link AiResultCache} to build an in-memory lookup from a previous
     * scan output without going through the full delta-comparison path.</p>
     *
     * @param csvPath path to a MethodAtlas CSV output file
     * @return unmodifiable list of parsed records; empty when the file has no data rows
     * @throws IOException if the file cannot be read
     */
    /* default */ static List<ScanRecord> loadRecords(Path csvPath) throws IOException {
        return parseCsv(csvPath).records();
    }

    /**
     * Parses one line of a MethodAtlas CSV file according to RFC 4180.
     *
     * <p>
     * Fields may be optionally enclosed in double quotes. A double-quote
     * character within a quoted field is escaped as two consecutive
     * double-quotes ({@code ""}). The delimiter is a comma. A line ending with
     * a comma produces a trailing empty field.
     * </p>
     *
     * @param line the raw CSV line to parse (no line terminator)
     * @return list of unescaped field values; never {@code null}
     */
    /* default */ static List<String> parseCsvLine(String line) {
        List<String> result = new ArrayList<>();
        StringBuilder field = new StringBuilder();
        int pos = 0;

        while (pos < line.length()) {
            char c = line.charAt(pos);
            if (c == CSV_QUOTE) {
                pos = parseQuotedField(line, pos + 1, field);
                // pos now points one past the closing quote (or end of string)
            } else if (c == CSV_COMMA) {
                result.add(field.toString());
                field.setLength(0);
                pos++;
            } else {
                field.append(c);
                pos++;
            }
        }
        result.add(field.toString()); // always add the last (or only) field
        return result;
    }

    /**
     * Parses characters of a quoted CSV field starting just after the opening quote,
     * appending unescaped characters to {@code field}.
     *
     * @param line  the full CSV line
     * @param start position of the first character inside the quoted field
     * @param field buffer receiving unescaped field content
     * @return position of the character immediately after the closing quote,
     *         or {@code line.length()} when the line ends before a closing quote
     */
    private static int parseQuotedField(String line, int start, StringBuilder field) {
        int pos = start;
        while (pos < line.length()) {
            char c = line.charAt(pos);
            if (c == CSV_QUOTE) {
                if (pos + 1 < line.length() && line.charAt(pos + 1) == CSV_QUOTE) {
                    field.append(CSV_QUOTE);
                    pos += 2; // skip both quotes of the escape sequence
                } else {
                    return pos + 1; // closing quote consumed; return position after it
                }
            } else {
                field.append(c);
                pos++;
            }
        }
        return pos; // unterminated quoted field — treat end-of-line as closing
    }

    // -------------------------------------------------------------------------
    // Private helpers
    // -------------------------------------------------------------------------

    /** Intermediate result of reading a CSV file. */
    private record ParsedCsv(List<ScanRecord> records, String scanTimestamp) {
    }

    private static ParsedCsv parseCsv(Path path) throws IOException {
        List<String> lines = Files.readAllLines(path, StandardCharsets.UTF_8);

        String timestamp = null;
        List<String> dataLines = new ArrayList<>();

        for (String line : lines) {
            if (line.startsWith("#")) {
                if (line.startsWith("# scan_timestamp:")) {
                    timestamp = line.substring("# scan_timestamp:".length()).trim();
                }
            } else if (!line.isBlank()) {
                dataLines.add(line);
            }
        }

        if (dataLines.isEmpty()) {
            return new ParsedCsv(List.of(), timestamp);
        }

        // First non-comment, non-blank line is the CSV header.
        List<String> header = parseCsvLine(dataLines.get(0));
        Map<String, Integer> colIndex = new HashMap<>();
        for (int i = 0; i < header.size(); i++) {
            colIndex.put(header.get(i).trim(), i);
        }

        List<ScanRecord> records = new ArrayList<>(dataLines.size() - 1);
        for (int i = 1; i < dataLines.size(); i++) {
            List<String> fields = parseCsvLine(dataLines.get(i));
            records.add(toScanRecord(fields, colIndex));
        }

        return new ParsedCsv(Collections.unmodifiableList(records), timestamp);
    }

    private static ScanRecord toScanRecord(List<String> fields, Map<String, Integer> colIndex) {
        return new ScanRecord(
                requireField(fields, colIndex, "fqcn"),
                requireField(fields, colIndex, "method"),
                parseInt(getField(fields, colIndex, "loc"), 0),
                parseSemicolonList(getField(fields, colIndex, "tags")),
                getFieldPreserveEmpty(fields, colIndex, "display_name"),
                getField(fields, colIndex, "content_hash"),
                parseBoolean(getField(fields, colIndex, "ai_security_relevant")).orElse(null),
                getField(fields, colIndex, "ai_display_name"),
                parseSemicolonListOrNull(fields, colIndex, "ai_tags"),
                getField(fields, colIndex, "ai_reason"),
                parseDouble(getField(fields, colIndex, "ai_confidence")),
                parseDouble(getField(fields, colIndex, "ai_interaction_score")),
                getField(fields, colIndex, "tag_ai_drift"));
    }

    private static Map<String, ScanRecord> buildMap(List<ScanRecord> records) {
        Map<String, ScanRecord> map = new LinkedHashMap<>(records.size() * 2);
        for (ScanRecord r : records) {
            map.put(key(r), r);
        }
        return map;
    }

    private static String key(ScanRecord r) {
        return r.fqcn() + "::" + r.method();
    }

    private static int countSecurityRelevant(List<ScanRecord> records) {
        return (int) records.stream()
                .filter(r -> Boolean.TRUE.equals(r.aiSecurityRelevant())).count();
    }

    /**
     * Returns the set of field names whose values differ between {@code before} and
     * {@code after}.
     *
     * <p>
     * A field is included in the result only when it is non-{@code null} in
     * <em>both</em> records, ensuring that differences caused by different scan
     * flag sets (e.g. one run with {@code -content-hash}, one without) are not
     * falsely reported as modifications.
     * </p>
     */
    private static Set<String> findChangedFields(ScanRecord before, ScanRecord after) {
        Set<String> changed = new LinkedHashSet<>();

        if (before.loc() != after.loc()) {
            changed.add("loc");
        }
        if (!tagsEqual(before.tags(), after.tags())) {
            changed.add("tags");
        }
        if (!displayNameEqual(before.displayName(), after.displayName())) {
            changed.add("display_name");
        }
        addIfBothPresentAndChanged(changed, "source", before.contentHash(), after.contentHash());
        addIfBothPresentAndChanged(changed, "ai_security_relevant",
                before.aiSecurityRelevant(), after.aiSecurityRelevant());
        if (before.aiTags() != null && after.aiTags() != null
                && !new HashSet<>(before.aiTags()).equals(new HashSet<>(after.aiTags()))) {
            changed.add("ai_tags");
        }
        addIfBothPresentAndChanged(changed, "ai_interaction_score",
                before.aiInteractionScore(), after.aiInteractionScore());
        addIfBothPresentAndChanged(changed, "tag_ai_drift", before.tagAiDrift(), after.tagAiDrift());

        return changed;
    }

    /**
     * Adds {@code fieldName} to {@code changed} when both values are non-{@code null}
     * and not equal.  Fields absent from either record (produced by scans with different
     * flag sets) are skipped so that they are not falsely reported as modifications.
     */
    private static void addIfBothPresentAndChanged(Set<String> changed, String fieldName,
            Object before, Object after) {
        if (before != null && after != null && !before.equals(after)) {
            changed.add(fieldName);
        }
    }

    private static boolean tagsEqual(List<String> a, List<String> b) {
        return new HashSet<>(a).equals(new HashSet<>(b));
    }

    /**
     * Returns {@code true} when two {@code display_name} values are semantically
     * equal. {@code null} (column absent from the CSV) and {@code ""} (column
     * present but no {@code @DisplayName} annotation) both mean "no annotation",
     * so they are treated as equal. Any non-empty value must match exactly.
     */
    private static boolean displayNameEqual(String a, String b) {
        String normalA = a == null ? "" : a;
        String normalB = b == null ? "" : b;
        return normalA.equals(normalB);
    }

    // -------------------------------------------------------------------------
    // Field extraction helpers
    // -------------------------------------------------------------------------

    /**
     * Returns the field value at the named column, or {@code null} when the column
     * is absent or the value is empty.
     */
    private static String getField(List<String> fields, Map<String, Integer> colIndex, String col) {
        Integer idx = colIndex.get(col);
        if (idx == null || idx >= fields.size()) {
            return null;
        }
        String val = fields.get(idx);
        return (val == null || val.isEmpty()) ? null : val;
    }

    /**
     * Returns the field value at the named column preserving empty strings as
     * distinct from a missing column.
     *
     * <p>Returns {@code null} when the column is absent from the header. Returns
     * {@code ""} when the column is present but the cell value is empty. Returns
     * the raw value otherwise.
     *
     * <p>Use this instead of {@link #getField} for columns where an empty value
     * carries semantics different from a missing column — e.g. {@code display_name}
     * where empty means "remove the annotation" and absent means "leave unchanged".
     */
    private static String getFieldPreserveEmpty(List<String> fields, Map<String, Integer> colIndex, String col) {
        Integer idx = colIndex.get(col);
        if (idx == null || idx >= fields.size()) {
            return null;
        }
        String val = fields.get(idx);
        return val == null ? "" : val;
    }

    /**
     * Returns the field value at the named column; throws when the column is absent
     * from the header.
     */
    private static String requireField(List<String> fields, Map<String, Integer> colIndex, String col) {
        Integer idx = colIndex.get(col);
        if (idx == null) {
            throw new IllegalArgumentException("Required CSV column missing: " + col);
        }
        if (idx >= fields.size()) {
            return "";
        }
        String val = fields.get(idx);
        return val != null ? val : "";
    }

    private static int parseInt(String val, int defaultValue) {
        if (val == null) {
            return defaultValue;
        }
        try {
            return Integer.parseInt(val.trim());
        } catch (NumberFormatException e) {
            return defaultValue;
        }
    }

    private static Optional<Boolean> parseBoolean(String val) {
        if (val == null || val.isEmpty()) {
            return Optional.empty();
        }
        return Optional.of(Boolean.parseBoolean(val.trim()));
    }

    private static Double parseDouble(String val) {
        if (val == null) {
            return null;
        }
        try {
            return Double.parseDouble(val.trim());
        } catch (NumberFormatException e) {
            return null;
        }
    }

    /**
     * Parses a semicolon-separated tag list. Returns an empty list for a blank
     * value; never {@code null}.
     */
    private static List<String> parseSemicolonList(String val) {
        if (val == null || val.isEmpty()) {
            return List.of();
        }
        List<String> result = new ArrayList<>();
        for (String part : val.split(";", -1)) {
            String trimmed = part.trim();
            if (!trimmed.isEmpty()) {
                result.add(trimmed);
            }
        }
        return Collections.unmodifiableList(result);
    }

    /**
     * Like {@link #parseSemicolonList(String)} but returns {@code null} when the
     * column is entirely absent from the file (versus present but empty). This
     * distinction matters for AI tag comparison: a column absent from the header
     * means AI was not run at all, while an empty column value means AI ran but
     * assigned no tags.
     */
    @SuppressWarnings("PMD.ReturnEmptyCollectionRatherThanNull")
    private static List<String> parseSemicolonListOrNull(List<String> fields,
            Map<String, Integer> colIndex, String col) {
        Integer idx = colIndex.get(col);
        if (idx == null) {
            return null; // column absent — AI not run; null is semantically distinct from empty list
        }
        String val = idx < fields.size() ? fields.get(idx) : null;
        return parseSemicolonList(val);
    }
}