PythonTestDiscovery.java

package org.egothor.methodatlas.discovery.python;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.locks.ReentrantLock;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.stream.Stream;

import org.egothor.methodatlas.api.DiscoveredMethod;
import org.egothor.methodatlas.api.SourceContent;
import org.egothor.methodatlas.api.TestDiscovery;
import org.egothor.methodatlas.api.TestDiscoveryConfig;

/**
 * Discovers Python test functions and methods in pytest-convention source files.
 *
 * <p>
 * Scans a directory root for Python test files and delegates AST parsing to a
 * pool of long-lived Python worker processes running the bundled
 * {@code py-scanner.py} script.  This eliminates the Python interpreter
 * startup overhead for large codebases.
 * </p>
 *
 * <h2>File selection</h2>
 *
 * <p>
 * Two pytest file-naming conventions are supported by default:
 * </p>
 * <ul>
 *   <li>Files whose name starts with {@code "test_"} and ends with
 *       {@code ".py"} (e.g. {@code test_auth.py}).</li>
 *   <li>Files whose name ends with {@code "_test.py"}
 *       (e.g. {@code security_test.py}).</li>
 * </ul>
 *
 * <p>
 * Additional suffixes may be supplied via
 * {@link TestDiscoveryConfig#fileSuffixesFor(String) fileSuffixesFor("python")}.
 * The {@code test_} prefix check is always active regardless of configured suffixes.
 * </p>
 *
 * <h2>Parsing</h2>
 *
 * <p>
 * Parsing is performed by the bundled Python {@code py-scanner.py} script using
 * the standard-library {@code ast} module (Python 3.8+).  The script resolves
 * all Python syntax correctly and extracts:
 * </p>
 * <ul>
 *   <li>test functions ({@code def test_*}) — both sync and async</li>
 *   <li>test methods inside {@code Test*} classes</li>
 *   <li>{@code @pytest.mark.*} decorator names as tags</li>
 *   <li>exact begin/end line numbers and LOC from the AST</li>
 * </ul>
 *
 * <h2>Python availability</h2>
 *
 * <p>
 * Python is detected lazily on the first call to {@link #discover(Path)} that
 * finds matching files.  When Python 3.8+ is absent, a {@code WARNING} is
 * logged and an empty stream is returned; {@link #hadErrors()} returns
 * {@code true}.
 * </p>
 *
 * <h2>Resource management</h2>
 *
 * <p>
 * The worker pool holds live Python sub-processes.  The orchestration layer
 * calls {@link #close()} when the scan run finishes.  A JVM shutdown hook
 * registered by the pool acts as a backstop.
 * </p>
 *
 * <h2>ServiceLoader registration</h2>
 *
 * <p>
 * This class is registered in
 * {@code META-INF/services/org.egothor.methodatlas.api.TestDiscovery} so that
 * it is loaded automatically via {@link java.util.ServiceLoader}.
 * </p>
 *
 * @see PythonWorkerPool
 * @see PythonEnvironment
 * @see TestDiscovery
 * @see TestDiscoveryConfig
 */
public final class PythonTestDiscovery implements TestDiscovery {

    private static final Logger LOG =
            Logger.getLogger(PythonTestDiscovery.class.getName());

    /** Default file suffixes when no configuration is supplied. */
    private static final List<String> DEFAULT_SUFFIXES = List.of("_test.py");

    /** Default pool size: at most 2 Python processes to keep memory use modest. */
    private static final int DEFAULT_POOL_SIZE =
            Math.min(2, Runtime.getRuntime().availableProcessors());

    /** Default per-file worker timeout in seconds. */
    private static final int DEFAULT_TIMEOUT_SEC = 30;

    /** Default circuit-breaker: 5 restarts within 60 seconds trips the circuit. */
    private static final int DEFAULT_MAX_RESTARTS = 5;
    private static final int DEFAULT_RESTART_WINDOW_SEC = 60;

    // ── Configured state (set by configure()) ──────────────────────────────
    private List<String> fileSuffixes = DEFAULT_SUFFIXES;
    private int poolSize = DEFAULT_POOL_SIZE;
    private long workerTimeoutMillis = DEFAULT_TIMEOUT_SEC * 1_000L;
    private int maxRestarts = DEFAULT_MAX_RESTARTS;
    private int restartWindowSec = DEFAULT_RESTART_WINDOW_SEC;

    // ── Runtime state (initialised lazily on first discover()) ────────────
    private final ReentrantLock poolInitLock = new ReentrantLock();
    private PythonEnvironment pythonEnv;
    private PythonWorkerPool workerPool;
    private final AtomicBoolean errors = new AtomicBoolean();

    /**
     * No-arg constructor required by {@link java.util.ServiceLoader}.
     */
    public PythonTestDiscovery() {
        // Required by ServiceLoader
    }

    /**
     * Returns the unique identifier of this discovery provider: {@code "python"}.
     *
     * @return {@code "python"}
     */
    @Override
    public String pluginId() {
        return "python";
    }

    /**
     * Configures this provider from a {@link TestDiscoveryConfig}.
     *
     * <p>
     * Reads the following configuration knobs:
     * </p>
     * <ul>
     * <li><b>File suffixes</b> — via {@link TestDiscoveryConfig#fileSuffixesFor}
     *     with ID {@code "python"}.  Falls back to {@code _test.py}.</li>
     * <li><b>{@code python.poolSize}</b> — number of worker processes;
     *     default: {@code Math.min(2, availableProcessors())}.</li>
     * <li><b>{@code python.workerTimeoutSec}</b> — per-file timeout;
     *     default: {@value #DEFAULT_TIMEOUT_SEC} s.</li>
     * <li><b>{@code python.maxConsecutiveRestarts}</b> — circuit-breaker
     *     restart limit; default: {@value #DEFAULT_MAX_RESTARTS}.</li>
     * <li><b>{@code python.restartWindowSec}</b> — circuit-breaker sliding
     *     window; default: {@value #DEFAULT_RESTART_WINDOW_SEC} s.</li>
     * </ul>
     *
     * <p>
     * Python detection and worker-pool creation are deferred until the first
     * call to {@link #discover(Path)} that actually finds a matching file.
     * </p>
     *
     * @param config runtime configuration; never {@code null}
     */
    @Override
    public void configure(TestDiscoveryConfig config) {
        List<String> suffixes = config.fileSuffixesFor(pluginId());
        this.fileSuffixes = suffixes.isEmpty() ? DEFAULT_SUFFIXES : suffixes;

        this.poolSize = parseIntProperty(config, "python.poolSize", DEFAULT_POOL_SIZE);
        int timeoutSec = parseIntProperty(config, "python.workerTimeoutSec", DEFAULT_TIMEOUT_SEC);
        this.workerTimeoutMillis = timeoutSec * 1_000L;
        this.maxRestarts = parseIntProperty(config, "python.maxConsecutiveRestarts",
                DEFAULT_MAX_RESTARTS);
        this.restartWindowSec = parseIntProperty(config, "python.restartWindowSec",
                DEFAULT_RESTART_WINDOW_SEC);
    }

    /**
     * Scans {@code root} and returns a stream of all discovered Python test
     * methods.
     *
     * <p>
     * The file tree is traversed first.  Python detection and worker-pool
     * creation are deferred until at least one matching file is found; projects
     * with no Python test files never start a Python process at all.  If
     * matching files are found but Python is unavailable, a warning is logged,
     * {@link #hadErrors()} returns {@code true}, and an empty stream is
     * returned.
     * </p>
     *
     * @param root directory to scan; must be an existing directory
     * @return stream of discovered test methods; never {@code null}
     * @throws IOException if traversing the file tree fails
     */
    @Override
    public Stream<DiscoveredMethod> discover(Path root) throws IOException {
        if (!Files.isDirectory(root)) {
            return Stream.empty();
        }

        List<Path> files;
        try (Stream<Path> walk = Files.walk(root)) {
            files = walk
                    .filter(Files::isRegularFile)
                    .filter(p -> {
                        Path fn = p.getFileName();
                        return fn != null && isPythonTestFile(fn.toString(), fileSuffixes);
                    })
                    .toList();
        }

        if (files.isEmpty()) {
            return Stream.empty();
        }

        ensurePoolReady();

        if (pythonEnv == null || !pythonEnv.isAvailable() || workerPool == null) {
            if (LOG.isLoggable(Level.WARNING)) {
                LOG.log(Level.WARNING,
                        "Python 3.8+ is unavailable — {0} Python test file(s) under {1} will not be scanned.",
                        new Object[] { files.size(), root });
            }
            errors.set(true);
            return Stream.empty();
        }

        List<DiscoveredMethod> result = new ArrayList<>();
        for (Path file : files) {
            processFile(root, file, result);
        }
        return result.stream();
    }

    /** {@inheritDoc} */
    @Override
    public boolean hadErrors() {
        return errors.get();
    }

    /**
     * Shuts down the worker pool and removes the JVM shutdown hook registered
     * by the pool.  Idempotent.
     *
     * @throws IOException never thrown; declared to satisfy {@link java.io.Closeable}
     */
    @Override
    public void close() throws IOException {
        if (workerPool != null) {
            workerPool.close();
            workerPool = null;
        }
    }

    // ── Private helpers ────────────────────────────────────────────────────

    /**
     * Initialises Python detection and the worker pool on first use.
     * Subsequent calls are no-ops.
     */
    private void ensurePoolReady() {
        poolInitLock.lock();
        try {
            if (pythonEnv != null) {
                return;
            }
            pythonEnv = new PythonEnvironment();
            if (!pythonEnv.isAvailable()) {
                return;
            }
            try {
                Path scriptPath = PythonScriptExtractor.extractScript();
                PythonWorkerCircuitBreaker cb =
                        new PythonWorkerCircuitBreaker(maxRestarts, restartWindowSec);
                workerPool = new PythonWorkerPool(
                        scriptPath, pythonEnv, poolSize, workerTimeoutMillis, cb);
            } catch (IOException e) {
                LOG.log(Level.SEVERE,
                        "Failed to initialise Python worker pool — Python scanning disabled", e);
                pythonEnv = null;
            }
        } finally {
            poolInitLock.unlock();
        }
    }

    /**
     * Sends a single file to the worker pool, converts the response into
     * {@link DiscoveredMethod} records, and appends them to {@code result}.
     *
     * @param root    scan root (for module-path computation)
     * @param file    absolute path to the source file
     * @param result  accumulator for discovered methods
     */
    private void processFile(Path root, Path file, List<DiscoveredMethod> result) {
        List<PythonWorker.MethodDescriptor> descriptors;
        try {
            descriptors = workerPool.scan(file);
        } catch (IOException e) {
            if (LOG.isLoggable(Level.WARNING)) {
                LOG.log(Level.WARNING, "Cannot scan Python file: " + file, e);
            }
            errors.set(true);
            return;
        }

        if (descriptors.isEmpty()) {
            return;
        }

        String modulePath = buildModulePath(file, root);
        SourceContent sourceContent = buildSourceContent(file);

        for (PythonWorker.MethodDescriptor d : descriptors) {
            String fqcn = d.className() != null
                    ? modulePath + "." + d.className()
                    : modulePath;
            result.add(new DiscoveredMethod(
                    fqcn,
                    d.name(),
                    d.beginLine(),
                    d.endLine(),
                    d.loc(),
                    d.tags(),
                    null,
                    file,
                    modulePath,
                    sourceContent));
        }
    }

    private static SourceContent buildSourceContent(Path file) {
        AtomicBoolean read = new AtomicBoolean(false);
        AtomicReference<String> cache = new AtomicReference<>(null);
        return () -> {
            if (read.compareAndSet(false, true)) {
                try {
                    cache.set(Files.readString(file));
                } catch (IOException e) {
                    if (LOG.isLoggable(Level.FINE)) {
                        LOG.log(Level.FINE, "Cannot read source for AI analysis: " + file, e);
                    }
                }
            }
            return Optional.ofNullable(cache.get());
        };
    }

    // ── Package-private static helpers (accessible from tests) ────────────

    /**
     * Returns {@code true} when the given file name should be scanned for
     * Python test functions.
     *
     * <p>
     * Selection rules (applied in order):
     * </p>
     * <ol>
     *   <li>If the name starts with {@code "test_"} and ends with
     *       {@code ".py"} → accept (always active).</li>
     *   <li>If {@code configuredSuffixes} is non-empty: accept if the name
     *       ends with any of those suffixes.</li>
     *   <li>Otherwise (empty configured suffixes): accept if the name ends
     *       with the default suffix {@code "_test.py"}.</li>
     * </ol>
     *
     * @param fileName           the simple file name (no directory component)
     * @param configuredSuffixes suffixes from {@link TestDiscoveryConfig#fileSuffixesFor};
     *                           an empty list means "use defaults"
     * @return {@code true} if the file should be scanned
     */
    /* default */ static boolean isPythonTestFile(
            String fileName, List<String> configuredSuffixes) {
        if (fileName.startsWith("test_") && fileName.endsWith(".py")) {
            return true;
        }
        if (!configuredSuffixes.isEmpty()) {
            return configuredSuffixes.stream().anyMatch(fileName::endsWith);
        }
        return fileName.endsWith("_test.py");
    }

    /**
     * Computes the dot-separated module path for {@code file} relative to
     * {@code root}.
     *
     * <p>
     * Both paths are normalised before relativising.  Path segments are joined
     * with {@code "."} and the {@code ".py"} extension is stripped from the
     * last segment.
     * </p>
     *
     * <p>
     * Example: if {@code root} is {@code /project/tests} and {@code file} is
     * {@code /project/tests/auth/test_auth.py}, the result is
     * {@code "auth.test_auth"}.
     * </p>
     *
     * @param file the source file; must be inside {@code root}
     * @param root the scan root
     * @return dot-separated module path; never {@code null} or empty
     */
    /* default */ static String buildModulePath(Path file, Path root) {
        Path relative = root.normalize().relativize(file.normalize());
        int count = relative.getNameCount();
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < count; i++) {
            String segment = relative.getName(i).toString();
            if (i == count - 1) {
                int dot = segment.lastIndexOf('.');
                if (dot > 0) {
                    segment = segment.substring(0, dot);
                }
            }
            if (sb.length() > 0) {
                sb.append('.');
            }
            sb.append(segment);
        }
        return sb.toString();
    }

    private static int parseIntProperty(TestDiscoveryConfig config, String key,
                                        int defaultValue) {
        List<String> values = config.properties().get(key);
        if (values == null || values.isEmpty()) {
            return defaultValue;
        }
        try {
            return Integer.parseInt(values.get(0));
        } catch (NumberFormatException e) {
            if (LOG.isLoggable(Level.WARNING)) {
                LOG.warning("Invalid value for property '" + key + "': " + values.get(0)
                        + " — using default " + defaultValue);
            }
            return defaultValue;
        }
    }
}