diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 00000000..1838bc1b --- /dev/null +++ b/.dockerignore @@ -0,0 +1,12 @@ +.git +.gitignore +node_modules +__pycache__ +*.pyc +.env +.env.* +*.egg-info +dist +build +.pytest_cache +.mypy_cache diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index cb9ed8e3..d5752dca 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -121,6 +121,45 @@ uv run data_formulator --dev # Run backend only (for frontend development) Open [http://localhost:5567](http://localhost:5567) to view it in the browser. +## Docker + +Docker is the easiest way to run Data Formulator without installing Python or Node.js locally. + +### Quick start + +1. **Copy the environment template and add your API keys:** + + ```bash + cp .env.template .env + # Edit .env and set your OPENAI_API_KEY, ANTHROPIC_API_KEY, etc. + ``` + +2. **Build and start the container:** + + ```bash + docker compose up --build + ``` + +3. Open [http://localhost:5567](http://localhost:5567) in your browser. + +To stop the container: `docker compose down` + +Workspace data (uploaded files, sessions) is persisted in a Docker volume (`data_formulator_home`) so it survives container restarts. + +### Build the image manually + +```bash +docker build -t data-formulator . +docker run --rm -p 5567:5567 --env-file .env data-formulator +``` + +### Docker sandbox (`SANDBOX=docker`) is not supported inside a container + +The Docker sandbox backend works by calling `docker run -v :...` to bind-mount temporary workspace directories into child containers. When Data Formulator itself runs in a Docker container those paths refer to the *container* filesystem, not the host, so Docker daemon cannot mount them and the feature does not work. + +Use `SANDBOX=docker` only when running Data Formulator **directly on the host** (e.g. with `uv run data_formulator --sandbox docker` or `python -m data_formulator --sandbox docker`). When using the Docker image, keep the default `SANDBOX=local`. + + ## Sandbox AI-generated Python code runs inside a **sandbox** to isolate it from the main server process. Two backends are available: diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..dd12e34e --- /dev/null +++ b/Dockerfile @@ -0,0 +1,65 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +# --------------------------------------------------------------------------- +# Stage 1: Build the React/TypeScript frontend +# --------------------------------------------------------------------------- +FROM node:20-slim AS frontend-builder + +WORKDIR /app + +# Install dependencies +COPY package.json yarn.lock ./ +RUN yarn install --frozen-lockfile + +# Copy source and build +COPY index.html tsconfig.json vite.config.ts eslint.config.js ./ +COPY public ./public +COPY src ./src +RUN yarn build + +# --------------------------------------------------------------------------- +# Stage 2: Python runtime with the built frontend bundled in +# --------------------------------------------------------------------------- +FROM python:3.11-slim AS runtime + +# System dependencies needed by some Python packages +RUN apt-get update && apt-get install -y --no-install-recommends \ + gcc \ + g++ \ + libpq-dev \ + unixodbc-dev \ + curl \ + && rm -rf /var/lib/apt/lists/* + +# Create a non-root user to run the application +RUN useradd -m -s /bin/bash appuser + +# Set the home directory for workspace data to a deterministic path +ENV DATA_FORMULATOR_HOME=/home/appuser/.data_formulator + +WORKDIR /app + +# Copy Python package sources +COPY pyproject.toml MANIFEST.in README.md ./ +COPY py-src ./py-src + +# Copy the compiled frontend into the package's expected location +COPY --from=frontend-builder /app/py-src/data_formulator/dist ./py-src/data_formulator/dist + +# Install the package and its dependencies +RUN pip install --no-cache-dir . + +# Switch to non-root user and ensure workspace and app directories are owned by it +RUN mkdir -p "${DATA_FORMULATOR_HOME}" && chown -R appuser:appuser /app "${DATA_FORMULATOR_HOME}" +USER appuser + +EXPOSE 5567 + +HEALTHCHECK --interval=30s --timeout=5s --retries=3 \ + CMD curl -f http://localhost:5567/ || exit 1 + +# Run the app on all interfaces so Docker port-forwarding works. +# We do not pass --dev so Flask runs in production mode (no debugger/reloader). +# webbrowser.open() fails silently in a headless container, which is harmless. +ENTRYPOINT ["python", "-m", "data_formulator", "--port", "5567"] diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 00000000..24db9753 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,28 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +# Docker Compose configuration for Data Formulator. +# +# Quick start: +# 1. Copy .env.template to .env and fill in your API keys. +# 2. docker compose up --build +# 3. Open http://localhost:5567 in your browser. + +services: + data-formulator: + build: + context: . + dockerfile: Dockerfile + image: data-formulator:latest + ports: + - "5567:5567" + env_file: + - .env + user: "0:0" + volumes: + # Persist workspace data (uploaded files, sessions, etc.) across container restarts. + - data_formulator_home:/home/appuser/.data_formulator + restart: unless-stopped + +volumes: + data_formulator_home: diff --git a/py-src/data_formulator/sandbox/Dockerfile.sandbox b/py-src/data_formulator/sandbox/Dockerfile.sandbox new file mode 100644 index 00000000..a6be9e1e --- /dev/null +++ b/py-src/data_formulator/sandbox/Dockerfile.sandbox @@ -0,0 +1,26 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +# Minimal sandbox image used by the Docker sandbox backend. +# This image is intentionally small — it only needs the packages +# required to execute user-generated data-transformation code. +# +# Build: +# docker build -t data-formulator-sandbox \ +# -f py-src/data_formulator/sandbox/Dockerfile.sandbox . + +FROM python:3.11-slim + +RUN pip install --no-cache-dir \ + pandas \ + numpy \ + duckdb \ + pyarrow \ + scikit-learn \ + scipy + +# Drop to a non-root user for extra isolation +RUN useradd -m sandbox +USER sandbox + +WORKDIR /sandbox/workdir