openapi: 3.0.3
info:
  title: Benford Public API
  version: "1.0.0"
  description: |
    Workspace-scoped HTTP API for CSV/XLSX ingestion, Benford analysis, and exports.

    **Authentication:** `Authorization: Bearer <api_key>`

    **Plans:** API keys and all `/api/v1/*` routes require a **Pro** workspace (`plan_tier = pro`).

    **Scopes:** Keys include `ingestion`, `analysis`, and/or `exports`. Each operation checks a scope.

    **Rate limiting:** ~120 requests per minute per workspace; `429` with `Retry-After` when exceeded.

    **Exports:** Prefer `POST /api/v1/exports/runs/{runId}` (async job + signed URL) over synchronous `GET` CSV/PDF.

    **Benford JSON:** `GET /api/v1/analysis-runs/{runId}/benford-results` returns chart-ready distributions when the run is **`completed`** (`409` until then or if **`failed`**).

    **Ingestion:** When `EDGE_WORKER_SECRET` is configured server-side, **CSV** uploads return `202` and profiling runs in the `ingestion-worker` Edge Function; **XLSX** is processed in Next.js (same limits) because a single parse often exceeds Edge CPU caps. Poll `GET /api/v1/datasets/{datasetId}/ingestion` until `status` is `ready` or `failed` when you received `202`.

    **Ingestion limits (Pro workspace):** CSV or XLSX; max **10 MB** per file; max **100,000** data rows; unlimited datasets per workspace (see `lib/limits/workspace-plan-limits.ts`). `413` + `limit_exceeded` when the file is too large. Browser/session uploads on **Free** workspaces use stricter caps (5 datasets per workspace, CSV-only, 5 MB, 50k rows)—not applicable to v1 keys, which require Pro. Published limits are maximums; very wide or high-cardinality datasets may process more slowly and can still exceed runtime thresholds.

    **File format:** The **first row must be column headers** (names); every subsequent row is a data record. Row counts and limits exclude the header row.

    **Errors:** JSON body `{ "error": { "code", "message", "details?" } }`.
servers:
  - url: https://www.benford.app
    description: Production
  - url: http://localhost:3000
    description: Local development
tags:
  - name: Ingestion
    description: Upload datasets
  - name: Analysis
    description: Queue runs and poll status
  - name: Exports
    description: Download completed-run reports
paths:
  /api/v1/uploads:
    post:
      tags: [Ingestion]
      summary: Upload and profile a dataset
      description: |
        Multipart upload. Stores the raw file and either processes synchronously (`200`) or enqueues the
        `ingestion-worker` (`202`) for **CSV** when `EDGE_WORKER_SECRET` is set (XLSX returns `200` inline). Poll
        `GET /api/v1/datasets/{datasetId}/ingestion` until `status` is `ready` or `failed`.
        The workspace is inferred from the API key (do not pass `workspaceId`). All v1 callers
        have Pro workspaces, so CSV and XLSX are allowed up to the Pro caps above.

        **CSV/XLSX:** The first row must be column headers; remaining rows are data. Row limits count data rows only.
      security:
        - bearerAuth: []
      requestBody:
        required: true
        content:
          multipart/form-data:
            schema:
              type: object
              required: [file]
              properties:
                file:
                  type: string
                  format: binary
                displayName:
                  type: string
                description:
                  type: string
                omitReference:
                  type: string
                  description: Send `true` or `on` to omit reference column mapping.
                referenceColumnName:
                  type: string
                referenceColumnIndex:
                  type: string
      responses:
        "200":
          description: Dataset ready
          content:
            application/json:
              schema:
                type: object
                properties:
                  datasetId:
                    type: string
                    format: uuid
                  rowCount:
                    type: integer
                  columnCount:
                    type: integer
                  recommendedColumns:
                    type: integer
                  referenceColumn:
                    nullable: true
        "202":
          description: Raw file accepted; ingestion queued (Edge worker)
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/UploadQueued"
        "400":
          $ref: "#/components/responses/ErrorResponse"
        "401":
          $ref: "#/components/responses/ErrorResponse"
        "403":
          $ref: "#/components/responses/ErrorResponse"
        "413":
          description: Payload too large (e.g. file over plan limit)
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ApiError"
        "429":
          $ref: "#/components/responses/ErrorResponse"
  /api/v1/datasets/{datasetId}/ingestion:
    get:
      tags: [Ingestion]
      summary: Poll dataset ingestion / profiling status
      description: |
        Use after `202` from `POST /api/v1/uploads`. `status` mirrors `datasets.status` (`processing`, `ready`, `failed`).
        When `ready`, `rowCount`, `columnCount`, `recommendedColumns`, and `referenceColumn` reflect the profile.
      security:
        - bearerAuth: []
      parameters:
        - $ref: "#/components/parameters/DatasetId"
      responses:
        "200":
          description: Current ingestion state
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/DatasetIngestionStatus"
        "401":
          $ref: "#/components/responses/ErrorResponse"
        "404":
          $ref: "#/components/responses/ErrorResponse"
        "429":
          $ref: "#/components/responses/ErrorResponse"
  /api/v1/datasets/{datasetId}/analysis-runs:
    post:
      tags: [Analysis]
      summary: Queue an analysis run
      description: |
        Creates a queued `analysis_runs` row using selected columns and workspace z-value.
        At least one column must be selected (via UI or auto-selection during upload).
      security:
        - bearerAuth: []
      parameters:
        - $ref: "#/components/parameters/DatasetId"
      responses:
        "200":
          description: Run queued
          content:
            application/json:
              schema:
                type: object
                properties:
                  runId:
                    type: string
                    format: uuid
                  status:
                    type: string
                    enum: [queued]
        "400":
          $ref: "#/components/responses/ErrorResponse"
        "401":
          $ref: "#/components/responses/ErrorResponse"
        "403":
          $ref: "#/components/responses/ErrorResponse"
        "404":
          $ref: "#/components/responses/ErrorResponse"
        "429":
          $ref: "#/components/responses/ErrorResponse"
  /api/v1/datasets/{datasetId}/process:
    post:
      tags: [Analysis]
      summary: Process queued runs for a dataset
      description: |
        Runs the Benford engine for queued jobs in this workspace/dataset (same semantics as the app &quot;Process&quot; action).
      security:
        - bearerAuth: []
      parameters:
        - $ref: "#/components/parameters/DatasetId"
      responses:
        "200":
          description: Processing summary
          content:
            application/json:
              schema:
                type: object
                properties:
                  processedRuns:
                    type: integer
                  failedRuns:
                    type: integer
        "401":
          $ref: "#/components/responses/ErrorResponse"
        "404":
          $ref: "#/components/responses/ErrorResponse"
        "429":
          $ref: "#/components/responses/ErrorResponse"
  /api/v1/analysis-runs/{runId}:
    get:
      tags: [Analysis]
      summary: Get analysis run status
      security:
        - bearerAuth: []
      parameters:
        - $ref: "#/components/parameters/RunId"
      responses:
        "200":
          description: Run record
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/AnalysisRun"
        "401":
          $ref: "#/components/responses/ErrorResponse"
        "404":
          $ref: "#/components/responses/ErrorResponse"
        "429":
          $ref: "#/components/responses/ErrorResponse"
  /api/v1/analysis-runs/{runId}/benford-results:
    get:
      tags: [Analysis]
      summary: Get Benford distributions for a completed run
      description: |
        Returns persisted `benford_results` rows for charting and custom reporting. **`404`** when the run
        does not exist or is outside the API key workspace. **`409`** when the run exists but is not
        **`completed`** (`analysis_run_not_completed` for `queued` / `running`, `analysis_run_failed` when
        `failed`) — no partial results.

        **Units:** `observed_distribution`, `expected_distribution`, and confidence maps use **probability per bin
        on a 0–1 scale** (not counts). Multiply by **100** for a percentage axis. Approximate count per bin:
        `round(p * sample_size)`. **`mad`** is the mean absolute deviation in **0–1** units (average
        `|p_obs − p_exp|` across bins); **`mad * 100`** is in percentage points. **`ssd`** is
        **Σ (100·(p_obs − p_exp))²** over bins (sum of squared percentage-point gaps), not a percent in 0–100.

        **Query `testMode`:** optional filter; must match DB enum: `first_digit`, `first_two_digits`, `second_digit`.
        **Query `columnName`:** optional exact match on stored column header (trimmed); omit for all columns.
      security:
        - bearerAuth: []
      parameters:
        - $ref: "#/components/parameters/RunId"
        - $ref: "#/components/parameters/BenfordResultsTestMode"
        - $ref: "#/components/parameters/BenfordResultsColumnName"
      responses:
        "200":
          description: Benford result rows for the run
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/BenfordResultsResponse"
        "400":
          $ref: "#/components/responses/ErrorResponse"
        "401":
          $ref: "#/components/responses/ErrorResponse"
        "404":
          $ref: "#/components/responses/ErrorResponse"
        "409":
          description: Run is not completed or failed without final results
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ApiError"
        "429":
          $ref: "#/components/responses/ErrorResponse"
        "500":
          $ref: "#/components/responses/ErrorResponse"
  /api/v1/exports/runs/{runId}:
    post:
      tags: [Exports]
      summary: Create async export job
      description: |
        Enqueues CSV or PDF generation to workspace storage. Returns `202` with `exportJobId` while the job is
        queued or running. Returns `200` with a time-limited `downloadUrl` when a completed file exists and
        retention has not expired. Poll `GET /api/v1/export-jobs/{jobId}` until `status` is `completed`.
      security:
        - bearerAuth: []
      parameters:
        - $ref: "#/components/parameters/RunId"
      requestBody:
        required: true
        content:
          application/json:
            schema:
              type: object
              required: [format]
              properties:
                format:
                  type: string
                  enum: [csv, pdf]
      responses:
        "200":
          description: Export already available
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ExportJobAccepted"
        "202":
          description: Job accepted
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ExportJobAccepted"
        "400":
          $ref: "#/components/responses/ErrorResponse"
        "401":
          $ref: "#/components/responses/ErrorResponse"
        "404":
          $ref: "#/components/responses/ErrorResponse"
        "429":
          $ref: "#/components/responses/ErrorResponse"
        "500":
          $ref: "#/components/responses/ErrorResponse"
  /api/v1/export-jobs/{jobId}:
    get:
      tags: [Exports]
      summary: Poll export job status
      description: |
        When `status` is `completed`, includes a signed `downloadUrl` (about one hour). Returns `410` if the
        artifact expired under the workspace retention window (30 days Free / 90 days Pro for generated exports).
      security:
        - bearerAuth: []
      parameters:
        - $ref: "#/components/parameters/ExportJobId"
      responses:
        "200":
          description: Job status (and download URL when completed)
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ExportJobStatus"
        "401":
          $ref: "#/components/responses/ErrorResponse"
        "404":
          $ref: "#/components/responses/ErrorResponse"
        "410":
          $ref: "#/components/responses/ErrorResponse"
        "429":
          $ref: "#/components/responses/ErrorResponse"
        "500":
          $ref: "#/components/responses/ErrorResponse"
  /api/v1/exports/runs/{runId}/pdf:
    get:
      tags: [Exports]
      summary: Download PDF export (synchronous)
      description: |
        Only `completed` runs. Branding follows workspace plan (Free vs Pro).
        **Backward compatible:** this synchronous route remains supported for existing clients.
        Prefer `POST /api/v1/exports/runs/{runId}` with `{"format":"pdf"}` for async generation and retention-aware storage.
      security:
        - bearerAuth: []
      parameters:
        - $ref: "#/components/parameters/RunId"
      responses:
        "200":
          description: PDF bytes
          content:
            application/pdf:
              schema:
                type: string
                format: binary
        "400":
          $ref: "#/components/responses/ErrorResponse"
        "401":
          $ref: "#/components/responses/ErrorResponse"
        "404":
          $ref: "#/components/responses/ErrorResponse"
        "429":
          $ref: "#/components/responses/ErrorResponse"
  /api/v1/exports/runs/{runId}/csv:
    get:
      tags: [Exports]
      summary: Download CSV export (synchronous)
      description: |
        **Backward compatible:** synchronous download remains supported for existing clients.
        Prefer `POST /api/v1/exports/runs/{runId}` with `{"format":"csv"}` for async generation and retention-aware storage.
      security:
        - bearerAuth: []
      parameters:
        - $ref: "#/components/parameters/RunId"
      responses:
        "200":
          description: CSV text
          content:
            text/csv:
              schema:
                type: string
        "400":
          $ref: "#/components/responses/ErrorResponse"
        "401":
          $ref: "#/components/responses/ErrorResponse"
        "404":
          $ref: "#/components/responses/ErrorResponse"
        "429":
          $ref: "#/components/responses/ErrorResponse"
components:
  securitySchemes:
    bearerAuth:
      type: http
      scheme: bearer
      bearerFormat: API Key
      description: Workspace API key from Settings → API access (Pro).
  parameters:
    DatasetId:
      name: datasetId
      in: path
      required: true
      schema:
        type: string
        format: uuid
    RunId:
      name: runId
      in: path
      required: true
      schema:
        type: string
        format: uuid
    ExportJobId:
      name: jobId
      in: path
      required: true
      schema:
        type: string
        format: uuid
    BenfordResultsTestMode:
      name: testMode
      in: query
      required: false
      schema:
        type: string
        enum: [first_digit, first_two_digits, second_digit]
      description: Filter to a single Benford test mode (matches `benford_results.test_mode`).
    BenfordResultsColumnName:
      name: columnName
      in: query
      required: false
      schema:
        type: string
      description: Exact match (after trim) on `column_name`; omit to return all columns for the run.
  schemas:
    ExportJobAccepted:
      type: object
      properties:
        exportJobId:
          type: string
          format: uuid
        status:
          type: string
          enum: [queued, running, completed]
        format:
          type: string
          enum: [csv, pdf]
        downloadUrl:
          type: string
          description: Present when status is completed (immediate hit).
        downloadUrlExpiresAt:
          type: string
          format: date-time
        retentionUntil:
          type: string
          format: date-time
        processingMode:
          type: string
          enum: [edge, inline]
          description: |
            `edge` when EDGE_WORKER_SECRET is set (POST nudges Supabase export-worker). `inline` when unset (Next.js processes the job in-process).
    UploadQueued:
      type: object
      properties:
        datasetId:
          type: string
          format: uuid
        ingestionJobId:
          type: string
          format: uuid
        status:
          type: string
          enum: [queued]
        processingMode:
          type: string
          enum: [edge]
        pollUrl:
          type: string
          description: Relative URL for polling (prepend your API host).
    DatasetIngestionStatus:
      type: object
      properties:
        datasetId:
          type: string
          format: uuid
        status:
          type: string
          description: Dataset row status — `processing`, `ready`, or `failed`.
        rowCount:
          type: integer
          nullable: true
        columnCount:
          type: integer
          nullable: true
        recommendedColumns:
          type: integer
          nullable: true
        referenceColumn:
          nullable: true
        errorMessage:
          type: string
          nullable: true
        ingestionJobStatus:
          type: string
          nullable: true
          description: Row in `ingestion_jobs` when Edge path was used (`queued`, `running`, `awaiting_numeric`, `completed`, `failed`).
    ExportJobStatus:
      type: object
      properties:
        exportJobId:
          type: string
          format: uuid
        runId:
          type: string
          format: uuid
        format:
          type: string
          enum: [csv, pdf]
        status:
          type: string
          enum: [queued, running, completed, failed]
        downloadUrl:
          type: string
        downloadUrlExpiresAt:
          type: string
          format: date-time
        errorMessage:
          type: string
          nullable: true
        retentionUntil:
          type: string
          format: date-time
        createdAt:
          type: string
          format: date-time
        completedAt:
          type: string
          format: date-time
          nullable: true
    BenfordResultsResponse:
      type: object
      required: [runId, datasetId, results]
      properties:
        runId:
          type: string
          format: uuid
        datasetId:
          type: string
          format: uuid
        results:
          type: array
          items:
            $ref: "#/components/schemas/BenfordResultRow"
    BenfordResultRow:
      type: object
      required:
        - columnId
        - columnName
        - testMode
        - sampleSize
        - observedDistribution
        - expectedDistribution
        - confidenceLowerDistribution
        - confidenceUpperDistribution
        - mad
        - ssd
        - createdAt
      properties:
        columnId:
          type: integer
          format: int64
        columnName:
          type: string
        testMode:
          type: string
          enum: [first_digit, first_two_digits, second_digit]
        sampleSize:
          type: integer
        observedDistribution:
          type: object
          additionalProperties:
            type: number
          description: Observed probability per digit bin, 0–1 scale.
        expectedDistribution:
          type: object
          additionalProperties:
            type: number
          description: Benford expected probability per bin, 0–1 scale.
        confidenceLowerDistribution:
          type: object
          additionalProperties:
            type: number
          description: Lower bound of the z-interval around expected, 0–1 scale.
        confidenceUpperDistribution:
          type: object
          additionalProperties:
            type: number
          description: Upper bound of the z-interval around expected, 0–1 scale.
        mad:
          type: number
          description: Mean absolute deviation in 0–1 units across bins.
        ssd:
          type: number
          description: Sum over bins of (100·(p_obs − p_exp))².
        createdAt:
          type: string
          format: date-time
    AnalysisRun:
      type: object
      properties:
        id:
          type: string
          format: uuid
        datasetId:
          type: string
          format: uuid
        status:
          type: string
          enum: [queued, running, completed, failed]
        testsEnabled:
          type: array
          items:
            type: string
        selectedColumnNames:
          type: array
          items:
            type: string
        zValue:
          type: number
        resultSummary:
          type: object
          additionalProperties: true
        errorMessage:
          type: string
          nullable: true
        createdAt:
          type: string
          format: date-time
        startedAt:
          type: string
          format: date-time
          nullable: true
        completedAt:
          type: string
          format: date-time
          nullable: true
    ApiError:
      type: object
      properties:
        error:
          type: object
          properties:
            code:
              type: string
              example: plan_upgrade_required
            message:
              type: string
            details:
              type: object
              additionalProperties: true
  responses:
    ErrorResponse:
      description: Error
      content:
        application/json:
          schema:
            $ref: "#/components/schemas/ApiError"
