name: Maintainer Reply

# When an issue is in a pre-fix triage state (`triage/reproduced` or
# `triage/by-design`) and an authorized maintainer addresses `@emdashbot` with
# a freeform directive, classify the intent via a small Flue classifier and
# act: dispatch a directed investigate run to implement the chosen approach,
# flag it as by-design, disengage, or ask for clarification.
#
# This covers the gap reporter-reply.yml does not: there, a fix already exists
# on `bot/fix-<n>` and the question is "does it work?" (confirm / reject). Here
# the bot reproduced the issue but deferred the fix (e.g. diagnose returned
# `needs-design-decision` with options), and the maintainer is making that
# call. The two workflows gate on disjoint label states, so they never both
# fire on one comment.
#
# A produced fix routes through the normal awaiting-reporter loop -- this
# workflow only gets the issue from `reproduced` to a fix attempt; reporter-
# reply.yml owns everything after.

on:
  issue_comment:
    types: [created]

# Default-deny at workflow level.
permissions:
  contents: read

jobs:
  classify-and-act:
    name: Classify directive and act
    # Coarse `if:` -- cheap, reliable payload-only filters, matching
    # reporter-reply.yml's philosophy:
    #  - the comment is on an issue (not a PR -- issue_comment fires for both)
    #  - the commenter is not a bot (excludes emdashbot's own comments, which
    #    would otherwise re-trigger the classifier in a loop)
    #  - the issue is in a pre-fix state this workflow acts on
    #
    # Authorization (a real write/triage role) and the `@emdashbot` wake word
    # are checked in live-check. `author_association` from the payload is
    # unreliable for the role check -- a maintainer with private org membership
    # reports `NONE` -- so it is not gated on here.
    if: >-
      github.event.issue.pull_request == null
      && github.event.comment.user.type != 'Bot'
      && (contains(github.event.issue.labels.*.name, 'triage/reproduced')
          || contains(github.event.issue.labels.*.name, 'triage/by-design'))
    runs-on: ubuntu-latest
    timeout-minutes: 15
    concurrency:
      group: maintainer-reply-${{ github.event.issue.number }}
      cancel-in-progress: false
    permissions:
      # All writes (labels, comment, repository_dispatch) use the app token
      # below. No PRs are opened here, so no pull-requests scope is needed.
      contents: read
      issues: read
    steps:
      - name: Generate app token
        id: app-token
        uses: actions/create-github-app-token@bcd2ba49218906704ab6c1aa796996da409d3eb1 # v3.2.0
        with:
          app-id: ${{ secrets.APP_ID }}
          private-key: ${{ secrets.APP_PRIVATE_KEY }}
          owner: emdash-cms
          repositories: emdash
          permission-issues: write
          permission-contents: write

      # Re-verify live state before any expensive work. Three checks:
      #
      #   1. The issue is still in a pre-fix state this workflow acts on
      #      (`triage/reproduced` or `triage/by-design`). The job `if:` uses
      #      the dispatch-time label snapshot; a label may have moved since.
      #      Concurrency only serialises replies, it does not re-read state.
      #
      #   2. The commenter is authorized: a real admin/write/triage role on the
      #      repo, checked against the permission API rather than the
      #      spoof-prone-by-omission `author_association` in the payload.
      #
      #   3. The comment opts in with an `@emdashbot` directive at the START of
      #      a line (leading whitespace only) so a directive quoted from
      #      another comment (`> @emdashbot ...`) does not count. Without the
      #      wake word, ordinary maintainer chatter on a triage thread would
      #      kick off an expensive classify+investigate on every comment.
      - name: Re-verify live state
        env:
          GH_TOKEN: ${{ steps.app-token.outputs.token }}
          ISSUE_NUMBER: ${{ github.event.issue.number }}
          COMMENTER: ${{ github.event.comment.user.login }}
          REPLY_BODY: ${{ github.event.comment.body }}
        run: |
          set -euo pipefail

          # Capture which pre-fix state the issue is in -- handlers word their
          # comments and label flips differently for reproduced vs by-design.
          LABELS="$(gh api "/repos/emdash-cms/emdash/issues/${ISSUE_NUMBER}" --jq '[.labels[].name] | join(",")')"
          if grep -q 'triage/reproduced' <<<"$LABELS"; then
            STATE="reproduced"
          elif grep -q 'triage/by-design' <<<"$LABELS"; then
            STATE="by-design"
          else
            echo "::notice::issue #${ISSUE_NUMBER} is no longer in a pre-fix state (live labels: ${LABELS}); skipping stale reply event"
            echo "stale=true" >> "$GITHUB_OUTPUT"
            exit 0
          fi

          # ---- Authorization: real write-or-triage role on the repo ----
          #
          # Gate on BOTH fields the endpoint returns:
          #   * `permission` -- the legacy BASE role (admin/write/read/none),
          #     with maintain mapped to write and triage mapped to read. Custom
          #     org roles collapse to their base here, so a write-equivalent
          #     custom role is caught by `write`.
          #   * `role_name` -- needed only to recognise `triage` specifically
          #     (it maps down to `read` in `permission`).
          # A 404 (no access) leaves both empty. The read is authorized by the
          # token's contents:write (push-equivalent) scope.
          PERM_JSON="$(gh api "/repos/emdash-cms/emdash/collaborators/${COMMENTER}/permission" 2>/dev/null || true)"
          PERM="$(jq -r '.permission // ""' <<<"$PERM_JSON" 2>/dev/null || true)"
          ROLE="$(jq -r '.role_name // ""' <<<"$PERM_JSON" 2>/dev/null || true)"
          if [[ "$PERM" != "admin" && "$PERM" != "write" && "$ROLE" != "triage" ]]; then
            echo "::notice::commenter ${COMMENTER} has permission '${PERM:-none}' / role '${ROLE:-none}' on emdash (need write or triage); ignoring"
            echo "stale=true" >> "$GITHUB_OUTPUT"
            exit 0
          fi

          # ---- Wake word: an `@emdashbot` directive starting a line ----
          if ! grep -iqE '^[[:space:]]*@emdashbot\b' <<<"$REPLY_BODY"; then
            echo "::notice::maintainer ${COMMENTER} commented without an '@emdashbot' directive; taking no action"
            echo "stale=true" >> "$GITHUB_OUTPUT"
            exit 0
          fi

          echo "state=${STATE}" >> "$GITHUB_OUTPUT"
          echo "stale=false" >> "$GITHUB_OUTPUT"
        id: live-check

      - name: Checkout
        if: steps.live-check.outputs.stale != 'true'
        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
        with:
          fetch-depth: 1
          persist-credentials: false

      - name: Setup pnpm
        if: steps.live-check.outputs.stale != 'true'
        uses: pnpm/action-setup@0e279bb959325dab635dd2c09392533439d90093 # v6.0.8

      - name: Setup Node.js
        if: steps.live-check.outputs.stale != 'true'
        uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
        with:
          node-version-file: "package.json"
          cache: "pnpm"

      - name: Install root dependencies
        if: steps.live-check.outputs.stale != 'true'
        run: pnpm install --frozen-lockfile

      - name: Install Flue agent dependencies
        if: steps.live-check.outputs.stale != 'true'
        run: pnpm install --frozen-lockfile
        working-directory: .flue

      - name: Build packages
        if: steps.live-check.outputs.stale != 'true'
        run: pnpm build

      - name: Build classifier payload
        if: steps.live-check.outputs.stale != 'true'
        env:
          GH_TOKEN: ${{ steps.app-token.outputs.token }}
          ISSUE_NUMBER: ${{ github.event.issue.number }}
          REPLY_BODY: ${{ github.event.comment.body }}
        run: |
          set -euo pipefail
          # The latest emdashbot[bot] comment is the bot's investigation, so the
          # classifier can resolve references like "option A" or "the second
          # one". Bot-authored and only ever fed to the model, so semi-trusted;
          # write to a file and pass via --rawfile rather than an env var.
          gh api "/repos/emdash-cms/emdash/issues/${ISSUE_NUMBER}/comments" --paginate --slurp \
            | jq -r '[ .[] | .[] | select(.user.login == "emdashbot[bot]") | .body ] | last // ""' \
            > /tmp/bot-context.txt
          jq -nc \
            --argjson n "$ISSUE_NUMBER" \
            --arg b "$REPLY_BODY" \
            --rawfile c /tmp/bot-context.txt \
            '{replyBody: $b, issueNumber: $n, botContext: $c, owner: "emdash-cms", repo: "emdash"}' \
            > /tmp/classify-payload.json

      - name: Run classifier
        if: steps.live-check.outputs.stale != 'true'
        id: classify
        timeout-minutes: 10
        env:
          AGENT_GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          ORCHESTRATOR_GH_TOKEN: ${{ steps.app-token.outputs.token }}
          CLOUDFLARE_ACCOUNT_ID: ${{ secrets.CF_AI_GATEWAY_ACCOUNT_ID }}
          CLOUDFLARE_GATEWAY_ID: ${{ secrets.CF_AI_GATEWAY_NAME }}
          CLOUDFLARE_API_KEY: ${{ secrets.CF_AI_GATEWAY_TOKEN }}
          ISSUE_NUMBER: ${{ github.event.issue.number }}
          # The workflow writes its result here; we read it directly instead of
          # scraping `flue run`'s stdout, which interleaves build-log lines and
          # pretty-prints the result -- both defeat parsing and silently default
          # to `unclear`. Same handoff as investigate.yml's INVESTIGATE_RESULT_PATH.
          CLASSIFY_RESULT_PATH: /tmp/classify-result.json
        run: |
          set -o pipefail
          RESULT_PATH="${CLASSIFY_RESULT_PATH:?CLASSIFY_RESULT_PATH not set}"
          PAYLOAD="$(cat /tmp/classify-payload.json)"
          rm -f "$RESULT_PATH"
          set +e
          # See investigate.yml's "Run Flue investigate agent" step for why we
          # invoke the binary directly rather than via `pnpm --dir`.
          .flue/node_modules/.bin/flue run classify-maintainer-reply \
            --target node \
            --root .flue \
            --payload "$PAYLOAD" \
            > /tmp/classify-stdout.json 2> /tmp/classify-stderr.log
          EXIT=$?
          set -e
          : > /tmp/directive.txt
          : > /tmp/classify-reasoning.txt
          # A clean run writes a single JSON object to the result file. A
          # non-zero exit, a missing file, or a non-object means the run did
          # not finish -- default to unclear (which re-asks, never acts).
          if [[ $EXIT -ne 0 ]] || [[ ! -s "$RESULT_PATH" ]] || ! jq -e 'type == "object"' "$RESULT_PATH" >/dev/null 2>&1; then
            echo "::warning::classifier exit=${EXIT} or no result file; defaulting to unclear"
            tail -n 50 /tmp/classify-stderr.log || true
            echo "intent=unclear" >> "$GITHUB_OUTPUT"
            exit 0
          fi
          # Whitelist the intent -- the handler gate must be a known enum or we
          # treat it as unclear. Defends against an unexpected model value.
          INTENT_RAW="$(jq -r '.intent // "unclear"' "$RESULT_PATH" | tr -d '\r\n')"
          case "$INTENT_RAW" in
            implement|close|takeover|unclear) INTENT="$INTENT_RAW" ;;
            *) INTENT="unclear" ;;
          esac
          # Directive and reasoning are model output shaped by the maintainer's
          # comment. Persist to files, never $GITHUB_OUTPUT -- a heredoc with a
          # fixed delimiter would be a step-output injection vector if either
          # contained the delimiter on its own line. The directive is later
          # JSON-escaped into the dispatch payload; it is never interpolated
          # into a command or used to build an identifier.
          jq -r '.directive // ""' "$RESULT_PATH" > /tmp/directive.txt
          jq -r '.reasoning // ""' "$RESULT_PATH" > /tmp/classify-reasoning.txt
          echo "intent=${INTENT}" >> "$GITHUB_OUTPUT"

      # Combine intent + directive presence into the `action` the handlers
      # gate on. `implement` only acts if the classifier actually extracted a
      # directive. An empty directive (the maintainer said "go ahead" without
      # naming an approach) cannot override the fix gate, so it would just
      # reproduce again; route it to `unclear` to ask for specifics instead.
      - name: Resolve action
        if: steps.live-check.outputs.stale != 'true'
        id: resolve
        env:
          INTENT: ${{ steps.classify.outputs.intent }}
        run: |
          set -euo pipefail
          case "$INTENT" in
            implement)
              if [[ -s /tmp/directive.txt ]] && grep -q '[^[:space:]]' /tmp/directive.txt; then
                ACTION="implement"
              else
                ACTION="unclear"
              fi
              ;;
            close) ACTION="close" ;;
            takeover) ACTION="takeover" ;;
            *) ACTION="unclear" ;;
          esac
          echo "action=${ACTION}" >> "$GITHUB_OUTPUT"

      # ----- Implement: dispatch a directed investigate run -----

      - name: Handle implement
        if: steps.live-check.outputs.stale != 'true' && steps.resolve.outputs.action == 'implement'
        env:
          GH_TOKEN: ${{ steps.app-token.outputs.token }}
          ISSUE_NUMBER: ${{ github.event.issue.number }}
          STATE: ${{ steps.live-check.outputs.state }}
          COMMENTER: ${{ github.event.comment.user.login }}
          REPO_FULL: ${{ github.repository }}
        run: |
          set -euo pipefail

          # Fire a `maintainer-directive` repository_dispatch (not
          # `gh workflow run`): firing repository_dispatch needs only
          # contents:write, which the app token has, whereas workflow_dispatch
          # needs actions:write, which the emdashbot App is not granted.
          # investigate.yml reads issueNumber / directive from client_payload.
          # The directive is read from a file via --rawfile so it is
          # JSON-escaped, never interpolated into the command.
          #
          # Dispatch first, then flip the label. Order matters for recovery: if
          # dispatch fails, the label stays put so the maintainer can simply
          # reply again, rather than the issue getting stuck in a reproducing
          # state with nothing running.
          set +e
          jq -nc \
            --arg n "$ISSUE_NUMBER" \
            --rawfile d /tmp/directive.txt \
            '{event_type: "maintainer-directive", client_payload: {issueNumber: $n, directive: $d}}' \
            | gh api --method POST "/repos/${REPO_FULL}/dispatches" --input -
          DISPATCH_EXIT=$?
          set -e

          if [[ $DISPATCH_EXIT -ne 0 ]]; then
            echo "::warning::repository_dispatch failed (exit ${DISPATCH_EXIT}); leaving label on triage/${STATE}"
            {
              echo "@${COMMENTER} I tried to start the implementation but the dispatch failed. Reply again to retry, or pick it up by hand."
            } > /tmp/comment.md
            gh issue comment "$ISSUE_NUMBER" --repo emdash-cms/emdash --body-file /tmp/comment.md
            exit 0
          fi

          # Dispatch succeeded. Flip the current pre-fix state label to
          # triage/reproducing so the in-flight investigation claims the issue
          # and a second directive during that window passes the live-state
          # check to a no-op. The dispatched investigate.yml re-asserts
          # reproducing idempotently at its transition step. Retry the flip a
          # few times; if it never lands, investigate.yml will flip it itself.
          FLIP_OK=false
          for ATTEMPT in 1 2 3; do
            if gh issue edit "$ISSUE_NUMBER" --repo emdash-cms/emdash \
              --remove-label "triage/${STATE}" --add-label "triage/reproducing"; then
              FLIP_OK=true
              break
            fi
            echo "::warning::label flip attempt ${ATTEMPT} failed, retrying"
            sleep $((ATTEMPT * 2))
          done
          if [[ "$FLIP_OK" != "true" ]]; then
            echo "::warning::label flip failed 3 times; relying on investigate.yml's transition step"
          fi

          {
            echo "On it, @${COMMENTER} — implementing your directive and re-running the investigation. I'll push a candidate fix and ask for confirmation when it's ready."
          } > /tmp/comment.md
          gh issue comment "$ISSUE_NUMBER" --repo emdash-cms/emdash --body-file /tmp/comment.md || true

      # ----- Close: flag as by-design; the bot never closes the issue itself -----

      - name: Handle close
        if: steps.live-check.outputs.stale != 'true' && steps.resolve.outputs.action == 'close'
        env:
          GH_TOKEN: ${{ steps.app-token.outputs.token }}
          ISSUE_NUMBER: ${{ github.event.issue.number }}
          STATE: ${{ steps.live-check.outputs.state }}
          COMMENTER: ${{ github.event.comment.user.login }}
        run: |
          set -euo pipefail
          if [[ "$STATE" != "by-design" ]]; then
            gh issue edit "$ISSUE_NUMBER" --repo emdash-cms/emdash \
              --remove-label "triage/${STATE}" --add-label "triage/by-design"
          fi
          {
            echo "Flagged as by-design per @${COMMENTER}."
            # Reasoning is multi-line model output; read from the file and
            # block-quote every line (a bare echo would quote only the first).
            if grep -q '[^[:space:]]' /tmp/classify-reasoning.txt; then
              echo
              sed 's/^/> /' /tmp/classify-reasoning.txt
            fi
            echo
            echo "I don't close issues automatically — close it whenever you're ready."
          } > /tmp/comment.md
          gh issue comment "$ISSUE_NUMBER" --repo emdash-cms/emdash --body-file /tmp/comment.md

      # ----- Takeover: disengage so the bot stops acting on this issue -----

      - name: Handle takeover
        if: steps.live-check.outputs.stale != 'true' && steps.resolve.outputs.action == 'takeover'
        env:
          GH_TOKEN: ${{ steps.app-token.outputs.token }}
          ISSUE_NUMBER: ${{ github.event.issue.number }}
          STATE: ${{ steps.live-check.outputs.state }}
          COMMENTER: ${{ github.event.comment.user.login }}
        run: |
          set -euo pipefail
          # Drop the pre-fix state label so this workflow no longer fires on
          # the issue (the job `if:` requires reproduced/by-design).
          gh issue edit "$ISSUE_NUMBER" --repo emdash-cms/emdash --remove-label "triage/${STATE}"
          {
            echo "Disengaging — over to you, @${COMMENTER}. Re-apply \`bot:repro\` if you want the bot back on it."
          } > /tmp/comment.md
          gh issue comment "$ISSUE_NUMBER" --repo emdash-cms/emdash --body-file /tmp/comment.md

      # ----- Unclear: ask for a concrete directive, no state change -----

      - name: Handle unclear
        if: steps.live-check.outputs.stale != 'true' && steps.resolve.outputs.action == 'unclear'
        env:
          GH_TOKEN: ${{ steps.app-token.outputs.token }}
          ISSUE_NUMBER: ${{ github.event.issue.number }}
          COMMENTER: ${{ github.event.comment.user.login }}
        run: |
          set -euo pipefail
          {
            echo "@${COMMENTER} I couldn't tell what you'd like me to do. You can:"
            echo
            echo "- **Implement a fix** — \`@emdashbot implement <which approach>\` (name the option or the change you want)."
            echo "- **Flag as by-design** — \`@emdashbot this is by design\`."
            echo "- **Take it over** — \`@emdashbot I'll handle this\`."
          } > /tmp/comment.md
          gh issue comment "$ISSUE_NUMBER" --repo emdash-cms/emdash --body-file /tmp/comment.md
