/** * Phase 2 of document ingestion: deterministic structural extraction * from a Markdown intermediate to RDF triples + source-file linkage. * * This is the "Layer 1 structural" extraction defined by * `19_MARKDOWN_CONTENT_TYPE.md` — it runs without an LLM and produces * triples from explicit Markdown/YAML structure only: * * - YAML frontmatter keys → subject properties * - `type` frontmatter key → rdf:type * - Wikilinks `[[Target]]` → schema:mentions * - Hashtags `#keyword` → schema:keywords * - Dataview `key:: value` inline fields → properties * - Heading hierarchy → dkg:hasSection * * When `sourceFileIri` is provided the extractor emits the §10.1 data- * graph linkage triples it owns — specifically row 1 * (` dkg:sourceFile `) and row 3 * (` dkg:rootEntity `). These come back * in the `sourceFileLinkage` return field so the daemon can keep them * distinct from content triples before merging them into the * assertion graph. The field was renamed from `provenance` in Round 13 * Bug 39 to remove the semantic clash with its original * extraction-run-metadata meaning. * * Row 2 (` dkg:sourceContentType ""`) is * owned by the daemon (Round 9 Bug 1 / Round 9 Bug 27 rulings), not * this module — only the daemon has access to the original upload * content type that row 2 must describe. The daemon emits row 2 * alongside the extractor's rows 1 and 3 in the same atomic insert. * * Rows 4-13 (file descriptor block + ExtractionProvenance resource * described in §3.2/§10.2) are also daemon-owned — the daemon has * natural access to the UAL, the fresh provenance URI, the agent DID, * and the `_meta` writes. This module stays free of `_meta` / * extraction-run concerns. * * Spec: 05_PROTOCOL_EXTENSIONS.md §6.3 / §6.5, 19_MARKDOWN_CONTENT_TYPE.md §10 */ import { type ExtractionQuad as Quad } from '@origintrail-official/dkg-core'; export interface MarkdownExtractInput { /** Markdown source text (the Phase 1 mdIntermediate). */ markdown: string; /** DID of the extracting agent, recorded in provenance. */ agentDid: string; /** Optional ontology URI (not yet used by Layer 1 — reserved for Layer 2). */ ontologyRef?: string; /** * Optional stable subject IRI for the document. When omitted, the extractor * derives a subject from frontmatter `id` or the first H1 heading. */ documentIri?: string; /** * IRI of the source blob this markdown was extracted from, in the form * `urn:dkg:file:keccak256:`. When set, the extractor emits the * §10.1 `dkg:sourceFile` linkage quad (row 1) with `` as * subject and this URI as object. * * The file descriptor block (rows 4-8) is subsequently filtered out of * `assertionPromote`'s root-entity partition via a subject-prefix * filter on `urn:dkg:file:` in `packages/publisher/src/dkg-publisher.ts` * — that's how we prevent cross-assertion contention without using * blank-node subjects. See `19_MARKDOWN_CONTENT_TYPE.md §10.2` for the * normative rule and spec-engineer's reconciled ruling on Codex Bug 8 * for the history (Round 3 tried blank nodes; Round 4 reverted to URI * subjects + promote-time filter after an `skolemizeByEntity` audit showed * the blank-node approach silently drops the ExtractionProvenance * block, which is a correctness smell). */ sourceFileIri?: string; /** * Explicit root-entity IRI override. In V10.0 this is usually the * document subject IRI itself (` dkg:rootEntity `). * If the frontmatter carries a `rootEntity` key with a string value it * takes precedence over both the input and the subject default; see * §19.10.1:508. The resolved value is returned on * `MarkdownExtractOutput.resolvedRootEntity` so the daemon can reuse it * for the `_meta` row 14 write without re-resolving. */ rootEntityIri?: string; } export interface MarkdownExtractOutput { /** Extracted RDF triples describing the document content. */ triples: Quad[]; /** * §10.1 source-file linkage quads on the document subject. Emits rows * 1 and 3 (`dkg:sourceFile` + `dkg:rootEntity`); row 2 * (`dkg:sourceContentType`) is owned by the daemon because it has the * original upload content type and the extractor does not. Empty when * `sourceFileIri` is not supplied. The daemon merges these into the * same data graph as `triples` before committing. * * Round 13 Bug 39: renamed from `provenance` to `sourceFileLinkage`. * The original field at module introduction (`ff8afe3`) was * "`dkg:ExtractionProvenance` blank-identifier records for every * extracted triple" — extraction-run metadata (agent, timestamp, * method). The PR #121 chain repurposed the field to hold source- * file linkage triples, creating a semantic clash with the old * meaning. Round 9 Bug 27 moved the extraction-run provenance rows * (9-13 on the `` subject) to the daemon's * route handler, so the extractor no longer produces ANY * extraction-run metadata — only source-file linkage. Renaming * makes the contract honest: this field contains linkage triples, * full stop. */ sourceFileLinkage: Quad[]; /** The subject IRI used for the document (useful to the caller for indexing). */ subjectIri: string; /** * The resolved root-entity IRI, following the §19.10.1:508 precedence * rules: frontmatter `rootEntity` key > explicit `rootEntityIri` input > * reflexive fallback to the document subject. The daemon reuses this * value as the object of the `_meta` row 14 quad so the data-graph row 3 * and `_meta` row 14 stay in sync without the daemon re-running the * resolution logic. */ resolvedRootEntity: string; } /** * Run the full Phase 2 structural extraction. Deterministic, no LLM. * Returns `{ triples, sourceFileLinkage, subjectIri, resolvedRootEntity }`. Empty arrays are valid * — a Markdown document with no frontmatter, no wikilinks, no tags, no * dataview fields, and no headings produces zero triples. */ export declare function extractFromMarkdown(input: MarkdownExtractInput): MarkdownExtractOutput; //# sourceMappingURL=markdown-extractor.d.ts.map