{"version":3,"file":"matryoshka_retriever.cjs","names":["VectorStoreRetriever"],"sources":["../../src/retrievers/matryoshka_retriever.ts"],"sourcesContent":["import { DocumentInterface } from \"@langchain/core/documents\";\nimport { Embeddings } from \"@langchain/core/embeddings\";\nimport {\n  cosineSimilarity,\n  euclideanDistance,\n  innerProduct,\n} from \"@langchain/core/utils/math\";\nimport {\n  VectorStore,\n  VectorStoreRetriever,\n  VectorStoreRetrieverInput,\n} from \"@langchain/core/vectorstores\";\n\n/**\n * Type for options when adding a document to the VectorStore.\n */\n// oxlint-disable-next-line @typescript-eslint/no-explicit-any\ntype AddDocumentOptions = Record<string, any>;\n\nexport interface MatryoshkaRetrieverFields {\n  /**\n   * The number of documents to retrieve from the small store.\n   * @default 50\n   */\n  smallK?: number;\n  /**\n   * The number of documents to retrieve from the large store.\n   * @default 8\n   */\n  largeK?: number;\n  /**\n   * The metadata key to store the larger embeddings.\n   * @default \"lc_large_embedding\"\n   */\n  largeEmbeddingKey?: string;\n  /**\n   * The embedding model to use when generating the large\n   * embeddings.\n   */\n  largeEmbeddingModel: Embeddings;\n  /**\n   * The type of search to perform using the large embeddings.\n   * @default \"cosine\"\n   */\n  searchType?: \"cosine\" | \"innerProduct\" | \"euclidean\";\n}\n\n/**\n * A retriever that uses two sets of embeddings to perform adaptive retrieval. Based\n * off of the \"Matryoshka embeddings: faster OpenAI vector search using Adaptive Retrieval\"\n * blog post {@link https://supabase.com/blog/matryoshka-embeddings}.\n *\n *\n * This class performs \"Adaptive Retrieval\" for searching text embeddings efficiently using the\n * Matryoshka Representation Learning (MRL) technique. It retrieves documents similar to a query\n * embedding in two steps:\n *\n * First-pass: Uses a lower dimensional sub-vector from the MRL embedding for an initial, fast,\n * but less accurate search.\n *\n * Second-pass: Re-ranks the top results from the first pass using the full, high-dimensional\n * embedding for higher accuracy.\n *\n *\n * This code implements MRL embeddings for efficient vector search by combining faster,\n * lower-dimensional initial search with accurate, high-dimensional re-ranking.\n */\nexport class MatryoshkaRetriever<\n  Store extends VectorStore = VectorStore,\n> extends VectorStoreRetriever<Store> {\n  smallK = 50;\n\n  largeK = 8;\n\n  largeEmbeddingKey = \"lc_large_embedding\";\n\n  largeEmbeddingModel: Embeddings;\n\n  searchType: \"cosine\" | \"innerProduct\" | \"euclidean\" = \"cosine\";\n\n  constructor(\n    fields: MatryoshkaRetrieverFields & VectorStoreRetrieverInput<Store>\n  ) {\n    super(fields);\n    this.smallK = fields.smallK ?? this.smallK;\n    this.largeK = fields.largeK ?? this.largeK;\n    this.largeEmbeddingKey = fields.largeEmbeddingKey ?? this.largeEmbeddingKey;\n    this.largeEmbeddingModel = fields.largeEmbeddingModel;\n    this.searchType = fields.searchType ?? this.searchType;\n  }\n\n  /**\n   * Ranks documents based on their similarity to a query embedding using larger embeddings.\n   *\n   * This method takes a query embedding and a list of documents (smallResults) as input. Each document\n   * in the smallResults array has previously been associated with a large embedding stored in its metadata.\n   * Depending on the `searchType` (cosine, innerProduct, or euclidean), it calculates the similarity scores\n   * between the query embedding and each document's large embedding. It then ranks the documents based on\n   * these similarity scores, from the most similar to the least similar.\n   *\n   * The method returns a promise that resolves to an array of the top `largeK` documents, where `largeK`\n   * is a class property defining the number of documents to return. This subset of documents is determined\n   * by sorting the entire list of documents based on their similarity scores and then selecting the top\n   * `largeK` documents.\n   *\n   * @param {number[]} embeddedQuery The embedding of the query, represented as an array of numbers.\n   * @param {DocumentInterface[]} smallResults An array of documents, each with metadata that includes a large embedding for similarity comparison.\n   * @returns {Promise<DocumentInterface[]>} A promise that resolves to an array of the top `largeK` ranked documents based on their similarity to the query embedding.\n   */\n  private _rankByLargeEmbeddings(\n    embeddedQuery: number[],\n    smallResults: DocumentInterface[]\n  ): DocumentInterface[] {\n    const largeEmbeddings: Array<number[]> = smallResults.map((doc) =>\n      JSON.parse(doc.metadata[this.largeEmbeddingKey])\n    );\n    let func: () => Array<number[]>;\n\n    switch (this.searchType) {\n      case \"cosine\":\n        func = () => cosineSimilarity([embeddedQuery], largeEmbeddings);\n        break;\n      case \"innerProduct\":\n        func = () => innerProduct([embeddedQuery], largeEmbeddings);\n        break;\n      case \"euclidean\":\n        func = () => euclideanDistance([embeddedQuery], largeEmbeddings);\n        break;\n      default:\n        throw new Error(`Unknown search type: ${this.searchType}`);\n    }\n\n    // Calculate the similarity scores between the query embedding and the large embeddings\n    const [similarityScores] = func();\n\n    // Create an array of indices from 0 to N-1, where N is the number of documents\n    let indices = Array.from(\n      { length: smallResults.length },\n      (_, index) => index\n    );\n\n    indices = indices\n      .map((v, i) => [similarityScores[i], v])\n      .sort(([a], [b]) => b - a)\n      .slice(0, this.largeK)\n      .map(([, i]) => i);\n\n    return indices.map((i) => smallResults[i]);\n  }\n\n  async _getRelevantDocuments(query: string): Promise<DocumentInterface[]> {\n    const [embeddedQuery, smallResults] = await Promise.all([\n      this.largeEmbeddingModel.embedQuery(query),\n      this.vectorStore.similaritySearch(query, this.smallK, this.filter),\n    ]);\n\n    return this._rankByLargeEmbeddings(embeddedQuery, smallResults);\n  }\n\n  /**\n   * Override the default `addDocuments` method to embed the documents twice,\n   * once using the larger embeddings model, and then again using the default\n   * embedding model linked to the vector store.\n   *\n   * @param {DocumentInterface[]} documents - An array of documents to add to the vector store.\n   * @param {AddDocumentOptions} options - An optional object containing additional options for adding documents.\n   * @returns {Promise<string[] | void>} A promise that resolves to an array of the document IDs that were added to the vector store.\n   */\n  override addDocuments = async (\n    documents: DocumentInterface[],\n    options?: AddDocumentOptions\n  ): Promise<string[] | void> => {\n    // Insure documents metadata does not contain the large embedding key\n    if (documents.some((doc) => this.largeEmbeddingKey in doc.metadata)) {\n      throw new Error(\n        `All documents must not contain the large embedding key: ${this.largeEmbeddingKey} in their metadata.`\n      );\n    }\n\n    const allDocPageContent = documents.map((doc) => doc.pageContent);\n    const allDocLargeEmbeddings =\n      await this.largeEmbeddingModel.embedDocuments(allDocPageContent);\n\n    const newDocuments: Array<DocumentInterface> = documents.map(\n      (doc, idx) => ({\n        ...doc,\n        metadata: {\n          ...doc.metadata,\n          [this.largeEmbeddingKey]: JSON.stringify(allDocLargeEmbeddings[idx]),\n        },\n      })\n    );\n\n    return this.vectorStore.addDocuments(newDocuments, options);\n  };\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;AAmEA,IAAa,sBAAb,cAEUA,6BAAAA,qBAA4B;CACpC,SAAS;CAET,SAAS;CAET,oBAAoB;CAEpB;CAEA,aAAsD;CAEtD,YACE,QACA;AACA,QAAM,OAAO;AACb,OAAK,SAAS,OAAO,UAAU,KAAK;AACpC,OAAK,SAAS,OAAO,UAAU,KAAK;AACpC,OAAK,oBAAoB,OAAO,qBAAqB,KAAK;AAC1D,OAAK,sBAAsB,OAAO;AAClC,OAAK,aAAa,OAAO,cAAc,KAAK;;;;;;;;;;;;;;;;;;;;CAqB9C,uBACE,eACA,cACqB;EACrB,MAAM,kBAAmC,aAAa,KAAK,QACzD,KAAK,MAAM,IAAI,SAAS,KAAK,mBAAmB,CACjD;EACD,IAAI;AAEJ,UAAQ,KAAK,YAAb;GACE,KAAK;AACH,kBAAA,GAAA,2BAAA,kBAA8B,CAAC,cAAc,EAAE,gBAAgB;AAC/D;GACF,KAAK;AACH,kBAAA,GAAA,2BAAA,cAA0B,CAAC,cAAc,EAAE,gBAAgB;AAC3D;GACF,KAAK;AACH,kBAAA,GAAA,2BAAA,mBAA+B,CAAC,cAAc,EAAE,gBAAgB;AAChE;GACF,QACE,OAAM,IAAI,MAAM,wBAAwB,KAAK,aAAa;;EAI9D,MAAM,CAAC,oBAAoB,MAAM;EAGjC,IAAI,UAAU,MAAM,KAClB,EAAE,QAAQ,aAAa,QAAQ,GAC9B,GAAG,UAAU,MACf;AAED,YAAU,QACP,KAAK,GAAG,MAAM,CAAC,iBAAiB,IAAI,EAAE,CAAC,CACvC,MAAM,CAAC,IAAI,CAAC,OAAO,IAAI,EAAE,CACzB,MAAM,GAAG,KAAK,OAAO,CACrB,KAAK,GAAG,OAAO,EAAE;AAEpB,SAAO,QAAQ,KAAK,MAAM,aAAa,GAAG;;CAG5C,MAAM,sBAAsB,OAA6C;EACvE,MAAM,CAAC,eAAe,gBAAgB,MAAM,QAAQ,IAAI,CACtD,KAAK,oBAAoB,WAAW,MAAM,EAC1C,KAAK,YAAY,iBAAiB,OAAO,KAAK,QAAQ,KAAK,OAAO,CACnE,CAAC;AAEF,SAAO,KAAK,uBAAuB,eAAe,aAAa;;;;;;;;;;;CAYjE,eAAwB,OACtB,WACA,YAC6B;AAE7B,MAAI,UAAU,MAAM,QAAQ,KAAK,qBAAqB,IAAI,SAAS,CACjE,OAAM,IAAI,MACR,2DAA2D,KAAK,kBAAkB,qBACnF;EAGH,MAAM,oBAAoB,UAAU,KAAK,QAAQ,IAAI,YAAY;EACjE,MAAM,wBACJ,MAAM,KAAK,oBAAoB,eAAe,kBAAkB;EAElE,MAAM,eAAyC,UAAU,KACtD,KAAK,SAAS;GACb,GAAG;GACH,UAAU;IACR,GAAG,IAAI;KACN,KAAK,oBAAoB,KAAK,UAAU,sBAAsB,KAAK;IACrE;GACF,EACF;AAED,SAAO,KAAK,YAAY,aAAa,cAAc,QAAQ"}