Tweak product doc generation for 8.17 (#205189)

## Summary - use default elser - adapt cleaning for new markdown format
elastic · Jan 8, 2025 · 81a5aa9 · 81a5aa9
1 parent 9078287
commit 81a5aa9
Show file tree

Hide file tree

Showing 5 changed files with 14 additions and 82 deletions.
diff --git a/x-pack/packages/ai-infra/product-doc-artifact-builder/src/build_artifacts.ts b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/build_artifacts.ts
@@ -14,7 +14,6 @@ import {
   createTargetIndex,
   extractDocumentation,
   indexDocuments,
-  installElser,
   createChunkFiles,
   createArtifact,
   cleanupFolders,
@@ -68,9 +67,6 @@ export const buildArtifacts = async (config: TaskConfig) => {
 
   await cleanupFolders({ folders: [config.buildFolder] });
 
-  log.info('Ensuring ELSER is installed on the embedding cluster');
-  await installElser({ client: embeddingClient });
-
   for (const productName of config.productNames) {
     await buildArtifact({
       productName,

diff --git a/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/create_index.ts b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/create_index.ts
@@ -8,13 +8,15 @@
 import type { Client } from '@elastic/elasticsearch';
 import type { MappingTypeMapping } from '@elastic/elasticsearch/lib/api/types';
 
+const DEFAULT_ELSER = '.elser-2-elasticsearch';
+
 const mappings: MappingTypeMapping = {
   dynamic: 'strict',
   properties: {
     content_title: { type: 'text' },
     content_body: {
       type: 'semantic_text',
-      inference_id: 'kibana-elser2',
+      inference_id: DEFAULT_ELSER,
     },
     product_name: { type: 'keyword' },
     root_type: { type: 'keyword' },
@@ -24,11 +26,11 @@ const mappings: MappingTypeMapping = {
     ai_subtitle: { type: 'text' },
     ai_summary: {
       type: 'semantic_text',
-      inference_id: 'kibana-elser2',
+      inference_id: DEFAULT_ELSER,
     },
     ai_questions_answered: {
       type: 'semantic_text',
-      inference_id: 'kibana-elser2',
+      inference_id: DEFAULT_ELSER,
     },
     ai_tags: { type: 'keyword' },
   },

diff --git a/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/index.ts b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/index.ts
@@ -8,7 +8,6 @@
 export { extractDocumentation } from './extract_documentation';
 export { indexDocuments } from './index_documents';
 export { createTargetIndex } from './create_index';
-export { installElser } from './install_elser';
 export { createChunkFiles } from './create_chunk_files';
 export { checkConnectivity } from './check_connectivity';
 export { createArtifact } from './create_artifact';

diff --git a/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/install_elser.ts b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/install_elser.ts
diff --git a/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/process_documents.ts b/x-pack/packages/ai-infra/product-doc-artifact-builder/src/tasks/process_documents.ts
@@ -33,13 +33,15 @@ const removeDuplicates = (documents: ExtractedDocument[]): ExtractedDocument[] =
   return uniqBy(documents, (doc) => doc.slug);
 };
 
+const EMPTY_DOC_TOKEN_LIMIT = 120;
+
 /**
  * Filter "this content has moved" or "deleted pages" type of documents, just based on token count.
  */
 const filterEmptyDocs = (documents: ExtractedDocument[]): ExtractedDocument[] => {
   return documents.filter((doc) => {
     const tokenCount = encode(doc.content_body).length;
-    if (tokenCount < 100) {
+    if (tokenCount < EMPTY_DOC_TOKEN_LIMIT) {
       return false;
     }
     return true;
@@ -52,8 +54,14 @@ const processDocument = (document: ExtractedDocument) => {
     .replaceAll(/([a-zA-Z])edit\n/g, (match) => {
       return `${match[0]}\n`;
     })
+    // remove edit links
+    .replaceAll(/\[\s*edit\s*\]\(\s*[^)]+\s*\)/g, '')
+    // remove empty links
+    .replaceAll('[]()', '')
     // limit to 2 consecutive carriage return
     .replaceAll(/\n\n+/g, '\n\n');
 
+  document.content_title = document.content_title.split('|')[0].trim();
+
   return document;
 };