Skip to content

Commit

Permalink
Tweak product doc generation for 8.17 (#205189)
Browse files Browse the repository at this point in the history
## Summary

- use default elser
- adapt cleaning for new markdown format
  • Loading branch information
pgayvallet authored Jan 8, 2025
1 parent 9078287 commit 81a5aa9
Show file tree
Hide file tree
Showing 5 changed files with 14 additions and 82 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ import {
createTargetIndex,
extractDocumentation,
indexDocuments,
installElser,
createChunkFiles,
createArtifact,
cleanupFolders,
Expand Down Expand Up @@ -68,9 +67,6 @@ export const buildArtifacts = async (config: TaskConfig) => {

await cleanupFolders({ folders: [config.buildFolder] });

log.info('Ensuring ELSER is installed on the embedding cluster');
await installElser({ client: embeddingClient });

for (const productName of config.productNames) {
await buildArtifact({
productName,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,15 @@
import type { Client } from '@elastic/elasticsearch';
import type { MappingTypeMapping } from '@elastic/elasticsearch/lib/api/types';

const DEFAULT_ELSER = '.elser-2-elasticsearch';

const mappings: MappingTypeMapping = {
dynamic: 'strict',
properties: {
content_title: { type: 'text' },
content_body: {
type: 'semantic_text',
inference_id: 'kibana-elser2',
inference_id: DEFAULT_ELSER,
},
product_name: { type: 'keyword' },
root_type: { type: 'keyword' },
Expand All @@ -24,11 +26,11 @@ const mappings: MappingTypeMapping = {
ai_subtitle: { type: 'text' },
ai_summary: {
type: 'semantic_text',
inference_id: 'kibana-elser2',
inference_id: DEFAULT_ELSER,
},
ai_questions_answered: {
type: 'semantic_text',
inference_id: 'kibana-elser2',
inference_id: DEFAULT_ELSER,
},
ai_tags: { type: 'keyword' },
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
export { extractDocumentation } from './extract_documentation';
export { indexDocuments } from './index_documents';
export { createTargetIndex } from './create_index';
export { installElser } from './install_elser';
export { createChunkFiles } from './create_chunk_files';
export { checkConnectivity } from './check_connectivity';
export { createArtifact } from './create_artifact';
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,15 @@ const removeDuplicates = (documents: ExtractedDocument[]): ExtractedDocument[] =
return uniqBy(documents, (doc) => doc.slug);
};

const EMPTY_DOC_TOKEN_LIMIT = 120;

/**
* Filter "this content has moved" or "deleted pages" type of documents, just based on token count.
*/
const filterEmptyDocs = (documents: ExtractedDocument[]): ExtractedDocument[] => {
return documents.filter((doc) => {
const tokenCount = encode(doc.content_body).length;
if (tokenCount < 100) {
if (tokenCount < EMPTY_DOC_TOKEN_LIMIT) {
return false;
}
return true;
Expand All @@ -52,8 +54,14 @@ const processDocument = (document: ExtractedDocument) => {
.replaceAll(/([a-zA-Z])edit\n/g, (match) => {
return `${match[0]}\n`;
})
// remove edit links
.replaceAll(/\[\s*edit\s*\]\(\s*[^)]+\s*\)/g, '')
// remove empty links
.replaceAll('[]()', '')
// limit to 2 consecutive carriage return
.replaceAll(/\n\n+/g, '\n\n');

document.content_title = document.content_title.split('|')[0].trim();

return document;
};

0 comments on commit 81a5aa9

Please sign in to comment.