Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(converter): add video converter. #193

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
145 changes: 144 additions & 1 deletion src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -496,7 +496,9 @@ def convert(
"youtube_transcript_languages", ("en",)
)
# Must be a single transcript.
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=youtube_transcript_languages) # type: ignore
transcript = YouTubeTranscriptApi.get_transcript(
video_id, languages=youtube_transcript_languages
) # type: ignore
transcript_text = " ".join([part["text"] for part in transcript]) # type: ignore
# Alternative formatting:
# formatter = TextFormatter()
Expand Down Expand Up @@ -1076,6 +1078,146 @@ def _get_llm_description(self, local_path, extension, client, model, prompt=None
return response.choices[0].message.content


class VideoConverter(WavConverter):
"""
Converts videos to markdown via:
* extraction of metadata (if `exiftool` is installed)
* speech transcription (if `speech_recognition` AND `pydub` are installed).
* summary via a multimodal LLM if a transcription is available and a llm_client is configured
"""

def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
"""
Convert a video to markdown

Args:
local_path (str): The path to the video file
metadata_exclude: A list of metadata fields to exclude from the extracted exif metadata
metadata_title: The title of the metadata section
transcribe: Whether to transcribe the video
transcript_title: The title of the transcript section
llm_summary: Whether to generate a summary via the provided multimodal LLM client
llm_summary_title: The title of the summary section
"""

mime_type = mimetypes.guess_type(local_path)[0]
if mime_type is None or not mime_type.startswith("video/"):
return None

md_content = ""

# Add metadata, let the user exclude metadata they don't want
metadata = self._get_metadata(local_path)
# Exclude these metadat by default (but allow the user to override)
# Maybe this should be moved to somewhere else
DEFAULTS_METADATA_EXCLUDE = [
"SourceFile",
"ExifToolVersion",
"Directory",
"FileModifyDate",
"FileAccessDate",
"FileInodeChangeDate",
"FilePermissions",
]
metadata_exclude = kwargs.get("metadata_exclude", DEFAULTS_METADATA_EXCLUDE)
metadata_title = kwargs.get("metadata_title", "### Metadata:\n")
if metadata_title is not None:
md_content += metadata_title
for f in metadata:
if not f in metadata_exclude:
md_content += f"{f}: {metadata[f]}\n"

# Transcribe
transcribe = kwargs.get("transcribe", True)
transcript = ""
if transcribe and IS_AUDIO_TRANSCRIPTION_CAPABLE:
handle, temp_path = tempfile.mkstemp(suffix=".wav")
os.close(handle)
try:
sound = pydub.AudioSegment.from_file(local_path)
with open(temp_path, "wb") as f:
sound.export(f, format="wav")
_args = dict()
_args.update(kwargs)
_args["file_extension"] = ".wav"

transcript_title = kwargs.get(
"transcript_title", "\n\n### Transcript:\n"
)
try:
transcript = super()._transcribe_audio(temp_path).strip()
md_content += transcript_title + (
"[No speech detected]" if transcript == "" else transcript
)
except Exception:
transcript_error = kwargs.get(
"transcript_error", "Error. Could not transcribe."
)
md_content += f"{transcript_title}{transcript_error}"

finally:
os.unlink(temp_path)

# LLM analysis (Optional) / not all LLMs are fully capable of analyzing video files yet,
# But for now we can use the transcript to get a summary of its content
llm_summary = kwargs.get("llm_summary", True)
llm_client = kwargs.get("llm_client")
llm_model = kwargs.get("llm_model")
if llm_summary and llm_client is not None and llm_model is not None:
if not transcribe:
print("Error: LLM summary requires transcription to be enabled.")
elif transcript == "":
print("Warning: No transcript found. Skipping LLM summary.")
else:
llm_summary_title = kwargs.get(
"llm_summary_title", "\n\n### Video Summary:\n"
)
md_content += (
llm_summary_title
+ self._get_llm_video_summary_from_transcript(
transcript,
llm_client,
llm_model,
prompt=kwargs.get("llm_prompt"),
)
)

# Return the result
return DocumentConverterResult(
title=None,
text_content=md_content.strip(),
)

def _get_llm_video_summary_from_transcript(
self, transcript, client, model, prompt=None
) -> str:
"""
helper function to get a summary of the video content from the transcript

Args:
transcript: the transcript of the video
client: the llm client
model: the llm model
prompt: the prompt to use
Returns: the summary
"""
if prompt is None or prompt.strip() == "":
prompt = "The following is video transcript, based on it, write a summary of the video content:\n"

messages = [
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "text", "text": transcript},
],
}
]

response = client.chat.completions.create(model=model, messages=messages)
return response.choices[0].message.content


class ZipConverter(DocumentConverter):
"""Converts ZIP files to markdown by extracting and converting all contained files.

Expand Down Expand Up @@ -1282,6 +1424,7 @@ def __init__(
self.register_page_converter(WavConverter())
self.register_page_converter(Mp3Converter())
self.register_page_converter(ImageConverter())
self.register_page_converter(VideoConverter())
self.register_page_converter(IpynbConverter())
self.register_page_converter(PdfConverter())
self.register_page_converter(ZipConverter())
Expand Down
Binary file added tests/test_files/test.mp4
Binary file not shown.
14 changes: 14 additions & 0 deletions tests/test_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,12 @@
"5bda1dd6",
]

VIDEO_TEST_EXIFTOOL = {
"Title": "Sample video test for MarkItDown",
"Comment": "This is a sample video created using FFmpeg, with the voice-over generated by the Parler-TTS model.",
"ImageSize": "1280x720",
}


# --- Helper Functions ---
def validate_strings(result, expected_strings, exclude_strings=None):
Expand Down Expand Up @@ -246,6 +252,14 @@ def test_markitdown_exiftool() -> None:
target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
assert target in result.text_content

# Test Video metadata
result = markitdown.convert(
os.path.join(TEST_FILES_DIR, "test.mp4"), transcribe=False, llm_summary=False
)
for key in VIDEO_TEST_EXIFTOOL:
target = f"{key}: {VIDEO_TEST_EXIFTOOL[key]}"
assert target in result.text_content


def test_markitdown_deprecation() -> None:
try:
Expand Down