You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
#!/usr/bin/env bash
set -euo pipefail
# Default settings
VERBOSE=0
DRY_RUN=0
ERROR_COUNT=0
OUTPUT_DIR="_converted"
MAX_PARALLEL=4
FORCE=0
# Error tracking
declare -a FAILED_FILES=()
ERROR_TYPES=(
"Empty or unreadable file"
"Conversion failed"
"Invalid format"
)
declare -A ERROR_COUNTS
for type in "${ERROR_TYPES[@]}"; do
ERROR_COUNTS[$type]=0
done
# Log file
DEBUG_LOG="${OUTPUT_DIR}/_debug_log.txt"
# Print usage information
usage() {
cat <<EOF
Usage: $(basename "$0") [OPTIONS] [DIRECTORY]
Convert files to markdown format recursively and save them to the _converted directory.
Supports HTML, PDF, CSV, Excel, Word and other formats via markitdown.
Options:
--verbose Show detailed progress
--dry-run Show what would be done without making changes
--help Display this help message
--parallel=N Process N files in parallel (default: 4)
--force Overwrite existing markdown files
Examples:
$(basename "$0") # Convert all supported files in current directory
$(basename "$0") --verbose # Show detailed progress
$(basename "$0") /path/to/dir # Convert files in specific directory
$(basename "$0") --parallel=8 . # Use 8 parallel processes
EOF
exit "${1:-1}"
}
# Log messages based on verbosity
log() {
local level="$1"
shift
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
local log_entry="[$timestamp] $level: $*"
# Write to the debug log file
mkdir -p "$OUTPUT_DIR"
echo "$log_entry" >> "$DEBUG_LOG"
# Write to the console based on verbosity and level
case "$level" in
ERROR) echo "$log_entry" >&2 ;;
WARNING) echo "$log_entry" >&2 ;;
INFO) (( VERBOSE )) && echo "$log_entry" ;;
DRY) (( DRY_RUN )) && echo "$log_entry" ;;
esac
}
# Check if markitdown is available
check_dependencies() {
if ! command -v markitdown >/dev/null 2>&1; then
log "ERROR" "markitdown command not found"
exit 1
fi
}
# Convert a single file
convert_file() {
local file="$1"
local relative_path="${file#./}"
local encoded_name="${relative_path//\//_}"
local encoded_name_without_ext="${encoded_name%.*}"
local output_file="${OUTPUT_DIR}/${encoded_name_without_ext}.md"
local temp_file="${output_file}.tmp"
local backup_file="${output_file}.bak"
# Check if input file exists and is not empty
if [[ ! -s "$file" ]]; then
log "ERROR" "Skipping empty or unreadable file: $file"
FAILED_FILES+=("$file")
(( ERROR_COUNTS["Empty or unreadable file"]++ ))
(( ERROR_COUNT++ ))
return 1
fi
# Check if file is in a supported format (basic check)
if ! file "$file" | grep -qE '(HTML|PDF|CSV|Excel|Word|text)'; then
log "ERROR" "File does not appear to be in a supported format: $file"
FAILED_FILES+=("$file")
(( ERROR_COUNTS["Invalid format"]++ ))
(( ERROR_COUNT++ ))
return 3
fi
# If file exists and we're using force, create backup
if [[ -f "$output_file" ]] && (( FORCE )); then
cp "$output_file" "$backup_file"
log "INFO" "Created backup: $backup_file"
elif [[ -f "$output_file" ]]; then
log "INFO" "Skipping existing file: $output_file"
return 0
fi
if (( DRY_RUN )); then
log "DRY" "Would convert $file -> $output_file"
return 0
fi
mkdir -p "$OUTPUT_DIR"
# Use temporary file for atomic writes
if markitdown "$file" > "$temp_file"; then
mv "$temp_file" "$output_file"
log "INFO" "Converted $file -> $output_file"
# Remove backup if conversion successful
[[ -f "$backup_file" ]] && rm "$backup_file"
else
rm -f "$temp_file"
# Restore from backup if conversion failed
if [[ -f "$backup_file" ]]; then
mv "$backup_file" "$output_file"
log "INFO" "Restored from backup after failed conversion"
fi
log "ERROR" "Failed to convert $file"
FAILED_FILES+=("$file")
(( ERROR_COUNTS["Conversion failed"]++ ))
(( ERROR_COUNT++ ))
return 2
fi
}
# Process files in a directory recursively
convert_directory() {
local input_dir="${1:-.}"
local conversion_failed=0
if [[ ! -d "$input_dir" ]]; then
log "ERROR" "Directory not found: $input_dir"
return 1
fi
# Find all non-empty files except hidden files, .DS_Store, .md, and .tmp files
local find_cmd=(find "$input_dir" -type f ! -empty ! -name ".*" ! -name ".DS_Store" ! -name "*.md" ! -name "*.tmp")
if command -v parallel >/dev/null 2>&1; then
# Create separate temp files for different error types
ERROR_EMPTY=$(mktemp)
ERROR_CONVERT=$(mktemp)
ERROR_FORMAT=$(mktemp)
# Modified parallel command with error type tracking
"${find_cmd[@]}" -print0 | parallel -0 --will-cite -j "$MAX_PARALLEL" \
"convert_file {} || {
case \$? in
1) echo {} >> $ERROR_EMPTY ;;
2) echo {} >> $ERROR_CONVERT ;;
3) echo {} >> $ERROR_FORMAT ;;
esac
}"
# Process error files
for file in "$ERROR_EMPTY" "$ERROR_CONVERT" "$ERROR_FORMAT"; do
if [[ -f "$file" ]]; then
while IFS= read -r failed_file; do
(( ERROR_COUNT++ ))
FAILED_FILES+=("$failed_file")
done < "$file"
rm "$file"
fi
done
else
while IFS= read -r -d $'\0' f; do
if ! convert_file "$f"; then
(( ERROR_COUNT++ ))
FAILED_FILES+=("$f")
fi
done < <("${find_cmd[@]}" -print0)
fi
return $conversion_failed
}
# Main script
main() {
check_dependencies
# Parse arguments
ARGS=()
while [[ $# -gt 0 ]]; do
case "$1" in
--help|-h)
usage 0
;;
--verbose)
VERBOSE=1
;;
--dry-run)
DRY_RUN=1
;;
--force)
FORCE=1
;;
--parallel=*)
MAX_PARALLEL="${1#*=}"
if ! [[ "$MAX_PARALLEL" =~ ^[1-9][0-9]*$ ]]; then
log "ERROR" "Invalid value for --parallel: $MAX_PARALLEL"
usage 1
fi
;;
-*)
log "ERROR" "Unknown option: $1"
usage 1
;;
*)
ARGS+=("$1")
;;
esac
shift
done
mkdir -p "$OUTPUT_DIR"
# Write error summary to debug log
echo "Error Summary:" > "$DEBUG_LOG"
for type in "${ERROR_TYPES[@]}"; do
echo " $type: ${ERROR_COUNTS[$type]}" >> "$DEBUG_LOG"
done
echo "" >> "$DEBUG_LOG"
trap 'echo "Conversion completed with $ERROR_COUNT errors.";
if (( ${#FAILED_FILES[@]} > 0 )); then
echo "Failed files:";
printf "%s\n" "${FAILED_FILES[@]}";
fi;
echo "Error breakdown:";
for type in "${ERROR_TYPES[@]}"; do
echo " $type: ${ERROR_COUNTS[$type]}";
done' EXIT
convert_directory "${ARGS[0]:-.}"
}
main "$@"
Explanation of BASH Script for MarkItDown App
Hi Team,
I’ve written a BASH script to automate the conversion of various file formats into markdown format using the markitdown tool. This script is designed to be robust, efficient, and user-friendly. The script recursively processes files in a specified directory (or the current directory by default), converts them to Markdown, and saves the output in a _converted directory. It supports parallel processing, error tracking, and logging for debugging purposes. This is especially useful for dealing with scraped sites.
Key Features
File Conversion:
Converts supported file formats (HTML, PDF, CSV, Excel, Word, etc.) to Markdown.
Skips empty, unreadable, or unsupported files.
Handles file paths with special characters by encoding them.
Parallel Processing:
Processes multiple files in parallel (default: 4) using GNU parallel if available.
Logs errors to a _debug_log.txt file in the output directory.
Provides a summary of errors at the end of the script.
Dry Run Mode:
Simulates the conversion process without making any changes.
Useful for testing and debugging.
Force Mode:
Overwrites existing Markdown files in the output directory.
Creates backups of overwritten files for safety.
Verbose Mode:
Provides detailed progress information during execution.
Logging:
Logs all actions (info, warnings, errors) to a debug log file.
Outputs logs to the console based on verbosity settings.
Dependency Check:
Ensures the markitdown tool is installed before proceeding.
Usage
The script supports the following options:
--verbose: Show detailed progress.
--dry-run: Simulate the conversion process without making changes.
--parallel=N: Process N files in parallel (default: 4).
--force: Overwrite existing Markdown files.
--help: Display the help message.
Examples:
./convert_to_markdown.sh # Convert all supported files in the current directory
./convert_to_markdown.sh --verbose # Show detailed progress
./convert_to_markdown.sh /path/to/dir # Convert files in a specific directory
./convert_to_markdown.sh --parallel=8 .# Use 8 parallel processes
Error Tracking
The script categorizes errors into the following types:
Empty or unreadable file
Conversion failed
Invalid format
It maintains a count of each error type and lists all failed files at the end of the execution.
Output
Converted Markdown files are saved in the _converted directory.
A debug log (_debug_log.txt) is created in the output directory, containing detailed information about the conversion process and errors.
Dependencies
markitdown: The core tool used for file conversion.
parallel (optional): For parallel processing.
Exit Behavior
The script exits with a summary of errors, including the number of errors, a list of failed files, and a breakdown of error types.
Let me know if you have any questions or suggestions!
The text was updated successfully, but these errors were encountered:
Explanation of BASH Script for MarkItDown App
Hi Team,
I’ve written a BASH script to automate the conversion of various file formats into markdown format using the
markitdown
tool. This script is designed to be robust, efficient, and user-friendly. The script recursively processes files in a specified directory (or the current directory by default), converts them to Markdown, and saves the output in a_converted
directory. It supports parallel processing, error tracking, and logging for debugging purposes. This is especially useful for dealing with scraped sites.Key Features
File Conversion:
Parallel Processing:
parallel
if available.Error Handling:
_debug_log.txt
file in the output directory.Dry Run Mode:
Force Mode:
Verbose Mode:
Logging:
Dependency Check:
markitdown
tool is installed before proceeding.Usage
The script supports the following options:
--verbose
: Show detailed progress.--dry-run
: Simulate the conversion process without making changes.--parallel=N
: Process N files in parallel (default: 4).--force
: Overwrite existing Markdown files.--help
: Display the help message.Examples:
Error Tracking
The script categorizes errors into the following types:
It maintains a count of each error type and lists all failed files at the end of the execution.
Output
_converted
directory._debug_log.txt
) is created in the output directory, containing detailed information about the conversion process and errors.Dependencies
markitdown
: The core tool used for file conversion.parallel
(optional): For parallel processing.Exit Behavior
Let me know if you have any questions or suggestions!
The text was updated successfully, but these errors were encountered: