[Contribution] BASH Script Addon #249

spencerthayer · 2025-01-03T00:29:10Z

#!/usr/bin/env bash
set -euo pipefail

# Default settings
VERBOSE=0
DRY_RUN=0
ERROR_COUNT=0
OUTPUT_DIR="_converted"
MAX_PARALLEL=4
FORCE=0

# Error tracking
declare -a FAILED_FILES=()
ERROR_TYPES=(
    "Empty or unreadable file"
    "Conversion failed"
    "Invalid format"
)
declare -A ERROR_COUNTS
for type in "${ERROR_TYPES[@]}"; do
    ERROR_COUNTS[$type]=0
done

# Log file
DEBUG_LOG="${OUTPUT_DIR}/_debug_log.txt"

# Print usage information
usage() {
    cat <<EOF
Usage: $(basename "$0") [OPTIONS] [DIRECTORY]
Convert files to markdown format recursively and save them to the _converted directory.
Supports HTML, PDF, CSV, Excel, Word and other formats via markitdown.

Options:
    --verbose           Show detailed progress
    --dry-run           Show what would be done without making changes
    --help              Display this help message
    --parallel=N        Process N files in parallel (default: 4)
    --force             Overwrite existing markdown files
    
Examples:
    $(basename "$0")                    # Convert all supported files in current directory
    $(basename "$0") --verbose          # Show detailed progress
    $(basename "$0") /path/to/dir       # Convert files in specific directory
    $(basename "$0") --parallel=8 .     # Use 8 parallel processes
EOF
    exit "${1:-1}"
}

# Log messages based on verbosity
log() {
    local level="$1"
    shift
    local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
    local log_entry="[$timestamp] $level: $*"

    # Write to the debug log file
    mkdir -p "$OUTPUT_DIR"
    echo "$log_entry" >> "$DEBUG_LOG"

    # Write to the console based on verbosity and level
    case "$level" in
        ERROR)   echo "$log_entry" >&2 ;;
        WARNING) echo "$log_entry" >&2 ;;
        INFO)    (( VERBOSE )) && echo "$log_entry" ;;
        DRY)     (( DRY_RUN )) && echo "$log_entry" ;;
    esac
}

# Check if markitdown is available
check_dependencies() {
    if ! command -v markitdown >/dev/null 2>&1; then
        log "ERROR" "markitdown command not found"
        exit 1
    fi
}

# Convert a single file
convert_file() {
    local file="$1"
    local relative_path="${file#./}"
    local encoded_name="${relative_path//\//_}"
    local encoded_name_without_ext="${encoded_name%.*}"
    local output_file="${OUTPUT_DIR}/${encoded_name_without_ext}.md"
    local temp_file="${output_file}.tmp"
    local backup_file="${output_file}.bak"

    # Check if input file exists and is not empty
    if [[ ! -s "$file" ]]; then
        log "ERROR" "Skipping empty or unreadable file: $file"
        FAILED_FILES+=("$file")
        (( ERROR_COUNTS["Empty or unreadable file"]++ ))
        (( ERROR_COUNT++ ))
        return 1
    fi

    # Check if file is in a supported format (basic check)
    if ! file "$file" | grep -qE '(HTML|PDF|CSV|Excel|Word|text)'; then
        log "ERROR" "File does not appear to be in a supported format: $file"
        FAILED_FILES+=("$file")
        (( ERROR_COUNTS["Invalid format"]++ ))
        (( ERROR_COUNT++ ))
        return 3
    fi

    # If file exists and we're using force, create backup
    if [[ -f "$output_file" ]] && (( FORCE )); then
        cp "$output_file" "$backup_file"
        log "INFO" "Created backup: $backup_file"
    elif [[ -f "$output_file" ]]; then
        log "INFO" "Skipping existing file: $output_file"
        return 0
    fi

    if (( DRY_RUN )); then
        log "DRY" "Would convert $file -> $output_file"
        return 0
    fi

    mkdir -p "$OUTPUT_DIR"
    
    # Use temporary file for atomic writes
    if markitdown "$file" > "$temp_file"; then
        mv "$temp_file" "$output_file"
        log "INFO" "Converted $file -> $output_file"
        # Remove backup if conversion successful
        [[ -f "$backup_file" ]] && rm "$backup_file"
    else
        rm -f "$temp_file"
        # Restore from backup if conversion failed
        if [[ -f "$backup_file" ]]; then
            mv "$backup_file" "$output_file"
            log "INFO" "Restored from backup after failed conversion"
        fi
        log "ERROR" "Failed to convert $file"
        FAILED_FILES+=("$file")
        (( ERROR_COUNTS["Conversion failed"]++ ))
        (( ERROR_COUNT++ ))
        return 2
    fi
}

# Process files in a directory recursively
convert_directory() {
    local input_dir="${1:-.}"
    local conversion_failed=0

    if [[ ! -d "$input_dir" ]]; then
        log "ERROR" "Directory not found: $input_dir"
        return 1
    fi

    # Find all non-empty files except hidden files, .DS_Store, .md, and .tmp files
    local find_cmd=(find "$input_dir" -type f ! -empty ! -name ".*" ! -name ".DS_Store" ! -name "*.md" ! -name "*.tmp")

    if command -v parallel >/dev/null 2>&1; then
        # Create separate temp files for different error types
        ERROR_EMPTY=$(mktemp)
        ERROR_CONVERT=$(mktemp)
        ERROR_FORMAT=$(mktemp)
        
        # Modified parallel command with error type tracking
        "${find_cmd[@]}" -print0 | parallel -0 --will-cite -j "$MAX_PARALLEL" \
            "convert_file {} || { 
                case \$? in 
                    1) echo {} >> $ERROR_EMPTY ;; 
                    2) echo {} >> $ERROR_CONVERT ;;
                    3) echo {} >> $ERROR_FORMAT ;;
                esac
            }"
        
        # Process error files
        for file in "$ERROR_EMPTY" "$ERROR_CONVERT" "$ERROR_FORMAT"; do
            if [[ -f "$file" ]]; then
                while IFS= read -r failed_file; do
                    (( ERROR_COUNT++ ))
                    FAILED_FILES+=("$failed_file")
                done < "$file"
                rm "$file"
            fi
        done
    else
        while IFS= read -r -d $'\0' f; do
            if ! convert_file "$f"; then
                (( ERROR_COUNT++ ))
                FAILED_FILES+=("$f")
            fi
        done < <("${find_cmd[@]}" -print0)
    fi

    return $conversion_failed
}

# Main script
main() {
    check_dependencies

    # Parse arguments
    ARGS=()
    while [[ $# -gt 0 ]]; do
        case "$1" in
            --help|-h)
                usage 0
                ;;
            --verbose)
                VERBOSE=1
                ;;
            --dry-run)
                DRY_RUN=1
                ;;
            --force)
                FORCE=1
                ;;
            --parallel=*)
                MAX_PARALLEL="${1#*=}"
                if ! [[ "$MAX_PARALLEL" =~ ^[1-9][0-9]*$ ]]; then
                    log "ERROR" "Invalid value for --parallel: $MAX_PARALLEL"
                    usage 1
                fi
                ;;
            -*)
                log "ERROR" "Unknown option: $1"
                usage 1
                ;;
            *)
                ARGS+=("$1")
                ;;
        esac
        shift
    done

    mkdir -p "$OUTPUT_DIR"

    # Write error summary to debug log
    echo "Error Summary:" > "$DEBUG_LOG"
    for type in "${ERROR_TYPES[@]}"; do
        echo "  $type: ${ERROR_COUNTS[$type]}" >> "$DEBUG_LOG"
    done
    echo "" >> "$DEBUG_LOG"

    trap 'echo "Conversion completed with $ERROR_COUNT errors.";
          if (( ${#FAILED_FILES[@]} > 0 )); then
              echo "Failed files:";
              printf "%s\n" "${FAILED_FILES[@]}";
          fi;
          echo "Error breakdown:";
          for type in "${ERROR_TYPES[@]}"; do
              echo "  $type: ${ERROR_COUNTS[$type]}";
          done' EXIT

    convert_directory "${ARGS[0]:-.}"
}

main "$@"

Explanation of BASH Script for MarkItDown App

Hi Team,

I’ve written a BASH script to automate the conversion of various file formats into markdown format using the markitdown tool. This script is designed to be robust, efficient, and user-friendly. The script recursively processes files in a specified directory (or the current directory by default), converts them to Markdown, and saves the output in a _converted directory. It supports parallel processing, error tracking, and logging for debugging purposes. This is especially useful for dealing with scraped sites.

Key Features

File Conversion:
- Converts supported file formats (HTML, PDF, CSV, Excel, Word, etc.) to Markdown.
- Skips empty, unreadable, or unsupported files.
- Handles file paths with special characters by encoding them.
Parallel Processing:
- Processes multiple files in parallel (default: 4) using GNU parallel if available.
- Improves performance for large directories.
Error Handling:
- Tracks and categorizes errors (e.g., empty files, conversion failures, invalid formats).
- Logs errors to a _debug_log.txt file in the output directory.
- Provides a summary of errors at the end of the script.
Dry Run Mode:
- Simulates the conversion process without making any changes.
- Useful for testing and debugging.
Force Mode:
- Overwrites existing Markdown files in the output directory.
- Creates backups of overwritten files for safety.
Verbose Mode:
- Provides detailed progress information during execution.
Logging:
- Logs all actions (info, warnings, errors) to a debug log file.
- Outputs logs to the console based on verbosity settings.
Dependency Check:
- Ensures the markitdown tool is installed before proceeding.

Usage

The script supports the following options:

--verbose: Show detailed progress.
--dry-run: Simulate the conversion process without making changes.
--parallel=N: Process N files in parallel (default: 4).
--force: Overwrite existing Markdown files.
--help: Display the help message.

Examples:

./convert_to_markdown.sh                    # Convert all supported files in the current directory
./convert_to_markdown.sh --verbose          # Show detailed progress
./convert_to_markdown.sh /path/to/dir       # Convert files in a specific directory
./convert_to_markdown.sh --parallel=8 .     # Use 8 parallel processes

Error Tracking

The script categorizes errors into the following types:

Empty or unreadable file
Conversion failed
Invalid format

It maintains a count of each error type and lists all failed files at the end of the execution.

Output

Converted Markdown files are saved in the _converted directory.
A debug log (_debug_log.txt) is created in the output directory, containing detailed information about the conversion process and errors.

Dependencies

markitdown: The core tool used for file conversion.
parallel (optional): For parallel processing.

Exit Behavior

The script exits with a summary of errors, including the number of errors, a list of failed files, and a breakdown of error types.

Let me know if you have any questions or suggestions!

The text was updated successfully, but these errors were encountered:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[Contribution] BASH Script Addon #249

[Contribution] BASH Script Addon #249

spencerthayer commented Jan 3, 2025 •

edited

Loading

[Contribution] BASH Script Addon #249

[Contribution] BASH Script Addon #249

Comments

spencerthayer commented Jan 3, 2025 • edited Loading

Explanation of BASH Script for MarkItDown App

Key Features

Usage

Error Tracking

Output

Dependencies

Exit Behavior

spencerthayer commented Jan 3, 2025 •

edited

Loading