#!/usr/bin/env python3 """ MonkeyOCR Command Line Interface Process documents using MonkeyOCR with MLX-VLM optimization """ import sys import os import argparse import time from pathlib import Path from loguru import logger def main(): parser = argparse.ArgumentParser( description="MonkeyOCR: Advanced OCR with MLX-VLM optimization for Apple Silicon" ) parser.add_argument("input_path", help="Path to PDF or image file to process") parser.add_argument( "-o", "--output", help="Output directory (default: same as input file)", default=None ) parser.add_argument( "-c", "--config", help="Config file path", default="model_configs_mps.yaml" ) parser.add_argument( "--verbose", "-v", action="store_true", help="Enable verbose logging" ) args = parser.parse_args() # Configure logging if args.verbose: logger.add(sys.stderr, level="DEBUG") else: logger.add(sys.stderr, level="INFO") # Check if input file exists input_path = Path(args.input_path) if not input_path.exists(): logger.error(f"Input file not found: {input_path}") sys.exit(1) # Check file extension supported_extensions = {'.pdf', '.png', '.jpg', '.jpeg'} if input_path.suffix.lower() not in supported_extensions: logger.error(f"Unsupported file type: {input_path.suffix}") logger.info(f"Supported formats: {', '.join(supported_extensions)}") sys.exit(1) # Set output directory if args.output: output_dir = Path(args.output) else: output_dir = input_path.parent output_dir.mkdir(parents=True, exist_ok=True) logger.info(f"🚀 Starting MonkeyOCR processing...") logger.info(f"📄 Input: {input_path}") logger.info(f"📁 Output: {output_dir}") logger.info(f"⚙️ Config: {args.config}") try: # Import and process from app import process_document, initialize_model # Initialize model logger.info("🔧 Initializing MonkeyOCR model...") start_time = time.time() model = initialize_model(args.config) init_time = time.time() - start_time logger.info(f"✅ Model initialized in {init_time:.2f}s") # Process document logger.info("📊 Processing document...") process_start = time.time() markdown_content, layout_pdf_path = process_document(str(input_path)) process_time = time.time() - process_start logger.info(f"⚡ Document processed in {process_time:.2f}s") # Save results output_name = input_path.stem markdown_file = output_dir / f"{output_name}.md" with open(markdown_file, 'w', encoding='utf-8') as f: f.write(markdown_content) logger.info(f"📝 Markdown saved: {markdown_file}") if layout_pdf_path and os.path.exists(layout_pdf_path): logger.info(f"🎨 Layout PDF: {layout_pdf_path}") # Summary logger.info("🎉 Processing completed successfully!") logger.info(f"⏱️ Total time: {time.time() - start_time:.2f}s") # Print first few lines of markdown for preview lines = markdown_content.split('\n')[:10] logger.info("📋 Preview:") for line in lines: if line.strip(): logger.info(f" {line}") if len(lines) >= 10: logger.info(" ...") except KeyboardInterrupt: logger.warning("⚠️ Processing interrupted by user") sys.exit(1) except Exception as e: logger.error(f"❌ Processing failed: {e}") if args.verbose: import traceback traceback.print_exc() sys.exit(1) if __name__ == "__main__": main()