setup.sh · Jimmi42/MonkeyOCR-Apple-Silicon at main

Your Name

Initial commit with working MLX-VLM configuration

18352e1 5 months ago

11.7 kB

	#!/bin/bash
	set -e

	echo "🐵 MonkeyOCR MLX-VLM Setup Script for Apple Silicon"
	echo "===================================================="

	# Check if we're on macOS
	if [[ "$OSTYPE" != "darwin"* ]]; then
	echo "❌ This script is designed for macOS (Apple Silicon). For other platforms, use the standard setup."
	exit 1
	fi

	# Check if uv is installed
	if ! command -v uv &> /dev/null; then
	echo "❌ uv is not installed. Installing it now..."
	curl -LsSf https://astral.sh/uv/install.sh \| sh
	source $HOME/.cargo/env
	fi

	echo "✅ uv found"

	# Download MonkeyOCR from official GitHub if not present
	if [ ! -d "MonkeyOCR" ]; then
	echo "📥 Downloading MonkeyOCR from official GitHub repository..."
	git clone https://github.com/Yuliang-Liu/MonkeyOCR.git MonkeyOCR
	echo "✅ MonkeyOCR downloaded successfully"
	else
	echo "✅ MonkeyOCR directory already exists"
	echo "🔄 Updating MonkeyOCR to latest version..."
	cd MonkeyOCR
	git pull origin main
	cd ..
	fi

	# Apply MLX-VLM optimizations patch
	echo "🔧 Applying MLX-VLM optimizations for Apple Silicon..."
	apply_mlx_patches() {
	local custom_model_file="MonkeyOCR/magic_pdf/model/custom_model.py"

	# Check if patches are already applied
	if grep -q "class MonkeyChat_MLX:" "$custom_model_file"; then
	echo "✅ MLX-VLM patches already applied"
	return 0
	fi

	echo "📝 Patching custom_model.py with MLX-VLM backend..."

	# Create backup
	cp "$custom_model_file" "$custom_model_file.backup"

	# Apply the MLX-VLM class patch
	cat >> "$custom_model_file" << 'EOF'

	class MonkeyChat_MLX:
	"""MLX-VLM backend for Apple Silicon optimization"""

	def __init__(self, model_path: str):
	try:
	import mlx_vlm
	from mlx_vlm import load, generate
	from mlx_vlm.utils import load_config
	except ImportError:
	raise ImportError(
	"MLX-VLM is not installed. Please install it with: "
	"pip install mlx-vlm"
	)

	self.model_path = model_path
	self.model_name = os.path.basename(model_path)

	logger.info(f"Loading MLX-VLM model from {model_path}")

	# Load model and processor with MLX-VLM
	self.model, self.processor = load(model_path)

	# Load configuration
	self.config = load_config(model_path)

	logger.info("MLX-VLM model loaded successfully")

	def batch_inference(self, images: List[Union[str, Image.Image]], questions: List[str]) -> List[str]:
	"""Process multiple images with questions using MLX-VLM"""
	if len(images) != len(questions):
	raise ValueError("Images and questions must have the same length")

	results = []

	import concurrent.futures
	with concurrent.futures.ThreadPoolExecutor() as executor:
	results = list(executor.map(self._process_single, images, questions))

	return results

	def _process_single(self, image: Union[str, Image.Image], question: str) -> str:
	"""Process a single image with question using MLX-VLM"""
	try:
	from mlx_vlm import generate
	from mlx_vlm.prompt_utils import apply_chat_template

	# Load image if it's a path
	if isinstance(image, str):
	if os.path.exists(image):
	image = Image.open(image)
	else:
	# Assume it's base64 or URL
	image = self._load_image_from_source(image)

	# Use the correct MLX-VLM format with chat template
	formatted_prompt = apply_chat_template(
	self.processor,
	self.config,
	question,
	num_images=1
	)

	response = generate(
	self.model,
	self.processor,
	formatted_prompt,
	[image], # MLX-VLM expects a list of images
	max_tokens=1024,
	temperature=0.1,
	verbose=False
	)

	# Handle different return types from MLX-VLM
	if isinstance(response, tuple):
	# MLX-VLM sometimes returns (text, metadata) tuple
	response = response[0] if response else ""
	elif isinstance(response, list):
	# Sometimes returns a list
	response = response[0] if response else ""

	# Ensure we have a string
	response = str(response) if response is not None else ""

	return response.strip()

	except Exception as e:
	logger.error(f"MLX-VLM single processing error: {e}")
	raise

	def _load_image_from_source(self, image_source: str) -> Image.Image:
	"""Load image from various sources (file path, URL, base64)"""
	import io
	try:
	if os.path.exists(image_source):
	return Image.open(image_source)
	elif image_source.startswith(('http://', 'https://')):
	import requests
	response = requests.get(image_source)
	return Image.open(io.BytesIO(response.content))
	elif image_source.startswith('data:image'):
	# Base64 encoded image
	import base64
	header, data = image_source.split(',', 1)
	image_data = base64.b64decode(data)
	return Image.open(io.BytesIO(image_data))
	else:
	raise ValueError(f"Unsupported image source: {image_source}")
	except Exception as e:
	logger.error(f"Failed to load image from source {image_source}: {e}")
	raise

	def single_inference(self, image: Union[str, Image.Image], question: str) -> str:
	"""Single image inference for compatibility"""
	return self._process_single(image, question)
	EOF

	# Now patch the backend selection logic in the MonkeyOCR class
	echo "📝 Patching backend selection logic..."

	# Find and replace the backend selection logic
	python3 << 'PYTHON_PATCH'
	import re

	# Read the file
	with open('MonkeyOCR/magic_pdf/model/custom_model.py', 'r') as f:
	content = f.read()

	# Find the backend selection section and replace it
	old_pattern = r"backend = chat_config\.get$'backend', 'lmdeploy'$"
	new_pattern = "backend = chat_config.get('backend', 'auto')"

	content = re.sub(old_pattern, new_pattern, content)

	# Add smart backend selection logic
	backend_selection_code = '''
	# Smart backend selection for optimal performance
	if backend == 'auto':
	try:
	import torch
	if torch.backends.mps.is_available():
	# Apple Silicon - prefer MLX
	try:
	import mlx_vlm
	backend = 'mlx'
	logger.info("Auto-selected MLX backend for Apple Silicon")
	except ImportError:
	backend = 'transformers'
	logger.info("MLX not available, using transformers backend")
	elif torch.cuda.is_available():
	# CUDA available - prefer lmdeploy
	try:
	import lmdeploy
	backend = 'lmdeploy'
	logger.info("Auto-selected lmdeploy backend for CUDA")
	except ImportError:
	backend = 'transformers'
	logger.info("lmdeploy not available, using transformers backend")
	else:
	# CPU fallback
	backend = 'transformers'
	logger.info("Auto-selected transformers backend for CPU")
	except Exception as e:
	logger.warning(f"Auto-detection failed: {e}, using transformers backend")
	backend = 'transformers'
	'''

	# Insert the smart selection code after the backend assignment
	pattern = r"(backend = chat_config\.get$'backend', 'auto'$)"
	replacement = pattern + backend_selection_code

	content = re.sub(pattern, replacement, content)

	# Add MLX backend handling
	mlx_backend_code = ''' elif backend == 'mlx':
	try:
	self.chat_model = MonkeyChat_MLX(model_path)
	logger.info("Successfully initialized MLX-VLM backend")
	except ImportError as e:
	logger.error(f"MLX-VLM not available: {e}")
	logger.info("Falling back to transformers backend")
	self.chat_model = MonkeyChat_transformers(model_path, device=device)
	except Exception as e:
	logger.error(f"Failed to initialize MLX backend: {e}")
	logger.info("Falling back to transformers backend")
	self.chat_model = MonkeyChat_transformers(model_path, device=device)
	'''

	# Find the backend initialization section and add MLX support
	pattern = r"(elif backend == 'transformers':)"
	replacement = mlx_backend_code + "\n " + pattern

	content = re.sub(pattern, replacement, content)

	# Write the patched content back
	with open('MonkeyOCR/magic_pdf/model/custom_model.py', 'w') as f:
	f.write(content)

	print("✅ Backend selection logic patched successfully")
	PYTHON_PATCH

	echo "✅ MLX-VLM patches applied successfully"
	}

	# Apply the patches
	apply_mlx_patches

	# Create virtual environment
	echo "🔧 Creating virtual environment..."
	uv venv --python 3.11

	# Activate virtual environment and install dependencies
	echo "📦 Installing dependencies..."
	source .venv/bin/activate
	uv pip install -r requirements.txt

	# Install MonkeyOCR package
	echo "📦 Installing MonkeyOCR package..."
	cd MonkeyOCR
	source ../.venv/bin/activate
	# Install MonkeyOCR dependencies
	uv pip install -r requirements.txt
	# Install the package in development mode
	uv pip install -e . --no-deps
	cd ..

	# Download model weights
	echo "📥 Downloading model weights..."
	cd MonkeyOCR
	source ../.venv/bin/activate
	python tools/download_model.py
	cd ..

	# Check if LaTeX is available (optional for table rendering)
	if command -v pdflatex &> /dev/null; then
	echo "✅ LaTeX found - table rendering will work"
	else
	echo "⚠️ LaTeX not found - table rendering will be limited"
	echo " To install LaTeX: brew install --cask mactex"
	fi

	# Create sample documents directory
	mkdir -p sample_docs
	echo "📁 Created sample_docs directory"

	echo ""
	echo "🎉 Setup completed successfully!"
	echo ""
	echo "MonkeyOCR is now optimized with MLX-VLM for Apple Silicon!"
	echo ""
	echo "✨ Applied Optimizations:"
	echo "- 🚀 MLX-VLM backend for 3x faster processing"
	echo "- 🧠 Smart backend auto-selection (MLX/LMDeploy/transformers)"
	echo "- 🔧 Fixed prompt formatting for optimal OCR output"
	echo "- 🍎 Native Apple Silicon acceleration"
	echo ""
	echo "To run the app:"
	echo " source .venv/bin/activate"
	echo " python app.py"
	echo ""
	echo "The app will be available at: http://localhost:7860"
	echo ""
	echo "Features:"
	echo "- MLX-VLM backend for 3x faster processing on Apple Silicon"
	echo "- Smart backend selection (MLX/LMDeploy/transformers)"
	echo "- Advanced table extraction and OCR"
	echo "- Web interface and command-line tools"
	echo ""
	echo "Tips:"
	echo "- Place sample documents in the 'sample_docs' directory"
	echo "- The first run may take longer as models are loaded"
	echo "- Monitor Activity Monitor to see MPS GPU usage"