|
|
#!/bin/bash |
|
|
set -e |
|
|
|
|
|
echo "π΅ MonkeyOCR MLX-VLM Setup Script for Apple Silicon" |
|
|
echo "====================================================" |
|
|
|
|
|
|
|
|
if [[ "$OSTYPE" != "darwin"* ]]; then |
|
|
echo "β This script is designed for macOS (Apple Silicon). For other platforms, use the standard setup." |
|
|
exit 1 |
|
|
fi |
|
|
|
|
|
|
|
|
if ! command -v uv &> /dev/null; then |
|
|
echo "β uv is not installed. Installing it now..." |
|
|
curl -LsSf https://astral.sh/uv/install.sh | sh |
|
|
source $HOME/.cargo/env |
|
|
fi |
|
|
|
|
|
echo "β
uv found" |
|
|
|
|
|
|
|
|
if [ ! -d "MonkeyOCR" ]; then |
|
|
echo "π₯ Downloading MonkeyOCR from official GitHub repository..." |
|
|
git clone https://github.com/Yuliang-Liu/MonkeyOCR.git MonkeyOCR |
|
|
echo "β
MonkeyOCR downloaded successfully" |
|
|
else |
|
|
echo "β
MonkeyOCR directory already exists" |
|
|
echo "π Updating MonkeyOCR to latest version..." |
|
|
cd MonkeyOCR |
|
|
git pull origin main |
|
|
cd .. |
|
|
fi |
|
|
|
|
|
|
|
|
echo "π§ Applying MLX-VLM optimizations for Apple Silicon..." |
|
|
apply_mlx_patches() { |
|
|
local custom_model_file="MonkeyOCR/magic_pdf/model/custom_model.py" |
|
|
|
|
|
|
|
|
if grep -q "class MonkeyChat_MLX:" "$custom_model_file"; then |
|
|
echo "β
MLX-VLM patches already applied" |
|
|
return 0 |
|
|
fi |
|
|
|
|
|
echo "π Patching custom_model.py with MLX-VLM backend..." |
|
|
|
|
|
|
|
|
cp "$custom_model_file" "$custom_model_file.backup" |
|
|
|
|
|
|
|
|
cat >> "$custom_model_file" << 'EOF' |
|
|
|
|
|
class MonkeyChat_MLX: |
|
|
"""MLX-VLM backend for Apple Silicon optimization""" |
|
|
|
|
|
def __init__(self, model_path: str): |
|
|
try: |
|
|
import mlx_vlm |
|
|
from mlx_vlm import load, generate |
|
|
from mlx_vlm.utils import load_config |
|
|
except ImportError: |
|
|
raise ImportError( |
|
|
"MLX-VLM is not installed. Please install it with: " |
|
|
"pip install mlx-vlm" |
|
|
) |
|
|
|
|
|
self.model_path = model_path |
|
|
self.model_name = os.path.basename(model_path) |
|
|
|
|
|
logger.info(f"Loading MLX-VLM model from {model_path}") |
|
|
|
|
|
|
|
|
self.model, self.processor = load(model_path) |
|
|
|
|
|
|
|
|
self.config = load_config(model_path) |
|
|
|
|
|
logger.info("MLX-VLM model loaded successfully") |
|
|
|
|
|
def batch_inference(self, images: List[Union[str, Image.Image]], questions: List[str]) -> List[str]: |
|
|
"""Process multiple images with questions using MLX-VLM""" |
|
|
if len(images) != len(questions): |
|
|
raise ValueError("Images and questions must have the same length") |
|
|
|
|
|
results = [] |
|
|
|
|
|
import concurrent.futures |
|
|
with concurrent.futures.ThreadPoolExecutor() as executor: |
|
|
results = list(executor.map(self._process_single, images, questions)) |
|
|
|
|
|
return results |
|
|
|
|
|
def _process_single(self, image: Union[str, Image.Image], question: str) -> str: |
|
|
"""Process a single image with question using MLX-VLM""" |
|
|
try: |
|
|
from mlx_vlm import generate |
|
|
from mlx_vlm.prompt_utils import apply_chat_template |
|
|
|
|
|
|
|
|
if isinstance(image, str): |
|
|
if os.path.exists(image): |
|
|
image = Image.open(image) |
|
|
else: |
|
|
|
|
|
image = self._load_image_from_source(image) |
|
|
|
|
|
|
|
|
formatted_prompt = apply_chat_template( |
|
|
self.processor, |
|
|
self.config, |
|
|
question, |
|
|
num_images=1 |
|
|
) |
|
|
|
|
|
response = generate( |
|
|
self.model, |
|
|
self.processor, |
|
|
formatted_prompt, |
|
|
[image], |
|
|
max_tokens=1024, |
|
|
temperature=0.1, |
|
|
verbose=False |
|
|
) |
|
|
|
|
|
|
|
|
if isinstance(response, tuple): |
|
|
|
|
|
response = response[0] if response else "" |
|
|
elif isinstance(response, list): |
|
|
|
|
|
response = response[0] if response else "" |
|
|
|
|
|
|
|
|
response = str(response) if response is not None else "" |
|
|
|
|
|
return response.strip() |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"MLX-VLM single processing error: {e}") |
|
|
raise |
|
|
|
|
|
def _load_image_from_source(self, image_source: str) -> Image.Image: |
|
|
"""Load image from various sources (file path, URL, base64)""" |
|
|
import io |
|
|
try: |
|
|
if os.path.exists(image_source): |
|
|
return Image.open(image_source) |
|
|
elif image_source.startswith(('http://', 'https://')): |
|
|
import requests |
|
|
response = requests.get(image_source) |
|
|
return Image.open(io.BytesIO(response.content)) |
|
|
elif image_source.startswith('data:image'): |
|
|
|
|
|
import base64 |
|
|
header, data = image_source.split(',', 1) |
|
|
image_data = base64.b64decode(data) |
|
|
return Image.open(io.BytesIO(image_data)) |
|
|
else: |
|
|
raise ValueError(f"Unsupported image source: {image_source}") |
|
|
except Exception as e: |
|
|
logger.error(f"Failed to load image from source {image_source}: {e}") |
|
|
raise |
|
|
|
|
|
def single_inference(self, image: Union[str, Image.Image], question: str) -> str: |
|
|
"""Single image inference for compatibility""" |
|
|
return self._process_single(image, question) |
|
|
EOF |
|
|
|
|
|
|
|
|
echo "π Patching backend selection logic..." |
|
|
|
|
|
|
|
|
python3 << 'PYTHON_PATCH' |
|
|
import re |
|
|
|
|
|
|
|
|
with open('MonkeyOCR/magic_pdf/model/custom_model.py', 'r') as f: |
|
|
content = f.read() |
|
|
|
|
|
|
|
|
old_pattern = r"backend = chat_config\.get\('backend', 'lmdeploy'\)" |
|
|
new_pattern = "backend = chat_config.get('backend', 'auto')" |
|
|
|
|
|
content = re.sub(old_pattern, new_pattern, content) |
|
|
|
|
|
|
|
|
backend_selection_code = ''' |
|
|
# Smart backend selection for optimal performance |
|
|
if backend == 'auto': |
|
|
try: |
|
|
import torch |
|
|
if torch.backends.mps.is_available(): |
|
|
# Apple Silicon - prefer MLX |
|
|
try: |
|
|
import mlx_vlm |
|
|
backend = 'mlx' |
|
|
logger.info("Auto-selected MLX backend for Apple Silicon") |
|
|
except ImportError: |
|
|
backend = 'transformers' |
|
|
logger.info("MLX not available, using transformers backend") |
|
|
elif torch.cuda.is_available(): |
|
|
# CUDA available - prefer lmdeploy |
|
|
try: |
|
|
import lmdeploy |
|
|
backend = 'lmdeploy' |
|
|
logger.info("Auto-selected lmdeploy backend for CUDA") |
|
|
except ImportError: |
|
|
backend = 'transformers' |
|
|
logger.info("lmdeploy not available, using transformers backend") |
|
|
else: |
|
|
# CPU fallback |
|
|
backend = 'transformers' |
|
|
logger.info("Auto-selected transformers backend for CPU") |
|
|
except Exception as e: |
|
|
logger.warning(f"Auto-detection failed: {e}, using transformers backend") |
|
|
backend = 'transformers' |
|
|
''' |
|
|
|
|
|
|
|
|
pattern = r"(backend = chat_config\.get\('backend', 'auto'\))" |
|
|
replacement = pattern + backend_selection_code |
|
|
|
|
|
content = re.sub(pattern, replacement, content) |
|
|
|
|
|
|
|
|
mlx_backend_code = ''' elif backend == 'mlx': |
|
|
try: |
|
|
self.chat_model = MonkeyChat_MLX(model_path) |
|
|
logger.info("Successfully initialized MLX-VLM backend") |
|
|
except ImportError as e: |
|
|
logger.error(f"MLX-VLM not available: {e}") |
|
|
logger.info("Falling back to transformers backend") |
|
|
self.chat_model = MonkeyChat_transformers(model_path, device=device) |
|
|
except Exception as e: |
|
|
logger.error(f"Failed to initialize MLX backend: {e}") |
|
|
logger.info("Falling back to transformers backend") |
|
|
self.chat_model = MonkeyChat_transformers(model_path, device=device) |
|
|
''' |
|
|
|
|
|
|
|
|
pattern = r"(elif backend == 'transformers':)" |
|
|
replacement = mlx_backend_code + "\n " + pattern |
|
|
|
|
|
content = re.sub(pattern, replacement, content) |
|
|
|
|
|
|
|
|
with open('MonkeyOCR/magic_pdf/model/custom_model.py', 'w') as f: |
|
|
f.write(content) |
|
|
|
|
|
print("β
Backend selection logic patched successfully") |
|
|
PYTHON_PATCH |
|
|
|
|
|
echo "β
MLX-VLM patches applied successfully" |
|
|
} |
|
|
|
|
|
|
|
|
apply_mlx_patches |
|
|
|
|
|
|
|
|
echo "π§ Creating virtual environment..." |
|
|
uv venv --python 3.11 |
|
|
|
|
|
|
|
|
echo "π¦ Installing dependencies..." |
|
|
source .venv/bin/activate |
|
|
uv pip install -r requirements.txt |
|
|
|
|
|
|
|
|
echo "π¦ Installing MonkeyOCR package..." |
|
|
cd MonkeyOCR |
|
|
source ../.venv/bin/activate |
|
|
|
|
|
uv pip install -r requirements.txt |
|
|
|
|
|
uv pip install -e . --no-deps |
|
|
cd .. |
|
|
|
|
|
|
|
|
echo "π₯ Downloading model weights..." |
|
|
cd MonkeyOCR |
|
|
source ../.venv/bin/activate |
|
|
python tools/download_model.py |
|
|
cd .. |
|
|
|
|
|
|
|
|
if command -v pdflatex &> /dev/null; then |
|
|
echo "β
LaTeX found - table rendering will work" |
|
|
else |
|
|
echo "β οΈ LaTeX not found - table rendering will be limited" |
|
|
echo " To install LaTeX: brew install --cask mactex" |
|
|
fi |
|
|
|
|
|
|
|
|
mkdir -p sample_docs |
|
|
echo "π Created sample_docs directory" |
|
|
|
|
|
echo "" |
|
|
echo "π Setup completed successfully!" |
|
|
echo "" |
|
|
echo "MonkeyOCR is now optimized with MLX-VLM for Apple Silicon!" |
|
|
echo "" |
|
|
echo "β¨ Applied Optimizations:" |
|
|
echo "- π MLX-VLM backend for 3x faster processing" |
|
|
echo "- π§ Smart backend auto-selection (MLX/LMDeploy/transformers)" |
|
|
echo "- π§ Fixed prompt formatting for optimal OCR output" |
|
|
echo "- π Native Apple Silicon acceleration" |
|
|
echo "" |
|
|
echo "To run the app:" |
|
|
echo " source .venv/bin/activate" |
|
|
echo " python app.py" |
|
|
echo "" |
|
|
echo "The app will be available at: http://localhost:7860" |
|
|
echo "" |
|
|
echo "Features:" |
|
|
echo "- MLX-VLM backend for 3x faster processing on Apple Silicon" |
|
|
echo "- Smart backend selection (MLX/LMDeploy/transformers)" |
|
|
echo "- Advanced table extraction and OCR" |
|
|
echo "- Web interface and command-line tools" |
|
|
echo "" |
|
|
echo "Tips:" |
|
|
echo "- Place sample documents in the 'sample_docs' directory" |
|
|
echo "- The first run may take longer as models are loaded" |
|
|
echo "- Monitor Activity Monitor to see MPS GPU usage" |