Spaces:

kusatmer
/

image-text-extractor

Running

App Files Files Community

kusatmer commited on 19 days ago

Commit

b8b55ff

0 Parent(s):

feat: Implement initial image text extraction application with Streamlit UI, OCR service, and tests.

Browse files

Files changed (8) hide show

.gitignore +141 -0
.vscode/settings.json +4 -0
README.md +61 -0
requirements.txt +10 -0
service/__init__.py +3 -0
service/text_extraction_service.py +213 -0
streamlit_app.py +145 -0
tests/test_text_extraction_service.py +112 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,141 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   with no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# macOS
+.DS_Store
+# VS Code

.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "python.defaultInterpreterPath": "/opt/homebrew/bin/python3",
+  "python.analysis.typeCheckingMode": "off"
+}

README.md ADDED Viewed

	@@ -0,0 +1,61 @@

+# Image Text Extractor
+This project is a Streamlit application that uses the `olmOCR` model (based on Qwen2.5-VL) to extract text from images. It provides a user-friendly interface to upload images and view the extracted text along with metadata.
+## Features
+-   **Image Upload**: Support for PNG, JPG, and JPEG formats.
+-   **Text Extraction**: Uses state-of-the-art Vision-Language Models for accurate OCR.
+-   **Metadata Extraction**: Extracts additional information like primary language, rotation, and content type (table, diagram).
+-   **JSON Export**: Download extraction results as JSON files.
+-   **Configurable**: Adjust maximum token generation for longer documents.
+## Installation
+1.  **Clone the repository**:
+    ```bash
+    git clone <repository-url>
+    cd image-text-extractor
+    ```
+2.  **Create a virtual environment** (recommended):
+    ```bash
+    python -m venv venv
+    source venv/bin/activate  # On Windows: venv\Scripts\activate
+    ```
+3.  **Install dependencies**:
+    ```bash
+    pip install -r requirements.txt
+    ```
+## Usage
+1.  **Run the Streamlit app**:
+    ```bash
+    streamlit run streamlit_app.py
+    ```
+2.  **Open your browser**:
+    The app should automatically open in your default browser at `http://localhost:8501`.
+## Testing
+This project uses `pytest` for unit testing.
+1.  **Run tests**:
+    ```bash
+    pytest tests/
+    ```
+## Project Structure
+-   `streamlit_app.py`: The main entry point for the Streamlit application.
+-   `service/`: Contains the backend logic for text extraction.
+    -   `text_extraction_service.py`: The core service class handling model interaction.
+-   `tests/`: Unit tests for the application.
+-   `requirements.txt`: Python dependencies.
+## License
+[Add License Here]

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+streamlit>=1.28.0
+torch>=2.0.0
+torchvision>=0.15.0
+transformers>=4.55.2
+pillow>=10.0.0
+olmocr>=0.4.6
+pytest>=7.0.0
+pytest-mock>=3.10.0

service/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # Services package for text extraction functionality
2	+
3	+

service/text_extraction_service.py ADDED Viewed

	@@ -0,0 +1,213 @@

+"""
+Text Extraction Service
+Handles OCR text extraction from images using olmOCR model.
+Separated from UI concerns for better maintainability.
+"""
+import base64
+import json
+import os
+import re
+from io import BytesIO
+from typing import Dict, Tuple, Optional
+import torch
+from PIL import Image
+from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
+from olmocr.prompts import build_no_anchoring_v4_yaml_prompt
+class TextExtractionService:
+    """
+    Service class for extracting text from images using olmOCR model.
+    Handles model initialization, image processing, and result formatting.
+    """
+    def __init__(self, model_name: str = "allenai/olmOCR-2-7B-1025",
+                 processor_name: str = "Qwen/Qwen2.5-VL-7B-Instruct"):
+        """
+        Initialize the text extraction service with model and processor.
+        Args:
+            model_name: Name of the olmOCR model to use
+            processor_name: Name of the processor to use
+        """
+        self.model_name = model_name
+        self.processor_name = processor_name
+        self.model = None
+        self.processor = None
+        self.device = None
+        self._initialize_model()
+    def _initialize_model(self):
+        """Initialize the model and processor, set up device."""
+        # Initialize model
+        self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            self.model_name,
+            torch_dtype=torch.bfloat16
+        ).eval()
+        # Initialize processor
+        self.processor = AutoProcessor.from_pretrained(self.processor_name)
+        # Determine device (CUDA, MPS for Mac, or CPU)
+        if torch.cuda.is_available():
+            self.device = torch.device("cuda")
+        elif torch.backends.mps.is_available():
+            self.device = torch.device("mps")
+        else:
+            self.device = torch.device("cpu")
+        # Move model to device
+        self.model.to(self.device)
+    def _parse_ocr_output(self, raw_text: str) -> Tuple[Dict, str]:
+        """
+        Parse OCR output that contains YAML frontmatter and extract metadata and text separately.
+        Args:
+            raw_text: Raw output from OCR model
+        Returns:
+            Tuple of (metadata_dict, extracted_text)
+        """
+        # Split by YAML delimiters
+        parts = raw_text.split("---")
+        metadata = {}
+        extracted_text = ""
+        if len(parts) >= 3:
+            # Extract metadata from between first two --- markers
+            yaml_content = parts[1].strip()
+            # Extract text after second --- marker
+            extracted_text = parts[2].strip()
+            # Parse YAML-like key-value pairs
+            for line in yaml_content.split("\n"):
+                line = line.strip()
+                if ":" in line:
+                    key, value = line.split(":", 1)
+                    key = key.strip()
+                    value = value.strip()
+                    # Convert string booleans and numbers
+                    if value.lower() == "true":
+                        value = True
+                    elif value.lower() == "false":
+                        value = False
+                    elif value.isdigit():
+                        value = int(value)
+                    elif re.match(r"^-?\d+\.\d+$", value):
+                        value = float(value)
+                    metadata[key] = value
+        else:
+            # No YAML frontmatter found, use entire text
+            extracted_text = raw_text.strip()
+        return metadata, extracted_text
+    def extract_text_from_image(self, image: Image.Image,
+                                max_new_tokens: int = 2048) -> Dict:
+        """
+        Extract text from a PIL Image object.
+        Args:
+            image: PIL Image object to extract text from
+            max_new_tokens: Maximum number of tokens to generate
+        Returns:
+            Dictionary containing extracted text and metadata
+        """
+        # Convert image to base64
+        buffered = BytesIO()
+        image.save(buffered, format="PNG")
+        image_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
+        # Build the full prompt
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": build_no_anchoring_v4_yaml_prompt()},
+                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
+                ],
+            }
+        ]
+        # Apply the chat template and processor
+        text = self.processor.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        # Process inputs
+        inputs = self.processor(
+            text=[text],
+            images=[image],
+            padding=True,
+            return_tensors="pt",
+        )
+        inputs = {key: value.to(self.device) for (key, value) in inputs.items()}
+        # Generate the output
+        output = self.model.generate(
+            **inputs,
+            temperature=0.1,
+            max_new_tokens=max_new_tokens,
+            num_return_sequences=1,
+            do_sample=True,
+        )
+        # Decode the output
+        prompt_length = inputs["input_ids"].shape[1]
+        new_tokens = output[:, prompt_length:]
+        text_output = self.processor.tokenizer.batch_decode(
+            new_tokens,
+            skip_special_tokens=True
+        )
+        # Extract the text content
+        raw_output = text_output[0] if text_output else ""
+        # Parse the output
+        metadata, extracted_text = self._parse_ocr_output(raw_output)
+        # Prepare result data structure
+        result_data = {
+            "extracted_text": extracted_text,
+            "primary_language": metadata.get("primary_language", None),
+            "is_rotation_valid": metadata.get("is_rotation_valid", None),
+            "rotation_correction": metadata.get("rotation_correction", None),
+            "is_table": metadata.get("is_table", None),
+            "is_diagram": metadata.get("is_diagram", None),
+            "model": self.model_name,
+            "processor": self.processor_name
+        }
+        return result_data
+    def save_result_to_json(self, result_data: Dict, output_path: str,
+                           source_image_name: Optional[str] = None):
+        """
+        Save extraction result to JSON file.
+        Args:
+            result_data: Dictionary containing extraction results
+            output_path: Path where to save the JSON file
+            source_image_name: Optional name of the source image
+        """
+        # Add source image name if provided
+        if source_image_name:
+            result_data["source_image"] = source_image_name
+        # Ensure output directory exists
+        output_dir = os.path.dirname(output_path)
+        if output_dir:
+            os.makedirs(output_dir, exist_ok=True)
+        # Save to JSON file
+        with open(output_path, "w", encoding="utf-8") as json_file:
+            json.dump(result_data, json_file, ensure_ascii=False, indent=2)

streamlit_app.py ADDED Viewed

	@@ -0,0 +1,145 @@

+"""
+Streamlit App for Text Extraction from Images
+UI layer for the text extraction service.
+"""
+import html
+import json
+from pathlib import Path
+import streamlit as st
+from PIL import Image
+from service.text_extraction_service import TextExtractionService
+# Page configuration
+st.set_page_config(
+    page_title="Text Extraction from Images",
+    page_icon="📄",
+    layout="wide"
+)
+# Initialize session state
+if "extraction_service" not in st.session_state:
+    st.session_state.extraction_service = None
+if "extraction_result" not in st.session_state:
+    st.session_state.extraction_result = None
+@st.cache_resource
+def get_extraction_service():
+    """
+    Get or create the text extraction service instance.
+    Cached to avoid reloading the model on every interaction.
+    """
+    if st.session_state.extraction_service is None:
+        with st.spinner("Loading OCR model... This may take a moment."):
+            service = TextExtractionService()
+            st.session_state.extraction_service = service
+    return st.session_state.extraction_service
+def main():
+    """Main application function."""
+    st.title("📄 Text Extraction from Images")
+    st.markdown("Upload an image to extract text using olmOCR model.")
+    # Sidebar for settings
+    with st.sidebar:
+        st.header("⚙️ Settings")
+        max_tokens = st.slider(
+            "Max Tokens",
+            min_value=512,
+            max_value=4096,
+            value=2048,
+            step=256,
+            help="Maximum number of tokens to generate. Higher values allow longer text extraction."
+        )
+    # File uploader
+    uploaded_file = st.file_uploader(
+        "Choose an image file",
+        type=["png", "jpg", "jpeg"],
+        help="Upload an image file (PNG, JPG, JPEG)"
+    )
+    if uploaded_file is not None:
+        # Display uploaded image
+        st.subheader("📷 Uploaded Image")
+        image = Image.open(uploaded_file)
+        st.image(image)
+        st.caption(f"File: {uploaded_file.name}")
+        st.divider()
+        # Extract button
+        st.subheader("📝 Text Extraction")
+        if st.button("🔍 Extract Text", type="primary"):
+            try:
+                # Get extraction service
+                service = get_extraction_service()
+                # Extract text
+                with st.spinner("Extracting text from image... This may take a while."):
+                    result = service.extract_text_from_image(
+                        image,
+                        max_new_tokens=max_tokens
+                    )
+                # Store result in session state
+                st.session_state.extraction_result = result
+                st.session_state.extraction_result["source_image"] = uploaded_file.name
+            except Exception as e:
+                st.error(f"❌ Error during extraction: {str(e)}")
+                st.exception(e)
+        # Display results if available
+        if st.session_state.extraction_result:
+            st.divider()
+            result = st.session_state.extraction_result
+            st.subheader("📄 Extracted Text")
+            # Display extracted text with black color
+            extracted_text = result.get("extracted_text", "")
+            # Escape HTML to prevent injection and ensure proper display
+            escaped_text = html.escape(extracted_text)
+            st.markdown(
+                f'<div style="background-color: #f0f2f6; padding: 15px; border-radius: 5px; max-height: 300px; overflow-y: auto; color: #000000; white-space: pre-wrap; font-family: monospace;">{escaped_text}</div>',
+                unsafe_allow_html=True
+            )
+            # Display metadata (full JSON)
+            with st.expander("📊 Full JSON Metadata"):
+                st.json(result)
+            # Download JSON button
+            json_str = json.dumps(result, ensure_ascii=False, indent=2)
+            st.download_button(
+                label="💾 Download JSON",
+                data=json_str,
+                file_name=f"{Path(uploaded_file.name).stem}.json",
+                mime="application/json"
+            )
+    else:
+        # Show instructions when no file is uploaded
+        st.info("👆 Please upload an image file to get started.")
+        # Example section
+        with st.expander("ℹ️ How to use"):
+            st.markdown("""
+            1. **Upload an image**: Click on the file uploader and select an image file (PNG, JPG, JPEG)
+            2. **Adjust settings** (optional): Use the sidebar to adjust max tokens if needed
+            3. **Extract text**: Click the "Extract Text" button
+            4. **View results**: The extracted text and metadata will be displayed
+            5. **Download**: Download the results as JSON if needed
+            **Note**: The first extraction may take longer as the model needs to be loaded.
+            Subsequent extractions will be faster.
+            """)
+if __name__ == "__main__":
+    main()

tests/test_text_extraction_service.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import pytest
+from unittest.mock import MagicMock, patch
+from PIL import Image
+from service.text_extraction_service import TextExtractionService
+@pytest.fixture
+def mock_service(mocker):
+    """Fixture to create a TextExtractionService with mocked model and processor."""
+    with patch("service.text_extraction_service.Qwen2_5_VLForConditionalGeneration") as mock_model_cls, \
+         patch("service.text_extraction_service.AutoProcessor") as mock_processor_cls, \
+         patch("torch.cuda.is_available", return_value=False), \
+         patch("torch.backends.mps.is_available", return_value=False):
+        mock_model = MagicMock()
+        mock_model_cls.from_pretrained.return_value = mock_model
+        mock_processor = MagicMock()
+        mock_processor_cls.from_pretrained.return_value = mock_processor
+        service = TextExtractionService()
+        return service, mock_model, mock_processor
+def test_parse_ocr_output_with_yaml(mock_service):
+    service, _, _ = mock_service
+    raw_text = """Some prefix text
+---
+primary_language: English
+is_rotation_valid: true
+rotation_correction: 0
+is_table: false
+---
+This is the extracted text content.
+It has multiple lines.
+"""
+    metadata, text = service._parse_ocr_output(raw_text)
+    assert metadata["primary_language"] == "English"
+    assert metadata["is_rotation_valid"] is True
+    assert metadata["rotation_correction"] == 0
+    assert metadata["is_table"] is False
+    assert text == "This is the extracted text content.\nIt has multiple lines."
+def test_parse_ocr_output_without_yaml(mock_service):
+    service, _, _ = mock_service
+    raw_text = "Just some plain text without any YAML frontmatter."
+    metadata, text = service._parse_ocr_output(raw_text)
+    assert metadata == {}
+    assert text == "Just some plain text without any YAML frontmatter."
+def test_parse_ocr_output_malformed_yaml(mock_service):
+    service, _, _ = mock_service
+    # Missing the second separator
+    raw_text = """---
+key: value
+This should probably fail to parse as YAML but return text.
+"""
+    metadata, text = service._parse_ocr_output(raw_text)
+    # Based on current implementation logic:
+    # split('---') will return ['', '\nkey: value\nThis should...', ''] if it ends with ---
+    # or just 2 parts if it starts with --- but doesn't end.
+    # The implementation checks if len(parts) >= 3.
+    # If there are only 2 parts (one separator), it falls back to returning everything as text.
+    assert metadata == {}
+    assert "key: value" in text
+def test_extract_text_from_image(mock_service):
+    service, mock_model, mock_processor = mock_service
+    # Mock image
+    image = Image.new('RGB', (100, 100), color='red')
+    # Mock processor output
+    mock_processor.apply_chat_template.return_value = "mock_prompt"
+    mock_processor.return_value = {"input_ids": MagicMock(), "pixel_values": MagicMock()}
+    mock_processor.return_value["input_ids"].shape = [1, 10] # Mock shape
+    # Mock tokenizer decode
+    mock_processor.tokenizer.batch_decode.return_value = ["""---
+primary_language: English
+---
+Extracted Text"""]
+    # Mock model generate
+    mock_model.generate.return_value = MagicMock() # Return value doesn't matter much as we mock batch_decode
+    result = service.extract_text_from_image(image)
+    assert result["extracted_text"] == "Extracted Text"
+    assert result["primary_language"] == "English"
+    assert result["model"] == service.model_name
+def test_initialization_device_selection():
+    """Test that the correct device is selected based on availability."""
+    with patch("service.text_extraction_service.Qwen2_5_VLForConditionalGeneration"), \
+         patch("service.text_extraction_service.AutoProcessor"):
+        # Test CPU
+        with patch("torch.cuda.is_available", return_value=False), \
+             patch("torch.backends.mps.is_available", return_value=False):
+            service = TextExtractionService()
+            assert service.device.type == "cpu"
+        # Test CUDA
+        with patch("torch.cuda.is_available", return_value=True):
+            service = TextExtractionService()
+            assert service.device.type == "cuda"