scrape-to-markdown-expert-mode-cpu

Sleeping

File size: 8,418 Bytes

fd8df2d

import gradio as gr
import requests
from markdownify import markdownify
import traceback # To help format potential errors
from readability import Document # <<< NOVO IMPORT
from bs4 import BeautifulSoup # <<< NOVO/USADO PARA FALLBACK OU ANÁLISE

# Configure requests with a timeout and user-agent
DEFAULT_TIMEOUT = 15 # seconds
HEADERS = {'User-Agent': 'GradioHTMLtoMarkdownConverter/1.0 (+https://hf.space)'} # Be polite

def html_to_markdown_converter(url: str, html_input: str) -> str:
    """
    Converts HTML (from URL or direct input) to Markdown.
    Prioritizes URL input. Attempts to extract main content using readability.
    """
    html_content = ""
    source = ""
    use_readability = True # Flag to control if readability is used (useful if we add an option later)

    # Clean up inputs
    url = url.strip() if url else ""
    html_input = html_input.strip() if html_input else ""

    try:
        # --- Step 1: Get HTML Content ---
        if url:
            source = f"URL ({url})"
            print(f"Attempting to fetch HTML from URL: {url}")
            try:
                # Ensure URL has a scheme (http/https)
                if not url.startswith(('http://', 'https://')):
                     url = 'https://' + url
                     print(f"Scheme missing, prepended https://. New URL: {url}")

                response = requests.get(url, timeout=DEFAULT_TIMEOUT, headers=HEADERS, allow_redirects=True)
                response.raise_for_status()
                response.encoding = response.apparent_encoding or 'utf-8'
                html_content = response.text
                print(f"Successfully fetched {len(html_content)} bytes from URL.")
            except requests.exceptions.MissingSchema:
                 return f"❌ **Error:** Invalid URL: `{url}`. Please include `http://` or `https://`."
            except requests.exceptions.Timeout:
                return f"❌ **Error:** Request timed out after {DEFAULT_TIMEOUT} seconds trying to fetch URL: `{url}`"
            except requests.exceptions.RequestException as e:
                print(f"Request failed: {e}")
                return f"❌ **Error:** Failed to fetch content from URL: `{url}`\n```\n{e}\n```"
            except Exception as e:
                print(f"An unexpected error occurred during fetch: {e}")
                return f"❌ **Error:** An unexpected error occurred while fetching the URL.\n```\n{traceback.format_exc()}\n```"

        elif html_input:
            source = "Direct HTML Input"
            print(f"Using direct HTML input ({len(html_input)} bytes).")
            html_content = html_input
            # Disable readability for direct input? Maybe not, it could be a full page paste.
            # Let's keep it enabled, it might clean up pasted snippets too.
        else:
            return "❓ Please provide a URL or paste HTML content in the fields above."

        # --- Step 2: Extract Main Content (Optional but Recommended) ---
        if not html_content:
            return f"❓ No HTML content found from {source}."

        processed_html = html_content
        article_title = ""
        if use_readability:
            print("Attempting to extract main content using Readability...")
            try:
                doc = Document(html_content)
                article_title = doc.title() # Get the title extracted by readability
                processed_html = doc.summary() # Get the cleaned main content HTML
                print(f"Readability extracted title: '{article_title}'")
                print(f"Readability summary HTML length: {len(processed_html)}")

                # Check if readability returned anything substantial
                soup = BeautifulSoup(processed_html, 'html.parser')
                if not soup.text.strip():
                    print("Readability summary was empty or non-substantial. Falling back to full HTML.")
                    processed_html = html_content # Fallback to original HTML
                    article_title = "" # Reset title if we fall back
                else:
                     print("Using Readability summary for conversion.")

            except Exception as e:
                print(f"Readability processing failed: {e}\n{traceback.format_exc()}")
                print("Falling back to using full HTML content due to Readability error.")
                processed_html = html_content # Fallback on error
                article_title = ""

        else:
             print("Skipping Readability step. Using full HTML content.")
             processed_html = html_content


        # --- Step 3: Convert to Markdown ---
        if not processed_html.strip():
             # This can happen if original HTML was just whitespace, or if readability failed AND original was empty
             return f"❓ The HTML content (after potential processing) appears to be empty."

        print(f"Attempting to convert processed HTML (length: {len(processed_html)}) to Markdown...")
        try:
            markdown_output = markdownify(processed_html, heading_style="ATX", bullets='*')
            print(f"Conversion successful. Markdown length: {len(markdown_output)}")

            # Prepend the title if found by readability and we used its output
            if article_title and processed_html != html_content: # Only add title if readability was used successfully
                final_output = f"# {article_title}\n\n{markdown_output}"
            else:
                final_output = markdown_output

            if not final_output.strip():
                 return f"ℹ️ The conversion resulted in empty Markdown (the processed HTML might have contained only unsupported tags or whitespace)."

            return final_output

        except Exception as e:
            print(f"Markdown conversion failed: {e}")
            return f"❌ **Error:** Failed to convert HTML to Markdown.\n```\n{traceback.format_exc()}\n```"

    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return f"❌ **Error:** An unexpected error occurred during processing.\n```\n{traceback.format_exc()}\n```"

# --- Gradio Interface ---
title = "HTML to Markdown Converter (Smart Extraction)"
description = """
Enter a URL **or** paste HTML code directly into the text box below.
The tool attempts to **extract the main article content** using Mozilla's Readability library, removing clutter like menus and sidebars (this works best with URLs).
The extracted HTML is then converted into Markdown. Priority is given to the URL input.
"""
article = """
**How it works:**
1.  Uses `requests` to fetch content from URLs.
2.  Uses `readability-lxml` to attempt extracting the main article content and title from the fetched/pasted HTML.
3.  If Readability fails or returns empty content, it falls back to using the full HTML.
4.  Uses `markdownify` to convert the processed HTML into Markdown (ATX headings, '*' bullets).
5.  If a title was found by Readability, it's prepended to the Markdown output.
"""

# Define input components (no change needed here)
url_input = gr.Textbox(
    label="Enter URL (gets priority)",
    placeholder="e.g., en.wikipedia.org/wiki/Markdown"
)
html_input_area = gr.Textbox(
    label="Or Paste HTML Code Here",
    lines=10,
    placeholder="e.g., <h1>Hello</h1><p>This is <b>bold</b>.</p>"
)

# Define output component (no change needed here)
markdown_output_display = gr.Markdown(label="Converted Markdown Output")

# Create the Gradio interface (Updated title, description, article)
iface = gr.Interface(
    fn=html_to_markdown_converter,
    inputs=[url_input, html_input_area],
    outputs=markdown_output_display,
    title=title,
    description=description,
    article=article,
    allow_flagging='never',
    examples=[
        ["https://gradio.app/quickstart/", ""],
        ["https://en.wikipedia.org/wiki/Python_(programming_language)", ""], # Good test for readability
        ["https://www.bbc.com/news", ""], # News site, tests clutter removal
        ["", "<body><header>Menu</header><main><h1>Main Title</h1><p>Article content here.</p></main><footer>Copyright</footer></body>"], # Direct HTML test
        ["https://httpbin.org/delay/5", ""], # Test timeout resilience
        ["invalid-url", ""] # Test invalid URL handling
    ],
     cache_examples=False
)

# Launch the app
if __name__ == "__main__":
    iface.launch()