|
|
import gradio as gr |
|
|
import requests |
|
|
from markdownify import markdownify |
|
|
import traceback |
|
|
from readability import Document |
|
|
from bs4 import BeautifulSoup |
|
|
|
|
|
|
|
|
DEFAULT_TIMEOUT = 15 |
|
|
HEADERS = {'User-Agent': 'GradioHTMLtoMarkdownConverter/1.0 (+https://hf.space)'} |
|
|
|
|
|
def html_to_markdown_converter(url: str, html_input: str) -> str: |
|
|
""" |
|
|
Converts HTML (from URL or direct input) to Markdown. |
|
|
Prioritizes URL input. Attempts to extract main content using readability. |
|
|
""" |
|
|
html_content = "" |
|
|
source = "" |
|
|
use_readability = True |
|
|
|
|
|
|
|
|
url = url.strip() if url else "" |
|
|
html_input = html_input.strip() if html_input else "" |
|
|
|
|
|
try: |
|
|
|
|
|
if url: |
|
|
source = f"URL ({url})" |
|
|
print(f"Attempting to fetch HTML from URL: {url}") |
|
|
try: |
|
|
|
|
|
if not url.startswith(('http://', 'https://')): |
|
|
url = 'https://' + url |
|
|
print(f"Scheme missing, prepended https://. New URL: {url}") |
|
|
|
|
|
response = requests.get(url, timeout=DEFAULT_TIMEOUT, headers=HEADERS, allow_redirects=True) |
|
|
response.raise_for_status() |
|
|
response.encoding = response.apparent_encoding or 'utf-8' |
|
|
html_content = response.text |
|
|
print(f"Successfully fetched {len(html_content)} bytes from URL.") |
|
|
except requests.exceptions.MissingSchema: |
|
|
return f"❌ **Error:** Invalid URL: `{url}`. Please include `http://` or `https://`." |
|
|
except requests.exceptions.Timeout: |
|
|
return f"❌ **Error:** Request timed out after {DEFAULT_TIMEOUT} seconds trying to fetch URL: `{url}`" |
|
|
except requests.exceptions.RequestException as e: |
|
|
print(f"Request failed: {e}") |
|
|
return f"❌ **Error:** Failed to fetch content from URL: `{url}`\n```\n{e}\n```" |
|
|
except Exception as e: |
|
|
print(f"An unexpected error occurred during fetch: {e}") |
|
|
return f"❌ **Error:** An unexpected error occurred while fetching the URL.\n```\n{traceback.format_exc()}\n```" |
|
|
|
|
|
elif html_input: |
|
|
source = "Direct HTML Input" |
|
|
print(f"Using direct HTML input ({len(html_input)} bytes).") |
|
|
html_content = html_input |
|
|
|
|
|
|
|
|
else: |
|
|
return "❓ Please provide a URL or paste HTML content in the fields above." |
|
|
|
|
|
|
|
|
if not html_content: |
|
|
return f"❓ No HTML content found from {source}." |
|
|
|
|
|
processed_html = html_content |
|
|
article_title = "" |
|
|
if use_readability: |
|
|
print("Attempting to extract main content using Readability...") |
|
|
try: |
|
|
doc = Document(html_content) |
|
|
article_title = doc.title() |
|
|
processed_html = doc.summary() |
|
|
print(f"Readability extracted title: '{article_title}'") |
|
|
print(f"Readability summary HTML length: {len(processed_html)}") |
|
|
|
|
|
|
|
|
soup = BeautifulSoup(processed_html, 'html.parser') |
|
|
if not soup.text.strip(): |
|
|
print("Readability summary was empty or non-substantial. Falling back to full HTML.") |
|
|
processed_html = html_content |
|
|
article_title = "" |
|
|
else: |
|
|
print("Using Readability summary for conversion.") |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Readability processing failed: {e}\n{traceback.format_exc()}") |
|
|
print("Falling back to using full HTML content due to Readability error.") |
|
|
processed_html = html_content |
|
|
article_title = "" |
|
|
|
|
|
else: |
|
|
print("Skipping Readability step. Using full HTML content.") |
|
|
processed_html = html_content |
|
|
|
|
|
|
|
|
|
|
|
if not processed_html.strip(): |
|
|
|
|
|
return f"❓ The HTML content (after potential processing) appears to be empty." |
|
|
|
|
|
print(f"Attempting to convert processed HTML (length: {len(processed_html)}) to Markdown...") |
|
|
try: |
|
|
markdown_output = markdownify(processed_html, heading_style="ATX", bullets='*') |
|
|
print(f"Conversion successful. Markdown length: {len(markdown_output)}") |
|
|
|
|
|
|
|
|
if article_title and processed_html != html_content: |
|
|
final_output = f"# {article_title}\n\n{markdown_output}" |
|
|
else: |
|
|
final_output = markdown_output |
|
|
|
|
|
if not final_output.strip(): |
|
|
return f"ℹ️ The conversion resulted in empty Markdown (the processed HTML might have contained only unsupported tags or whitespace)." |
|
|
|
|
|
return final_output |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Markdown conversion failed: {e}") |
|
|
return f"❌ **Error:** Failed to convert HTML to Markdown.\n```\n{traceback.format_exc()}\n```" |
|
|
|
|
|
except Exception as e: |
|
|
print(f"An unexpected error occurred: {e}") |
|
|
return f"❌ **Error:** An unexpected error occurred during processing.\n```\n{traceback.format_exc()}\n```" |
|
|
|
|
|
|
|
|
title = "HTML to Markdown Converter (Smart Extraction)" |
|
|
description = """ |
|
|
Enter a URL **or** paste HTML code directly into the text box below. |
|
|
The tool attempts to **extract the main article content** using Mozilla's Readability library, removing clutter like menus and sidebars (this works best with URLs). |
|
|
The extracted HTML is then converted into Markdown. Priority is given to the URL input. |
|
|
""" |
|
|
article = """ |
|
|
**How it works:** |
|
|
1. Uses `requests` to fetch content from URLs. |
|
|
2. Uses `readability-lxml` to attempt extracting the main article content and title from the fetched/pasted HTML. |
|
|
3. If Readability fails or returns empty content, it falls back to using the full HTML. |
|
|
4. Uses `markdownify` to convert the processed HTML into Markdown (ATX headings, '*' bullets). |
|
|
5. If a title was found by Readability, it's prepended to the Markdown output. |
|
|
""" |
|
|
|
|
|
|
|
|
url_input = gr.Textbox( |
|
|
label="Enter URL (gets priority)", |
|
|
placeholder="e.g., en.wikipedia.org/wiki/Markdown" |
|
|
) |
|
|
html_input_area = gr.Textbox( |
|
|
label="Or Paste HTML Code Here", |
|
|
lines=10, |
|
|
placeholder="e.g., <h1>Hello</h1><p>This is <b>bold</b>.</p>" |
|
|
) |
|
|
|
|
|
|
|
|
markdown_output_display = gr.Markdown(label="Converted Markdown Output") |
|
|
|
|
|
|
|
|
iface = gr.Interface( |
|
|
fn=html_to_markdown_converter, |
|
|
inputs=[url_input, html_input_area], |
|
|
outputs=markdown_output_display, |
|
|
title=title, |
|
|
description=description, |
|
|
article=article, |
|
|
allow_flagging='never', |
|
|
examples=[ |
|
|
["https://gradio.app/quickstart/", ""], |
|
|
["https://en.wikipedia.org/wiki/Python_(programming_language)", ""], |
|
|
["https://www.bbc.com/news", ""], |
|
|
["", "<body><header>Menu</header><main><h1>Main Title</h1><p>Article content here.</p></main><footer>Copyright</footer></body>"], |
|
|
["https://httpbin.org/delay/5", ""], |
|
|
["invalid-url", ""] |
|
|
], |
|
|
cache_examples=False |
|
|
) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
iface.launch() |