|
|
import gradio as gr |
|
|
import requests |
|
|
from markdownify import markdownify |
|
|
import traceback |
|
|
from readability import Document |
|
|
from bs4 import BeautifulSoup |
|
|
import logging |
|
|
import socket |
|
|
import ipaddress |
|
|
from urllib.parse import urlparse |
|
|
|
|
|
|
|
|
DEFAULT_TIMEOUT = 20 |
|
|
HEADERS = {'User-Agent': 'GradioHTMLtoMarkdownConverter/1.2 (+https://hf.space)'} |
|
|
MAX_CONTENT_SIZE_BYTES = 10 * 1024 * 1024 |
|
|
MIN_TITLE_LENGTH = 4 |
|
|
PRECLEAN_TAGS_TO_REMOVE = [ |
|
|
'script', 'style', 'iframe', 'svg', 'noscript', 'header', 'footer', 'nav', 'aside', 'form', 'button', 'input', 'textarea', 'select', 'option', 'label' |
|
|
] |
|
|
GENERIC_ERROR_MESSAGE = "❌ Error: An unexpected internal error occurred. Please check logs or try again later." |
|
|
SOURCE_URL_PREFIX = "URL" |
|
|
SOURCE_DIRECT_INPUT = "Direct HTML Input" |
|
|
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
|
|
|
|
|
|
|
|
|
|
|
def _is_ip_allowed(hostname: str) -> bool: |
|
|
"""Verifica se o IP resolvido do hostname é permitido (não privado/local).""" |
|
|
try: |
|
|
addr_info = socket.getaddrinfo(hostname, None) |
|
|
ip_addr_str = addr_info[0][4][0] |
|
|
ip_addr = ipaddress.ip_address(ip_addr_str) |
|
|
if ip_addr.is_private or ip_addr.is_loopback or ip_addr.is_link_local: |
|
|
logging.warning(f"Blocked attempt to access internal/private IP: {ip_addr_str} for hostname {hostname}") |
|
|
return False |
|
|
logging.info(f"Hostname {hostname} resolved to allowed public IP {ip_addr_str}.") |
|
|
return True |
|
|
except socket.gaierror as e: |
|
|
logging.error(f"Could not resolve hostname: {hostname} - {e}") |
|
|
return False |
|
|
except Exception as e: |
|
|
logging.error(f"Unexpected error during IP validation for {hostname}: {e}", exc_info=True) |
|
|
return False |
|
|
|
|
|
def _fetch_and_clean_html(url: str, html_input: str) -> tuple[str | None, str | None, str | None]: |
|
|
""" |
|
|
Busca HTML da URL ou usa input direto, faz pré-limpeza. |
|
|
Retorna uma tupla: (cleaned_html, source_description, error_message) |
|
|
Retorna (None, source, error_message) em caso de erro. |
|
|
Retorna (None, None, error_message) se nenhuma entrada foi fornecida. |
|
|
""" |
|
|
html_content = "" |
|
|
source = None |
|
|
|
|
|
if url: |
|
|
source = f"{SOURCE_URL_PREFIX} ({url})" |
|
|
logging.info(f"Attempting to fetch HTML from URL: {url}") |
|
|
try: |
|
|
|
|
|
|
|
|
if not url.startswith(('http://', 'https://')): |
|
|
url = 'https://' + url |
|
|
logging.info(f"Scheme missing, prepended https://. New URL: {url}") |
|
|
|
|
|
parsed_url = urlparse(url) |
|
|
if not parsed_url.scheme or not parsed_url.netloc: |
|
|
raise ValueError("Invalid URL structure.") |
|
|
if not _is_ip_allowed(parsed_url.hostname): |
|
|
|
|
|
return None, source, f"❌ Error: Access to this URL's IP address is not allowed for security reasons." |
|
|
|
|
|
response = requests.get(url, timeout=DEFAULT_TIMEOUT, headers=HEADERS, allow_redirects=True, stream=True) |
|
|
response.raise_for_status() |
|
|
|
|
|
content_length = response.headers.get('Content-Length') |
|
|
if content_length and int(content_length) > MAX_CONTENT_SIZE_BYTES: |
|
|
logging.warning(f"Content-Length {content_length} exceeds limit for URL: {url}") |
|
|
return None, source, f"❌ Error: Content exceeds maximum allowed size ({MAX_CONTENT_SIZE_BYTES // 1024 // 1024}MB)." |
|
|
|
|
|
response.encoding = response.apparent_encoding or 'utf-8' |
|
|
html_content = response.text |
|
|
if len(html_content.encode(response.encoding, errors='ignore')) > MAX_CONTENT_SIZE_BYTES * 1.1: |
|
|
logging.warning(f"Decoded content size exceeds limit for URL: {url}") |
|
|
return None, source, f"❌ Error: Decoded content exceeds estimated maximum size." |
|
|
logging.info(f"Successfully fetched {len(html_content)} bytes from {url}.") |
|
|
|
|
|
except ValueError as e: |
|
|
logging.error(f"Invalid URL provided: {url} - {e}") |
|
|
return None, source, f"❌ Error: Invalid URL format: `{url}`." |
|
|
except requests.exceptions.MissingSchema: |
|
|
logging.error(f"Invalid URL (Missing Schema): {url}") |
|
|
return None, source, f"❌ Error: Invalid URL: `{url}`. Please include `http://` or `https://`." |
|
|
except requests.exceptions.Timeout: |
|
|
logging.warning(f"Request timed out for URL: {url}") |
|
|
return None, source, f"❌ Error: Request timed out after {DEFAULT_TIMEOUT} seconds trying to fetch URL: `{url}`" |
|
|
except requests.exceptions.RequestException as e: |
|
|
logging.error(f"Failed to fetch URL: {url} - {e}") |
|
|
return None, source, f"❌ Error: Failed to fetch content from URL: `{url}`\nDetails: {e}" |
|
|
except Exception as e: |
|
|
logging.error(f"Unexpected error fetching URL {url}: {traceback.format_exc()}") |
|
|
return None, source, GENERIC_ERROR_MESSAGE |
|
|
|
|
|
elif html_input: |
|
|
source = SOURCE_DIRECT_INPUT |
|
|
logging.info(f"Using {source} ({len(html_input)} bytes).") |
|
|
if len(html_input) > MAX_CONTENT_SIZE_BYTES * 1.2: |
|
|
logging.warning(f"Direct HTML input size {len(html_input)} exceeds limit.") |
|
|
|
|
|
return None, source, f"❌ Error: Pasted HTML exceeds maximum allowed size." |
|
|
html_content = html_input |
|
|
else: |
|
|
|
|
|
return None, None, "❓ Please provide a URL or paste HTML content in the fields above." |
|
|
|
|
|
|
|
|
if not html_content: |
|
|
logging.error("Reached pre-cleaning stage with no HTML content.") |
|
|
return None, source, f"❓ No HTML content found from {source}." |
|
|
|
|
|
logging.info("Pre-cleaning HTML...") |
|
|
try: |
|
|
soup_pre = BeautifulSoup(html_content, 'lxml') |
|
|
for tag in soup_pre(PRECLEAN_TAGS_TO_REMOVE): |
|
|
tag.decompose() |
|
|
cleaned_html = str(soup_pre) |
|
|
logging.info(f"HTML pre-cleaned. Size reduced to {len(cleaned_html)} bytes.") |
|
|
|
|
|
return cleaned_html, source, None |
|
|
except Exception as e: |
|
|
logging.error(f"Error during HTML pre-cleaning: {traceback.format_exc()}") |
|
|
|
|
|
return None, source, "❌ Error: Failed during HTML pre-cleaning step." |
|
|
|
|
|
|
|
|
|
|
|
def _extract_content_and_title(cleaned_html: str, source: str) -> tuple[str | None, str | None]: |
|
|
""" |
|
|
Extrai conteúdo principal com Readability (APENAS para URLs) e determina o título. |
|
|
Retorna (processed_html, final_title). |
|
|
""" |
|
|
processed_html = cleaned_html |
|
|
readability_title = None |
|
|
final_title = None |
|
|
use_readability = True |
|
|
|
|
|
|
|
|
if use_readability and source and source.startswith(SOURCE_URL_PREFIX): |
|
|
logging.info("Source is URL. Attempting to extract main content using Readability...") |
|
|
try: |
|
|
doc = Document(cleaned_html) |
|
|
readability_title = doc.title() |
|
|
processed_html_summary = doc.summary() |
|
|
soup_summary_check = BeautifulSoup(processed_html_summary, 'lxml') |
|
|
if soup_summary_check.text.strip(): |
|
|
processed_html = processed_html_summary |
|
|
logging.info(f"Readability extracted title: '{readability_title}'. Using summary content for URL.") |
|
|
else: |
|
|
logging.warning("Readability summary was empty for URL. Falling back to cleaned full HTML.") |
|
|
readability_title = None |
|
|
|
|
|
|
|
|
except Exception as e: |
|
|
logging.warning(f"Readability processing failed for URL: {e}. Falling back to cleaned full HTML.") |
|
|
readability_title = None |
|
|
|
|
|
elif source == SOURCE_DIRECT_INPUT: |
|
|
logging.info("Source is Direct HTML Input. Skipping Readability content extraction.") |
|
|
|
|
|
readability_title = None |
|
|
else: |
|
|
logging.warning(f"Source type '{source}' unknown or missing, skipping Readability.") |
|
|
readability_title = None |
|
|
|
|
|
|
|
|
|
|
|
if readability_title and len(readability_title) >= MIN_TITLE_LENGTH and not readability_title.strip().startswith('['): |
|
|
final_title = readability_title.strip() |
|
|
logging.info(f"Using Readability title: '{final_title}'") |
|
|
|
|
|
|
|
|
if not final_title: |
|
|
|
|
|
if source and source.startswith(SOURCE_URL_PREFIX): |
|
|
logging.info("Readability title not suitable or not found for URL. Looking for H1 fallback in cleaned HTML...") |
|
|
else: |
|
|
logging.info("Looking for H1 title in cleaned HTML...") |
|
|
|
|
|
try: |
|
|
soup_for_h1 = BeautifulSoup(cleaned_html, 'lxml') |
|
|
h1_tag = soup_for_h1.find('h1') |
|
|
if h1_tag: |
|
|
h1_text = h1_tag.get_text(strip=True) |
|
|
if h1_text: |
|
|
final_title = h1_text |
|
|
logging.info(f"Using H1 fallback title: '{final_title}'") |
|
|
else: |
|
|
logging.info("Found H1 tag but it was empty.") |
|
|
else: |
|
|
logging.info("No H1 tag found in cleaned HTML for fallback title.") |
|
|
except Exception as e: |
|
|
logging.error(f"Error searching for H1 fallback title: {traceback.format_exc()}") |
|
|
|
|
|
|
|
|
return processed_html, final_title |
|
|
|
|
|
|
|
|
def _convert_to_markdown(processed_html: str, final_title: str | None) -> tuple[str | None, str | None]: |
|
|
""" |
|
|
Remove título duplicado do HTML processado (se necessário) e converte para Markdown. |
|
|
Retorna (final_markdown, None) ou (None, error_message). |
|
|
""" |
|
|
|
|
|
html_to_convert = processed_html |
|
|
|
|
|
if final_title: |
|
|
logging.info(f"Checking for title duplication (first H1 in processed content)...") |
|
|
try: |
|
|
soup_proc = BeautifulSoup(processed_html, 'lxml') |
|
|
first_h1_in_proc = soup_proc.find('h1') |
|
|
if first_h1_in_proc: |
|
|
h1_proc_text = first_h1_in_proc.get_text(strip=True) |
|
|
if h1_proc_text == final_title: |
|
|
logging.info(f"Found matching H1 ('{h1_proc_text}') in content. Removing it to prevent duplication.") |
|
|
first_h1_in_proc.decompose() |
|
|
html_to_convert = str(soup_proc) |
|
|
else: |
|
|
logging.info(f"First H1 content ('{h1_proc_text}') does not match final title ('{final_title}'). Keeping H1.") |
|
|
else: |
|
|
logging.info("No H1 found in processed content to check for duplication.") |
|
|
except Exception as e: |
|
|
logging.error(f"Error during title duplication check: {traceback.format_exc()}") |
|
|
|
|
|
if not html_to_convert.strip(): |
|
|
logging.warning("HTML content (after processing) is empty. Cannot convert.") |
|
|
return None, f"❓ The HTML content (after processing) appears to be empty." |
|
|
|
|
|
logging.info(f"Attempting to convert final processed HTML (length: {len(html_to_convert)}) to Markdown...") |
|
|
try: |
|
|
markdown_output = markdownify( |
|
|
html_to_convert, |
|
|
heading_style="ATX", |
|
|
bullets='*' |
|
|
).strip() |
|
|
|
|
|
if final_title: |
|
|
final_markdown = f"# {final_title}\n\n{markdown_output}" |
|
|
else: |
|
|
final_markdown = markdown_output |
|
|
|
|
|
if not final_markdown.strip(): |
|
|
logging.warning("Markdown conversion resulted in empty output.") |
|
|
return None, f"ℹ️ The conversion resulted in empty Markdown." |
|
|
|
|
|
logging.info(f"Successfully converted to Markdown (length: {len(final_markdown)}).") |
|
|
return final_markdown.strip(), None |
|
|
|
|
|
except Exception as e: |
|
|
logging.error(f"Failed to convert HTML to Markdown: {traceback.format_exc()}") |
|
|
return None, "❌ Error: Failed during the final Markdown conversion step." |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def html_to_markdown_converter(url: str, html_input: str) -> str: |
|
|
""" |
|
|
Converts HTML (from URL or direct input) to Markdown using helper functions. |
|
|
Handles overall workflow and top-level errors. |
|
|
""" |
|
|
url = url.strip() if url else "" |
|
|
html_input = html_input.strip() if html_input else "" |
|
|
|
|
|
try: |
|
|
|
|
|
|
|
|
cleaned_html, source, error_msg = _fetch_and_clean_html(url, html_input) |
|
|
if error_msg: |
|
|
return error_msg |
|
|
if cleaned_html is None or source is None: |
|
|
logging.error("Fetching/cleaning returned None HTML/source without error message.") |
|
|
return GENERIC_ERROR_MESSAGE |
|
|
|
|
|
|
|
|
|
|
|
processed_html, final_title = _extract_content_and_title(cleaned_html, source) |
|
|
|
|
|
if processed_html is None: |
|
|
logging.error("Processed HTML became None unexpectedly after extraction step.") |
|
|
return GENERIC_ERROR_MESSAGE |
|
|
|
|
|
|
|
|
final_markdown, convert_error_msg = _convert_to_markdown(processed_html, final_title) |
|
|
if convert_error_msg: |
|
|
return convert_error_msg |
|
|
else: |
|
|
return final_markdown |
|
|
|
|
|
except Exception as e: |
|
|
logging.error(f"FATAL: Unexpected error in main converter function: {traceback.format_exc()}") |
|
|
return GENERIC_ERROR_MESSAGE |
|
|
|
|
|
|
|
|
|
|
|
title = "Smart Scrape Any URL or Website to Markdown [Expert CPU Mode]" |
|
|
description = """ |
|
|
Enter a URL **or** paste HTML code directly into the text box below. |
|
|
- For **URLs**, the tool attempts to extract the main article content using `readability` before converting. |
|
|
- For **Pasted HTML**, the tool converts the *entire* provided HTML (after basic cleaning) without using `readability`'s content extraction. |
|
|
It identifies a title (page title or first H1 fallback) and converts to Markdown. Includes security checks and size limits. |
|
|
Use the **copy icon** (📋) in the output box to copy the code. |
|
|
""" |
|
|
article = """ |
|
|
**How it works (v1.2):** |
|
|
1. **Input:** Accepts URL or direct HTML. |
|
|
2. **Fetch/Clean:** Gets HTML, performs security checks (IP block, size limit), removes basic tags (`<script>`, `<nav>`, etc.). Determines if source is URL or Direct Input. |
|
|
3. **Content Processing:** |
|
|
* **If Source is URL:** Attempts `readability-lxml` extraction (`doc.summary()`). Falls back to cleaned HTML if extraction fails/is empty. |
|
|
* **If Source is Direct Input:** **Skips** `readability-lxml` extraction. Uses the cleaned HTML directly. |
|
|
4. **Title Logic:** Tries Readability title (if URL source). Falls back to first `<h1>` in *cleaned* HTML otherwise. |
|
|
5. **Deduplication:** Removes the first `<h1>` from the *processed content* if it matches the determined title. |
|
|
6. **Conversion:** Uses `markdownify` to convert the final processed HTML to Markdown. |
|
|
7. **Output:** Prepends title (if found) and returns Markdown or error message. |
|
|
8. **Logging:** Uses Python's `logging`. |
|
|
""" |
|
|
|
|
|
|
|
|
url_input = gr.Textbox(...) |
|
|
html_input_area = gr.Textbox(...) |
|
|
markdown_output_textbox = gr.Textbox(...) |
|
|
|
|
|
|
|
|
iface = gr.Interface( |
|
|
fn=html_to_markdown_converter, |
|
|
inputs=[url_input, html_input_area], |
|
|
outputs=markdown_output_textbox, |
|
|
title=title, |
|
|
description=description, |
|
|
article=article, |
|
|
allow_flagging='never', |
|
|
examples=[ |
|
|
|
|
|
["https://psychedelic.com.br/profissoes-boneca-barbie/", ""], |
|
|
["https://agideia.com.br/tutoriais/ai-inteligencia-artificial/integre-uma-ia-gratuita-gemma-2b-ao-seu-site-wordpress-usando-google-colab-e-cloudflare/", ""], |
|
|
|
|
|
["", "<h1>Título Simples</h1>\n<p>Este é um parágrafo de exemplo com <strong>texto em negrito</strong> e <em>texto em itálico</em>.</p>\n<ul>\n<li>Item 1</li>\n<li>Item 2</li>\n</ul>"], |
|
|
|
|
|
["", "<p>Um parágrafo sem título H1.</p><div><p>Outro conteúdo.</p></div>"] |
|
|
], |
|
|
cache_examples=False |
|
|
) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
iface.launch() |