import gradio as gr import requests from markdownify import markdownify import traceback from readability import Document from bs4 import BeautifulSoup import logging import socket import ipaddress from urllib.parse import urlparse # --- Configuration Constants --- DEFAULT_TIMEOUT = 20 # seconds HEADERS = {'User-Agent': 'GradioHTMLtoMarkdownConverter/1.2 (+https://hf.space)'} # Updated version MAX_CONTENT_SIZE_BYTES = 10 * 1024 * 1024 MIN_TITLE_LENGTH = 4 PRECLEAN_TAGS_TO_REMOVE = [ 'script', 'style', 'iframe', 'svg', 'noscript', 'header', 'footer', 'nav', 'aside', 'form', 'button', 'input', 'textarea', 'select', 'option', 'label' ] GENERIC_ERROR_MESSAGE = "❌ Error: An unexpected internal error occurred. Please check logs or try again later." SOURCE_URL_PREFIX = "URL" # Identifier for URL source SOURCE_DIRECT_INPUT = "Direct HTML Input" # Identifier for direct input # --- Logging Setup --- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') # --- Helper Functions --- def _is_ip_allowed(hostname: str) -> bool: """Verifica se o IP resolvido do hostname é permitido (não privado/local).""" try: addr_info = socket.getaddrinfo(hostname, None) ip_addr_str = addr_info[0][4][0] ip_addr = ipaddress.ip_address(ip_addr_str) if ip_addr.is_private or ip_addr.is_loopback or ip_addr.is_link_local: logging.warning(f"Blocked attempt to access internal/private IP: {ip_addr_str} for hostname {hostname}") return False logging.info(f"Hostname {hostname} resolved to allowed public IP {ip_addr_str}.") return True except socket.gaierror as e: logging.error(f"Could not resolve hostname: {hostname} - {e}") return False except Exception as e: logging.error(f"Unexpected error during IP validation for {hostname}: {e}", exc_info=True) return False def _fetch_and_clean_html(url: str, html_input: str) -> tuple[str | None, str | None, str | None]: """ Busca HTML da URL ou usa input direto, faz pré-limpeza. Retorna uma tupla: (cleaned_html, source_description, error_message) Retorna (None, source, error_message) em caso de erro. Retorna (None, None, error_message) se nenhuma entrada foi fornecida. """ html_content = "" source = None # Initialize source if url: source = f"{SOURCE_URL_PREFIX} ({url})" # Use constant prefix logging.info(f"Attempting to fetch HTML from URL: {url}") try: # ... (mesma lógica de fetch, validação de IP, tamanho, etc.)... # 1. Prepend Scheme if not url.startswith(('http://', 'https://')): url = 'https://' + url logging.info(f"Scheme missing, prepended https://. New URL: {url}") # 2. Validate URL structure and check for forbidden IPs parsed_url = urlparse(url) if not parsed_url.scheme or not parsed_url.netloc: raise ValueError("Invalid URL structure.") if not _is_ip_allowed(parsed_url.hostname): # Pass source back even on error return None, source, f"❌ Error: Access to this URL's IP address is not allowed for security reasons." # 3. Fetch content response = requests.get(url, timeout=DEFAULT_TIMEOUT, headers=HEADERS, allow_redirects=True, stream=True) response.raise_for_status() # 4. Check Content-Length content_length = response.headers.get('Content-Length') if content_length and int(content_length) > MAX_CONTENT_SIZE_BYTES: logging.warning(f"Content-Length {content_length} exceeds limit for URL: {url}") return None, source, f"❌ Error: Content exceeds maximum allowed size ({MAX_CONTENT_SIZE_BYTES // 1024 // 1024}MB)." # 5. Read content response.encoding = response.apparent_encoding or 'utf-8' html_content = response.text if len(html_content.encode(response.encoding, errors='ignore')) > MAX_CONTENT_SIZE_BYTES * 1.1: logging.warning(f"Decoded content size exceeds limit for URL: {url}") return None, source, f"❌ Error: Decoded content exceeds estimated maximum size." logging.info(f"Successfully fetched {len(html_content)} bytes from {url}.") except ValueError as e: logging.error(f"Invalid URL provided: {url} - {e}") return None, source, f"❌ Error: Invalid URL format: `{url}`." except requests.exceptions.MissingSchema: logging.error(f"Invalid URL (Missing Schema): {url}") return None, source, f"❌ Error: Invalid URL: `{url}`. Please include `http://` or `https://`." except requests.exceptions.Timeout: logging.warning(f"Request timed out for URL: {url}") return None, source, f"❌ Error: Request timed out after {DEFAULT_TIMEOUT} seconds trying to fetch URL: `{url}`" except requests.exceptions.RequestException as e: logging.error(f"Failed to fetch URL: {url} - {e}") return None, source, f"❌ Error: Failed to fetch content from URL: `{url}`\nDetails: {e}" except Exception as e: logging.error(f"Unexpected error fetching URL {url}: {traceback.format_exc()}") return None, source, GENERIC_ERROR_MESSAGE elif html_input: source = SOURCE_DIRECT_INPUT # Use constant logging.info(f"Using {source} ({len(html_input)} bytes).") if len(html_input) > MAX_CONTENT_SIZE_BYTES * 1.2: logging.warning(f"Direct HTML input size {len(html_input)} exceeds limit.") # Pass source back even on error return None, source, f"❌ Error: Pasted HTML exceeds maximum allowed size." html_content = html_input else: # No input provided return None, None, "❓ Please provide a URL or paste HTML content in the fields above." # --- Pre-cleaning --- if not html_content: # Should only happen if logic above fails unexpectedly logging.error("Reached pre-cleaning stage with no HTML content.") return None, source, f"❓ No HTML content found from {source}." logging.info("Pre-cleaning HTML...") try: soup_pre = BeautifulSoup(html_content, 'lxml') for tag in soup_pre(PRECLEAN_TAGS_TO_REMOVE): tag.decompose() cleaned_html = str(soup_pre) logging.info(f"HTML pre-cleaned. Size reduced to {len(cleaned_html)} bytes.") # Return cleaned_html, source, and None for error message return cleaned_html, source, None except Exception as e: logging.error(f"Error during HTML pre-cleaning: {traceback.format_exc()}") # Pass source back even on error return None, source, "❌ Error: Failed during HTML pre-cleaning step." # **MODIFIED** def _extract_content_and_title(cleaned_html: str, source: str) -> tuple[str | None, str | None]: """ Extrai conteúdo principal com Readability (APENAS para URLs) e determina o título. Retorna (processed_html, final_title). """ processed_html = cleaned_html # Default to cleaned HTML (importante para Direct Input) readability_title = None final_title = None use_readability = True # Internal flag, could be user option later # **Execute Readability ONLY if requested AND the source is a URL** if use_readability and source and source.startswith(SOURCE_URL_PREFIX): logging.info("Source is URL. Attempting to extract main content using Readability...") try: doc = Document(cleaned_html) readability_title = doc.title() processed_html_summary = doc.summary() soup_summary_check = BeautifulSoup(processed_html_summary, 'lxml') if soup_summary_check.text.strip(): processed_html = processed_html_summary # Use summary ONLY IF valid AND source is URL logging.info(f"Readability extracted title: '{readability_title}'. Using summary content for URL.") else: logging.warning("Readability summary was empty for URL. Falling back to cleaned full HTML.") readability_title = None # Discard title if summary failed # processed_html remains cleaned_html except Exception as e: logging.warning(f"Readability processing failed for URL: {e}. Falling back to cleaned full HTML.") readability_title = None # processed_html remains cleaned_html elif source == SOURCE_DIRECT_INPUT: logging.info("Source is Direct HTML Input. Skipping Readability content extraction.") # processed_html is already set to cleaned_html, which is correct. readability_title = None # Ensure no accidental title carry-over else: logging.warning(f"Source type '{source}' unknown or missing, skipping Readability.") readability_title = None # --- Title Decision Logic --- # Priority 1: Readability title (only possible if source was URL and Readability ran) if readability_title and len(readability_title) >= MIN_TITLE_LENGTH and not readability_title.strip().startswith('['): final_title = readability_title.strip() logging.info(f"Using Readability title: '{final_title}'") # Priority 2: Fallback to first H1 from CLEANED HTML (runs for BOTH URL and Direct Input if no Readability title) if not final_title: # Log difference based on source if source and source.startswith(SOURCE_URL_PREFIX): logging.info("Readability title not suitable or not found for URL. Looking for H1 fallback in cleaned HTML...") else: # Includes Direct Input and unknowns logging.info("Looking for H1 title in cleaned HTML...") try: soup_for_h1 = BeautifulSoup(cleaned_html, 'lxml') h1_tag = soup_for_h1.find('h1') if h1_tag: h1_text = h1_tag.get_text(strip=True) if h1_text: final_title = h1_text logging.info(f"Using H1 fallback title: '{final_title}'") else: logging.info("Found H1 tag but it was empty.") else: logging.info("No H1 tag found in cleaned HTML for fallback title.") except Exception as e: logging.error(f"Error searching for H1 fallback title: {traceback.format_exc()}") # Return the HTML to be converted (either Readability summary or cleaned_html) and the determined title return processed_html, final_title def _convert_to_markdown(processed_html: str, final_title: str | None) -> tuple[str | None, str | None]: """ Remove título duplicado do HTML processado (se necessário) e converte para Markdown. Retorna (final_markdown, None) ou (None, error_message). """ # ... (mesma lógica de verificação de H1 duplicado e conversão com markdownify) ... html_to_convert = processed_html if final_title: logging.info(f"Checking for title duplication (first H1 in processed content)...") try: soup_proc = BeautifulSoup(processed_html, 'lxml') first_h1_in_proc = soup_proc.find('h1') if first_h1_in_proc: h1_proc_text = first_h1_in_proc.get_text(strip=True) if h1_proc_text == final_title: logging.info(f"Found matching H1 ('{h1_proc_text}') in content. Removing it to prevent duplication.") first_h1_in_proc.decompose() html_to_convert = str(soup_proc) else: logging.info(f"First H1 content ('{h1_proc_text}') does not match final title ('{final_title}'). Keeping H1.") else: logging.info("No H1 found in processed content to check for duplication.") except Exception as e: logging.error(f"Error during title duplication check: {traceback.format_exc()}") if not html_to_convert.strip(): logging.warning("HTML content (after processing) is empty. Cannot convert.") return None, f"❓ The HTML content (after processing) appears to be empty." logging.info(f"Attempting to convert final processed HTML (length: {len(html_to_convert)}) to Markdown...") try: markdown_output = markdownify( html_to_convert, heading_style="ATX", bullets='*' ).strip() if final_title: final_markdown = f"# {final_title}\n\n{markdown_output}" else: final_markdown = markdown_output if not final_markdown.strip(): logging.warning("Markdown conversion resulted in empty output.") return None, f"ℹ️ The conversion resulted in empty Markdown." logging.info(f"Successfully converted to Markdown (length: {len(final_markdown)}).") return final_markdown.strip(), None except Exception as e: logging.error(f"Failed to convert HTML to Markdown: {traceback.format_exc()}") return None, "❌ Error: Failed during the final Markdown conversion step." # --- Main Gradio Function (Orchestrator) --- # **MODIFIED** def html_to_markdown_converter(url: str, html_input: str) -> str: """ Converts HTML (from URL or direct input) to Markdown using helper functions. Handles overall workflow and top-level errors. """ url = url.strip() if url else "" html_input = html_input.strip() if html_input else "" try: # 1. Fetch and Clean HTML # Now returns: cleaned_html, source, error_message cleaned_html, source, error_msg = _fetch_and_clean_html(url, html_input) if error_msg: # Check if fetch/clean returned an error message return error_msg if cleaned_html is None or source is None: # Should not happen if error_msg is None, but check anyway logging.error("Fetching/cleaning returned None HTML/source without error message.") return GENERIC_ERROR_MESSAGE # 2. Extract Content and Title (pass source) # Now takes cleaned_html and source processed_html, final_title = _extract_content_and_title(cleaned_html, source) if processed_html is None: logging.error("Processed HTML became None unexpectedly after extraction step.") return GENERIC_ERROR_MESSAGE # 3. Convert to Markdown final_markdown, convert_error_msg = _convert_to_markdown(processed_html, final_title) if convert_error_msg: return convert_error_msg else: return final_markdown # Success except Exception as e: logging.error(f"FATAL: Unexpected error in main converter function: {traceback.format_exc()}") return GENERIC_ERROR_MESSAGE # --- Gradio Interface Definition (Adjust description slightly) --- title = "Smart Scrape Any URL or Website to Markdown [Expert CPU Mode]" description = """ Enter a URL **or** paste HTML code directly into the text box below. - For **URLs**, the tool attempts to extract the main article content using `readability` before converting. - For **Pasted HTML**, the tool converts the *entire* provided HTML (after basic cleaning) without using `readability`'s content extraction. It identifies a title (page title or first H1 fallback) and converts to Markdown. Includes security checks and size limits. Use the **copy icon** (📋) in the output box to copy the code. """ article = """ **How it works (v1.2):** 1. **Input:** Accepts URL or direct HTML. 2. **Fetch/Clean:** Gets HTML, performs security checks (IP block, size limit), removes basic tags (`