scrape-to-markdown-expert-mode-cpu

Sleeping

App Files Files Community

scrape-to-markdown-expert-mode-cpu / app.py

13ze

Update app.py

ba11ba7 verified 8 months ago

raw

history blame

18 kB

	import gradio as gr
	import requests
	from markdownify import markdownify
	import traceback
	from readability import Document
	from bs4 import BeautifulSoup
	import logging
	import socket
	import ipaddress
	from urllib.parse import urlparse

	# --- Configuration Constants ---
	DEFAULT_TIMEOUT = 20 # seconds
	HEADERS = {'User-Agent': 'GradioHTMLtoMarkdownConverter/1.2 (+https://hf.space)'} # Updated version
	MAX_CONTENT_SIZE_BYTES = 10 * 1024 * 1024
	MIN_TITLE_LENGTH = 4
	PRECLEAN_TAGS_TO_REMOVE = [
	'script', 'style', 'iframe', 'svg', 'noscript', 'header', 'footer', 'nav', 'aside', 'form', 'button', 'input', 'textarea', 'select', 'option', 'label'
	]
	GENERIC_ERROR_MESSAGE = "❌ Error: An unexpected internal error occurred. Please check logs or try again later."
	SOURCE_URL_PREFIX = "URL" # Identifier for URL source
	SOURCE_DIRECT_INPUT = "Direct HTML Input" # Identifier for direct input

	# --- Logging Setup ---
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

	# --- Helper Functions ---

	def _is_ip_allowed(hostname: str) -> bool:
	"""Verifica se o IP resolvido do hostname é permitido (não privado/local)."""
	try:
	addr_info = socket.getaddrinfo(hostname, None)
	ip_addr_str = addr_info[0][4][0]
	ip_addr = ipaddress.ip_address(ip_addr_str)
	if ip_addr.is_private or ip_addr.is_loopback or ip_addr.is_link_local:
	logging.warning(f"Blocked attempt to access internal/private IP: {ip_addr_str} for hostname {hostname}")
	return False
	logging.info(f"Hostname {hostname} resolved to allowed public IP {ip_addr_str}.")
	return True
	except socket.gaierror as e:
	logging.error(f"Could not resolve hostname: {hostname} - {e}")
	return False
	except Exception as e:
	logging.error(f"Unexpected error during IP validation for {hostname}: {e}", exc_info=True)
	return False

	def _fetch_and_clean_html(url: str, html_input: str) -> tuple[str \| None, str \| None, str \| None]:
	"""
	Busca HTML da URL ou usa input direto, faz pré-limpeza.
	Retorna uma tupla: (cleaned_html, source_description, error_message)
	Retorna (None, source, error_message) em caso de erro.
	Retorna (None, None, error_message) se nenhuma entrada foi fornecida.
	"""
	html_content = ""
	source = None # Initialize source

	if url:
	source = f"{SOURCE_URL_PREFIX} ({url})" # Use constant prefix
	logging.info(f"Attempting to fetch HTML from URL: {url}")
	try:
	# ... (mesma lógica de fetch, validação de IP, tamanho, etc.)...
	# 1. Prepend Scheme
	if not url.startswith(('http://', 'https://')):
	url = 'https://' + url
	logging.info(f"Scheme missing, prepended https://. New URL: {url}")
	# 2. Validate URL structure and check for forbidden IPs
	parsed_url = urlparse(url)
	if not parsed_url.scheme or not parsed_url.netloc:
	raise ValueError("Invalid URL structure.")
	if not _is_ip_allowed(parsed_url.hostname):
	# Pass source back even on error
	return None, source, f"❌ Error: Access to this URL's IP address is not allowed for security reasons."
	# 3. Fetch content
	response = requests.get(url, timeout=DEFAULT_TIMEOUT, headers=HEADERS, allow_redirects=True, stream=True)
	response.raise_for_status()
	# 4. Check Content-Length
	content_length = response.headers.get('Content-Length')
	if content_length and int(content_length) > MAX_CONTENT_SIZE_BYTES:
	logging.warning(f"Content-Length {content_length} exceeds limit for URL: {url}")
	return None, source, f"❌ Error: Content exceeds maximum allowed size ({MAX_CONTENT_SIZE_BYTES // 1024 // 1024}MB)."
	# 5. Read content
	response.encoding = response.apparent_encoding or 'utf-8'
	html_content = response.text
	if len(html_content.encode(response.encoding, errors='ignore')) > MAX_CONTENT_SIZE_BYTES * 1.1:
	logging.warning(f"Decoded content size exceeds limit for URL: {url}")
	return None, source, f"❌ Error: Decoded content exceeds estimated maximum size."
	logging.info(f"Successfully fetched {len(html_content)} bytes from {url}.")

	except ValueError as e:
	logging.error(f"Invalid URL provided: {url} - {e}")
	return None, source, f"❌ Error: Invalid URL format: `{url}`."
	except requests.exceptions.MissingSchema:
	logging.error(f"Invalid URL (Missing Schema): {url}")
	return None, source, f"❌ Error: Invalid URL: `{url}`. Please include `http://` or `https://`."
	except requests.exceptions.Timeout:
	logging.warning(f"Request timed out for URL: {url}")
	return None, source, f"❌ Error: Request timed out after {DEFAULT_TIMEOUT} seconds trying to fetch URL: `{url}`"
	except requests.exceptions.RequestException as e:
	logging.error(f"Failed to fetch URL: {url} - {e}")
	return None, source, f"❌ Error: Failed to fetch content from URL: `{url}`\nDetails: {e}"
	except Exception as e:
	logging.error(f"Unexpected error fetching URL {url}: {traceback.format_exc()}")
	return None, source, GENERIC_ERROR_MESSAGE

	elif html_input:
	source = SOURCE_DIRECT_INPUT # Use constant
	logging.info(f"Using {source} ({len(html_input)} bytes).")
	if len(html_input) > MAX_CONTENT_SIZE_BYTES * 1.2:
	logging.warning(f"Direct HTML input size {len(html_input)} exceeds limit.")
	# Pass source back even on error
	return None, source, f"❌ Error: Pasted HTML exceeds maximum allowed size."
	html_content = html_input
	else:
	# No input provided
	return None, None, "❓ Please provide a URL or paste HTML content in the fields above."

	# --- Pre-cleaning ---
	if not html_content: # Should only happen if logic above fails unexpectedly
	logging.error("Reached pre-cleaning stage with no HTML content.")
	return None, source, f"❓ No HTML content found from {source}."

	logging.info("Pre-cleaning HTML...")
	try:
	soup_pre = BeautifulSoup(html_content, 'lxml')
	for tag in soup_pre(PRECLEAN_TAGS_TO_REMOVE):
	tag.decompose()
	cleaned_html = str(soup_pre)
	logging.info(f"HTML pre-cleaned. Size reduced to {len(cleaned_html)} bytes.")
	# Return cleaned_html, source, and None for error message
	return cleaned_html, source, None
	except Exception as e:
	logging.error(f"Error during HTML pre-cleaning: {traceback.format_exc()}")
	# Pass source back even on error
	return None, source, "❌ Error: Failed during HTML pre-cleaning step."


	# MODIFIED
	def _extract_content_and_title(cleaned_html: str, source: str) -> tuple[str \| None, str \| None]:
	"""
	Extrai conteúdo principal com Readability (APENAS para URLs) e determina o título.
	Retorna (processed_html, final_title).
	"""
	processed_html = cleaned_html # Default to cleaned HTML (importante para Direct Input)
	readability_title = None
	final_title = None
	use_readability = True # Internal flag, could be user option later

	# Execute Readability ONLY if requested AND the source is a URL
	if use_readability and source and source.startswith(SOURCE_URL_PREFIX):
	logging.info("Source is URL. Attempting to extract main content using Readability...")
	try:
	doc = Document(cleaned_html)
	readability_title = doc.title()
	processed_html_summary = doc.summary()
	soup_summary_check = BeautifulSoup(processed_html_summary, 'lxml')
	if soup_summary_check.text.strip():
	processed_html = processed_html_summary # Use summary ONLY IF valid AND source is URL
	logging.info(f"Readability extracted title: '{readability_title}'. Using summary content for URL.")
	else:
	logging.warning("Readability summary was empty for URL. Falling back to cleaned full HTML.")
	readability_title = None # Discard title if summary failed
	# processed_html remains cleaned_html

	except Exception as e:
	logging.warning(f"Readability processing failed for URL: {e}. Falling back to cleaned full HTML.")
	readability_title = None
	# processed_html remains cleaned_html
	elif source == SOURCE_DIRECT_INPUT:
	logging.info("Source is Direct HTML Input. Skipping Readability content extraction.")
	# processed_html is already set to cleaned_html, which is correct.
	readability_title = None # Ensure no accidental title carry-over
	else:
	logging.warning(f"Source type '{source}' unknown or missing, skipping Readability.")
	readability_title = None

	# --- Title Decision Logic ---
	# Priority 1: Readability title (only possible if source was URL and Readability ran)
	if readability_title and len(readability_title) >= MIN_TITLE_LENGTH and not readability_title.strip().startswith('['):
	final_title = readability_title.strip()
	logging.info(f"Using Readability title: '{final_title}'")

	# Priority 2: Fallback to first H1 from CLEANED HTML (runs for BOTH URL and Direct Input if no Readability title)
	if not final_title:
	# Log difference based on source
	if source and source.startswith(SOURCE_URL_PREFIX):
	logging.info("Readability title not suitable or not found for URL. Looking for H1 fallback in cleaned HTML...")
	else: # Includes Direct Input and unknowns
	logging.info("Looking for H1 title in cleaned HTML...")

	try:
	soup_for_h1 = BeautifulSoup(cleaned_html, 'lxml')
	h1_tag = soup_for_h1.find('h1')
	if h1_tag:
	h1_text = h1_tag.get_text(strip=True)
	if h1_text:
	final_title = h1_text
	logging.info(f"Using H1 fallback title: '{final_title}'")
	else:
	logging.info("Found H1 tag but it was empty.")
	else:
	logging.info("No H1 tag found in cleaned HTML for fallback title.")
	except Exception as e:
	logging.error(f"Error searching for H1 fallback title: {traceback.format_exc()}")

	# Return the HTML to be converted (either Readability summary or cleaned_html) and the determined title
	return processed_html, final_title


	def _convert_to_markdown(processed_html: str, final_title: str \| None) -> tuple[str \| None, str \| None]:
	"""
	Remove título duplicado do HTML processado (se necessário) e converte para Markdown.
	Retorna (final_markdown, None) ou (None, error_message).
	"""
	# ... (mesma lógica de verificação de H1 duplicado e conversão com markdownify) ...
	html_to_convert = processed_html

	if final_title:
	logging.info(f"Checking for title duplication (first H1 in processed content)...")
	try:
	soup_proc = BeautifulSoup(processed_html, 'lxml')
	first_h1_in_proc = soup_proc.find('h1')
	if first_h1_in_proc:
	h1_proc_text = first_h1_in_proc.get_text(strip=True)
	if h1_proc_text == final_title:
	logging.info(f"Found matching H1 ('{h1_proc_text}') in content. Removing it to prevent duplication.")
	first_h1_in_proc.decompose()
	html_to_convert = str(soup_proc)
	else:
	logging.info(f"First H1 content ('{h1_proc_text}') does not match final title ('{final_title}'). Keeping H1.")
	else:
	logging.info("No H1 found in processed content to check for duplication.")
	except Exception as e:
	logging.error(f"Error during title duplication check: {traceback.format_exc()}")

	if not html_to_convert.strip():
	logging.warning("HTML content (after processing) is empty. Cannot convert.")
	return None, f"❓ The HTML content (after processing) appears to be empty."

	logging.info(f"Attempting to convert final processed HTML (length: {len(html_to_convert)}) to Markdown...")
	try:
	markdown_output = markdownify(
	html_to_convert,
	heading_style="ATX",
	bullets='*'
	).strip()

	if final_title:
	final_markdown = f"# {final_title}\n\n{markdown_output}"
	else:
	final_markdown = markdown_output

	if not final_markdown.strip():
	logging.warning("Markdown conversion resulted in empty output.")
	return None, f"ℹ️ The conversion resulted in empty Markdown."

	logging.info(f"Successfully converted to Markdown (length: {len(final_markdown)}).")
	return final_markdown.strip(), None

	except Exception as e:
	logging.error(f"Failed to convert HTML to Markdown: {traceback.format_exc()}")
	return None, "❌ Error: Failed during the final Markdown conversion step."


	# --- Main Gradio Function (Orchestrator) ---
	# MODIFIED
	def html_to_markdown_converter(url: str, html_input: str) -> str:
	"""
	Converts HTML (from URL or direct input) to Markdown using helper functions.
	Handles overall workflow and top-level errors.
	"""
	url = url.strip() if url else ""
	html_input = html_input.strip() if html_input else ""

	try:
	# 1. Fetch and Clean HTML
	# Now returns: cleaned_html, source, error_message
	cleaned_html, source, error_msg = _fetch_and_clean_html(url, html_input)
	if error_msg: # Check if fetch/clean returned an error message
	return error_msg
	if cleaned_html is None or source is None: # Should not happen if error_msg is None, but check anyway
	logging.error("Fetching/cleaning returned None HTML/source without error message.")
	return GENERIC_ERROR_MESSAGE

	# 2. Extract Content and Title (pass source)
	# Now takes cleaned_html and source
	processed_html, final_title = _extract_content_and_title(cleaned_html, source)

	if processed_html is None:
	logging.error("Processed HTML became None unexpectedly after extraction step.")
	return GENERIC_ERROR_MESSAGE

	# 3. Convert to Markdown
	final_markdown, convert_error_msg = _convert_to_markdown(processed_html, final_title)
	if convert_error_msg:
	return convert_error_msg
	else:
	return final_markdown # Success

	except Exception as e:
	logging.error(f"FATAL: Unexpected error in main converter function: {traceback.format_exc()}")
	return GENERIC_ERROR_MESSAGE


	# --- Gradio Interface Definition (Adjust description slightly) ---
	title = "Smart Scrape Any URL or Website to Markdown [Expert CPU Mode]"
	description = """
	Enter a URL or paste HTML code directly into the text box below.
	- For URLs, the tool attempts to extract the main article content using `readability` before converting.
	- For Pasted HTML, the tool converts the entire provided HTML (after basic cleaning) without using `readability`'s content extraction.
	It identifies a title (page title or first H1 fallback) and converts to Markdown. Includes security checks and size limits.
	Use the copy icon (📋) in the output box to copy the code.
	"""
	article = """
	How it works (v1.2):
	1. Input: Accepts URL or direct HTML.
	2. Fetch/Clean: Gets HTML, performs security checks (IP block, size limit), removes basic tags (`<script>`, `<nav>`, etc.). Determines if source is URL or Direct Input.
	3. Content Processing:
	* If Source is URL: Attempts `readability-lxml` extraction (`doc.summary()`). Falls back to cleaned HTML if extraction fails/is empty.
	* If Source is Direct Input: Skips `readability-lxml` extraction. Uses the cleaned HTML directly.
	4. Title Logic: Tries Readability title (if URL source). Falls back to first `<h1>` in cleaned HTML otherwise.
	5. Deduplication: Removes the first `<h1>` from the processed content if it matches the determined title.
	6. Conversion: Uses `markdownify` to convert the final processed HTML to Markdown.
	7. Output: Prepends title (if found) and returns Markdown or error message.
	8. Logging: Uses Python's `logging`.
	"""

	# Define input/output components (No changes needed)
	url_input = gr.Textbox(...)
	html_input_area = gr.Textbox(...)
	markdown_output_textbox = gr.Textbox(...)

	# Create the Gradio interface (No changes needed in the call)
	iface = gr.Interface(
	fn=html_to_markdown_converter,
	inputs=[url_input, html_input_area],
	outputs=markdown_output_textbox,
	title=title,
	description=description,
	article=article,
	allow_flagging='never',
	examples=[
	# Examples using URLs (should use Readability)
	["https://psychedelic.com.br/profissoes-boneca-barbie/", ""],
	["https://agideia.com.br/tutoriais/ai-inteligencia-artificial/integre-uma-ia-gratuita-gemma-2b-ao-seu-site-wordpress-usando-google-colab-e-cloudflare/", ""],
	# Example with direct HTML INCLUDING list (should now work)
	["", "<h1>Título Simples</h1>\n<p>Este é um parágrafo de exemplo com <strong>texto em negrito</strong> e <em>texto em itálico</em>.</p>\n<ul>\n<li>Item 1</li>\n<li>Item 2</li>\n</ul>"],
	# Example direct HTML without H1
	["", "<p>Um parágrafo sem título H1.</p><div><p>Outro conteúdo.</p></div>"]
	],
	cache_examples=False
	)

	# Launch the app
	if __name__ == "__main__":
	# Reminder: requirements: gradio, requests, markdownify, beautifulsoup4, readability-lxml, lxml
	iface.launch()