scrape-to-markdown-expert-mode-cpu

Sleeping

App Files Files Community

scrape-to-markdown-expert-mode-cpu / app.py

13ze

Create app.py

fd8df2d verified 8 months ago

raw

history blame

8.42 kB

	import gradio as gr
	import requests
	from markdownify import markdownify
	import traceback # To help format potential errors
	from readability import Document # <<< NOVO IMPORT
	from bs4 import BeautifulSoup # <<< NOVO/USADO PARA FALLBACK OU ANÁLISE

	# Configure requests with a timeout and user-agent
	DEFAULT_TIMEOUT = 15 # seconds
	HEADERS = {'User-Agent': 'GradioHTMLtoMarkdownConverter/1.0 (+https://hf.space)'} # Be polite

	def html_to_markdown_converter(url: str, html_input: str) -> str:
	"""
	Converts HTML (from URL or direct input) to Markdown.
	Prioritizes URL input. Attempts to extract main content using readability.
	"""
	html_content = ""
	source = ""
	use_readability = True # Flag to control if readability is used (useful if we add an option later)

	# Clean up inputs
	url = url.strip() if url else ""
	html_input = html_input.strip() if html_input else ""

	try:
	# --- Step 1: Get HTML Content ---
	if url:
	source = f"URL ({url})"
	print(f"Attempting to fetch HTML from URL: {url}")
	try:
	# Ensure URL has a scheme (http/https)
	if not url.startswith(('http://', 'https://')):
	url = 'https://' + url
	print(f"Scheme missing, prepended https://. New URL: {url}")

	response = requests.get(url, timeout=DEFAULT_TIMEOUT, headers=HEADERS, allow_redirects=True)
	response.raise_for_status()
	response.encoding = response.apparent_encoding or 'utf-8'
	html_content = response.text
	print(f"Successfully fetched {len(html_content)} bytes from URL.")
	except requests.exceptions.MissingSchema:
	return f"❌ Error: Invalid URL: `{url}`. Please include `http://` or `https://`."
	except requests.exceptions.Timeout:
	return f"❌ Error: Request timed out after {DEFAULT_TIMEOUT} seconds trying to fetch URL: `{url}`"
	except requests.exceptions.RequestException as e:
	print(f"Request failed: {e}")
	return f"❌ Error: Failed to fetch content from URL: `{url}`\n```\n{e}\n```"
	except Exception as e:
	print(f"An unexpected error occurred during fetch: {e}")
	return f"❌ Error: An unexpected error occurred while fetching the URL.\n```\n{traceback.format_exc()}\n```"

	elif html_input:
	source = "Direct HTML Input"
	print(f"Using direct HTML input ({len(html_input)} bytes).")
	html_content = html_input
	# Disable readability for direct input? Maybe not, it could be a full page paste.
	# Let's keep it enabled, it might clean up pasted snippets too.
	else:
	return "❓ Please provide a URL or paste HTML content in the fields above."

	# --- Step 2: Extract Main Content (Optional but Recommended) ---
	if not html_content:
	return f"❓ No HTML content found from {source}."

	processed_html = html_content
	article_title = ""
	if use_readability:
	print("Attempting to extract main content using Readability...")
	try:
	doc = Document(html_content)
	article_title = doc.title() # Get the title extracted by readability
	processed_html = doc.summary() # Get the cleaned main content HTML
	print(f"Readability extracted title: '{article_title}'")
	print(f"Readability summary HTML length: {len(processed_html)}")

	# Check if readability returned anything substantial
	soup = BeautifulSoup(processed_html, 'html.parser')
	if not soup.text.strip():
	print("Readability summary was empty or non-substantial. Falling back to full HTML.")
	processed_html = html_content # Fallback to original HTML
	article_title = "" # Reset title if we fall back
	else:
	print("Using Readability summary for conversion.")

	except Exception as e:
	print(f"Readability processing failed: {e}\n{traceback.format_exc()}")
	print("Falling back to using full HTML content due to Readability error.")
	processed_html = html_content # Fallback on error
	article_title = ""

	else:
	print("Skipping Readability step. Using full HTML content.")
	processed_html = html_content


	# --- Step 3: Convert to Markdown ---
	if not processed_html.strip():
	# This can happen if original HTML was just whitespace, or if readability failed AND original was empty
	return f"❓ The HTML content (after potential processing) appears to be empty."

	print(f"Attempting to convert processed HTML (length: {len(processed_html)}) to Markdown...")
	try:
	markdown_output = markdownify(processed_html, heading_style="ATX", bullets='*')
	print(f"Conversion successful. Markdown length: {len(markdown_output)}")

	# Prepend the title if found by readability and we used its output
	if article_title and processed_html != html_content: # Only add title if readability was used successfully
	final_output = f"# {article_title}\n\n{markdown_output}"
	else:
	final_output = markdown_output

	if not final_output.strip():
	return f"ℹ️ The conversion resulted in empty Markdown (the processed HTML might have contained only unsupported tags or whitespace)."

	return final_output

	except Exception as e:
	print(f"Markdown conversion failed: {e}")
	return f"❌ Error: Failed to convert HTML to Markdown.\n```\n{traceback.format_exc()}\n```"

	except Exception as e:
	print(f"An unexpected error occurred: {e}")
	return f"❌ Error: An unexpected error occurred during processing.\n```\n{traceback.format_exc()}\n```"

	# --- Gradio Interface ---
	title = "HTML to Markdown Converter (Smart Extraction)"
	description = """
	Enter a URL or paste HTML code directly into the text box below.
	The tool attempts to extract the main article content using Mozilla's Readability library, removing clutter like menus and sidebars (this works best with URLs).
	The extracted HTML is then converted into Markdown. Priority is given to the URL input.
	"""
	article = """
	How it works:
	1. Uses `requests` to fetch content from URLs.
	2. Uses `readability-lxml` to attempt extracting the main article content and title from the fetched/pasted HTML.
	3. If Readability fails or returns empty content, it falls back to using the full HTML.
	4. Uses `markdownify` to convert the processed HTML into Markdown (ATX headings, '*' bullets).
	5. If a title was found by Readability, it's prepended to the Markdown output.
	"""

	# Define input components (no change needed here)
	url_input = gr.Textbox(
	label="Enter URL (gets priority)",
	placeholder="e.g., en.wikipedia.org/wiki/Markdown"
	)
	html_input_area = gr.Textbox(
	label="Or Paste HTML Code Here",
	lines=10,
	placeholder="e.g., <h1>Hello</h1><p>This is <b>bold</b>.</p>"
	)

	# Define output component (no change needed here)
	markdown_output_display = gr.Markdown(label="Converted Markdown Output")

	# Create the Gradio interface (Updated title, description, article)
	iface = gr.Interface(
	fn=html_to_markdown_converter,
	inputs=[url_input, html_input_area],
	outputs=markdown_output_display,
	title=title,
	description=description,
	article=article,
	allow_flagging='never',
	examples=[
	["https://gradio.app/quickstart/", ""],
	["https://en.wikipedia.org/wiki/Python_(programming_language)", ""], # Good test for readability
	["https://www.bbc.com/news", ""], # News site, tests clutter removal
	["", "<body><header>Menu</header><main><h1>Main Title</h1><p>Article content here.</p></main><footer>Copyright</footer></body>"], # Direct HTML test
	["https://httpbin.org/delay/5", ""], # Test timeout resilience
	["invalid-url", ""] # Test invalid URL handling
	],
	cache_examples=False
	)

	# Launch the app
	if __name__ == "__main__":
	iface.launch()