import gradio as gr
import requests
from markdownify import markdownify
import traceback
from readability import Document
from bs4 import BeautifulSoup
import logging
import socket
import ipaddress
from urllib.parse import urlparse

# --- Configuration Constants ---
DEFAULT_TIMEOUT = 20 # seconds
HEADERS = {'User-Agent': 'GradioHTMLtoMarkdownConverter/1.2 (+https://hf.space)'} # Updated version
MAX_CONTENT_SIZE_BYTES = 10 * 1024 * 1024
MIN_TITLE_LENGTH = 4
PRECLEAN_TAGS_TO_REMOVE = [
    'script', 'style', 'iframe', 'svg', 'noscript', 'header', 'footer', 'nav', 'aside', 'form', 'button', 'input', 'textarea', 'select', 'option', 'label'
]
GENERIC_ERROR_MESSAGE = "❌ Error: An unexpected internal error occurred. Please check logs or try again later."
SOURCE_URL_PREFIX = "URL" # Identifier for URL source
SOURCE_DIRECT_INPUT = "Direct HTML Input" # Identifier for direct input

# --- Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# --- Helper Functions ---

def _is_ip_allowed(hostname: str) -> bool:
    """Verifica se o IP resolvido do hostname é permitido (não privado/local)."""
    try:
        addr_info = socket.getaddrinfo(hostname, None)
        ip_addr_str = addr_info[0][4][0]
        ip_addr = ipaddress.ip_address(ip_addr_str)
        if ip_addr.is_private or ip_addr.is_loopback or ip_addr.is_link_local:
            logging.warning(f"Blocked attempt to access internal/private IP: {ip_addr_str} for hostname {hostname}")
            return False
        logging.info(f"Hostname {hostname} resolved to allowed public IP {ip_addr_str}.")
        return True
    except socket.gaierror as e:
        logging.error(f"Could not resolve hostname: {hostname} - {e}")
        return False
    except Exception as e:
        logging.error(f"Unexpected error during IP validation for {hostname}: {e}", exc_info=True)
        return False

def _fetch_and_clean_html(url: str, html_input: str) -> tuple[str | None, str | None, str | None]:
    """
    Busca HTML da URL ou usa input direto, faz pré-limpeza.
    Retorna uma tupla: (cleaned_html, source_description, error_message)
    Retorna (None, source, error_message) em caso de erro.
    Retorna (None, None, error_message) se nenhuma entrada foi fornecida.
    """
    html_content = ""
    source = None # Initialize source

    if url:
        source = f"{SOURCE_URL_PREFIX} ({url})" # Use constant prefix
        logging.info(f"Attempting to fetch HTML from URL: {url}")
        try:
            # ... (mesma lógica de fetch, validação de IP, tamanho, etc.)...
            # 1. Prepend Scheme
            if not url.startswith(('http://', 'https://')):
                url = 'https://' + url
                logging.info(f"Scheme missing, prepended https://. New URL: {url}")
            # 2. Validate URL structure and check for forbidden IPs
            parsed_url = urlparse(url)
            if not parsed_url.scheme or not parsed_url.netloc:
                 raise ValueError("Invalid URL structure.")
            if not _is_ip_allowed(parsed_url.hostname):
                 # Pass source back even on error
                 return None, source, f"❌ Error: Access to this URL's IP address is not allowed for security reasons."
            # 3. Fetch content
            response = requests.get(url, timeout=DEFAULT_TIMEOUT, headers=HEADERS, allow_redirects=True, stream=True)
            response.raise_for_status()
            # 4. Check Content-Length
            content_length = response.headers.get('Content-Length')
            if content_length and int(content_length) > MAX_CONTENT_SIZE_BYTES:
                logging.warning(f"Content-Length {content_length} exceeds limit for URL: {url}")
                return None, source, f"❌ Error: Content exceeds maximum allowed size ({MAX_CONTENT_SIZE_BYTES // 1024 // 1024}MB)."
            # 5. Read content
            response.encoding = response.apparent_encoding or 'utf-8'
            html_content = response.text
            if len(html_content.encode(response.encoding, errors='ignore')) > MAX_CONTENT_SIZE_BYTES * 1.1:
                logging.warning(f"Decoded content size exceeds limit for URL: {url}")
                return None, source, f"❌ Error: Decoded content exceeds estimated maximum size."
            logging.info(f"Successfully fetched {len(html_content)} bytes from {url}.")

        except ValueError as e:
             logging.error(f"Invalid URL provided: {url} - {e}")
             return None, source, f"❌ Error: Invalid URL format: `{url}`."
        except requests.exceptions.MissingSchema:
            logging.error(f"Invalid URL (Missing Schema): {url}")
            return None, source, f"❌ Error: Invalid URL: `{url}`. Please include `http://` or `https://`."
        except requests.exceptions.Timeout:
            logging.warning(f"Request timed out for URL: {url}")
            return None, source, f"❌ Error: Request timed out after {DEFAULT_TIMEOUT} seconds trying to fetch URL: `{url}`"
        except requests.exceptions.RequestException as e:
            logging.error(f"Failed to fetch URL: {url} - {e}")
            return None, source, f"❌ Error: Failed to fetch content from URL: `{url}`\nDetails: {e}"
        except Exception as e:
            logging.error(f"Unexpected error fetching URL {url}: {traceback.format_exc()}")
            return None, source, GENERIC_ERROR_MESSAGE

    elif html_input:
        source = SOURCE_DIRECT_INPUT # Use constant
        logging.info(f"Using {source} ({len(html_input)} bytes).")
        if len(html_input) > MAX_CONTENT_SIZE_BYTES * 1.2:
             logging.warning(f"Direct HTML input size {len(html_input)} exceeds limit.")
             # Pass source back even on error
             return None, source, f"❌ Error: Pasted HTML exceeds maximum allowed size."
        html_content = html_input
    else:
        # No input provided
        return None, None, "❓ Please provide a URL or paste HTML content in the fields above."

    # --- Pre-cleaning ---
    if not html_content: # Should only happen if logic above fails unexpectedly
        logging.error("Reached pre-cleaning stage with no HTML content.")
        return None, source, f"❓ No HTML content found from {source}."

    logging.info("Pre-cleaning HTML...")
    try:
        soup_pre = BeautifulSoup(html_content, 'lxml')
        for tag in soup_pre(PRECLEAN_TAGS_TO_REMOVE):
            tag.decompose()
        cleaned_html = str(soup_pre)
        logging.info(f"HTML pre-cleaned. Size reduced to {len(cleaned_html)} bytes.")
        # Return cleaned_html, source, and None for error message
        return cleaned_html, source, None
    except Exception as e:
        logging.error(f"Error during HTML pre-cleaning: {traceback.format_exc()}")
        # Pass source back even on error
        return None, source, "❌ Error: Failed during HTML pre-cleaning step."


# **MODIFIED**
def _extract_content_and_title(cleaned_html: str, source: str) -> tuple[str | None, str | None]:
    """
    Extrai conteúdo principal com Readability (APENAS para URLs) e determina o título.
    Retorna (processed_html, final_title).
    """
    processed_html = cleaned_html # Default to cleaned HTML (importante para Direct Input)
    readability_title = None
    final_title = None
    use_readability = True # Internal flag, could be user option later

    # **Execute Readability ONLY if requested AND the source is a URL**
    if use_readability and source and source.startswith(SOURCE_URL_PREFIX):
        logging.info("Source is URL. Attempting to extract main content using Readability...")
        try:
            doc = Document(cleaned_html)
            readability_title = doc.title()
            processed_html_summary = doc.summary()
            soup_summary_check = BeautifulSoup(processed_html_summary, 'lxml')
            if soup_summary_check.text.strip():
                processed_html = processed_html_summary # Use summary ONLY IF valid AND source is URL
                logging.info(f"Readability extracted title: '{readability_title}'. Using summary content for URL.")
            else:
                logging.warning("Readability summary was empty for URL. Falling back to cleaned full HTML.")
                readability_title = None # Discard title if summary failed
                # processed_html remains cleaned_html

        except Exception as e:
            logging.warning(f"Readability processing failed for URL: {e}. Falling back to cleaned full HTML.")
            readability_title = None
            # processed_html remains cleaned_html
    elif source == SOURCE_DIRECT_INPUT:
        logging.info("Source is Direct HTML Input. Skipping Readability content extraction.")
        # processed_html is already set to cleaned_html, which is correct.
        readability_title = None # Ensure no accidental title carry-over
    else:
         logging.warning(f"Source type '{source}' unknown or missing, skipping Readability.")
         readability_title = None

    # --- Title Decision Logic ---
    # Priority 1: Readability title (only possible if source was URL and Readability ran)
    if readability_title and len(readability_title) >= MIN_TITLE_LENGTH and not readability_title.strip().startswith('['):
        final_title = readability_title.strip()
        logging.info(f"Using Readability title: '{final_title}'")

    # Priority 2: Fallback to first H1 from CLEANED HTML (runs for BOTH URL and Direct Input if no Readability title)
    if not final_title:
        # Log difference based on source
        if source and source.startswith(SOURCE_URL_PREFIX):
            logging.info("Readability title not suitable or not found for URL. Looking for H1 fallback in cleaned HTML...")
        else: # Includes Direct Input and unknowns
             logging.info("Looking for H1 title in cleaned HTML...")

        try:
            soup_for_h1 = BeautifulSoup(cleaned_html, 'lxml')
            h1_tag = soup_for_h1.find('h1')
            if h1_tag:
                h1_text = h1_tag.get_text(strip=True)
                if h1_text:
                    final_title = h1_text
                    logging.info(f"Using H1 fallback title: '{final_title}'")
                else:
                    logging.info("Found H1 tag but it was empty.")
            else:
                 logging.info("No H1 tag found in cleaned HTML for fallback title.")
        except Exception as e:
             logging.error(f"Error searching for H1 fallback title: {traceback.format_exc()}")

    # Return the HTML to be converted (either Readability summary or cleaned_html) and the determined title
    return processed_html, final_title


def _convert_to_markdown(processed_html: str, final_title: str | None) -> tuple[str | None, str | None]:
    """
    Remove título duplicado do HTML processado (se necessário) e converte para Markdown.
    Retorna (final_markdown, None) ou (None, error_message).
    """
    # ... (mesma lógica de verificação de H1 duplicado e conversão com markdownify) ...
    html_to_convert = processed_html

    if final_title:
        logging.info(f"Checking for title duplication (first H1 in processed content)...")
        try:
            soup_proc = BeautifulSoup(processed_html, 'lxml')
            first_h1_in_proc = soup_proc.find('h1')
            if first_h1_in_proc:
                h1_proc_text = first_h1_in_proc.get_text(strip=True)
                if h1_proc_text == final_title:
                    logging.info(f"Found matching H1 ('{h1_proc_text}') in content. Removing it to prevent duplication.")
                    first_h1_in_proc.decompose()
                    html_to_convert = str(soup_proc)
                else:
                    logging.info(f"First H1 content ('{h1_proc_text}') does not match final title ('{final_title}'). Keeping H1.")
            else:
                logging.info("No H1 found in processed content to check for duplication.")
        except Exception as e:
            logging.error(f"Error during title duplication check: {traceback.format_exc()}")

    if not html_to_convert.strip():
        logging.warning("HTML content (after processing) is empty. Cannot convert.")
        return None, f"❓ The HTML content (after processing) appears to be empty."

    logging.info(f"Attempting to convert final processed HTML (length: {len(html_to_convert)}) to Markdown...")
    try:
        markdown_output = markdownify(
            html_to_convert,
            heading_style="ATX",
            bullets='*'
        ).strip()

        if final_title:
            final_markdown = f"# {final_title}\n\n{markdown_output}"
        else:
            final_markdown = markdown_output

        if not final_markdown.strip():
            logging.warning("Markdown conversion resulted in empty output.")
            return None, f"ℹ️ The conversion resulted in empty Markdown."

        logging.info(f"Successfully converted to Markdown (length: {len(final_markdown)}).")
        return final_markdown.strip(), None

    except Exception as e:
        logging.error(f"Failed to convert HTML to Markdown: {traceback.format_exc()}")
        return None, "❌ Error: Failed during the final Markdown conversion step."


# --- Main Gradio Function (Orchestrator) ---
# **MODIFIED**
def html_to_markdown_converter(url: str, html_input: str) -> str:
    """
    Converts HTML (from URL or direct input) to Markdown using helper functions.
    Handles overall workflow and top-level errors.
    """
    url = url.strip() if url else ""
    html_input = html_input.strip() if html_input else ""

    try:
        # 1. Fetch and Clean HTML
        # Now returns: cleaned_html, source, error_message
        cleaned_html, source, error_msg = _fetch_and_clean_html(url, html_input)
        if error_msg: # Check if fetch/clean returned an error message
            return error_msg
        if cleaned_html is None or source is None: # Should not happen if error_msg is None, but check anyway
             logging.error("Fetching/cleaning returned None HTML/source without error message.")
             return GENERIC_ERROR_MESSAGE

        # 2. Extract Content and Title (pass source)
        # Now takes cleaned_html and source
        processed_html, final_title = _extract_content_and_title(cleaned_html, source)

        if processed_html is None:
             logging.error("Processed HTML became None unexpectedly after extraction step.")
             return GENERIC_ERROR_MESSAGE

        # 3. Convert to Markdown
        final_markdown, convert_error_msg = _convert_to_markdown(processed_html, final_title)
        if convert_error_msg:
            return convert_error_msg
        else:
            return final_markdown # Success

    except Exception as e:
        logging.error(f"FATAL: Unexpected error in main converter function: {traceback.format_exc()}")
        return GENERIC_ERROR_MESSAGE


# --- Gradio Interface Definition (Adjust description slightly) ---
title = "Smart Scrape Any URL or Website to Markdown [Expert CPU Mode]"
description = """
Enter a URL **or** paste HTML code directly into the text box below.
- For **URLs**, the tool attempts to extract the main article content using `readability` before converting.
- For **Pasted HTML**, the tool converts the *entire* provided HTML (after basic cleaning) without using `readability`'s content extraction.
It identifies a title (page title or first H1 fallback) and converts to Markdown. Includes security checks and size limits.
Use the **copy icon** (📋) in the output box to copy the code.
"""
article = """
**How it works (v1.2):**
1.  **Input:** Accepts URL or direct HTML.
2.  **Fetch/Clean:** Gets HTML, performs security checks (IP block, size limit), removes basic tags (`<script>`, `<nav>`, etc.). Determines if source is URL or Direct Input.
3.  **Content Processing:**
    *   **If Source is URL:** Attempts `readability-lxml` extraction (`doc.summary()`). Falls back to cleaned HTML if extraction fails/is empty.
    *   **If Source is Direct Input:** **Skips** `readability-lxml` extraction. Uses the cleaned HTML directly.
4.  **Title Logic:** Tries Readability title (if URL source). Falls back to first `<h1>` in *cleaned* HTML otherwise.
5.  **Deduplication:** Removes the first `<h1>` from the *processed content* if it matches the determined title.
6.  **Conversion:** Uses `markdownify` to convert the final processed HTML to Markdown.
7.  **Output:** Prepends title (if found) and returns Markdown or error message.
8.  **Logging:** Uses Python's `logging`.
"""

# Define input/output components (No changes needed)
url_input = gr.Textbox(...)
html_input_area = gr.Textbox(...)
markdown_output_textbox = gr.Textbox(...)

# Create the Gradio interface (No changes needed in the call)
iface = gr.Interface(
    fn=html_to_markdown_converter,
    inputs=[url_input, html_input_area],
    outputs=markdown_output_textbox,
    title=title,
    description=description,
    article=article,
    allow_flagging='never',
    examples=[
        # Examples using URLs (should use Readability)
        ["https://psychedelic.com.br/profissoes-boneca-barbie/", ""],
        ["https://agideia.com.br/tutoriais/ai-inteligencia-artificial/integre-uma-ia-gratuita-gemma-2b-ao-seu-site-wordpress-usando-google-colab-e-cloudflare/", ""],
        # Example with direct HTML INCLUDING list (should now work)
        ["", "<h1>Título Simples</h1>\n<p>Este é um parágrafo de exemplo com <strong>texto em negrito</strong> e <em>texto em itálico</em>.</p>\n<ul>\n<li>Item 1</li>\n<li>Item 2</li>\n</ul>"],
        # Example direct HTML without H1
        ["", "<p>Um parágrafo sem título H1.</p><div><p>Outro conteúdo.</p></div>"]
    ],
    cache_examples=False
)

# Launch the app
if __name__ == "__main__":
    # Reminder: requirements: gradio, requests, markdownify, beautifulsoup4, readability-lxml, lxml
    iface.launch()