Spaces:

daneigh
/

memesensex-backend

Runtime error

File size: 15,045 Bytes

import torch
import torch.nn as nn
from torchvision import models, transforms
import torch.nn.functional as F
import math
from transformers import AutoModel, AutoTokenizer
from PIL import Image
import matplotlib.pyplot as plt
import easyocr
import numpy as np
import re
import os
import io
import cv2


BASE_DIR = os.path.dirname(os.path.abspath(__file__))  
MODEL_PATH = os.path.join(BASE_DIR, "best_multimodal_v3.pth")

# =========================
# 1. Text Preprocessing
# =========================
def preprocess_text(text):
    emoji_pattern = re.compile(
        "["  
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags
        "\U00002700-\U000027BF"  # dingbats
        "\U0001F900-\U0001F9FF"  # supplemental symbols
        "\U00002600-\U000026FF"  # misc symbols
        "\U00002B00-\U00002BFF"  # arrows, etc.
        "\U0001FA70-\U0001FAFF"  # extended symbols
        "]+",
        flags=re.UNICODE
    )
    # Remove emojis
    text = emoji_pattern.sub(r'', text)
    # Lowercase and strip
    text = text.lower().strip()
    # Keep letters (including accented), and spaces
    text = re.sub(r'[^a-zñáéíóúü\s]', '', text)
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text)

    return text

# =========================
# 2. OCR Extraction
# =========================
def ocr_extract_text(image_path, confidence_threshold=0.6):
    reader = easyocr.Reader(['en', 'tl'], gpu=torch.cuda.is_available())
    # # preprocess image for ocr
    # image = cv2.cvtColor(image_path, cv2.COLOR_RGB2GRAY)
    # # image = cv2.GaussianBlur(image,(5,5),0)
    
    # result = reader.readtext(image, detail=1, paragraph=False, width_ths=0.7, height_ths=0.7)

    # # Extract text and confidence scores
    # texts = []
    # confidences = []
    
    # for detection in result:
    #     bbox, text, confidence = detection
    #     texts.append(text)
    #     confidences.append(confidence)
    # final_text = " ".join(texts)
    # preprocess_txt = preprocess_text(final_text)
    # avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
    # return final_text, preprocess_txt, avg_confidence

    # Convert to grayscale
    gray = cv2.cvtColor(image_path, cv2.COLOR_RGB2GRAY)

    # First pass: OCR on raw grayscale
    result = reader.readtext(gray, detail=1, paragraph=False, width_ths=0.7, height_ths=0.7)
    texts, confidences = [], []

    for detection in result:
        if len(detection) == 3:
            _, text, conf = detection
        else:
            text, conf = detection

        if isinstance(text, list):
            text = " ".join([str(t) for t in text if isinstance(t, str)])

        texts.append(text)
        try:
            confidences.append(float(conf))
        except (ValueError, TypeError):
            confidences.append(0.0)

    final_text = " ".join(texts)
    avg_conf = sum(confidences)/len(confidences) if confidences else 0.0

    # If confidence is low, retry with Gaussian blur
    if avg_conf < confidence_threshold:
        texts, confidences = [], []
        gauss_img = cv2.GaussianBlur(gray, (5,5), 0)
        result = reader.readtext(gauss_img, detail=1, paragraph=False, width_ths=0.7, height_ths=0.7)

        for detection in result:
            if len(detection) == 3:
                _, text, conf = detection
            else:
                text, conf = detection

            if isinstance(text, list):
                text = " ".join([str(t) for t in text if isinstance(t, str)])

            texts.append(text)
            try:
                confidences.append(float(conf))
            except (ValueError, TypeError):
                confidences.append(0.0)

        final_text_gauss = " ".join(texts)
        avg_conf_gauss = sum(confidences)/len(confidences) if confidences else 0.0

        # Keep the version with higher confidence
        if avg_conf_gauss > avg_conf:
            final_text, avg_conf = final_text_gauss, avg_conf_gauss
    
    if not final_text:
        return "", "", 0.0

    preprocess_txt = preprocess_text(final_text)
    return final_text, preprocess_txt, avg_conf


# =========================
# 3. Image Preprocessing
# =========================
def resize_normalize_image(image_path, target_size=(224, 224)):

    preprocess_image = transforms.Compose([
        transforms.Resize(target_size, interpolation=transforms.InterpolationMode.BILINEAR),
        transforms.ToTensor(),
        transforms.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225]
        )
    ])

    image_tensor = preprocess_image(image_path).unsqueeze(0)  # Add batch dimension
    return image_tensor


# =========================
# 4. Model Definitions
# =========================
class CrossAttentionModule(nn.Module):
    def __init__(self, query_dim, key_value_dim, hidden_dim=256, num_heads=8, dropout=0.1):
        super(CrossAttentionModule, self).__init__()

        self.hidden_dim = hidden_dim
        self.num_heads = num_heads
        self.head_dim = hidden_dim // num_heads
        self.scale = math.sqrt(self.head_dim)  # √dk

        assert hidden_dim % num_heads == 0, "hidden_dim must be divisible by num_heads"

        # Query projection for H (image features)
        self.query_proj = nn.Linear(query_dim, hidden_dim)

        # Key and Value projections for G (text features)
        self.key_proj = nn.Linear(key_value_dim, hidden_dim)
        self.value_proj = nn.Linear(key_value_dim, hidden_dim)

        # Output projection WO
        self.out_proj = nn.Linear(hidden_dim, query_dim)

        # Layer normalization
        self.norm1 = nn.LayerNorm(query_dim)
        self.norm2 = nn.LayerNorm(query_dim)

        # MLP for final transformation
        self.mlp = nn.Sequential(
            nn.Linear(query_dim, query_dim * 4),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(query_dim * 4, query_dim),
            nn.Dropout(dropout)
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, H, G):
        """
        Args:
            H: Query features [batch_size, seq_len_h, query_dim] (e.g., image patches)
            G: Key/Value features [batch_size, seq_len_g, key_value_dim] (e.g., text tokens)
        """
        batch_size, seq_len_h, _ = H.shape
        seq_len_g = G.shape[1]

        # Step 1: Project to Q, K, V
        Q = self.query_proj(H)  # WiQ H
        K = self.key_proj(G)    # WiK G
        V = self.value_proj(G)  # WiV G

        # Step 2: Reshape for multi-head attention
        Q = Q.view(batch_size, seq_len_h, self.num_heads, self.head_dim).transpose(1, 2)
        K = K.view(batch_size, seq_len_g, self.num_heads, self.head_dim).transpose(1, 2)
        V = V.view(batch_size, seq_len_g, self.num_heads, self.head_dim).transpose(1, 2)

        # Step 3: Compute attention ATTi(H,G) = softmax((WiQ H)T(WiK G)/√dk)(WiV G)T
        attention_scores = torch.matmul(Q, K.transpose(-2, -1)) / self.scale
        attention_weights = F.softmax(attention_scores, dim=-1)
        attention_weights = self.dropout(attention_weights)
        attention_output = torch.matmul(attention_weights, V)

        # Step 4: Concatenate heads and apply output projection
        attention_output = attention_output.transpose(1, 2).contiguous().view(
            batch_size, seq_len_h, self.hidden_dim
        )

        # MATT(H,G) = [ATT1...ATTh]WO
        matt_output = self.out_proj(attention_output)

        # Step 5: Z = LN(H + MATT(H,G))
        Z = self.norm1(H + matt_output)

        # Step 6: TIM(H,G) = LN(Z + MLP(Z))
        mlp_output = self.mlp(Z)
        tim_output = self.norm2(Z + mlp_output)

        return tim_output

class MultimodalClassifier(nn.Module):
    def __init__(self, num_classes=2, model_name='jcblaise/roberta-tagalog-base'):
        super(MultimodalClassifier, self).__init__()

        # Image encoder (ResNet-18, keep spatial features)
        resnet = models.resnet18(pretrained=True)
        modules = list(resnet.children())[:-2]  # keep until last conv (before avgpool)
        self.image_encoder = nn.Sequential(*modules)  # output: (B, 512, 7, 7)

        # Text encoder
        self.text_encoder = AutoModel.from_pretrained(model_name)

        # Cross-attention using paper formula
        # Image attends to text
        self.img_to_text_attention = CrossAttentionModule(
            query_dim=512,
            key_value_dim=self.text_encoder.config.hidden_size,
            hidden_dim=256,
            num_heads=8
        )

        # Text attends to image
        self.text_to_img_attention = CrossAttentionModule(
            query_dim=self.text_encoder.config.hidden_size,
            key_value_dim=512,
            hidden_dim=256,
            num_heads=8
        )

        # Fusion & classifier
        self.fusion = nn.Sequential(
            nn.Linear(512 + self.text_encoder.config.hidden_size, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_classes)
        )

    def forward(self, images, input_ids, attention_mask):
        # Extract image features
        batch_size = images.size(0)
        img_feats = self.image_encoder(images)           # (B, 512, 7, 7)
        img_feats = img_feats.flatten(2).permute(0, 2, 1)  # (B, 49, 512)

        # Extract text features
        text_outputs = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
        txt_feats = text_outputs.last_hidden_state  # (B, seq_len, H)

        # Cross-attention following paper formula
        # TIM(img_feats, txt_feats) and TIM(txt_feats, img_feats)
        attended_img = self.img_to_text_attention(img_feats, txt_feats)
        attended_txt = self.text_to_img_attention(txt_feats, img_feats)

        # Pool attended outputs
        img_repr = attended_img.mean(dim=1)  # (B, 512)
        txt_repr = attended_txt[:, 0, :]     # CLS token (B, hidden_size)

        # Fusion
        fused = torch.cat([img_repr, txt_repr], dim=1)
        return self.fusion(fused)

# =========================
# 5. Load Model & Tokenizer
# =========================
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = MultimodalClassifier(num_classes=2)
model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
model.to(device)
model.eval()

tokenizer = AutoTokenizer.from_pretrained("jcblaise/roberta-tagalog-base")

# =========================
# 6. Inference Function
# =========================
def run_inference(image_path):
    # Convert bytes → PIL image
    if isinstance(image_path, (bytes, bytearray)):
        pil_img = Image.open(io.BytesIO(image_path)).convert("RGB")
    elif isinstance(image_path, str):
        pil_img = Image.open(image_path).convert("RGB")
    elif isinstance(image_path, Image.Image):
        pil_img = image_path.convert("RGB")
    else:
        raise TypeError(f"Unsupported input type: {type(image_path)}")

    # OCR
    np_image= np.array(pil_img)
    raw_text, clean_text, confidence= ocr_extract_text(np_image)

    if clean_text == "":
        return {
            "error": "This is not a meme. Upload a valid meme image with text.",
        }
    
    elif len(clean_text.split()) < 3:
        return {
            "error": "Insufficient text detected in the meme. Please upload a meme with more text. Minimum is 3 words.",
            "clean_text": clean_text,
            "raw_text": raw_text,
            "confidence": confidence
        }

    # Image
    img_tensor = resize_normalize_image(pil_img).to(device)

    # Tokenize text
    encoding = tokenizer(
        clean_text, return_tensors='pt',
        padding=True, truncation=True, max_length=128
    )
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    

    # Forward pass
    with torch.no_grad():
        logits = model(img_tensor, input_ids, attention_mask)
        probs = torch.softmax(logits, dim=1)
        pred_class = torch.argmax(probs, dim=1).item()
        pred_class = 'sexual' if pred_class == 1 else 'non-sexual'

    return {
        'original_size': pil_img.size,
        'prediction': pred_class,
        'probabilities': probs.cpu().numpy().tolist(),
        'raw_text': raw_text,
        'clean_text': clean_text,
        'confidence': confidence
    }


# =========================
# 7. Run as main
# =========================
# if __name__ == "__main__":
#     # Example: load image from path
#     IMAGE_PATH = "backend/OIP (1).jfif"

#     # test_dimension_sensitivity(IMAGE_PATH)

#     result = run_inference(IMAGE_PATH)

#     # Print results
#     print("Original Image Size:", result['original_size'])
#     print("Prediction:", result['prediction'])
#     print("Probabilities:", result['probabilities'])
#     print("Raw OCR Text:", result['raw_text'])
#     print("Clean OCR Text:", result['clean_text'])
#     print("OCR Confidence:", result['confidence'])


#     # Preprocess image
#     pil_img = Image.open(IMAGE_PATH).convert("RGB")
#     img_tensor = resize_normalize_image(pil_img).to(device)

#     # -----------------------------
#     # Generate ResNet heatmap
#     # -----------------------------
#     features = {}
#     def hook_fn(module, input, output):
#         features['value'] = output.detach()

#     last_conv = model.image_encoder[-1]
#     hook_handle = last_conv.register_forward_hook(hook_fn)

#     with torch.no_grad():
#         _ = model(img_tensor, 
#                 input_ids=torch.zeros(1,1, dtype=torch.long, device=device), 
#                 attention_mask=torch.ones(1,1, dtype=torch.long, device=device))

#     hook_handle.remove()

#     feat_tensor = features['value']
#     heatmap_grid = feat_tensor[0].mean(dim=0).cpu().numpy()
#     heatmap_grid = (heatmap_grid - heatmap_grid.min()) / (heatmap_grid.max() - heatmap_grid.min())
#     heatmap_resized = np.uint8(255 * heatmap_grid)
#     heatmap_resized = Image.fromarray(heatmap_resized).resize(pil_img.size, Image.NEAREST)
#     heatmap_resized = np.array(heatmap_resized)

#     probs = result['probabilities'][0]
#     prob_text = f"non-sexual: {probs[0]:.2f}, sexual: {probs[1]:.2f}"

#     # -----------------------------
#     # Plot everything in one figure
#     # -----------------------------
#     fig, ax = plt.subplots(figsize=(6,6))

#     ax.imshow(pil_img)  # original image
#     ax.imshow(heatmap_resized, cmap='jet', alpha=0.4, interpolation='nearest')  # overlay heatmap
#     ax.axis('off')
#     ax.set_title(f"{result['prediction']} ({prob_text})", fontsize=14, color='blue')

#     # Add colorbar
#     sm = plt.cm.ScalarMappable(cmap='jet', norm=plt.Normalize(vmin=0, vmax=1))
#     sm.set_array([])
#     cbar = fig.colorbar(sm, ax=ax, fraction=0.046, pad=0.04)
#     cbar.set_label('Feature Intensity')

#     plt.show()