Spaces:

dappyx
/

kazroberta-4experts

Sleeping

App Files Files Community

dappyx commited on Jun 9, 2024

Commit

75c80a0

verified ·

1 Parent(s): e833099

Upload 4 files

Browse files

Files changed (4) hide show

app.py +53 -0
main.py +41 -0
model.py +371 -0
switch_transformer.pt +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import gradio as gr
+from main import tokenizer, model, device
+import torch
+def qa_pipeline(text,question):
+  inputs = tokenizer(question, text, return_tensors="pt")
+  input_ids = inputs['input_ids'].to(device)
+  attention_mask = inputs['attention_mask'].to(device)
+  batch = {
+     "input_ids": input_ids,
+     "attention_mask": attention_mask
+  }
+  outputs = model(batch)
+  start_index = torch.argmax(outputs.start_logits, dim=-1).item()
+  end_index = torch.argmax(outputs.end_logits, dim=-1).item()
+  predict_answer_tokens = inputs.input_ids[0, start_index : end_index + 1]
+  return tokenizer.decode(predict_answer_tokens)
+def answer_question(context, question):
+    result = qa_pipeline(context, question)
+    return result
+example_contexts = [
+    "Қазақстанның ұлттық құрамы алуан түрлі. Халықтың басым бөлігін тұрғылықты қазақ халқы құрайды, пайыздық үлесі — 70,18%[10], орыстар — 18,42%, өзбектер — 3,29%, украиндар — 1,36%, ұйғырлар — 1,48%, татарлар — 1,06%, басқа халықтар 5,38%.[11] Халықтың 75% астамын мұсылмандар құрайды, православты христиандар — 21%, қалғаны басқа да дін өкілдері.[12]",
+    "Қазақстан бес мемлекетпен шекаралас, соның ішінде әлемдегі құрлықтағы ең ұзын шекара, солтүстігінде және батысында Ресеймен — 7591 км құрайды. Оңтүстігінде: Түрікменстан — 426 км, Өзбекстан — 2354 км және Қырғызстан — 1241 км, ал шығысында: Қытаймен — 1782 км шектеседі. Жалпы құрлық шекарасының ұзындығы — 13394 км. Батыста Каспий көлімен (2000 км), оңтүстік батыста Арал теңізімен шайылады.[9] 2024 жылдың 1 наурыздағы елдегі тұрғындар саны — 20 075 271[4], бұл әлем бойынша 64-орын. Жер көлемі жағынан әлем елдерінің ішінде 9-орын алады (2 724 902 км²).",
+    "Қазақстан — 1995 жылғы 30 тамыздағы республикалық референдумда қабылданған Конституция бойынша — өзін демократиялы, зайырлы, құқықты және әлеуметті мемлекет ретінде орнықтырды. Қазақстан Республикасы – президенттік басқару формасындағы біртұтас мемлекет. Республиканың ең жоғарғы өкілді органы — Парламент. Ол республиканың заң шығару құзіретін жүзеге асырады."
+]
+example_questions = [
+"Қазақстанның халқы неше пайызды қазақтар құрайды?",
+"Қазақстан нешеу мемлекетпен шекаралас?",
+"Қазақстандағы басқару формасы қандай?",
+]
+examples = [[context, question] for context, question in zip(example_contexts, example_questions)]
+# Создаем интерфейс
+iface = gr.Interface(
+    fn=answer_question,
+    inputs=[
+        gr.Textbox(lines=10, label="Context"),
+        gr.Textbox(lines=2, label="Question")
+    ],
+    outputs="text",
+    title="Question Answering Model",
+    description="Введите контекст и задайте вопрос, чтобы получить ответ.",
+    examples=examples
+)
+# Запускаем интерфейс
+iface.launch()

main.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import torch
+import torch.nn as nn
+from model import (
+    SwitchTransformer,
+    SwitchTransformerLayer,
+    MultiHeadAttention,
+    SwitchFeedForward,
+    FeedForward,
+)
+from transformers import AutoTokenizer
+device = 'cpu'
+ff = FeedForward(768, 768*4)
+attn = MultiHeadAttention(8, 768, 0.2)
+st_ff = SwitchFeedForward(
+            capacity_factor=1.25,
+            drop_tokens=False,
+            n_experts=4,
+            expert=ff,
+            d_model=768,
+            is_scale_prob=True,
+        )
+st_layer = SwitchTransformerLayer(
+            d_model=768,
+            attn=attn,
+            feed_forward=st_ff,
+            dropout_prob=0.2
+        )
+model = SwitchTransformer(
+            layer=st_layer,
+            n_layers=4,
+            n_experts=4,
+            device=device,
+            load_balancing_loss_ceof=0.05,
+        ).to(device)
+model.load_state_dict(torch.load("switch_transformer.pt"))
+tokenizer = AutoTokenizer.from_pretrained("Kyrmasch/kaz-roberta-squad2-kaz")

model.py ADDED Viewed

	@@ -0,0 +1,371 @@

+from turtle import forward
+from torch import Tensor
+import torch.nn.functional as F
+import torch.nn as nn
+import torch
+import copy
+import math
+from transformers import DistilBertForQuestionAnswering, DistilBertConfig
+from transformers import AutoModelForQuestionAnswering
+class MultiHeadAttention(nn.Module):
+    def __init__(self, n_heads, dim, dropout_prob):
+        super().__init__()
+        # self.n_heads = config.n_heads
+        # self.dim = config.dim
+        # self.dropout = nn.Dropout(p=config.attention_dropout)
+        self.n_heads = n_heads
+        self.dim = dim
+        self.dropout = nn.Dropout(p=dropout_prob)
+        assert self.dim % self.n_heads == 0
+        self.q_lin = nn.Linear(in_features=self.dim, out_features=self.dim)
+        self.k_lin = nn.Linear(in_features=self.dim, out_features=self.dim)
+        self.v_lin = nn.Linear(in_features=self.dim, out_features=self.dim)
+        self.out_lin = nn.Linear(in_features=self.dim, out_features=self.dim)
+    def forward(self, query, key, value, mask, head_mask=None, output_attentions=False):
+        """
+        Parameters:
+            query: torch.tensor(bs, seq_length, dim)
+            key: torch.tensor(bs, seq_length, dim)
+            value: torch.tensor(bs, seq_length, dim)
+            mask: torch.tensor(bs, seq_length)
+        Returns:
+            weights: torch.tensor(bs, n_heads, seq_length, seq_length) Attention weights context: torch.tensor(bs,
+            seq_length, dim) Contextualized layer. Optional: only if `output_attentions=True`
+        """
+        bs, q_length, dim = query.size()
+        k_length = key.size(1)
+        # assert dim == self.dim, f'Dimensions do not match: {dim} input vs {self.dim} configured'
+        # assert key.size() == value.size()
+        dim_per_head = self.dim // self.n_heads
+        mask_reshp = (bs, 1, 1, k_length)
+        def shape(x):
+            """separate heads"""
+            return x.view(bs, -1, self.n_heads, dim_per_head).transpose(1, 2)
+        def unshape(x):
+            """group heads"""
+            return (
+                x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head)
+            )
+        q = shape(self.q_lin(query))  # (bs, n_heads, q_length, dim_per_head)
+        k = shape(self.k_lin(key))  # (bs, n_heads, k_length, dim_per_head)
+        v = shape(self.v_lin(value))  # (bs, n_heads, k_length, dim_per_head)
+        q = q / math.sqrt(dim_per_head)  # (bs, n_heads, q_length, dim_per_head)
+        scores = torch.matmul(q, k.transpose(2, 3))  # (bs, n_heads, q_length, k_length)
+        mask = (
+            (mask == 0).view(mask_reshp).expand_as(scores)
+        )  # (bs, n_heads, q_length, k_length)
+        scores = scores.masked_fill(
+            mask, -float("inf")
+        )  # (bs, n_heads, q_length, k_length)
+        weights = nn.functional.softmax(
+            scores, dim=-1
+        )  # (bs, n_heads, q_length, k_length)
+        weights = self.dropout(weights)  # (bs, n_heads, q_length, k_length)
+        # Mask heads if we want to
+        if head_mask is not None:
+            weights = weights * head_mask
+        context = torch.matmul(weights, v)  # (bs, n_heads, q_length, dim_per_head)
+        context = unshape(context)  # (bs, q_length, dim)
+        context = self.out_lin(context)  # (bs, q_length, dim)
+        if output_attentions:
+            return (context, weights)
+        else:
+            return context
+class FeedForward(nn.Module):
+    def __init__(self, dim_input: int = 768, dim_feedforward: int = 4 * 768):
+        super().__init__()
+        self.linear1 = nn.Linear(dim_input, dim_feedforward)
+        self.relu = nn.ReLU()
+        self.linear2 = nn.Linear(dim_feedforward, dim_input)
+    def forward(self, x):
+        return self.linear2(self.relu(self.linear1(x)))
+class SwitchFeedForward(nn.Module):
+    """
+    ## Routing among multiple FFNs
+    """
+    def __init__(
+        self,
+        *,
+        capacity_factor: float,
+        drop_tokens: bool,
+        is_scale_prob: bool,
+        n_experts: int,
+        expert: FeedForward,
+        d_model: int
+    ):
+        """
+        * `capacity_factor` is the capacity of each expert as a factor relative to ideally balanced load
+        * `drop_tokens` specifies whether to drop tokens if more tokens are routed to an expert than the capacity
+        * `is_scale_prob` specifies whether to multiply the input to the FFN by the routing probability
+        * `n_experts` is the number of experts
+        * `expert` is the expert layer, a [FFN module](../feed_forward.html)
+        * `d_model` is the number of features in a token embedding
+        * `d_ff` is the number of features in the hidden layer of the FFN
+        * `dropout` is dropout probability in the FFN
+        """
+        super().__init__()
+        self.capacity_factor = capacity_factor
+        self.is_scale_prob = is_scale_prob
+        self.n_experts = n_experts
+        self.drop_tokens = drop_tokens
+        # make copies of the FFNs
+        self.experts = nn.ModuleList([copy.deepcopy(expert) for _ in range(n_experts)])
+        # Routing layer and softmax
+        self.switch = nn.Linear(d_model, n_experts)
+        self.softmax = nn.Softmax(dim=-1)
+    def forward(self, x: torch.Tensor):
+        """
+        * `x` is the input to the switching module with shape `[seq_len, batch_size, d_model]`
+        """
+        # Capture the shape to change shapes later
+        seq_len, batch_size, d_model = x.shape
+        # Flatten the sequence and batch dimensions
+        x = x.view(-1, d_model)
+        # Get routing probabilities for each of the tokens.
+        # $$p_i(x) = \frac{e^{h(x)_i}}{\sum^N_j e^{h(x)_j}}$$
+        # where $N$ is the number of experts `n_experts` and
+        # $h(\cdot)$ is the linear transformation of token embeddings.
+        route_prob = self.softmax(self.switch(x))
+        # Get the maximum routing probabilities and the routes.
+        # We route to the expert with highest probability
+        route_prob_max, routes = torch.max(route_prob, dim=-1)
+        # Get indexes of tokens going to each expert
+        indexes_list = [
+            torch.eq(routes, i).nonzero(as_tuple=True)[0] for i in range(self.n_experts)
+        ]
+        # Initialize an empty tensor to store outputs
+        final_output = x.new_zeros(x.shape)
+        # Capacity of each expert.
+        # $$\mathrm{expert\;capacity} =
+        # \frac{\mathrm{tokens\;per\;batch}}{\mathrm{number\;of\;experts}}
+        # \times \mathrm{capacity\;factor}$$
+        capacity = int(self.capacity_factor * len(x) / self.n_experts)
+        # Number of tokens routed to each expert.
+        counts = x.new_tensor([len(indexes_list[i]) for i in range(self.n_experts)])
+        # Initialize an empty list of dropped tokens
+        dropped = []
+        # Only drop tokens if `drop_tokens` is `True`.
+        if self.drop_tokens:
+            # Drop tokens in each of the experts
+            for i in range(self.n_experts):
+                # Ignore if the expert is not over capacity
+                if len(indexes_list[i]) <= capacity:
+                    continue
+                # Shuffle indexes before dropping
+                indexes_list[i] = indexes_list[i][torch.randperm(len(indexes_list[i]))]
+                # Collect the tokens over capacity as dropped tokens
+                dropped.append(indexes_list[i][capacity:])
+                # Keep only the tokens upto the capacity of the expert
+                indexes_list[i] = indexes_list[i][:capacity]
+        # Get outputs of the expert FFNs
+        expert_output = [
+            self.experts[i](x[indexes_list[i], :]) for i in range(self.n_experts)
+        ]
+        # Assign to final output
+        for i in range(self.n_experts):
+            final_output[indexes_list[i], :] = expert_output[i]
+        # Pass through the dropped tokens
+        if dropped:
+            dropped = torch.cat(dropped)
+            final_output[dropped, :] = x[dropped, :]
+        if self.is_scale_prob:
+            # Multiply by the expert outputs by the probabilities $y = p_i(x) E_i(x)$
+            final_output = final_output * route_prob_max.view(-1, 1)
+        else:
+            # Don't scale the values but multiply by $\frac{p}{\hat{p}} = 1$ so that the gradients flow
+            # (this is something we experimented with).
+            final_output = final_output * (
+                route_prob_max / route_prob_max.detach()
+            ).view(-1, 1)
+        # Change the shape of the final output back to `[seq_len, batch_size, d_model]`
+        final_output = final_output.view(seq_len, batch_size, d_model)
+        # Return
+        #
+        # * the final output
+        # * number of tokens routed to each expert
+        # * sum of probabilities for each expert
+        # * number of tokens dropped.
+        # * routing probabilities of the selected experts
+        #
+        # These are used for the load balancing loss and logging
+        return final_output, counts, route_prob.sum(0), len(dropped), route_prob_max
+class SwitchTransformerLayer(nn.Module):
+    """
+    # Switch Transformer Block
+    This is the same as [normal transformer block](../models.html#TransformerLayer)
+    with handling extra outputs of switch feedforward module.
+    """
+    def __init__(
+        self,
+        *,
+        d_model: int,
+        attn: MultiHeadAttention,
+        feed_forward: SwitchFeedForward,
+        dropout_prob: float
+    ):
+        """
+        * `d_model` is the token embedding size
+        * `attn` is the attention module
+        * `feed_forward` is the feed forward module (which is the switching module in this case)
+        * `dropout_prob` is the probability of dropping out after self attention and FFN
+        """
+        super().__init__()
+        self.size = d_model
+        self.attn = attn
+        self.feed_forward = feed_forward
+        self.dropout = nn.Dropout(dropout_prob)
+        self.norm_self_attn = nn.LayerNorm([d_model])
+        self.norm_ff = nn.LayerNorm([d_model])
+    def forward(self, *, x: torch.Tensor, mask: torch.Tensor):
+        # Normalize the vectors before doing self attention
+        z = self.norm_self_attn(x)
+        # Run through self attention, i.e. keys and values are from self
+        self_attn = self.attn(query=z, key=z, value=z, mask=mask)
+        # Add the self attention results
+        x = x + self.dropout(self_attn)
+        # Normalize for feed-forward
+        z = self.norm_ff(x)
+        # Pass through the switching feed-forward network
+        ff, counts, route_prob, n_dropped, route_prob_max = self.feed_forward(z)
+        # Add the feed-forward results back
+        x = x + self.dropout(ff)
+        return x, counts, route_prob, n_dropped, route_prob_max
+class SwitchTransformer(nn.Module):
+    """
+    ## Switch Transformer
+    """
+    def __init__(self, layer, n_layers, n_experts, device, load_balancing_loss_ceof):
+        super().__init__()
+        # Make copies of the transformer layer
+        self.layers = nn.ModuleList([copy.deepcopy(layer) for _ in range(n_layers)])
+        # Final normalization layer
+        self.norm = nn.LayerNorm([layer.size])
+        self.qa_outputs = nn.Linear(768, 2)
+        model = AutoModelForQuestionAnswering.from_pretrained("Kyrmasch/kaz-roberta-squad2-kaz").to(device)
+        self.base_model = model
+        self.device = device
+        self.load_balancing_loss_ceof = load_balancing_loss_ceof
+        self.n_experts = n_experts  # used to calculate lb loss
+    def freeze_base_model(self):
+        for param in self.base_model.parameters():
+            param.requires_grad = False
+    def freeze_experts(self):
+        # TODO: find how to freeze the experts in the SwitchTransformer
+        pass
+    # def forward(self, x: torch.Tensor, mask: torch.Tensor):
+    def forward(self, batch):
+        input_ids = batch["input_ids"].to(self.device)
+        attention_mask = batch["attention_mask"].to(self.device)
+        start_positions = (
+            batch["start_positions"].to(self.device)
+            if "start_positions" in batch.keys()
+            else None
+        )
+        end_positions = (
+            batch["end_positions"].to(self.device)
+            if "end_positions" in batch.keys()
+            else None
+        )
+        outputs = self.base_model(
+            input_ids,
+            attention_mask=attention_mask,
+            start_positions=None,
+            end_positions=None,
+            output_hidden_states=True,
+        )
+        x = outputs.hidden_states[-1]
+        # Run through each transformer layer
+        counts, route_prob, n_dropped, route_prob_max = [], [], [], []
+        for layer in self.layers:
+            x, f, p, n_d, p_max = layer(x=x, mask=attention_mask)
+            counts.append(f)
+            route_prob.append(p)
+            n_dropped.append(n_d)
+            route_prob_max.append(p_max)
+        # Finally, normalize the vectors
+        output = self.norm(x)
+        logits = self.qa_outputs(output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()  # (bs, max_query_len)
+        end_logits = end_logits.squeeze(-1).contiguous()  # (bs, max_query_len)
+        loss = None
+        if start_positions is not None and end_positions is not None:
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+            loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            loss = (start_loss + end_loss) / 2
+        counts = torch.stack(counts)
+        route_prob = torch.stack(route_prob)
+        route_prob_max = torch.stack(route_prob_max)
+        total = counts.sum(dim=-1, keepdims=True)
+        route_frac = counts / total
+        route_prob = route_prob / total
+        load_balancing_loss = self.n_experts * (route_frac * route_prob).sum()
+        loss = (
+            load_balancing_loss
+            if loss is None
+            else loss + self.load_balancing_loss_ceof * load_balancing_loss
+        )
+        return start_logits, end_logits, loss

switch_transformer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:18db93cbc33e8aab35f5583010b67d2ca0c44cd93445e0bfd5d886382708d9ba
+size 671685785