Spaces:
Runtime error
Runtime error
| import numpy as np | |
| import torch | |
| import torch.nn as nn | |
| from utils import Config | |
| from transformers import AutoTokenizer, AutoModel | |
| class AutoModelForSequenceClassification(nn.Module): | |
| """Base model for sequence classification""" | |
| def __init__(self, args, Model, config, num_labels=2): | |
| """Initialize the model""" | |
| super(AutoModelForSequenceClassification, self).__init__() | |
| self.num_labels = num_labels | |
| self.encoder = Model | |
| self.config = config | |
| self.dropout = nn.Dropout(args.drop_ratio) | |
| self.classifier = nn.Linear(config.hidden_size, num_labels) | |
| self.logsoftmax = nn.LogSoftmax(dim=1) | |
| self._init_weights(self.classifier) | |
| def _init_weights(self, module): | |
| """Initialize the weights""" | |
| if isinstance(module, (nn.Linear, nn.Embedding)): | |
| module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) | |
| elif isinstance(module, nn.LayerNorm): | |
| module.bias.data.zero_() | |
| module.weight.data.fill_(1.0) | |
| if isinstance(module, nn.Linear) and module.bias is not None: | |
| module.bias.data.zero_() | |
| def forward( | |
| self, | |
| input_ids, | |
| target_mask=None, | |
| token_type_ids=None, | |
| attention_mask=None, | |
| labels=None, | |
| head_mask=None, | |
| ): | |
| """ | |
| Inputs: | |
| `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] with the word token indices in the vocabulary | |
| `target_mask`: a torch.LongTensor of shape [batch_size, sequence_length] with the mask for target wor. 1 for target word and 0 otherwise. | |
| `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token types indices | |
| selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to a `sentence B` token (see BERT paper for more details). | |
| `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [0, 1]. | |
| It's a mask to be used if the input sequence length is smaller than the max input sequence length in the current batch. | |
| It's the mask that we typically use for attention when a batch has varying length sentences. | |
| `labels`: optional labels for the classification output: torch.LongTensor of shape [batch_size, sequence_length] | |
| with indices selected in [0, ..., num_labels]. | |
| `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. | |
| It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked. | |
| """ | |
| outputs = self.encoder( | |
| input_ids, | |
| token_type_ids=token_type_ids, | |
| attention_mask=attention_mask, | |
| head_mask=head_mask, | |
| ) | |
| pooled_output = outputs[1] | |
| pooled_output = self.dropout(pooled_output) | |
| logits = self.classifier(pooled_output) | |
| logits = self.logsoftmax(logits) | |
| if labels is not None: | |
| loss_fct = nn.NLLLoss() | |
| loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) | |
| return loss | |
| return logits | |
| class AutoModelForTokenClassification(nn.Module): | |
| """Base model for token classification""" | |
| def __init__(self, args, Model, config, num_labels=2): | |
| """Initialize the model""" | |
| super(AutoModelForTokenClassification, self).__init__() | |
| self.num_labels = num_labels | |
| self.bert = Model | |
| self.config = config | |
| self.dropout = nn.Dropout(args.drop_ratio) | |
| self.classifier = nn.Linear(config.hidden_size, num_labels) | |
| self.logsoftmax = nn.LogSoftmax(dim=1) | |
| self._init_weights(self.classifier) | |
| def _init_weights(self, module): | |
| """Initialize the weights""" | |
| if isinstance(module, (nn.Linear, nn.Embedding)): | |
| module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) | |
| elif isinstance(module, nn.LayerNorm): | |
| module.bias.data.zero_() | |
| module.weight.data.fill_(1.0) | |
| if isinstance(module, nn.Linear) and module.bias is not None: | |
| module.bias.data.zero_() | |
| def forward( | |
| self, | |
| input_ids, | |
| target_mask, | |
| token_type_ids=None, | |
| attention_mask=None, | |
| labels=None, | |
| head_mask=None, | |
| ): | |
| """ | |
| Inputs: | |
| `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] with the word token indices in the vocabulary | |
| `target_mask`: a torch.LongTensor of shape [batch_size, sequence_length] with the mask for target wor. 1 for target word and 0 otherwise. | |
| `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token types indices | |
| selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to a `sentence B` token (see BERT paper for more details). | |
| `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [0, 1]. | |
| It's a mask to be used if the input sequence length is smaller than the max input sequence length in the current batch. | |
| It's the mask that we typically use for attention when a batch has varying length sentences. | |
| `labels`: optional labels for the classification output: torch.LongTensor of shape [batch_size, sequence_length] | |
| with indices selected in [0, ..., num_labels]. | |
| `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. | |
| It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked. | |
| """ | |
| outputs = self.bert( | |
| input_ids, | |
| token_type_ids=token_type_ids, | |
| attention_mask=attention_mask, | |
| head_mask=head_mask, | |
| ) | |
| sequence_output = outputs[0] # [batch, max_len, hidden] | |
| target_output = sequence_output * target_mask.unsqueeze(2) | |
| target_output = self.dropout(target_output) | |
| target_output = target_output.sum(1) / target_mask.sum() # [batch, hideen] | |
| logits = self.classifier(target_output) | |
| logits = self.logsoftmax(logits) | |
| if labels is not None: | |
| loss_fct = nn.NLLLoss() | |
| loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) | |
| return loss | |
| return logits | |
| class AutoModelForSequenceClassification_SPV(nn.Module): | |
| """MelBERT with only SPV""" | |
| def __init__(self, args, Model, config, num_labels=2): | |
| """Initialize the model""" | |
| super(AutoModelForSequenceClassification_SPV, self).__init__() | |
| self.num_labels = num_labels | |
| self.encoder = Model | |
| self.config = config | |
| self.dropout = nn.Dropout(args.drop_ratio) | |
| self.classifier = nn.Linear(config.hidden_size * 2, num_labels) | |
| self.logsoftmax = nn.LogSoftmax(dim=1) | |
| self._init_weights(self.classifier) | |
| def _init_weights(self, module): | |
| """Initialize the weights""" | |
| if isinstance(module, (nn.Linear, nn.Embedding)): | |
| module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) | |
| elif isinstance(module, nn.LayerNorm): | |
| module.bias.data.zero_() | |
| module.weight.data.fill_(1.0) | |
| if isinstance(module, nn.Linear) and module.bias is not None: | |
| module.bias.data.zero_() | |
| def forward( | |
| self, | |
| input_ids, | |
| target_mask, | |
| token_type_ids=None, | |
| attention_mask=None, | |
| labels=None, | |
| head_mask=None, | |
| ): | |
| """ | |
| Inputs: | |
| `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] with the word token indices in the vocabulary | |
| `target_mask`: a torch.LongTensor of shape [batch_size, sequence_length] with the mask for target wor. 1 for target word and 0 otherwise. | |
| `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token types indices | |
| selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to a `sentence B` token (see BERT paper for more details). | |
| `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [0, 1]. | |
| `labels`: optional labels for the classification output: torch.LongTensor of shape [batch_size, sequence_length] | |
| with indices selected in [0, ..., num_labels]. | |
| `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. | |
| It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked. | |
| """ | |
| outputs = self.encoder( | |
| input_ids, | |
| token_type_ids=token_type_ids, | |
| attention_mask=attention_mask, | |
| head_mask=head_mask, | |
| ) | |
| sequence_output = outputs[0] # [batch, max_len, hidden] | |
| pooled_output = outputs[1] # [batch, hidden] | |
| # Get target ouput with target mask | |
| target_output = sequence_output * target_mask.unsqueeze(2) # [batch, hidden] | |
| # dropout | |
| target_output = self.dropout(target_output) | |
| pooled_output = self.dropout(pooled_output) | |
| # Get mean value of target output if the target output consistst of more than one token | |
| target_output = target_output.mean(1) | |
| logits = self.classifier(torch.cat([target_output, pooled_output], dim=1)) | |
| logits = self.logsoftmax(logits) | |
| if labels is not None: | |
| loss_fct = nn.NLLLoss() | |
| loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) | |
| return loss | |
| return logits | |
| class AutoModelForSequenceClassification_MIP(nn.Module): | |
| """MelBERT with only MIP""" | |
| def __init__(self, args, Model, config, num_labels=2): | |
| """Initialize the model""" | |
| super(AutoModelForSequenceClassification_MIP, self).__init__() | |
| self.num_labels = num_labels | |
| self.encoder = Model | |
| self.config = config | |
| self.dropout = nn.Dropout(args.drop_ratio) | |
| self.args = args | |
| self.classifier = nn.Linear(config.hidden_size * 2, num_labels) | |
| self.logsoftmax = nn.LogSoftmax(dim=1) | |
| self._init_weights(self.classifier) | |
| def _init_weights(self, module): | |
| """Initialize the weights""" | |
| if isinstance(module, (nn.Linear, nn.Embedding)): | |
| module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) | |
| elif isinstance(module, nn.LayerNorm): | |
| module.bias.data.zero_() | |
| module.weight.data.fill_(1.0) | |
| if isinstance(module, nn.Linear) and module.bias is not None: | |
| module.bias.data.zero_() | |
| def forward( | |
| self, | |
| input_ids, | |
| input_ids_2, | |
| target_mask, | |
| target_mask_2, | |
| attention_mask_2, | |
| token_type_ids=None, | |
| attention_mask=None, | |
| labels=None, | |
| head_mask=None, | |
| ): | |
| """ | |
| Inputs: | |
| `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] with the first input token indices in the vocabulary | |
| `input_ids_2`: a torch.LongTensor of shape [batch_size, sequence_length] with the second input token indicies | |
| `target_mask`: a torch.LongTensor of shape [batch_size, sequence_length] with the mask for target word in the first input. 1 for target word and 0 otherwise. | |
| `target_mask_2`: a torch.LongTensor of shape [batch_size, sequence_length] with the mask for target word in the second input. 1 for target word and 0 otherwise. | |
| `attention_mask_2`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [0, 1] for the second input. | |
| `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token types indices | |
| selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to a `sentence B` token (see BERT paper for more details). | |
| `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [0, 1] for the first input. | |
| `labels`: optional labels for the classification output: torch.LongTensor of shape [batch_size, sequence_length] | |
| with indices selected in [0, ..., num_labels]. | |
| `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. | |
| It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked. | |
| """ | |
| # First encoder for full sentence | |
| outputs = self.encoder( | |
| input_ids, | |
| token_type_ids=token_type_ids, | |
| attention_mask=attention_mask, | |
| head_mask=head_mask, | |
| ) | |
| sequence_output = outputs[0] # [batch, max_len, hidden] | |
| # Get target ouput with target mask | |
| target_output = sequence_output * target_mask.unsqueeze(2) | |
| target_output = self.dropout(target_output) | |
| target_output = target_output.sum(1) / target_mask.sum() # [batch, hidden] | |
| # Second encoder for only the target word | |
| outputs_2 = self.encoder(input_ids_2, attention_mask=attention_mask_2, head_mask=head_mask) | |
| sequence_output_2 = outputs_2[0] # [batch, max_len, hidden] | |
| # Get target ouput with target mask | |
| target_output_2 = sequence_output_2 * target_mask_2.unsqueeze(2) | |
| target_output_2 = self.dropout(target_output_2) | |
| target_output_2 = target_output_2.sum(1) / target_mask_2.sum() | |
| logits = self.classifier(torch.cat([target_output_2, target_output], dim=1)) | |
| logits = self.logsoftmax(logits) | |
| if labels is not None: | |
| loss_fct = nn.NLLLoss() | |
| loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) | |
| return loss | |
| return logits | |
| class AutoModelForSequenceClassification_SPV_MIP(nn.Module): | |
| """MelBERT""" | |
| def __init__(self, args, Model, config, num_labels=2): | |
| """Initialize the model""" | |
| super(AutoModelForSequenceClassification_SPV_MIP, self).__init__() | |
| self.num_labels = num_labels | |
| self.encoder = Model | |
| self.config = config | |
| self.dropout = nn.Dropout(args.drop_ratio) | |
| self.args = args | |
| self.SPV_linear = nn.Linear(config.hidden_size * 2, args.classifier_hidden) | |
| self.MIP_linear = nn.Linear(config.hidden_size * 2, args.classifier_hidden) | |
| self.classifier = nn.Linear(args.classifier_hidden * 2, num_labels) | |
| self._init_weights(self.SPV_linear) | |
| self._init_weights(self.MIP_linear) | |
| self.logsoftmax = nn.LogSoftmax(dim=1) | |
| self._init_weights(self.classifier) | |
| def _init_weights(self, module): | |
| """Initialize the weights""" | |
| if isinstance(module, (nn.Linear, nn.Embedding)): | |
| module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) | |
| elif isinstance(module, nn.LayerNorm): | |
| module.bias.data.zero_() | |
| module.weight.data.fill_(1.0) | |
| if isinstance(module, nn.Linear) and module.bias is not None: | |
| module.bias.data.zero_() | |
| def forward( | |
| self, | |
| input_ids, | |
| input_ids_2, | |
| target_mask, | |
| target_mask_2, | |
| attention_mask_2, | |
| token_type_ids=None, | |
| attention_mask=None, | |
| labels=None, | |
| head_mask=None, | |
| ): | |
| """ | |
| Inputs: | |
| `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] with the first input token indices in the vocabulary | |
| `input_ids_2`: a torch.LongTensor of shape [batch_size, sequence_length] with the second input token indicies | |
| `target_mask`: a torch.LongTensor of shape [batch_size, sequence_length] with the mask for target word in the first input. 1 for target word and 0 otherwise. | |
| `target_mask_2`: a torch.LongTensor of shape [batch_size, sequence_length] with the mask for target word in the second input. 1 for target word and 0 otherwise. | |
| `attention_mask_2`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [0, 1] for the second input. | |
| `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token types indices | |
| selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to a `sentence B` token (see BERT paper for more details). | |
| `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [0, 1] for the first input. | |
| `labels`: optional labels for the classification output: torch.LongTensor of shape [batch_size, sequence_length] | |
| with indices selected in [0, ..., num_labels]. | |
| `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. | |
| It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked. | |
| """ | |
| # First encoder for full sentence | |
| outputs = self.encoder( | |
| input_ids, | |
| token_type_ids=token_type_ids, | |
| attention_mask=attention_mask, | |
| head_mask=head_mask, | |
| ) | |
| sequence_output = outputs[0] # [batch, max_len, hidden] | |
| pooled_output = outputs[1] # [batch, hidden] | |
| # Get target ouput with target mask | |
| target_output = sequence_output * target_mask.unsqueeze(2) | |
| # dropout | |
| target_output = self.dropout(target_output) | |
| pooled_output = self.dropout(pooled_output) | |
| target_output = target_output.mean(1) # [batch, hidden] | |
| # Second encoder for only the target word | |
| outputs_2 = self.encoder(input_ids_2, attention_mask=attention_mask_2, head_mask=head_mask) | |
| sequence_output_2 = outputs_2[0] # [batch, max_len, hidden] | |
| # Get target ouput with target mask | |
| target_output_2 = sequence_output_2 * target_mask_2.unsqueeze(2) | |
| target_output_2 = self.dropout(target_output_2) | |
| target_output_2 = target_output_2.mean(1) | |
| # Get hidden vectors each from SPV and MIP linear layers | |
| SPV_hidden = self.SPV_linear(torch.cat([pooled_output, target_output], dim=1)) | |
| MIP_hidden = self.MIP_linear(torch.cat([target_output_2, target_output], dim=1)) | |
| logits = self.classifier(self.dropout(torch.cat([SPV_hidden, MIP_hidden], dim=1))) | |
| logits = self.logsoftmax(logits) | |
| if labels is not None: | |
| loss_fct = nn.NLLLoss() | |
| loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) | |
| return loss | |
| return logits | |