update

Files changed (3) hide show

config.yaml +168 -0
diffusion.py +1 -101
scoring/{hemolysis.py → functions/hemolysis.py} +0 -0

config.yaml ADDED Viewed

	@@ -0,0 +1,168 @@

+noise:
+  type: loglinear
+  sigma_min: 1e-4
+  sigma_max: 20
+  state_dependent: True
+mode: ppl_eval  # train / ppl_eval / sample_eval
+diffusion: absorbing_state
+vocab: old_smiles # old_smiles / new_smiles / selfies / helm
+backbone: roformer  # peptideclm / helmgpt / dit / roformer / finetune_roformer
+parameterization: subs  # subs
+time_conditioning: False
+T: 0  # 0 (continuous time) / 1000
+subs_masking: False
+seed: 42
+mcts:
+  num_children: 50
+  num_objectives: 5
+  topk: 100
+  mask_token: 4
+  num_iter: 128
+  sampling: 0 # 0 is gumbel sampling / > 0 samples children from top k probs
+  invalid_penalty: 0.5
+  sample_prob: 1.0
+  perm: True
+  dual: False
+  single: False
+  time_dependent: True
+lr_scheduler:
+  _target_: transformers.get_constant_schedule_with_warmup
+  num_warmup_steps: 2500
+data:
+  train: /home/st512/peptune/scripts/peptide-mdlm-mcts/data/finetune2/30K-train.csv
+  valid: /home/st512/peptune/scripts/peptide-mdlm-mcts/data/finetune2/30K-val.csv
+  batchinohup ng: wrapping # padding / wrapping
+loader:
+  global_batch_size: 64
+  eval_global_batch_size: ${.global_batch_size}
+  # Note: batch_size and eval_batch_size are **per machine**
+  batch_size: ${div_up:${.global_batch_size}, ${eval:${trainer.devices} * ${trainer.num_nodes}}}
+  eval_batch_size: ${div_up:${.eval_global_batch_size}, ${eval:${trainer.devices} * ${trainer.num_nodes}}}
+  num_workers: ${eval:"len(__import__('os').sched_getaffinity(0))"}
+  pin_memory: True
+sampling:
+  predictor: ddpm_cache  # analytic, ddpm, ddpm_cache
+  num_sequences: 100
+  sampling_eps: 1e-3
+  steps: 128
+  seq_length: 100
+  noise_removal: True
+  num_sample_batches: 2  # Total samples: `num_gpus` * `loader.eval_batch_size` * num_sample_batches
+  num_sample_log: 2
+  stride_length: 1
+  num_strides: 1
+training:
+  antithetic_sampling: True
+  sampling_eps: 1e-3
+  focus_mask: False
+  #dynamic_batching: True
+  accumulator: False
+eval:
+  checkpoint_path: /home/st512/peptune/scripts/peptide-mdlm-mcts/checkpoints/11M-old-tokenizer/epoch=10-step=156276.ckpt
+  disable_ema: False
+  compute_generative_perplexity: False
+  perplexity_batch_size: 8
+  compute_perplexity_on_sanity: False
+  gen_ppl_eval_model_name_or_path: gpt2-large  # gpt2-large, meta-llama/Llama-2-7b-hf
+  generate_samples: True
+  generation_model: /home/st512/peptune/scripts/peptide-mdlm-mcts/checkpoints/11M-old-tokenizer/
+optim:
+  weight_decay: 0.075
+  lr: 3e-4
+  beta1: 0.9
+  beta2: 0.999
+  eps: 1e-8
+pepclm:
+  hidden_size: 768
+  cond_dim: 256
+  n_heads: 20
+  n_blocks: 4
+  dropout: 0.5
+  length: 512
+  #scale_by_sigma: True
+model:
+  type: ddit
+  hidden_size: 768
+  cond_dim: 128
+  length: 512
+  n_blocks: 12
+  n_heads: 12
+  scale_by_sigma: True
+  dropout: 0.1
+roformer:
+  hidden_size: 768
+  n_layers: 8
+  n_heads: 8
+  max_position_embeddings: 1035
+helmgpt:
+  hidden_size: 256
+  embd_pdrop: 0.1
+  resid_pdrop: 0.1
+  attn_pdrop: 0.1
+  ff_dropout: 0.
+  block_size: 140
+  n_layer: 8
+  n_heads: 8
+trainer:
+  _target_: lightning.Trainer
+  accelerator: cuda
+  num_nodes: 1
+  devices: ${device_count:}
+  accumulate_grad_batches: ${div_up:${loader.global_batch_size}, ${eval:${trainer.devices} * ${loader.batch_size} * ${trainer.num_nodes}}}
+  gradient_clip_val: 1.0
+  precision: 64-true
+  num_sanity_val_steps: 2
+  max_epochs: 100
+  max_steps: 1_000_000
+  log_every_n_steps: 10
+  limit_train_batches: 1.0   # train on full dataset, can be used to toggle quick run
+  limit_val_batches: 1.0     # validate on full dataset, can be used to toggle quick run
+  #val_check_interval: 40 #954
+  check_val_every_n_epoch: 1
+wandb:
+  project: peptune
+  notes: null
+  group: null
+  job_type: null
+  name: sophia-tang
+  id: ${.name}_nov12_set2
+hydra:
+  run:
+    dir: ./${now:%Y.%m.%d}/
+  job:
+    chdir: True
+checkpointing:
+  # Use custom `save_dir` if, e.g., saving to S3 bucket, otherwise leave this parameter as is
+  save_dir: ${cwd:}
+  # Note: `checkpoints` path should correspond to `checkpoint_every_n_steps.dirpath`
+  resume_from_ckpt: True
+  resume_ckpt_path: /home/st512/peptune/scripts/peptide-mdlm-mcts/checkpoints/11M-old-tokenizer/epoch=7-step=108225.ckpt
+callbacks:
+  model_checkpoint:
+    _target_: pytorch_lightning.callbacks.ModelCheckpoint
+    every_n_epochs: 1
+    monitor: "val/nll"
+    save_top_k: 10
+    mode: "min"
+    dirpath: '/home/st512/peptune/scripts/peptide-mdlm-mcts/checkpoints/11M-old-tokenizer'

diffusion.py CHANGED Viewed

@@ -116,8 +116,6 @@ class Diffusion(L.LightningModule):
         self.test_metrics = metrics.clone(prefix='test/')
-    """LOSS"""
     """LOSS FOR INVALID PEPTIDES"""
     @torch.no_grad()
@@ -248,18 +246,6 @@ class Diffusion(L.LightningModule):
         t = (1 - self.config.training.sampling_eps) * eps_t + self.config.training.sampling_eps
         return t
-    """def mask_samples(self, x0, mask_prob):
-        # generate array of values in range [0, 1] uniformly at random
-        # will be used to determine which tokens are masked
-        mask_indices = torch.rand(* x0.shape, device=x0.device) # (batch_size, L)
-        # select tokens to mask if the random value in mask_indices is less than mask_prob
-        # this will mask approximately the fraction of tokens indicated by mask_prob
-        zt = torch.where(mask_indices < mask_prob, self.mask_token_id, x0)
-        return zt"""
     def q_xt(self, x, mask_prob):
         """Computes the noisy sample xt.
@@ -349,48 +335,6 @@ class Diffusion(L.LightningModule):
         # scale by T and return
         return self.T * L_vb
-    """def _forward_pass_diffusion(self, x0, attn_mask, mask=None):
-        print(x0)
-        # randomly sample time steps to start the denoising process for each x0 in batch
-        t = self.sample_t(x0.shape[0], x0.device)
-        # if we are training the intermediate transition blocks
-        if self.T > 0:
-            # scale by total timesteps T and cast to integer
-            t = (t * self.T).to(torch.int)
-            # scale down by T to get a multiple of 1/T
-            t = t / self.T
-            # add 1/T to ensure no 0 values
-            t += (1 / self.T)
-        # get noise and rate of noise at timestep t
-        sigma, dsigma = self.noise(t)
-        time_conditioning = sigma[:, None]
-        # get masking probabilities for all tokens for each batch
-        mask_prob = 1 - torch.exp(-sigma[:, None]) # (batch_size, L)
-        # get masked samples at different timesteps
-        if mask is None: zt = self.q_xt(x0, mask_prob)
-        else: zt = x0.where(mask==1, torch.full_like(x0, self.mask_token_id))
-        model_output = self.forward(zt, attn_mask, time_conditioning)
-        utils.print_nans(model_output, 'model_output')
-        if self.T > 0:
-            # compute diffusion loss
-            diffusion_loss = self.compute_diffusion_loss(model_output, zt, x0, t)
-            return diffusion_loss
-        # compute loss for the final that converts from z0 to x0
-        # -log(p_theta)
-        # get (batch_size, L) array of log-probabilities
-        log_p_theta = torch.gather(input=model_output, dim=-1, index=x0[:, :, None]).squeeze(-1) # (B, L)
-        return -log_p_theta * (dsigma / torch.expm1(sigma))[:, None]"""
     def _forward_pass_diffusion(self, x0, attn_mask, bond_mask=None, mask=None):
         """
             Training reverse diffusion model x_theta to reconstruct samples x0
@@ -634,21 +578,6 @@ class Diffusion(L.LightningModule):
     # first step in expansion
     def batch_cached_reverse_step(self, token_array, t, dt, batch_size, p_x0=None, attn_mask=None):
-        """
-        Generates batch_size different samples from the same starting point for the
-        first expansion step of MCTS
-        Args:
-            x (_type_): _description_
-            t (_type_): _description_
-            dt (_type_): _description_
-            batch_size (_type_): _description_
-            p_x0 (_type_, optional): _description_. Defaults to None.
-            attn_mask (_type_, optional): _description_. Defaults to None.
-        Returns:
-            _type_: _description_
-        """
         assert self.config.noise.type == 'loglinear'
         sigma_t, _ = self.noise(t)
@@ -880,9 +809,7 @@ class Diffusion(L.LightningModule):
                             0)[..., None]
         return edge
-    """TRAINING from https://github.com/Dao-AILab/flash-attention/blob/main/training/src/tasks/seq.py"""
     def on_train_epoch_start(self):
         torch.cuda.empty_cache()
         self.backbone.train()
@@ -1049,19 +976,6 @@ def sample_categorical(categorical_probs):
     return (categorical_probs / gumbel_norm).argmax(dim=-1)
 def sample_batched_categorical(categorical_probs, batch_size):
-    """
-    Generates `m` distinct sequences sampled from categorical probabilities
-    using the Gumbel distribution to ensure randomness while following probabilities
-    Args:
-        categorical_probs (torch.Tensor): tensor of shape (sequence_length, vocab_length)
-                                          representing categorical probabilities
-        m (int): number of distinct sequences to sample
-    Returns:
-        torch.Tensor: tensor of shape (m, sequence_length), where each row is a
-                      distinct sequence of sampled category indices.
-    """
     _, sequence_length, vocab_size = categorical_probs.shape
     # add Gumbel noise and sample m sequences
@@ -1074,20 +988,6 @@ def sample_batched_categorical(categorical_probs, batch_size):
     return sampled_sequences
 def sample_batched_top_k(categorical_probs, batch_size, k):
-    """
-    Generates `m` sequences sampled from the top-k probabilities of each token
-    using Gumbel noise to ensure randomness and reduce bias towards the most likely options.
-    Args:
-        categorical_probs (torch.Tensor): A tensor of shape (sequence_length, vocab_length)
-                                          representing categorical probabilities.
-        m (int): Number of sequences to sample.
-        k (int): Number of top probabilities to consider for sampling.
-    Returns:
-        torch.Tensor: A tensor of shape (m, sequence_length), where each row is a
-                      sampled sequence of category indices.
-    """
     _, sequence_length, vocab_length = categorical_probs.shape
     # Add Gumbel noise to the log probabilities

         self.test_metrics = metrics.clone(prefix='test/')
     """LOSS FOR INVALID PEPTIDES"""
     @torch.no_grad()
         t = (1 - self.config.training.sampling_eps) * eps_t + self.config.training.sampling_eps
         return t
     def q_xt(self, x, mask_prob):
         """Computes the noisy sample xt.
         # scale by T and return
         return self.T * L_vb
     def _forward_pass_diffusion(self, x0, attn_mask, bond_mask=None, mask=None):
         """
             Training reverse diffusion model x_theta to reconstruct samples x0
     # first step in expansion
     def batch_cached_reverse_step(self, token_array, t, dt, batch_size, p_x0=None, attn_mask=None):
         assert self.config.noise.type == 'loglinear'
         sigma_t, _ = self.noise(t)
                             0)[..., None]
         return edge
     def on_train_epoch_start(self):
         torch.cuda.empty_cache()
         self.backbone.train()
     return (categorical_probs / gumbel_norm).argmax(dim=-1)
 def sample_batched_categorical(categorical_probs, batch_size):
     _, sequence_length, vocab_size = categorical_probs.shape
     # add Gumbel noise and sample m sequences
     return sampled_sequences
 def sample_batched_top_k(categorical_probs, batch_size, k):
     _, sequence_length, vocab_length = categorical_probs.shape
     # Add Gumbel noise to the log probabilities

scoring/{hemolysis.py → functions/hemolysis.py} RENAMED Viewed

File without changes