GonzaloMG
/

stable-diffusion-e2e-ft-depth

@@ -27,7 +27,7 @@
 # Modifications from the original code are marked with '# add' comments.
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple, Union
 import numpy as np
 import torch
@@ -82,7 +82,7 @@ class E2EMarigoldDepthOutput(BaseOutput):
             Latent features corresponding to the predictions, compatible with the `latents` argument of the pipeline.
             The shape is $numimages * numensemble \times 4 \times latentheight \times latentwidth$.
     """
     prediction: Union[np.ndarray, torch.Tensor]
     latent: Union[None, torch.Tensor]
@@ -124,7 +124,7 @@ class E2EMarigoldDepthPipeline(DiffusionPipeline):
         scheduler: Union[DDIMScheduler],
         text_encoder: CLIPTextModel,
         tokenizer: CLIPTokenizer,
-        default_processing_resolution: Optional[int] = 768,
     ):
         super().__init__()
@@ -265,8 +265,7 @@ class E2EMarigoldDepthPipeline(DiffusionPipeline):
             batch_size (`int`, *optional*, defaults to `1`):
                 Batch size; only matters passing a tensor of images.
             output_type (`str`, *optional*, defaults to `"np"`):
-                Preferred format of the output's `prediction` and the optional `uncertainty` fields. The accepted
-                values are: `"np"` (numpy array) or `"pt"` (torch tensor).
             output_latent (`bool`, *optional*, defaults to `False`):
                 When enabled, the output's `latent` field contains the latent codes corresponding to the predictions
                 within the ensemble. These codes can be saved, modified, and used for subsequent calls with the
@@ -339,9 +338,7 @@ class E2EMarigoldDepthPipeline(DiffusionPipeline):
         # 5. Process the denoising loop. All `N * E` latents are processed sequentially in batches of size `batch_size`.
         # The unet model takes concatenated latent spaces of the input image and the predicted modality as an input, and
-        # outputs noise for the predicted modality's latent space. The number of denoising diffusion steps is defined by
-        # `num_inference_steps`. It is either set directly, or resolves to the optimal value specific to the loaded
-        # model.
         # Model invocation: self.unet.
         pred_latents = []
@@ -396,7 +393,7 @@ class E2EMarigoldDepthPipeline(DiffusionPipeline):
         # 7. Remove padding. The output shape is (PH, PW).
         prediction = self.image_processor.unpad_image(prediction, padding)  # [N*E,1,PH,PW]
-        # 9. If `match_input_resolution` is set, the output prediction and the uncertainty are upsampled to match the
         # input resolution `(H, W)`. This step may introduce upsampling artifacts, and therefore can be disabled.
         # Depending on the downstream use-case, upsampling can be also chosen based on the tolerated artifacts by
         # setting the `resample_method_output` parameter (e.g., to `"nearest"`).

 # Modifications from the original code are marked with '# add' comments.
 from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
 import numpy as np
 import torch
             Latent features corresponding to the predictions, compatible with the `latents` argument of the pipeline.
             The shape is $numimages * numensemble \times 4 \times latentheight \times latentwidth$.
     """
     prediction: Union[np.ndarray, torch.Tensor]
     latent: Union[None, torch.Tensor]
         scheduler: Union[DDIMScheduler],
         text_encoder: CLIPTextModel,
         tokenizer: CLIPTokenizer,
+        default_processing_resolution: Optional[int] = 768, # add
     ):
         super().__init__()
             batch_size (`int`, *optional*, defaults to `1`):
                 Batch size; only matters passing a tensor of images.
             output_type (`str`, *optional*, defaults to `"np"`):
+                Preferred format of the output's `prediction`. The accepted ßvalues are: `"np"` (numpy array) or `"pt"` (torch tensor).
             output_latent (`bool`, *optional*, defaults to `False`):
                 When enabled, the output's `latent` field contains the latent codes corresponding to the predictions
                 within the ensemble. These codes can be saved, modified, and used for subsequent calls with the
         # 5. Process the denoising loop. All `N * E` latents are processed sequentially in batches of size `batch_size`.
         # The unet model takes concatenated latent spaces of the input image and the predicted modality as an input, and
+        # outputs noise for the predicted modality's latent space.
         # Model invocation: self.unet.
         pred_latents = []
         # 7. Remove padding. The output shape is (PH, PW).
         prediction = self.image_processor.unpad_image(prediction, padding)  # [N*E,1,PH,PW]
+        # 9. If `match_input_resolution` is set, the output prediction are upsampled to match the
         # input resolution `(H, W)`. This step may introduce upsampling artifacts, and therefore can be disabled.
         # Depending on the downstream use-case, upsampling can be also chosen based on the tolerated artifacts by
         # setting the `resample_method_output` parameter (e.g., to `"nearest"`).