Update pipeline.py
Browse files- pipeline.py +6 -9
pipeline.py
CHANGED
|
@@ -27,7 +27,7 @@
|
|
| 27 |
# Modifications from the original code are marked with '# add' comments.
|
| 28 |
|
| 29 |
from dataclasses import dataclass
|
| 30 |
-
from typing import
|
| 31 |
|
| 32 |
import numpy as np
|
| 33 |
import torch
|
|
@@ -82,7 +82,7 @@ class E2EMarigoldDepthOutput(BaseOutput):
|
|
| 82 |
Latent features corresponding to the predictions, compatible with the `latents` argument of the pipeline.
|
| 83 |
The shape is $numimages * numensemble \times 4 \times latentheight \times latentwidth$.
|
| 84 |
"""
|
| 85 |
-
|
| 86 |
prediction: Union[np.ndarray, torch.Tensor]
|
| 87 |
latent: Union[None, torch.Tensor]
|
| 88 |
|
|
@@ -124,7 +124,7 @@ class E2EMarigoldDepthPipeline(DiffusionPipeline):
|
|
| 124 |
scheduler: Union[DDIMScheduler],
|
| 125 |
text_encoder: CLIPTextModel,
|
| 126 |
tokenizer: CLIPTokenizer,
|
| 127 |
-
default_processing_resolution: Optional[int] = 768,
|
| 128 |
):
|
| 129 |
super().__init__()
|
| 130 |
|
|
@@ -265,8 +265,7 @@ class E2EMarigoldDepthPipeline(DiffusionPipeline):
|
|
| 265 |
batch_size (`int`, *optional*, defaults to `1`):
|
| 266 |
Batch size; only matters passing a tensor of images.
|
| 267 |
output_type (`str`, *optional*, defaults to `"np"`):
|
| 268 |
-
Preferred format of the output's `prediction`
|
| 269 |
-
values are: `"np"` (numpy array) or `"pt"` (torch tensor).
|
| 270 |
output_latent (`bool`, *optional*, defaults to `False`):
|
| 271 |
When enabled, the output's `latent` field contains the latent codes corresponding to the predictions
|
| 272 |
within the ensemble. These codes can be saved, modified, and used for subsequent calls with the
|
|
@@ -339,9 +338,7 @@ class E2EMarigoldDepthPipeline(DiffusionPipeline):
|
|
| 339 |
|
| 340 |
# 5. Process the denoising loop. All `N * E` latents are processed sequentially in batches of size `batch_size`.
|
| 341 |
# The unet model takes concatenated latent spaces of the input image and the predicted modality as an input, and
|
| 342 |
-
# outputs noise for the predicted modality's latent space.
|
| 343 |
-
# `num_inference_steps`. It is either set directly, or resolves to the optimal value specific to the loaded
|
| 344 |
-
# model.
|
| 345 |
# Model invocation: self.unet.
|
| 346 |
pred_latents = []
|
| 347 |
|
|
@@ -396,7 +393,7 @@ class E2EMarigoldDepthPipeline(DiffusionPipeline):
|
|
| 396 |
# 7. Remove padding. The output shape is (PH, PW).
|
| 397 |
prediction = self.image_processor.unpad_image(prediction, padding) # [N*E,1,PH,PW]
|
| 398 |
|
| 399 |
-
# 9. If `match_input_resolution` is set, the output prediction
|
| 400 |
# input resolution `(H, W)`. This step may introduce upsampling artifacts, and therefore can be disabled.
|
| 401 |
# Depending on the downstream use-case, upsampling can be also chosen based on the tolerated artifacts by
|
| 402 |
# setting the `resample_method_output` parameter (e.g., to `"nearest"`).
|
|
|
|
| 27 |
# Modifications from the original code are marked with '# add' comments.
|
| 28 |
|
| 29 |
from dataclasses import dataclass
|
| 30 |
+
from typing import List, Optional, Tuple, Union
|
| 31 |
|
| 32 |
import numpy as np
|
| 33 |
import torch
|
|
|
|
| 82 |
Latent features corresponding to the predictions, compatible with the `latents` argument of the pipeline.
|
| 83 |
The shape is $numimages * numensemble \times 4 \times latentheight \times latentwidth$.
|
| 84 |
"""
|
| 85 |
+
|
| 86 |
prediction: Union[np.ndarray, torch.Tensor]
|
| 87 |
latent: Union[None, torch.Tensor]
|
| 88 |
|
|
|
|
| 124 |
scheduler: Union[DDIMScheduler],
|
| 125 |
text_encoder: CLIPTextModel,
|
| 126 |
tokenizer: CLIPTokenizer,
|
| 127 |
+
default_processing_resolution: Optional[int] = 768, # add
|
| 128 |
):
|
| 129 |
super().__init__()
|
| 130 |
|
|
|
|
| 265 |
batch_size (`int`, *optional*, defaults to `1`):
|
| 266 |
Batch size; only matters passing a tensor of images.
|
| 267 |
output_type (`str`, *optional*, defaults to `"np"`):
|
| 268 |
+
Preferred format of the output's `prediction`. The accepted ßvalues are: `"np"` (numpy array) or `"pt"` (torch tensor).
|
|
|
|
| 269 |
output_latent (`bool`, *optional*, defaults to `False`):
|
| 270 |
When enabled, the output's `latent` field contains the latent codes corresponding to the predictions
|
| 271 |
within the ensemble. These codes can be saved, modified, and used for subsequent calls with the
|
|
|
|
| 338 |
|
| 339 |
# 5. Process the denoising loop. All `N * E` latents are processed sequentially in batches of size `batch_size`.
|
| 340 |
# The unet model takes concatenated latent spaces of the input image and the predicted modality as an input, and
|
| 341 |
+
# outputs noise for the predicted modality's latent space.
|
|
|
|
|
|
|
| 342 |
# Model invocation: self.unet.
|
| 343 |
pred_latents = []
|
| 344 |
|
|
|
|
| 393 |
# 7. Remove padding. The output shape is (PH, PW).
|
| 394 |
prediction = self.image_processor.unpad_image(prediction, padding) # [N*E,1,PH,PW]
|
| 395 |
|
| 396 |
+
# 9. If `match_input_resolution` is set, the output prediction are upsampled to match the
|
| 397 |
# input resolution `(H, W)`. This step may introduce upsampling artifacts, and therefore can be disabled.
|
| 398 |
# Depending on the downstream use-case, upsampling can be also chosen based on the tolerated artifacts by
|
| 399 |
# setting the `resample_method_output` parameter (e.g., to `"nearest"`).
|