roboflow · Borda · Mar 25, 2026 · Mar 25, 2026 · Mar 25, 2026 · Mar 26, 2026
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- `augmentation_backend` field on `TrainConfig` (`"cpu"` / `"auto"` / `"gpu"`): opt-in GPU-side augmentation via [Kornia](https://kornia.readthedocs.io) applied in `RFDETRDataModule.on_after_batch_transfer` after the batch is resident on the GPU. CPU path is unchanged and remains the default. Install with `pip install 'rfdetr[kornia]'`. Phase 1 supports detection only; segmentation mask support is planned for Phase 2.
 - `BuilderArgs` — a `@runtime_checkable` `typing.Protocol` documenting the minimum attribute set consumed by `build_model()`, `build_backbone()`, `build_transformer()`, and `build_criterion_and_postprocessors()`. Enables static type-checker support for custom builder integrations. Exported from `rfdetr.models`. (#841)
 - `build_model_from_config(model_config, train_config=None, defaults=MODEL_DEFAULTS)` — config-native alternative to `build_model(build_namespace(mc, tc))`; accepts Pydantic config objects directly and constructs the internal namespace automatically. Exported from `rfdetr.models`. (#845)
 - `build_criterion_from_config(model_config, train_config, defaults=MODEL_DEFAULTS)` — config-native alternative to `build_criterion_and_postprocessors(build_namespace(mc, tc))`; returns a `(SetCriterion, PostProcess)` tuple. Exported from `rfdetr.models`. (#845)

@@ -73,6 +73,9 @@ trt = [
     "tensorrt>=8.6.1",
     "polygraphy",
 ]
+kornia = [
+    "kornia>=0.7,<1",              # GPU-side augmentation via on_after_batch_transfer
+]
 loggers = [
     "tensorboard>=2.13.0",
     "protobuf>=3.20.0,<4.0.0",  # Pins protobuf below 4.x to avoid TensorBoard descriptor crash with protobuf>=4 (see #844)
@@ -214,6 +217,7 @@ overrides = [
     "requests", "requests.*",
     "seaborn", "seaborn.*",
     "tqdm", "tqdm.*",
+    "kornia", "kornia.*",
   ], ignore_missing_imports = true },
   # Modules with pre-existing type errors — ignored until incrementally fixed.
   { module = [

@@ -449,6 +449,7 @@ class TrainConfig(BaseModel):
     eval_interval: int = 1
     log_per_class_metrics: bool = True
     aug_config: Optional[Dict[str, Any]] = None
+    augmentation_backend: Literal["cpu", "auto", "gpu"] = "cpu"
 
     @model_validator(mode="after")
     def _warn_deprecated_train_config_fields(self) -> "TrainConfig":

@@ -61,6 +61,27 @@
     "YourCustomTransform",  # Add here
 }
 ```
+
+## Kornia GPU Backend
+
+When ``augmentation_backend="auto"`` or ``"gpu"`` is set in ``TrainConfig``, augmentations
+run on the GPU via Kornia instead of Albumentations.
+
+**Supported transforms** (all presets):
+
+| Preset key | Kornia equivalent | Notes |
+|---|---|---|
+| ``HorizontalFlip`` | ``K.RandomHorizontalFlip`` | Direct |
+| ``VerticalFlip`` | ``K.RandomVerticalFlip`` | Direct |
+| ``Rotate`` | ``K.RandomRotation`` | ``limit`` may be scalar or tuple |
+| ``Affine`` | ``K.RandomAffine`` | ``translate_percent`` treated as fraction |
+| ``ColorJitter`` | ``K.ColorJiggle`` | Same multiplicative semantics |
+| ``RandomBrightnessContrast`` | ``K.ColorJiggle`` | ``brightness_limit`` / ``contrast_limit`` direct |
+| ``GaussianBlur`` | ``K.RandomGaussianBlur`` | ``blur_limit`` rounded up to odd; ``sigma=(0.1, 2.0)`` |
+| ``GaussNoise`` | ``K.RandomGaussianNoise`` | Upper bound of ``std_range`` used as fixed std |
+
+**Phase 1 limitation**: Segmentation models (``segmentation_head=True``) skip GPU augmentation;
+CPU Albumentations are used instead. Mask support is planned for Phase 2.
 """
 
 # ---------------------------------------------------------------------------

@@ -20,7 +20,7 @@
 """
 
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any
 
 import torch
 import torch.utils.data
@@ -44,7 +44,7 @@ def compute_multi_scale_scales(
     expanded_scales: bool = False,
     patch_size: int = 16,
     num_windows: int = 4,
-) -> List[int]:
+) -> list[int]:
     # round to the nearest multiple of 4*patch_size to enable both patching and windowing
     base_num_patches_per_window = resolution // (patch_size * num_windows)
     offsets = [-3, -2, -1, 0, 1, 2, 3, 4] if not expanded_scales else [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5]
@@ -74,7 +74,7 @@ def _is_rle(segmentation: Any) -> bool:
     return isinstance(segmentation, dict) and "counts" in segmentation and "size" in segmentation
 
 
-def convert_coco_poly_to_mask(segmentations: List[Any], height: int, width: int) -> torch.Tensor:
+def convert_coco_poly_to_mask(segmentations: list[Any], height: int, width: int) -> torch.Tensor:
     """Convert COCO segmentation annotations to a binary mask tensor of shape ``[N, H, W]``.
 
     Supports both polygon and RLE (Run-Length Encoding) annotation formats.
@@ -174,13 +174,13 @@ class CocoDetection(torchvision.datasets.CocoDetection):
 
     def __init__(
         self,
-        img_folder: Union[str, Path],
-        ann_file: Union[str, Path],
-        transforms: Optional[Any],
+        img_folder: str | Path,
+        ann_file: str | Path,
+        transforms: Any | None,
         include_masks: bool = False,
         remap_category_ids: bool = False,
     ) -> None:
-        super(CocoDetection, self).__init__(img_folder, ann_file)
+        super().__init__(img_folder, ann_file)
         self._transforms = transforms
         self.include_masks = include_masks
         if remap_category_ids:
@@ -189,14 +189,14 @@ def __init__(
             # Reverse mapping from contiguous label indices back to COCO category_id
             self.label2cat = {label: cat_id for cat_id, label in self.cat2label.items()}
             # Expose label-to-category mapping on the underlying COCO API object for evaluators
-            setattr(self.coco, "label2cat", self.label2cat)
+            self.coco.label2cat = self.label2cat
         else:
             self.cat2label = None
             self.label2cat = None
         self.prepare = ConvertCoco(include_masks=include_masks, cat2label=self.cat2label)
 
-    def __getitem__(self, idx: int) -> Tuple[Any, Any]:
-        img, target = super(CocoDetection, self).__getitem__(idx)
+    def __getitem__(self, idx: int) -> tuple[Any, Any]:
+        img, target = super().__getitem__(idx)
         image_id = self.ids[idx]
         target = {"image_id": image_id, "annotations": target}
         img, target = self.prepare(img, target)
@@ -207,7 +207,7 @@ def __getitem__(self, idx: int) -> Tuple[Any, Any]:
         return img, target
 
 
-class ConvertCoco(object):
+class ConvertCoco:
     """Convert a raw COCO annotation dict into model-ready tensors.
 
     Accepts the ``(image, target)`` pair produced by
@@ -237,11 +237,11 @@ class ConvertCoco(object):
             that labels stay within the model's output range.
     """
 
-    def __init__(self, include_masks: bool = False, cat2label: Optional[Dict[int, int]] = None) -> None:
+    def __init__(self, include_masks: bool = False, cat2label: dict[int, int] | None = None) -> None:
         self.include_masks = include_masks
         self.cat2label = cat2label
 
-    def __call__(self, image: Image.Image, target: Dict[str, Any]) -> Tuple[Image.Image, Dict[str, Any]]:
+    def __call__(self, image: Image.Image, target: dict[str, Any]) -> tuple[Image.Image, dict[str, Any]]:
         w, h = image.size
 
         image_id = target["image_id"]
@@ -258,7 +258,7 @@ def __call__(self, image: Image.Image, target: Dict[str, Any]) -> Tuple[Image.Im
         boxes[:, 0::2].clamp_(min=0, max=w)
         boxes[:, 1::2].clamp_(min=0, max=h)
 
-        classes: List[int] = []
+        classes: list[int] = []
         for obj in anno:
             category_id = obj["category_id"]
             if getattr(self, "cat2label", None) is not None:
@@ -283,7 +283,7 @@ def __call__(self, image: Image.Image, target: Dict[str, Any]) -> Tuple[Image.Im
 
         # for conversion to coco api
         area = torch.tensor([obj["area"] for obj in anno])
-        iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno])
+        iscrowd = torch.tensor([obj.get("iscrowd", 0) for obj in anno])
         target["area"] = area[keep]
         target["iscrowd"] = iscrowd[keep]
 
@@ -308,11 +308,11 @@ def __call__(self, image: Image.Image, target: Dict[str, Any]) -> Tuple[Image.Im
 
 
 def _build_train_resize_config(
-    scales: List[int],
+    scales: list[int],
     *,
     square: bool,
-    max_size: Optional[int] = None,
-) -> List[Dict[str, Any]]:
+    max_size: int | None = None,
+) -> list[dict[str, Any]]:
     """Build the training resize pipeline as an Albumentations config list.
 
     Expresses the ``RandomSelect(resize_a, Compose([resize_b1, crop, resize_b2]))``
@@ -337,12 +337,12 @@ def _build_train_resize_config(
         A single-element list containing a ``OneOf`` config entry.
     """
     if square:
-        option_a: Dict[str, Any] = {
+        option_a: dict[str, Any] = {
             "OneOf": {
                 "transforms": [{"Resize": {"height": s, "width": s}} for s in scales],
             }
         }
-        option_b: Dict[str, Any] = {
+        option_b: dict[str, Any] = {
             "Sequential": {
                 "transforms": [
                     {"SmallestMaxSize": {"max_size": [400, 500, 600]}},
@@ -391,7 +391,8 @@ def make_coco_transforms(
     skip_random_resize: bool = False,
     patch_size: int = 16,
     num_windows: int = 4,
-    aug_config: Optional[Dict[str, Dict[str, Any]]] = None,
+    aug_config: dict[str, dict[str, Any]] | None = None,
+    gpu_postprocess: bool = False,
 ) -> Compose:
     """Build the standard COCO transform pipeline for a given dataset split.
 
@@ -405,6 +406,11 @@ def make_coco_transforms(
     normalisation.  For ``"val"``, ``"test"``, and ``"val_speed"`` only resize and
     normalisation are applied — no augmentation.
 
+    When *gpu_postprocess* is ``True``, both the Albumentations augmentation
+    wrappers and the ``Normalize`` step are omitted from the ``"train"`` pipeline.
+    The ``RFDETRDataModule`` then applies augmentation and normalization on the
+    device in ``on_after_batch_transfer`` instead.
+
     Args:
         image_set: Dataset split identifier — ``"train"``, ``"val"``, ``"test"``,
             or ``"val_speed"``.
@@ -425,6 +431,10 @@ def make_coco_transforms(
             :class:`~rfdetr.datasets.transforms.AlbumentationsWrapper`.  Falls back
             to the default :data:`~rfdetr.datasets.aug_config.AUG_CONFIG` when
             ``None``.
+        gpu_postprocess: When ``True``, skip Albumentations augmentation wrappers and
+            ``Normalize`` from the CPU pipeline.  The ``RFDETRDataModule`` then applies
+            both augmentation and normalization on the GPU in
+            ``on_after_batch_transfer``.  Has no effect on val/test splits.
 
     Returns:
         A :class:`torchvision.transforms.v2.Compose` pipeline ready to be passed
@@ -450,8 +460,14 @@ def make_coco_transforms(
         resize_wrappers = AlbumentationsWrapper.from_config(
             _build_train_resize_config(scales, square=False, max_size=1333)
         )
-        aug_wrappers = AlbumentationsWrapper.from_config(resolved_aug_config)
-        return Compose([*resize_wrappers, *aug_wrappers, to_image, to_float, normalize])
+        pipeline = [*resize_wrappers]
+        if not gpu_postprocess:
+            aug_wrappers = AlbumentationsWrapper.from_config(resolved_aug_config)
+            pipeline += [*aug_wrappers]
+        pipeline += [to_image, to_float]
+        if not gpu_postprocess:
+            pipeline += [normalize]
+        return Compose(pipeline)
 
     if image_set in ("val", "test"):
         resize_wrappers = AlbumentationsWrapper.from_config(
@@ -476,7 +492,8 @@ def make_coco_transforms_square_div_64(
     skip_random_resize: bool = False,
     patch_size: int = 16,
     num_windows: int = 4,
-    aug_config: Optional[Dict[str, Dict[str, Any]]] = None,
+    aug_config: dict[str, dict[str, Any]] | None = None,
+    gpu_postprocess: bool = False,
 ) -> Compose:
     """
     Create COCO transforms with square resizing where the output size is divisible by 64.
@@ -486,6 +503,11 @@ def make_coco_transforms_square_div_64(
     divisible by 64. It supports multi-scale training and optional random resizing and
     cropping for the training split.
 
+    When *gpu_postprocess* is ``True``, both the Albumentations augmentation
+    wrappers and the ``Normalize`` step are omitted from the ``"train"`` pipeline.
+    The ``RFDETRDataModule`` then applies augmentation and normalization on the
+    device in ``on_after_batch_transfer`` instead.
+
     Args:
         image_set: Dataset split identifier. Expected values are "train", "val",
             "test", or "val_speed". Each split uses a slightly different transform
@@ -506,6 +528,10 @@ def make_coco_transforms_square_div_64(
         aug_config: Augmentation configuration dictionary compatible with
             :class:`~rfdetr.datasets.transforms.AlbumentationsWrapper`. If ``None``,
             the default :data:`~rfdetr.datasets.aug_config.AUG_CONFIG` is used.
+        gpu_postprocess: When ``True``, skip Albumentations augmentation wrappers and
+            ``Normalize`` from the CPU pipeline.  The ``RFDETRDataModule`` then applies
+            both augmentation and normalization on the GPU in
+            ``on_after_batch_transfer``.  Has no effect on val/test splits.
 
     Returns:
         A ``Compose`` object containing the composed image transforms appropriate
@@ -526,8 +552,14 @@ def make_coco_transforms_square_div_64(
     if image_set == "train":
         resolved_aug_config = aug_config if aug_config is not None else AUG_CONFIG
         resize_wrappers = AlbumentationsWrapper.from_config(_build_train_resize_config(scales, square=True))
-        aug_wrappers = AlbumentationsWrapper.from_config(resolved_aug_config)
-        return Compose([*resize_wrappers, *aug_wrappers, to_image, to_float, normalize])
+        pipeline = [*resize_wrappers]
+        if not gpu_postprocess:
+            aug_wrappers = AlbumentationsWrapper.from_config(resolved_aug_config)
+            pipeline += [*aug_wrappers]
+        pipeline += [to_image, to_float]
+        if not gpu_postprocess:
+            pipeline += [normalize]
+        return Compose(pipeline)
 
     if image_set in ("val", "test", "val_speed"):
         resize_wrappers = AlbumentationsWrapper.from_config([{"Resize": {"height": resolution, "width": resolution}}])
@@ -554,6 +586,22 @@ def build_coco(image_set: str, args: Any, resolution: int) -> CocoDetection:
     square_resize_div_64 = getattr(args, "square_resize_div_64", False)
     include_masks = getattr(args, "segmentation_head", False)
     aug_config = getattr(args, "aug_config", None)
+    augmentation_backend = getattr(args, "augmentation_backend", "cpu")
+    resolved_augmentation_backend = augmentation_backend
+    if include_masks and augmentation_backend != "cpu":
+        logger.warning(
+            "Segmentation training does not currently support GPU postprocess transforms; "
+            "forcing augmentation_backend='cpu' to retain CPU transforms and normalization."
+        )
+        resolved_augmentation_backend = "cpu"
+    if resolved_augmentation_backend != "cpu":
+        resolved_augmentation_backend = _resolve_runtime_augmentation_backend(resolved_augmentation_backend)
+        if resolved_augmentation_backend == "cpu":
+            logger.warning(
+                "augmentation_backend='auto' resolved to 'cpu' because CUDA or kornia is unavailable; "
+                "disabling GPU postprocess transforms and retaining CPU normalization."
+            )
+    gpu_postprocess = resolved_augmentation_backend != "cpu" and not include_masks
 
     if square_resize_div_64:
         logger.info(f"Building COCO {image_set} dataset with square resize at resolution {resolution}")
@@ -569,6 +617,7 @@ def build_coco(image_set: str, args: Any, resolution: int) -> CocoDetection:
                 patch_size=args.patch_size,
                 num_windows=args.num_windows,
                 aug_config=aug_config,
+                gpu_postprocess=gpu_postprocess,
             ),
             include_masks=include_masks,
         )
@@ -586,12 +635,27 @@ def build_coco(image_set: str, args: Any, resolution: int) -> CocoDetection:
                 patch_size=args.patch_size,
                 num_windows=args.num_windows,
                 aug_config=aug_config,
+                gpu_postprocess=gpu_postprocess,
             ),
             include_masks=include_masks,
         )
     return dataset
 
 
+def _resolve_runtime_augmentation_backend(backend: str) -> str:
+    """Resolve ``augmentation_backend`` at runtime for dataset builders.
+
+    Thin wrapper around :func:`rfdetr.datasets.kornia_transforms.resolve_augmentation_backend`
+    kept for backward-compatibility with callers in ``yolo.py``.
+
+    ``"auto"`` becomes ``"gpu"`` only when CUDA and Kornia are both available,
+    otherwise ``"cpu"``. Explicit ``"cpu"``/``"gpu"`` values pass through.
+    """
+    from rfdetr.datasets.kornia_transforms import resolve_augmentation_backend
+
+    return resolve_augmentation_backend(backend)
+
+
 def build_roboflow_from_coco(image_set: str, args: Any, resolution: int) -> CocoDetection:
     """Build a Roboflow COCO-format dataset.
 
@@ -618,6 +682,8 @@ def build_roboflow_from_coco(image_set: str, args: Any, resolution: int) -> Coco
     patch_size = getattr(args, "patch_size", 16)
     num_windows = getattr(args, "num_windows", 4)
     aug_config = getattr(args, "aug_config", None)
+    resolved_augmentation_backend = _resolve_runtime_augmentation_backend(getattr(args, "augmentation_backend", "cpu"))
+    gpu_postprocess = resolved_augmentation_backend != "cpu" and not include_masks
 
     if square_resize_div_64:
         logger.info(f"Building Roboflow {image_set} dataset with square resize at resolution {resolution}")
@@ -633,6 +699,7 @@ def build_roboflow_from_coco(image_set: str, args: Any, resolution: int) -> Coco
                 patch_size=patch_size,
                 num_windows=num_windows,
                 aug_config=aug_config,
+                gpu_postprocess=gpu_postprocess,
             ),
             include_masks=include_masks,
             remap_category_ids=True,
@@ -651,6 +718,7 @@ def build_roboflow_from_coco(image_set: str, args: Any, resolution: int) -> Coco
                 patch_size=patch_size,
                 num_windows=num_windows,
                 aug_config=aug_config,
+                gpu_postprocess=gpu_postprocess,
             ),
             include_masks=include_masks,
             remap_category_ids=True,