roboflow · Borda · Mar 25, 2026 · Mar 25, 2026 · Mar 25, 2026 · Mar 26, 2026
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- `augmentation_backend` field on `TrainConfig` (`"cpu"` / `"auto"` / `"gpu"`): opt-in GPU-side augmentation via [Kornia](https://kornia.readthedocs.io) applied in `RFDETRDataModule.on_after_batch_transfer` after the batch is resident on the GPU. CPU path is unchanged and remains the default. Install with `pip install 'rfdetr[kornia]'`. Phase 1 supports detection only; segmentation mask support is planned for Phase 2.
 - `RFDETR.predict(shape=...)` — optional `(height, width)` tuple overrides the default inference resolution; useful for matching the resolution used when exporting the model. Both dimensions must be positive integers divisible by 14. (closes #682)
 - `BuilderArgs` — a `@runtime_checkable` `typing.Protocol` documenting the minimum attribute set consumed by `build_model()`, `build_backbone()`, `build_transformer()`, and `build_criterion_and_postprocessors()`. Enables static type-checker support for custom builder integrations. Exported from `rfdetr.models`.
 - `build_model_from_config(model_config, train_config=None, defaults=MODEL_DEFAULTS)` — config-native alternative to `build_model(build_namespace(mc, tc))`; accepts Pydantic config objects directly and constructs the internal namespace automatically. Exported from `rfdetr.models`.

@@ -73,6 +73,9 @@ trt = [
     "tensorrt>=8.6.1",
     "polygraphy",
 ]
+kornia = [
+    "kornia>=0.7,<1",              # GPU-side augmentation via on_after_batch_transfer
+]
 loggers = [
     "tensorboard>=2.13.0",
     "protobuf>=3.20.0,<4.0.0",  # Pins protobuf below 4.x to avoid TensorBoard descriptor crash with protobuf>=4 (see #844)

@@ -410,6 +410,7 @@ class TrainConfig(BaseModel):
     eval_interval: int = 1
     log_per_class_metrics: bool = True
     aug_config: Optional[Dict[str, Any]] = None
+    augmentation_backend: Literal["cpu", "auto", "gpu"] = "cpu"
 
     @model_validator(mode="after")
     def _warn_deprecated_train_config_fields(self) -> "TrainConfig":

@@ -61,6 +61,27 @@
     "YourCustomTransform",  # Add here
 }
 ```
+
+## Kornia GPU Backend
+
+When ``augmentation_backend="auto"`` or ``"gpu"`` is set in ``TrainConfig``, augmentations
+run on the GPU via Kornia instead of Albumentations.
+
+**Supported transforms** (all presets):
+
+| Preset key | Kornia equivalent | Notes |
+|---|---|---|
+| ``HorizontalFlip`` | ``K.RandomHorizontalFlip`` | Direct |
+| ``VerticalFlip`` | ``K.RandomVerticalFlip`` | Direct |
+| ``Rotate`` | ``K.RandomRotation`` | ``limit`` may be scalar or tuple |
+| ``Affine`` | ``K.RandomAffine`` | ``translate_percent`` treated as fraction |
+| ``ColorJitter`` | ``K.ColorJiggle`` | Same multiplicative semantics |
+| ``RandomBrightnessContrast`` | ``K.ColorJiggle`` | ``brightness_limit`` / ``contrast_limit`` direct |
+| ``GaussianBlur`` | ``K.RandomGaussianBlur`` | ``blur_limit`` rounded up to odd; ``sigma=(0.1, 2.0)`` |
+| ``GaussNoise`` | ``K.RandomGaussianNoise`` | Upper bound of ``std_range`` used as fixed std |
+
+**Phase 1 limitation**: Segmentation models (``segmentation_head=True``) skip GPU augmentation;
+CPU Albumentations are used instead. Mask support is planned for Phase 2.
 """
 
 # ---------------------------------------------------------------------------

@@ -340,6 +340,7 @@ def make_coco_transforms(
     patch_size: int = 16,
     num_windows: int = 4,
     aug_config: Optional[Dict[str, Dict[str, Any]]] = None,
+    gpu_postprocess: bool = False,
 ) -> Compose:
     """Build the standard COCO transform pipeline for a given dataset split.
 
@@ -373,6 +374,10 @@ def make_coco_transforms(
             :class:`~rfdetr.datasets.transforms.AlbumentationsWrapper`.  Falls back
             to the default :data:`~rfdetr.datasets.aug_config.AUG_CONFIG` when
             ``None``.
+        gpu_postprocess: When ``True``, skip Albumentations augmentation wrappers and
+            ``Normalize`` from the CPU pipeline.  The ``RFDETRDataModule`` then applies
+            both augmentation and normalization on the GPU in
+            ``on_after_batch_transfer``.  Has no effect on val/test splits.
 
     Returns:
         A :class:`torchvision.transforms.v2.Compose` pipeline ready to be passed
@@ -398,8 +403,14 @@ def make_coco_transforms(
         resize_wrappers = AlbumentationsWrapper.from_config(
             _build_train_resize_config(scales, square=False, max_size=1333)
         )
-        aug_wrappers = AlbumentationsWrapper.from_config(resolved_aug_config)
-        return Compose([*resize_wrappers, *aug_wrappers, to_image, to_float, normalize])
+        pipeline = [*resize_wrappers]
+        if not gpu_postprocess:
+            aug_wrappers = AlbumentationsWrapper.from_config(resolved_aug_config)
+            pipeline += [*aug_wrappers]
+        pipeline += [to_image, to_float]
+        if not gpu_postprocess:
+            pipeline += [normalize]
+        return Compose(pipeline)
 
     if image_set in ("val", "test"):
         resize_wrappers = AlbumentationsWrapper.from_config(
@@ -425,6 +436,7 @@ def make_coco_transforms_square_div_64(
     patch_size: int = 16,
     num_windows: int = 4,
     aug_config: Optional[Dict[str, Dict[str, Any]]] = None,
+    gpu_postprocess: bool = False,
 ) -> Compose:
     """
     Create COCO transforms with square resizing where the output size is divisible by 64.
@@ -454,6 +466,10 @@ def make_coco_transforms_square_div_64(
         aug_config: Augmentation configuration dictionary compatible with
             :class:`~rfdetr.datasets.transforms.AlbumentationsWrapper`. If ``None``,
             the default :data:`~rfdetr.datasets.aug_config.AUG_CONFIG` is used.
+        gpu_postprocess: When ``True``, skip Albumentations augmentation wrappers and
+            ``Normalize`` from the CPU pipeline.  The ``RFDETRDataModule`` then applies
+            both augmentation and normalization on the GPU in
+            ``on_after_batch_transfer``.  Has no effect on val/test splits.
 
     Returns:
         A ``Compose`` object containing the composed image transforms appropriate
@@ -474,8 +490,14 @@ def make_coco_transforms_square_div_64(
     if image_set == "train":
         resolved_aug_config = aug_config if aug_config is not None else AUG_CONFIG
         resize_wrappers = AlbumentationsWrapper.from_config(_build_train_resize_config(scales, square=True))
-        aug_wrappers = AlbumentationsWrapper.from_config(resolved_aug_config)
-        return Compose([*resize_wrappers, *aug_wrappers, to_image, to_float, normalize])
+        pipeline = [*resize_wrappers]
+        if not gpu_postprocess:
+            aug_wrappers = AlbumentationsWrapper.from_config(resolved_aug_config)
+            pipeline += [*aug_wrappers]
+        pipeline += [to_image, to_float]
+        if not gpu_postprocess:
+            pipeline += [normalize]
+        return Compose(pipeline)
 
     if image_set in ("val", "test", "val_speed"):
         resize_wrappers = AlbumentationsWrapper.from_config([{"Resize": {"height": resolution, "width": resolution}}])
@@ -502,6 +524,32 @@ def build_coco(image_set: str, args: Any, resolution: int) -> CocoDetection:
     square_resize_div_64 = getattr(args, "square_resize_div_64", False)
     include_masks = getattr(args, "segmentation_head", False)
     aug_config = getattr(args, "aug_config", None)
+    augmentation_backend = getattr(args, "augmentation_backend", "cpu")
+    resolved_augmentation_backend = augmentation_backend
+    if include_masks and augmentation_backend != "cpu":
+        logger.warning(
+            "Segmentation training does not currently support GPU postprocess transforms; "
+            "forcing augmentation_backend='cpu' to retain CPU transforms and normalization."
+        )
+        resolved_augmentation_backend = "cpu"
+        if hasattr(args, "augmentation_backend"):
+            setattr(args, "augmentation_backend", "cpu")
+    if resolved_augmentation_backend == "auto":
+        gpu_available = torch.cuda.is_available()
+        if gpu_available:
+            try:
+                import kornia  # type: ignore[import-not-found]
+            except ImportError:
+                gpu_available = False
+        if not gpu_available:
+            logger.warning(
+                "augmentation_backend='auto' resolved to 'cpu' because CUDA or kornia is unavailable; "
+                "disabling GPU postprocess transforms and retaining CPU normalization."
+            )
+            resolved_augmentation_backend = "cpu"
+            if hasattr(args, "augmentation_backend"):
+                setattr(args, "augmentation_backend", "cpu")
+    gpu_postprocess = resolved_augmentation_backend != "cpu" and not include_masks
 
     if square_resize_div_64:
         logger.info(f"Building COCO {image_set} dataset with square resize at resolution {resolution}")
@@ -517,6 +565,7 @@ def build_coco(image_set: str, args: Any, resolution: int) -> CocoDetection:
                 patch_size=args.patch_size,
                 num_windows=args.num_windows,
                 aug_config=aug_config,
+                gpu_postprocess=gpu_postprocess,
             ),
             include_masks=include_masks,
         )
@@ -534,6 +583,7 @@ def build_coco(image_set: str, args: Any, resolution: int) -> CocoDetection:
                 patch_size=args.patch_size,
                 num_windows=args.num_windows,
                 aug_config=aug_config,
+                gpu_postprocess=gpu_postprocess,
             ),
             include_masks=include_masks,
         )
@@ -566,6 +616,7 @@ def build_roboflow_from_coco(image_set: str, args: Any, resolution: int) -> Coco
     patch_size = getattr(args, "patch_size", 16)
     num_windows = getattr(args, "num_windows", 4)
     aug_config = getattr(args, "aug_config", None)
+    gpu_postprocess = getattr(args, "augmentation_backend", "cpu") != "cpu" and not include_masks
 
     if square_resize_div_64:
         logger.info(f"Building Roboflow {image_set} dataset with square resize at resolution {resolution}")
@@ -581,6 +632,7 @@ def build_roboflow_from_coco(image_set: str, args: Any, resolution: int) -> Coco
                 patch_size=patch_size,
                 num_windows=num_windows,
                 aug_config=aug_config,
+                gpu_postprocess=gpu_postprocess,
             ),
             include_masks=include_masks,
             remap_category_ids=True,
@@ -599,6 +651,7 @@ def build_roboflow_from_coco(image_set: str, args: Any, resolution: int) -> Coco
                 patch_size=patch_size,
                 num_windows=num_windows,
                 aug_config=aug_config,
+                gpu_postprocess=gpu_postprocess,
             ),
             include_masks=include_masks,
             remap_category_ids=True,