Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 113 additions & 14 deletions rfdetr/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,39 @@
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------

from pydantic import BaseModel, field_validator, model_validator, Field
from pydantic_core.core_schema import ValidationInfo # for field_validator(info)
from typing import List, Optional, Literal
import os, torch
Copy link

Copilot AI Feb 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The import statement uses a comma to separate imports on the same line ('import os, torch'), which is inconsistent with PEP 8 style guidelines that recommend separate import statements. Consider splitting into 'import os' and 'import torch' on separate lines.

Suggested change
import os, torch
import os
import torch

Copilot uses AI. Check for mistakes.

from pydantic import BaseModel
from typing import List, Optional, Literal, Type
import torch
DEVICE = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

# centralize all supported encoder names (add dinov3).
EncoderName = Literal[
"dinov2_windowed_small",
"dinov2_windowed_base",
"dinov3_small",
"dinov3_base",
"dinov3_large",
]

def _encoder_default():
"""Default encoder name for the model config."""
# default to v2 unless explicitly overridden by env
val = os.getenv("RFD_ENCODER", "").strip() or "dinov2_windowed_small"

# guardrail: only accept known names
allowed = {
"dinov2_windowed_small","dinov2_windowed_base",
"dinov3_small","dinov3_base","dinov3_large"
Comment on lines +30 to +31
Copy link

Copilot AI Feb 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's inconsistent spacing in the dictionary definition - missing space after comma on line 30 between "dinov2_windowed_small","dinov2_windowed_base". Add a space after the comma for consistency with Python style conventions.

Suggested change
"dinov2_windowed_small","dinov2_windowed_base",
"dinov3_small","dinov3_base","dinov3_large"
"dinov2_windowed_small", "dinov2_windowed_base",
"dinov3_small", "dinov3_base", "dinov3_large"

Copilot uses AI. Check for mistakes.
}
return val if val in allowed else "dinov2_windowed_small"

class ModelConfig(BaseModel):
encoder: Literal["dinov2_windowed_small", "dinov2_windowed_base"]
"""Base configuration for RF-DETR models."""
# WAS: only dinov2_windowed_*; NOW: include dinov3_* as drop-in options
encoder: EncoderName = _encoder_default()

out_feature_indexes: List[int]
dec_layers: int
two_stage: bool = True
Expand All @@ -33,39 +58,109 @@ class ModelConfig(BaseModel):
group_detr: int = 13
gradient_checkpointing: bool = False
positional_encoding_size: int
# used only when encoder startswith("dinov3")
dinov3_repo_dir: Optional[str] = None # e.g., r"D:\repos\dinov3"
dinov3_weights_path: Optional[str] = None # e.g., r"C:\models\dinov3-vitb16.pth"
dinov3_hf_token: Optional[str] = None # or rely on HUGGINGFACE_HUB_TOKEN
dinov3_prefer_hf: bool = True # try HF first, then hub fallback

# force /16 for v3
@field_validator("patch_size", mode="after")
def _coerce_patch_for_dinov3(cls, v, info: ValidationInfo):
Copy link

Copilot AI Feb 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Normal methods should have 'self', rather than 'cls', as their first parameter.

Copilot uses AI. Check for mistakes.
"""Ensure patch size is 16 for DINOv3 encoders."""
enc = str(info.data.get("encoder", ""))
return 16 if enc.startswith("dinov3") else v

# keep pos-encoding grid consistent with resolution / patch
@field_validator("positional_encoding_size", mode="after")
def _sync_pos_enc_with_resolution(cls, v, info: ValidationInfo):
Copy link

Copilot AI Feb 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Normal methods should have 'self', rather than 'cls', as their first parameter.

Copilot uses AI. Check for mistakes.
"""Sync positional encoding size with resolution and patch size."""
values = info.data or {}
res, ps = values.get("resolution"), values.get("patch_size")
return max(1, res // ps) if (res and ps) else v

# env fallbacks for local repo/weights when *not* preferring HF
@field_validator("dinov3_repo_dir", "dinov3_weights_path", mode="after")
def _fallback_to_env(cls, v, info: ValidationInfo):
Copy link

Copilot AI Feb 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Normal methods should have 'self', rather than 'cls', as their first parameter.

Copilot uses AI. Check for mistakes.
"""Fallback to environment variables if not set."""
values = info.data or {}
if (not v) and str(values.get("encoder","")).startswith("dinov3") and not values.get("dinov3_prefer_hf", True):
env_map = {"dinov3_repo_dir": "DINOV3_REPO", "dinov3_weights_path": "DINOV3_WEIGHTS"}
env_key = env_map[info.field_name]
Copy link

Copilot AI Feb 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The field_validator for 'dinov3_repo_dir' and 'dinov3_weights_path' at line 89 accesses info.field_name, but this may fail if the field_name attribute doesn't match the expected keys in the env_map dictionary. Consider adding error handling or verification that the field_name is one of the expected values.

Suggested change
env_key = env_map[info.field_name]
field_name = getattr(info, "field_name", None)
env_key = env_map.get(field_name)
if not env_key:
return v

Copilot uses AI. Check for mistakes.
return os.getenv(env_key, v)
return v

# neutralize windowing for v3 (avoid accidental asserts downstream)
@field_validator("num_windows", mode="after")
def _neutralize_windows_for_dinov3(cls, v, info: ValidationInfo):
Copy link

Copilot AI Feb 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Normal methods should have 'self', rather than 'cls', as their first parameter.

Copilot uses AI. Check for mistakes.
"""Neutralize windowing for DINOv3 encoders."""
enc = str((info.data or {}).get("encoder",""))
return 1 if enc.startswith("dinov3") else v

# auto-fit out_feature_indexes to avoid projector shape mismatches
@field_validator("out_feature_indexes", mode="after")
def _coerce_out_feats_for_backbone(cls, v, info: ValidationInfo):
Copy link

Copilot AI Feb 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Normal methods should have 'self', rather than 'cls', as their first parameter.

Copilot uses AI. Check for mistakes.
"""Ensure out_feature_indexes are compatible with the encoder."""
enc = str((info.data or {}).get("encoder",""))
if enc.startswith("dinov3"):
# DINOv3 path: default to fewer, stable high-level features
return v if len(v) in (2,) else [8, 11]
Copy link

Copilot AI Feb 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The validator checks if len(v) is in the tuple (2,), but using a single-element tuple is unusual. Either use len(v) == 2 for clarity, or if multiple lengths are intended to be valid in the future, document why only 2 is acceptable.

Suggested change
return v if len(v) in (2,) else [8, 11]
return v if len(v) == 2 else [8, 11]

Copilot uses AI. Check for mistakes.
return v

# Final safety net: once the whole model is built, normalize settings for DINOv3.
@model_validator(mode="after")
def _final_autofix_for_dinov3(self):
"""Final adjustments after model construction."""
enc = str(self.encoder)
if enc.startswith("dinov3"):
# enforce /16 patch + matching pos-enc grid
self.patch_size = 16
if self.resolution:
self.positional_encoding_size = max(1, self.resolution // self.patch_size)
# windowing is a no-op for v3
self.num_windows = 1
# most important: use 2 high-level features to match projector weights across v2/v3
if len(self.out_feature_indexes) != 2:
self.out_feature_indexes = [8, 11]
return self
Comment on lines +111 to +125
Copy link

Copilot AI Feb 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The model_validator modifies the model's attributes directly (self.patch_size, self.positional_encoding_size, etc.). While Pydantic v2 allows this in 'after' mode validators, be aware that this can make the config initialization behavior less predictable. Consider documenting this auto-correction behavior prominently or providing a way to disable it for users who want explicit control.

Copilot uses AI. Check for mistakes.

class RFDETRBaseConfig(ModelConfig):
"""
The configuration for an RF-DETR Base model.
"""
encoder: Literal["dinov2_windowed_small", "dinov2_windowed_base"] = "dinov2_windowed_small"
# Allow choosing dinov3_* without changing call sites
encoder: EncoderName = _encoder_default()
print("Using RFDETRBaseConfig with encoder:", encoder)
Copy link

Copilot AI Feb 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The print statement at line 133 will execute at import time (when the class is defined), not when an instance is created. This will print the default encoder value every time the module is imported, which is unexpected behavior. Consider moving this to init or removing it entirely if it's just for debugging.

Suggested change
print("Using RFDETRBaseConfig with encoder:", encoder)

Copilot uses AI. Check for mistakes.
hidden_dim: int = 256
patch_size: int = 14
num_windows: int = 4
patch_size: int = 14 # will auto-become 16 if encoder startswith("dinov3")
num_windows: int = 4 # ignored by DINOv3 branch
dec_layers: int = 3
sa_nheads: int = 8
ca_nheads: int = 16
dec_n_points: int = 2
num_queries: int = 300
num_select: int = 300
projector_scale: List[Literal["P3", "P4", "P5"]] = ["P4"]
out_feature_indexes: List[int] = [2, 5, 8, 11]
out_feature_indexes: List[int] = [2, 4, 5, 9]
Copy link

Copilot AI Feb 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The default out_feature_indexes is changed from [2, 5, 8, 11] to [2, 4, 5, 9], but there's a validator at lines 101-108 that forces it to [8, 11] when using dinov3 encoders. This creates inconsistency and makes it unclear what the actual indexes will be. Consider documenting why these specific indexes were chosen and whether the default should be different for v2 vs v3.

Copilot uses AI. Check for mistakes.
pretrain_weights: Optional[str] = "rf-detr-base.pth"
resolution: int = 560
positional_encoding_size: int = 37
#resolution: int = 504 # 560//16=35 when dinov3_* is used
resolution: int = 512 # 512//16=32 → pos grid auto=32 for both v2/v3
Comment on lines +146 to +147
Copy link

Copilot AI Feb 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The commented-out code should be removed. If documentation about the resolution change is needed, add a clear comment explaining why 512 is used instead of 560/504/576.

Suggested change
#resolution: int = 504 # 560//16=35 when dinov3_* is used
resolution: int = 512 # 512//16=32 → pos grid auto=32 for both v2/v3
resolution: int = 512 # unified 512px input; 512//16=32 so pos grid=32 for both v2/v3, replacing earlier 504/560/576 variants

Copilot uses AI. Check for mistakes.
positional_encoding_size: int = 36 # will auto-sync to resolution//patch_size


class RFDETRLargeConfig(RFDETRBaseConfig):
"""
The configuration for an RF-DETR Large model.
"""
encoder: Literal["dinov2_windowed_small", "dinov2_windowed_base"] = "dinov2_windowed_base"
encoder: EncoderName = "dinov2_windowed_base"
hidden_dim: int = 384
sa_nheads: int = 12
ca_nheads: int = 24
dec_n_points: int = 4
projector_scale: List[Literal["P3", "P4", "P5"]] = ["P3", "P5"]
pretrain_weights: Optional[str] = "rf-detr-large.pth"


class RFDETRNanoConfig(RFDETRBaseConfig):
"""
The configuration for an RF-DETR Nano model.
Expand All @@ -74,10 +169,11 @@ class RFDETRNanoConfig(RFDETRBaseConfig):
num_windows: int = 2
dec_layers: int = 2
patch_size: int = 16
resolution: int = 384
resolution: int = 384 # 384//16=24 → pos grid auto=24 for both v2/v3
positional_encoding_size: int = 24
pretrain_weights: Optional[str] = "rf-detr-nano.pth"


class RFDETRSmallConfig(RFDETRBaseConfig):
"""
The configuration for an RF-DETR Small model.
Expand All @@ -86,10 +182,11 @@ class RFDETRSmallConfig(RFDETRBaseConfig):
num_windows: int = 2
dec_layers: int = 3
patch_size: int = 16
resolution: int = 512
resolution: int = 512 # 512//16=32 → pos grid auto=32
positional_encoding_size: int = 32
pretrain_weights: Optional[str] = "rf-detr-small.pth"


class RFDETRMediumConfig(RFDETRBaseConfig):
"""
The configuration for an RF-DETR Medium model.
Expand All @@ -98,10 +195,12 @@ class RFDETRMediumConfig(RFDETRBaseConfig):
num_windows: int = 2
dec_layers: int = 4
patch_size: int = 16
resolution: int = 576
#resolution: int = 504 # 576//16=36 → pos grid auto=36
Copy link

Copilot AI Feb 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The commented-out code should be removed. If documentation about the resolution change is needed, add a clear comment explaining why 512 is used instead of 504/576.

Suggested change
#resolution: int = 504 # 576//16=36 → pos grid auto=36
# Use 512 instead of earlier 504/576 choices; keeps a 16-pixel patch (512//16=32)
# while retaining a positional grid size of 36 (from 576//16=36) for compatibility.

Copilot uses AI. Check for mistakes.
resolution: int = 512
positional_encoding_size: int = 36
pretrain_weights: Optional[str] = "rf-detr-medium.pth"


class TrainConfig(BaseModel):
lr: float = 1e-4
lr_encoder: float = 1.5e-4
Expand Down
43 changes: 25 additions & 18 deletions rfdetr/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
import sys
from typing import Iterable
import random

from contextlib import nullcontext
import torch
import torch.nn.functional as F

Expand All @@ -39,12 +39,16 @@
from rfdetr.util.misc import NestedTensor
import numpy as np


def get_autocast_args(args):
"""Return autocast arguments based on the DEPRECATED_AMP flag and args."""
use_cuda = torch.cuda.is_available()
enabled = bool(getattr(args, "amp", False) and use_cuda)
if DEPRECATED_AMP:
return {'enabled': args.amp, 'dtype': torch.bfloat16}
return {"enabled": enabled, "dtype": torch.bfloat16}
else:
return {'device_type': 'cuda', 'enabled': args.amp, 'dtype': torch.bfloat16}

# only use CUDA autocast when CUDA exists
return {"device_type": "cuda", "enabled": enabled, "dtype": torch.bfloat16}
Comment on lines +49 to +50
Copy link

Copilot AI Feb 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When CUDA is not available, the autocast function will receive device_type="cuda" which will cause an error. The enabled flag correctly checks for CUDA availability, but the device_type parameter should also be conditional. Consider using device_type based on the actual available device, or checking if the device is CUDA before applying autocast. For example: return {"device_type": "cuda" if use_cuda else "cpu", "enabled": enabled, "dtype": torch.bfloat16}

Suggested change
# only use CUDA autocast when CUDA exists
return {"device_type": "cuda", "enabled": enabled, "dtype": torch.bfloat16}
# use CUDA autocast when CUDA exists, otherwise fall back to CPU
return {
"device_type": "cuda" if use_cuda else "cpu",
"enabled": enabled,
"dtype": torch.bfloat16,
}

Copilot uses AI. Check for mistakes.

def train_one_epoch(
model: torch.nn.Module,
Expand Down Expand Up @@ -75,11 +79,11 @@ def train_one_epoch(
print("Grad accum steps: ", args.grad_accum_steps)
print("Total batch size: ", batch_size * utils.get_world_size())

# Add gradient scaler for AMP
use_amp = bool(getattr(args, "amp", False) and torch.cuda.is_available())
if DEPRECATED_AMP:
scaler = GradScaler(enabled=args.amp)
scaler = GradScaler(enabled=use_amp)
else:
scaler = GradScaler('cuda', enabled=args.amp)
scaler = GradScaler("cuda", enabled=use_amp)

optimizer.zero_grad()
assert batch_size % args.grad_accum_steps == 0
Expand Down Expand Up @@ -113,7 +117,9 @@ def train_one_epoch(
scales = compute_multi_scale_scales(args.resolution, args.expanded_scales, args.patch_size, args.num_windows)
random.seed(it)
scale = random.choice(scales)
with torch.inference_mode():
# DO NOT use inference_mode() here; it creates inference tensors
#with torch.inference_mode():
with torch.no_grad():
samples.tensors = F.interpolate(samples.tensors, size=scale, mode='bilinear', align_corners=False)
samples.mask = F.interpolate(samples.mask.unsqueeze(1).float(), size=scale, mode='nearest').squeeze(1).bool()

Expand All @@ -124,16 +130,17 @@ def train_one_epoch(
new_samples = NestedTensor(new_samples_tensors, samples.mask[start_idx:final_idx])
new_samples = new_samples.to(device)
new_targets = [{k: v.to(device) for k, v in t.items()} for t in targets[start_idx:final_idx]]

with autocast(**get_autocast_args(args)):
outputs = model(new_samples, new_targets)
loss_dict = criterion(outputs, new_targets)
weight_dict = criterion.weight_dict
losses = sum(
(1 / args.grad_accum_steps) * loss_dict[k] * weight_dict[k]
for k in loss_dict.keys()
if k in weight_dict
)
torch.set_grad_enabled(True) # safety
with torch.inference_mode(False):
with autocast(**get_autocast_args(args)):
outputs = model(new_samples, new_targets)
loss_dict = criterion(outputs, new_targets)
weight_dict = criterion.weight_dict
losses = sum(
(1 / args.grad_accum_steps) * loss_dict[k] * weight_dict[k]
for k in loss_dict.keys()
if k in weight_dict
)
Comment on lines +132 to +142
Copy link

Copilot AI Feb 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The indentation of the autocast block and its contents appears to be changed. While the torch.inference_mode(False) wrapper was added for safety, the call to torch.set_grad_enabled(True) at line 133 is redundant since torch.inference_mode(False) already ensures gradients are enabled. Consider removing the torch.set_grad_enabled(True) call to simplify the code.

Copilot uses AI. Check for mistakes.


scaler.scale(losses).backward()
Expand Down
Loading
Loading