diff --git a/inference/cpp/onnx/CMakeLists.txt b/inference/cpp/onnx/CMakeLists.txt new file mode 100644 index 000000000..34d6df92d --- /dev/null +++ b/inference/cpp/onnx/CMakeLists.txt @@ -0,0 +1,49 @@ +cmake_minimum_required(VERSION 3.10) +project(RF_DETR_ONNX) + +# Set C++ standard +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +# Find OpenCV +find_package(OpenCV REQUIRED) + +# Find ONNX Runtime +find_path(ONNXRUNTIME_INCLUDE_DIRS onnxruntime/core/session/onnxruntime_cxx_api.h) +find_library(ONNXRUNTIME_LIBRARIES NAMES onnxruntime) + +if(NOT ONNXRUNTIME_INCLUDE_DIRS OR NOT ONNXRUNTIME_LIBRARIES) + message(FATAL_ERROR "ONNX Runtime not found. Set ONNXRUNTIME_DIR to the ONNX Runtime installation path.") +endif() + +# Include directories +include_directories(${OpenCV_INCLUDE_DIRS} ${ONNXRUNTIME_INCLUDE_DIRS}) + +# Source files +set(SOURCES + main.cpp + RF_DETR_ONNX.cpp +) + +# Header files +set(HEADERS + RF_DETR_ONNX.h +) + +# Create executable +add_executable(RF_DETR_ONNX ${SOURCES} ${HEADERS}) + +# Link libraries +target_link_libraries(RF_DETR_ONNX ${OpenCV_LIBS} ${ONNXRUNTIME_LIBRARIES}) + +# Platform-specific settings +if(WIN32) + add_definitions(-DWIN32_LEAN_AND_MEAN) +elseif(UNIX) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread") +endif() + +message(STATUS "OpenCV include: ${OpenCV_INCLUDE_DIRS}") +message(STATUS "OpenCV libs: ${OpenCV_LIBS}") +message(STATUS "ONNX Runtime include: ${ONNXRUNTIME_INCLUDE_DIRS}") +message(STATUS "ONNX Runtime libs: ${ONNXRUNTIME_LIBRARIES}") diff --git a/inference/cpp/onnx/README.md b/inference/cpp/onnx/README.md new file mode 100644 index 000000000..58f575b8c --- /dev/null +++ b/inference/cpp/onnx/README.md @@ -0,0 +1,106 @@ +# ONNX Inference in C++ + +This guide explains how to set up, build, and run the ONNX model inference demo in C++ using ONNX Runtime and OpenCV. + +--- + +## 📌 Prerequisites + +Ensure you have the following installed: + +- **CMake** (3.5.1 or later) +- **C++ Compiler** (with C++17 support) +- **ONNX Runtime** → Install it following [this guide](https://onnxruntime.ai/docs/genai/howto/install.html). +- **OpenCV** → Install via a package manager: + + - **Windows (vcpkg)** → `vcpkg install opencv` + - **Ubuntu** → `sudo apt update && sudo apt install libopencv-dev` + - **macOS (Homebrew)** → `brew install opencv` + +--- + +## ⚙️ Build & Run + +### 1️⃣ Clone the Repository +```sh +git clone https://github.com/roboflow/rf-detr +cd inference/cpp/onnx +mkdir build && cd build +``` + +### 2️⃣ Configure & Compile + +#### **🔹 Linux/macOS** +```sh +cmake .. +cmake --build . +``` + +#### **🔹 Windows (MSVC)** +```sh +cmake .. -G "Visual Studio 17 2022" +cmake --build . --config Release +``` + +### 3️⃣ Run the Model + +This PR introduces a C++ demo for the RF-DETR model, allowing users to perform real-time object detection using an ONNX model. The demo supports various input sources, including images, videos, and live camera streams, with optional CUDA acceleration. + +### 🔹 Key Features +✅ Loads an RF-DETR model in ONNX format +✅ Supports image, video, and live camera inference +✅ Enables CPU and CUDA (GPU) execution +✅ Configurable confidence threshold for detections +✅ Outputs annotated images/videos with detected objects +✅ Uses COCO class labels for object recognition + +### 🔹 Run Examples + +#### **Image Inference** +Detect objects in a static image and save the output: +```sh +./main --model path/to/model.onnx --source_type image \ + --input path/to/image.jpg --output path/to/output.jpg \ + --conf 0.6 --labels path/to/coco.names +``` + +#### **Video Inference** +Process a video file and save the annotated output: +```sh +./main --model path/to/model.onnx --source_type video \ + --input path/to/video.mp4 --output path/to/output.mp4 \ + --conf 0.5 --use_cuda +``` + +#### **Live Camera Inference (Default ID 0)** +Run inference on the default webcam (ID 0) with GPU acceleration: +```sh +./main --model path/to/model.onnx --source_type camera \ + --input 0 --conf 0.55 --use_cuda +``` + +#### **Live Camera Inference (Specific Camera ID 1)** +Run inference on a specific camera (ID 1): +```sh +./"Hi @SkalskiP, I have added a README file to the project. Please let me know if you need any further changes or additional information." --model path/to/model.onnx --source_type camera \ + --input 1 --conf 0.55 +``` + +#### **Get Help & Available Options** +```sh +./main --help +``` + +### 🔹 Dependencies +- **OpenCV** (for image and video processing) +- **ONNX Runtime** (for model inference) +- **CUDA** (optional, for GPU acceleration) + +--- + +## 📝 Notes + +- Ensure the ONNX model and input image are accessible. +- On Windows, make sure `onnxruntime.dll` is in the same directory as `main.exe` or added to the `PATH`. +- Modify `main.cpp` as needed for preprocessing or output handling. + diff --git a/inference/cpp/onnx/RF_DETR_ONNX.cpp b/inference/cpp/onnx/RF_DETR_ONNX.cpp new file mode 100644 index 000000000..d3c7778fd --- /dev/null +++ b/inference/cpp/onnx/RF_DETR_ONNX.cpp @@ -0,0 +1,617 @@ +#include "RF_DETR_ONNX.h" + +#include // For cv::resize, cv::cvtColor +#include // For cv::dnn::blobFromImage + +#include +#include +#include +#include +#include +#include +#include +#include // Potentially useful, e.g., for accumulate, though vectorProduct_ uses a loop +#include + +// --- Constructor Implementation --- +RF_DETR_ONNX::RF_DETR_ONNX(const std::string& modelPath, bool useCUDA, size_t deviceId, int intraOpNumThreads) + : env_(OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING, DEFAULT_INSTANCE_NAME.c_str()), + sessionOptions_(), + allocator_(Ort::AllocatorWithDefaultOptions()) + // MEANS and STDS initialized directly in header (as per original code) + // numClasses_ initialized with default, updated in getInputOutputInfo +{ + initializeSession(modelPath, useCUDA, deviceId, intraOpNumThreads); + getInputOutputInfo(); // Get info after session is created +} + +// --- Public Method Implementations --- + +std::vector RF_DETR_ONNX::preprocess(const cv::Mat& inputImage) { + auto start = std::chrono::high_resolution_clock::now(); + + cv::Mat processedImage; + // Ensure inputWidth_ and inputHeight_ are valid (checked in getInputOutputInfo) + if (inputWidth_ <= 0 || inputHeight_ <= 0) { + throw std::runtime_error("[ERROR] Invalid input dimensions obtained from model."); + } + cv::resize(inputImage, processedImage, cv::Size(static_cast(inputWidth_), static_cast(inputHeight_)), 0, 0, cv::INTER_LINEAR); + + cv::cvtColor(processedImage, processedImage, cv::ColorConversionCodes::COLOR_BGR2RGB); + + processedImage.convertTo(processedImage, CV_32FC3, 1.0 / 255.0); + + // Apply normalization (Subtract Mean, Divide by Standard Deviation) + // Ensure MEANS and STDS have the correct size (3) + if (MEANS.size() != 3 || STDS.size() != 3) { + throw std::runtime_error("[ERROR] MEANS or STDS vectors do not have size 3."); + } + cv::Mat meanMat(inputHeight_, inputWidth_, CV_32FC3, cv::Scalar(MEANS[0], MEANS[1], MEANS[2])); + cv::Mat stdMat(inputHeight_, inputWidth_, CV_32FC3, cv::Scalar(STDS[0], STDS[1], STDS[2])); + + // Perform normalization using OpenCV functions for potentially better performance/clarity + cv::subtract(processedImage, meanMat, processedImage); + cv::divide(processedImage, stdMat, processedImage); // Element-wise division + + // Create blob from image (results in NCHW layout) + // Scale factor is 1.0 because scaling and normalization are already done. + // Mean subtraction is (0,0,0) because it's already done. + // SwapRB is false because we converted BGR->RGB earlier. + // Crop is false. + cv::Mat inputBlob = cv::dnn::blobFromImage(processedImage, 1.0, cv::Size(), cv::Scalar(), false, false); + + // Copy blob data to a std::vector + // Ensure inputTensorSize_ is correctly calculated + if (inputTensorSize_ == 0) { + throw std::runtime_error("[ERROR] Input tensor size is zero. Model info might be incorrect."); + } + std::vector inputTensorValues(inputTensorSize_); + memcpy(inputTensorValues.data(), inputBlob.ptr(), inputTensorSize_ * sizeof(float)); + + auto end = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration = end - start; + std::cout << "[INFO] Preprocessing time: " << duration.count() << " ms" << std::endl; + + return inputTensorValues; +} + + +std::vector RF_DETR_ONNX::infer(const std::vector& inputTensorValues) { + if (!ortSession_) { + throw std::runtime_error("[ERROR] Inference called before session initialization or after failure."); + } + if (inputTensorValues.size() != inputTensorSize_) { + throw std::runtime_error("[ERROR] Input tensor value size mismatch. Expected " + std::to_string(inputTensorSize_) + ", got " + std::to_string(inputTensorValues.size())); + } + if (inputNames_.empty() || outputNames_.empty()) { + throw std::runtime_error("[ERROR] Input/Output names not initialized."); + } + + auto start = std::chrono::high_resolution_clock::now(); + + std::vector inputTensors; + // Use const_cast carefully. Ensure the tensor data is not modified by the inference engine if it's not supposed to be. + // CreateCpu implies the data is on the CPU. If using CUDA EP with pinned memory, adjust accordingly. + Ort::MemoryInfo memoryInfo = Ort::MemoryInfo::CreateCpu(OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault); + inputTensors.push_back(Ort::Value::CreateTensor( + memoryInfo, const_cast(inputTensorValues.data()), inputTensorSize_, + inputDims_.data(), inputDims_.size() + )); + + // Prepare C-style strings for ONNX Runtime API + std::vector inputNamesCStr; + inputNamesCStr.reserve(inputNames_.size()); + for (const auto& name : inputNames_) { + inputNamesCStr.push_back(name.c_str()); + } + + std::vector outputNamesCStr; + outputNamesCStr.reserve(outputNames_.size()); + for (const auto& name : outputNames_) { + outputNamesCStr.push_back(name.c_str()); + } + + std::vector outputTensors; + try { + outputTensors = ortSession_->Run( + Ort::RunOptions{ nullptr }, + inputNamesCStr.data(), + inputTensors.data(), + inputTensors.size(), // Should be 1 for this model + outputNamesCStr.data(), + outputNamesCStr.size() // Should be 2 for this model + ); + } + catch (const Ort::Exception& e) { + std::cerr << "[ERROR] ONNX Runtime inference failed: " << e.what() << std::endl; + // Consider logging more details, e.g., input shapes/types + throw; // Re-throw the exception + } + + auto end = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration = end - start; + std::cout << "[INFO] Inference time: " << duration.count() << " ms" << std::endl; + + return outputTensors; +} + + +std::vector RF_DETR_ONNX::postprocess(const std::vector& outputTensors, int originalWidth, int originalHeight, float confThreshold) { + // Check if output tensors are valid and have expected number + if (outputTensors.size() != 2) { + throw std::runtime_error("[ERROR] Expected 2 output tensors from inference, but got " + std::to_string(outputTensors.size())); + } + if (!outputTensors[0] || !outputTensors[1]) { + throw std::runtime_error("[ERROR] One or more output tensors are invalid (null)."); + } + if (!outputTensors[0].IsTensor() || !outputTensors[1].IsTensor()) { + throw std::runtime_error("[ERROR] Outputs are not tensors."); + } + + auto start = std::chrono::high_resolution_clock::now(); + + // Basic type check + if (outputTensors[0].GetTensorTypeAndShapeInfo().GetElementType() != ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT || + outputTensors[1].GetTensorTypeAndShapeInfo().GetElementType() != ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT) { + throw std::runtime_error("[ERROR] Output tensors are not float tensors as expected."); + } + + // Get pointers to tensor data + const float* predBoxesData = outputTensors[0].GetTensorData(); + const float* predLogitsData = outputTensors[1].GetTensorData(); + + // Get shapes + auto boxesShapeInfo = outputTensors[0].GetTensorTypeAndShapeInfo(); + auto logitsShapeInfo = outputTensors[1].GetTensorTypeAndShapeInfo(); + std::vector boxesShape = boxesShapeInfo.GetShape(); // e.g., [1, 300, 4] + std::vector logitsShape = logitsShapeInfo.GetShape(); // e.g., [1, 300, 80] + + // Validate shapes + if (boxesShape.size() != 3 || boxesShape[0] != 1 || boxesShape[2] != 4) { + throw std::runtime_error("[ERROR] Unexpected shape for boxes output. Expected [1, N, 4]."); + } + if (logitsShape.size() != 3 || logitsShape[0] != 1) { + throw std::runtime_error("[ERROR] Unexpected shape for logits output. Expected [1, N, num_classes]."); + } + if (boxesShape[1] != logitsShape[1]) { + throw std::runtime_error("[ERROR] Mismatch in number of queries between boxes (" + + std::to_string(boxesShape[1]) + ") and logits (" + + std::to_string(logitsShape[1]) + ") outputs."); + } + // Check if the number of classes matches the initialized value + if (numClasses_ <= 0) { + throw std::runtime_error("[ERROR] Number of classes not properly initialized."); + } + if (logitsShape[2] != numClasses_) { + std::cerr << "[WARNING] Number of classes in model output (" << logitsShape[2] + << ") differs from expected value (" << numClasses_ + << "). Using value from model output." << std::endl; + // Optionally update numClasses_ here if you trust the model output more, + // or throw an error if they must match. + // numClasses_ = logitsShape[2]; // Example: Update based on model + } + + + const int64_t numQueries = boxesShape[1]; + // Use the dynamically determined number of classes from the logits shape + const int64_t actualNumClasses = logitsShape[2]; + + // --- Score Calculation and Filtering --- + // Store scores along with their query and class index: (score, query_idx, class_idx) + std::vector> flattenedScores; + flattenedScores.reserve(static_cast(numQueries) * actualNumClasses); // Pre-allocate memory + + for (int64_t i = 0; i < numQueries; ++i) { + // Optimization: Find max score for this query first (optional) + // float max_score_for_query = -1.0f; + // int max_class_idx = -1; + + for (int64_t j = 0; j < actualNumClasses; ++j) { + // Calculate index in the flattened logits tensor + size_t logit_idx = static_cast(i) * actualNumClasses + j; + float score = sigmoid(predLogitsData[logit_idx]); + + // Optimization: If you only care about the top class per query: + // if (score > max_score_for_query) { + // max_score_for_query = score; + // max_class_idx = j; + // } + + // Store all scores above a preliminary threshold (or all scores) + // Adding a small threshold here can reduce sorting time later. + if (score >= confThreshold) { // Only consider scores above the final threshold + flattenedScores.emplace_back(score, static_cast(i), static_cast(j)); + } + } + // Optimization: Add only the best class for this query if desired + // if (max_class_idx != -1 && max_score_for_query >= confThreshold) { + // flattenedScores.emplace_back(max_score_for_query, i, max_class_idx); + // } + } + + // Sort all collected scores in descending order + std::sort(flattenedScores.rbegin(), flattenedScores.rend()); // Sort descending efficiently + + // --- Box Conversion and Selection --- + std::vector detections; + detections.reserve(std::min(static_cast(MAX_NUMBER_BOXES), flattenedScores.size())); // Pre-allocate memory + + float scaleX = static_cast(originalWidth); + float scaleY = static_cast(originalHeight); + + // Iterate through the sorted scores and create detections + // Apply Non-Maximum Suppression (NMS) here if needed (DETR often doesn't require heavy NMS) + // This current implementation takes the top-K scores directly without NMS. + int count = 0; + for (const auto& scoreTuple : flattenedScores) { + if (count >= MAX_NUMBER_BOXES) { + break; // Limit the number of detections + } + + float score = std::get<0>(scoreTuple); + // We already pre-filtered by confThreshold, but double-check if logic changes + // if (score < confThreshold) { + // continue; // Should not happen if pre-filtered + // } + + int queryIdx = std::get<1>(scoreTuple); + int classIdx = std::get<2>(scoreTuple); + + // Get the raw box data (cxcywh normalized) for this query index + const float* rawBoxData = predBoxesData + (static_cast(queryIdx) * 4); // 4 = box dimensions (cx, cy, w, h) + + // Convert cxcywh (normalized) to xyxy (normalized) + std::vector xyxy_norm = box_cxcywh_to_xyxy(rawBoxData); + + // Scale xyxy (normalized) to original image coordinates + float x1 = xyxy_norm[0] * scaleX; + float y1 = xyxy_norm[1] * scaleY; + float x2 = xyxy_norm[2] * scaleX; + float y2 = xyxy_norm[3] * scaleY; + + // Clip coordinates to image boundaries to prevent invalid Rect + x1 = std::max(0.0f, std::min(x1, scaleX - 1.0f)); + y1 = std::max(0.0f, std::min(y1, scaleY - 1.0f)); + x2 = std::max(0.0f, std::min(x2, scaleX - 1.0f)); + y2 = std::max(0.0f, std::min(y2, scaleY - 1.0f)); + + // Ensure width and height are non-negative after clipping + if (x2 > x1 && y2 > y1) { + Detection det; + // Convert to integer Rect (x, y, width, height) + det.box = cv::Rect(static_cast(std::round(x1)), + static_cast(std::round(y1)), + static_cast(std::round(x2 - x1)), + static_cast(std::round(y2 - y1))); + det.score = score; + det.class_id = classIdx; + detections.push_back(det); + count++; + } + } + + + auto end = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration = end - start; + std::cout << "[INFO] Postprocessing time: " << duration.count() << " ms" << std::endl; + std::cout << "[INFO] Found " << detections.size() << " detections passing the confidence threshold (max " << MAX_NUMBER_BOXES << ")." << std::endl; + + return detections; +} + + +std::vector RF_DETR_ONNX::detect(const cv::Mat& image, float confThreshold) { + if (image.empty()) { + throw std::runtime_error("[ERROR] Input image for detection is empty."); + } + + // 1. Get Original Dimensions (needed for postprocessing scaling) + const int originalWidth = image.cols; + const int originalHeight = image.rows; + + // 2. Preprocess the image + std::vector inputTensorValues = preprocess(image); + + // 3. Run inference + std::vector outputTensors = infer(inputTensorValues); + + // 4. Postprocess the results + std::vector detections = postprocess(outputTensors, originalWidth, originalHeight, confThreshold); + + return detections; +} + +// --- Getters Implementation --- +int RF_DETR_ONNX::getInputWidth() const { + return static_cast(inputWidth_); +} + +int RF_DETR_ONNX::getInputHeight() const { + return static_cast(inputHeight_); +} + +const std::vector& RF_DETR_ONNX::getInputNames() const { + return inputNames_; +} + +const std::vector& RF_DETR_ONNX::getOutputNames() const { + return outputNames_; +} + +int64_t RF_DETR_ONNX::getNumClasses() const { + return numClasses_; +} + +void RF_DETR_ONNX::initializeSession(const std::string& modelPath, bool useCUDA, size_t deviceId, int intraOpNumThreads) { + sessionOptions_.SetIntraOpNumThreads(intraOpNumThreads); + sessionOptions_.SetLogSeverityLevel(ORT_LOGGING_LEVEL_WARNING); // Match env level + + bool cuda_available = false; + if (useCUDA) { + std::cout << "[INFO] Attempting to use CUDA Execution Provider." << std::endl; + try { + // Check available providers before appending + auto available_providers = Ort::GetAvailableProviders(); + bool provider_found = false; + for (const auto& provider_name : available_providers) { + if (provider_name == "CUDAExecutionProvider") { + provider_found = true; + break; + } + } + + if (provider_found) { + OrtCUDAProviderOptions cuda_options{}; + cuda_options.device_id = static_cast(deviceId); // Ensure deviceId fits in int + // Other options can be set here (e.g., gpu_mem_limit, arena_extend_strategy) + // cuda_options.gpu_mem_limit = N; + // cuda_options.arena_extend_strategy = 1; // kNextPowerOfTwo + + sessionOptions_.AppendExecutionProvider_CUDA(cuda_options); + sessionOptions_.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED); + std::cout << "[INFO] CUDA Execution Provider enabled on device " << deviceId << "." << std::endl; + cuda_available = true; + } + else { + std::cerr << "[WARNING] CUDA Execution Provider is not available in this build." << std::endl; + } + } + catch (const Ort::Exception& e) { + std::cerr << "[ERROR] Failed to initialize CUDA Execution Provider: " << e.what() << std::endl; + // Fallback logic is handled below + } + catch (const std::exception& e) { + std::cerr << "[ERROR] std::exception during CUDA setup: " << e.what() << std::endl; + } + catch (...) { + std::cerr << "[ERROR] Unknown exception during CUDA setup." << std::endl; + } + } + + if (!cuda_available) { + if (useCUDA) { // Only print fallback message if CUDA was requested but failed + std::cerr << "[INFO] Falling back to CPU Execution Provider." << std::endl; + } + else { + std::cout << "[INFO] Using CPU Execution Provider." << std::endl; + } + // Set optimization level for CPU + sessionOptions_.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL); + // Optionally disable Per Session Threads if preferring intraOpNumThreads control + // sessionOptions_.DisablePerSessionThreads(); + } + + // Handle Model Path Encoding (Windows vs. others) +#ifdef _WIN32 + std::wstring wideModelPath = std::wstring(modelPath.begin(), modelPath.end()); + const wchar_t* modelPathW = wideModelPath.c_str(); +#else + const char* modelPathW = modelPath.c_str(); +#endif + + // Create the session + try { + ortSession_ = std::make_unique(env_, modelPathW, sessionOptions_); + std::cout << "[INFO] ONNX Runtime session initialized successfully for model: " << modelPath << std::endl; + } + catch (const Ort::Exception& e) { + std::cerr << "[ERROR] Failed to create ONNX Runtime session for model '" << modelPath << "': " << e.what() << std::endl; + throw; // Re-throw to signal failure + } + catch (const std::exception& e) { + std::cerr << "[ERROR] std::exception during session creation: " << e.what() << std::endl; + throw; + } + catch (...) { + std::cerr << "[ERROR] Unknown exception during session creation." << std::endl; + throw; + } +} + + +void RF_DETR_ONNX::getInputOutputInfo() { + if (!ortSession_) { + throw std::runtime_error("[ERROR] Session is not initialized. Cannot get input/output info."); + } + + // --- Input Info --- + size_t numInputNodes = ortSession_->GetInputCount(); + if (numInputNodes != 1) { + throw std::runtime_error("[ERROR] Expected 1 input node, but found " + std::to_string(numInputNodes)); + } + + inputNames_.resize(numInputNodes); + // Use Ort::Allocator unique_ptr for automatic memory management +#if ORT_API_VERSION > 11 // Get..NameAllocated introduced around v1.11 (API Version 11) +// Newer ONNX Runtime versions (>= 1.11): Use GetInputNameAllocated + std::cout << "[INFO] Using GetInputNameAllocated (ORT API Version >= 11)." << std::endl; + auto input_name_allocated_ptr = ortSession_->GetInputNameAllocated(0, allocator_); + inputNames_[0] = input_name_allocated_ptr.get(); // Copy string from unique_ptr's managed C-string +#else + // Older ONNX Runtime versions (< 1.11): Use GetInputName + std::cout << "[INFO] Using GetInputName (ORT API Version < 11)." << std::endl; + char* input_name_ptr = ortSession_->GetInputName(0, allocator_); // Returns char* + inputNames_[0] = input_name_ptr; // Assign to std::string (performs copy) + allocator_.Free(input_name_ptr); // IMPORTANT: Free the memory allocated by ORT for the name +#endif + + Ort::TypeInfo inputTypeInfo = ortSession_->GetInputTypeInfo(0); + auto inputTensorInfo = inputTypeInfo.GetTensorTypeAndShapeInfo(); + ONNXTensorElementDataType inputType = inputTensorInfo.GetElementType(); + inputDims_ = inputTensorInfo.GetShape(); // N C H W + + // Basic validation of input dimensions + if (inputDims_.size() != 4) { + throw std::runtime_error("[ERROR] Expected 4D input tensor (NCHW), but got " + std::to_string(inputDims_.size()) + "D shape."); + } + if (inputType != ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT) { + throw std::runtime_error("[ERROR] Expected FLOAT input type, but got type " + std::to_string(inputType)); + } + + // Handle dynamic dimensions (e.g., -1 or 'batch') + if (inputDims_[0] < 1) { + std::cout << "[INFO] Input batch size is dynamic, assuming batch size = 1." << std::endl; + inputDims_[0] = 1; // Set batch size to 1 + } + if (inputDims_[2] <= 0 || inputDims_[3] <= 0) { + throw std::runtime_error("[ERROR] Input height or width dimension is non-positive or dynamic. Model may need fixed input size or specific handling."); + } + + inputHeight_ = inputDims_[2]; + inputWidth_ = inputDims_[3]; + inputTensorSize_ = vectorProduct_(inputDims_); // Calculate total elements N*C*H*W + + if (inputTensorSize_ == 0) { + throw std::runtime_error("[ERROR] Calculated input tensor size is zero. Check model input dimensions."); + } + + std::cout << "[INFO] Input Name: " << inputNames_[0] + << ", Type: FLOAT, Shape: [" << inputDims_[0] << "," << inputDims_[1] << "," << inputDims_[2] << "," << inputDims_[3] << "]" << std::endl; + + // --- Output Info --- + size_t numOutputNodes = ortSession_->GetOutputCount(); + if (numOutputNodes != 2) { // Specific to RF-DETR's expected output (boxes, logits) + throw std::runtime_error("[ERROR] Expected 2 output nodes (boxes, logits), but found " + std::to_string(numOutputNodes)); + } + + outputNames_.resize(numOutputNodes); +#if ORT_API_VERSION > 11 + // Newer ONNX Runtime versions (>= 1.11) + std::cout << "[INFO] Using GetOutputNameAllocated (ORT API Version >= 11)." << std::endl; + auto output_name_ptr0 = ortSession_->GetOutputNameAllocated(0, allocator_); + outputNames_[0] = output_name_ptr0.get(); + auto output_name_ptr1 = ortSession_->GetOutputNameAllocated(1, allocator_); + outputNames_[1] = output_name_ptr1.get(); +#else + // Older ONNX Runtime versions (< 1.11) + std::cout << "[INFO] Using GetOutputName (ORT API Version < 11)." << std::endl; + char* output_name_ptr0 = ortSession_->GetOutputName(0, allocator_); + outputNames_[0] = output_name_ptr0; // Copy + allocator_.Free(output_name_ptr0); // Free memory + + char* output_name_ptr1 = ortSession_->GetOutputName(1, allocator_); + outputNames_[1] = output_name_ptr1; // Copy + allocator_.Free(output_name_ptr1); // Free memory +#endif + + // Verify Output 0 (Boxes) + Ort::TypeInfo outputTypeInfo0 = ortSession_->GetOutputTypeInfo(0); + auto outputTensorInfo0 = outputTypeInfo0.GetTensorTypeAndShapeInfo(); + ONNXTensorElementDataType outputType0 = outputTensorInfo0.GetElementType(); + std::vector outputDims0 = outputTensorInfo0.GetShape(); + + if (outputType0 != ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT) { + std::cerr << "[WARNING] Output 0 (Boxes) type is not FLOAT. Got type " << outputType0 << "." << std::endl; + } + if (outputDims0.size() != 3 || (outputDims0[0] != inputDims_[0] && outputDims0[0] > 0) || outputDims0[2] != 4) { + std::cerr << "[WARNING] Output 0 (Boxes) shape might not be [batch, num_queries, 4]. Got shape: ["; + for (size_t j = 0; j < outputDims0.size(); ++j) std::cerr << outputDims0[j] << (j == outputDims0.size() - 1 ? "" : ","); + std::cerr << "]. Ensure postprocessing logic matches." << std::endl; + } + + // Verify Output 1 (Logits) and determine numClasses_ + Ort::TypeInfo outputTypeInfo1 = ortSession_->GetOutputTypeInfo(1); + auto outputTensorInfo1 = outputTypeInfo1.GetTensorTypeAndShapeInfo(); + ONNXTensorElementDataType outputType1 = outputTensorInfo1.GetElementType(); + std::vector outputDims1 = outputTensorInfo1.GetShape(); // E.g., [batch, num_queries, num_classes] + + if (outputType1 != ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT) { + std::cerr << "[WARNING] Output 1 (Logits) type is not FLOAT. Got type " << outputType1 << "." << std::endl; + } + if (outputDims1.size() != 3) { + throw std::runtime_error("[ERROR] Output 1 (Logits) is not 3D [batch, num_queries, num_classes]. Got " + + std::to_string(outputDims1.size()) + "D."); + } + if (outputDims1[0] != inputDims_[0] && outputDims1[0] > 0) { // Check batch size consistency + std::cerr << "[WARNING] Output 1 (Logits) batch size (" << outputDims1[0] + << ") doesn't match input batch size (" << inputDims_[0] << ")." << std::endl; + } + if (outputDims0.size() == 3 && outputDims1.size() == 3 && outputDims0[1] != outputDims1[1] && outputDims0[1] > 0 && outputDims1[1] > 0) { // Check num_queries consistency + std::cerr << "[WARNING] Mismatch in num_queries between outputs. Boxes: " << outputDims0[1] + << ", Logits: " << outputDims1[1] << "." << std::endl; + } + + // Determine number of classes from the last dimension of the logits output + if (outputDims1[2] <= 0) { + throw std::runtime_error("[ERROR] Number of classes dimension in logits output is non-positive (" + std::to_string(outputDims1[2]) + ")."); + } + numClasses_ = outputDims1[2]; // Update numClasses_ based on the model + + // Print output info + std::cout << "[INFO] Output 0 Name: " << outputNames_[0] << ", Type: FLOAT, Shape: ["; + for (size_t j = 0; j < outputDims0.size(); ++j) std::cout << outputDims0[j] << (j == outputDims0.size() - 1 ? "" : ","); + std::cout << "]" << std::endl; + + std::cout << "[INFO] Output 1 Name: " << outputNames_[1] << ", Type: FLOAT, Shape: ["; + for (size_t j = 0; j < outputDims1.size(); ++j) std::cout << outputDims1[j] << (j == outputDims1.size() - 1 ? "" : ","); + std::cout << "]" << std::endl; + + std::cout << "[INFO] Determined number of classes from model: " << numClasses_ << std::endl; +} + + +std::vector RF_DETR_ONNX::box_cxcywh_to_xyxy(const float* box_start) { + // Assumes box_start points to [cx, cy, w, h] + float cx = box_start[0]; + float cy = box_start[1]; + // Ensure width and height are non-negative before calculations + float w = std::max(0.0f, box_start[2]); + float h = std::max(0.0f, box_start[3]); + + float x1 = cx - 0.5f * w; + float y1 = cy - 0.5f * h; + float x2 = cx + 0.5f * w; + float y2 = cy + 0.5f * h; + + return { x1, y1, x2, y2 }; // Return {x_min, y_min, x_max, y_max} +} + + +inline float RF_DETR_ONNX::sigmoid(float x) { + // Basic sigmoid implementation + return 1.0f / (1.0f + std::exp(-x)); + // Consider adding checks for very large/small x to prevent overflow/underflow if necessary + // e.g., using std::max(-30.0f, std::min(30.0f, -x)) inside exp for stability +} + + +size_t RF_DETR_ONNX::vectorProduct_(const std::vector& vec) { + if (vec.empty()) { + return 0; + } + size_t product = 1; + for (const auto& element : vec) { + // Ensure dimensions are positive before multiplying + if (element <= 0) { + // This indicates an invalid dimension (or dynamic handled incorrectly) + std::cerr << "[ERROR] Non-positive dimension encountered (" << element << ") in vector product calculation." << std::endl; + return 0; // Return 0 to signify an error or invalid size + } + // Check for potential overflow before multiplication + if (element > 0 && product > std::numeric_limits::max() / static_cast(element)) { + throw std::overflow_error("[ERROR] Size calculation overflowed (vectorProduct_)"); + } + product *= static_cast(element); + } + return product; +} \ No newline at end of file diff --git a/inference/cpp/onnx/RF_DETR_ONNX.h b/inference/cpp/onnx/RF_DETR_ONNX.h new file mode 100644 index 000000000..84b77868e --- /dev/null +++ b/inference/cpp/onnx/RF_DETR_ONNX.h @@ -0,0 +1,98 @@ +#ifndef RF_DETR_ONNX_H_ +#define RF_DETR_ONNX_H_ + +#include +#include +#include // For unique_ptr +#include // For runtime_error +#include // For timing (optional in header, but often useful) +#include // For logging (optional in header) +#include // For std::min/max etc. +#include // For std::exp, std::round +#include // For std::tuple + +#include // For cv::Mat +#include // For cv::Rect + +#include // ONNX Runtime C++ API + +// Basic structure to hold detection results +struct Detection { + cv::Rect box; // Bounding box (x, y, width, height) + float score; // Confidence score + int class_id; // Detected class ID + // std::string class_name; // Optional: Add if you have class names +}; + +class RF_DETR_ONNX { +public: + // Constructor: Initializes the ONNX Runtime environment and loads the model. + RF_DETR_ONNX(const std::string& modelPath, bool useCUDA = false, size_t deviceId = 0, int intraOpNumThreads = 1); + + // Default destructor (unique_ptr handles session cleanup) + ~RF_DETR_ONNX() = default; + + // Preprocesses the input image into a float vector (NCHW blob). + std::vector preprocess(const cv::Mat& inputImage); + + // Runs inference on the preprocessed input tensor. + std::vector infer(const std::vector& inputTensorValues); + + // Postprocesses the raw model outputs into a list of detections. + std::vector postprocess(const std::vector& outputTensors, int originalWidth, int originalHeight, float confThreshold); + + // Performs the full detection pipeline: preprocess, infer, postprocess. + std::vector detect(const cv::Mat& image, float confThreshold); + + // --- Getters --- + int getInputWidth() const; + int getInputHeight() const; + const std::vector& getInputNames() const; + const std::vector& getOutputNames() const; + int64_t getNumClasses() const; + + +private: + // --- Constants --- + // Note: Initializing non-static const vectors directly here might require C++11/14/17 depending on usage/compiler. + // It's often safer to initialize them in the constructor initializer list or make them static const defined in the .cpp. + // However, mirroring your original code structure for now. + const std::vector MEANS = { 0.485f, 0.456f, 0.406f }; + const std::vector STDS = { 0.229f, 0.224f, 0.225f }; + const int MAX_NUMBER_BOXES = 300; // Max proposals from RF-DETR + const std::string DEFAULT_INSTANCE_NAME = "rfdetr-onnx-cpp-inference"; + + // --- ONNX Runtime Members --- + Ort::Env env_; + Ort::SessionOptions sessionOptions_; + Ort::AllocatorWithDefaultOptions allocator_; + std::unique_ptr ortSession_; + + // --- Model Info Members --- + std::vector inputNames_; + std::vector outputNames_; + std::vector inputDims_; // NCHW + int64_t inputWidth_ = 0; + int64_t inputHeight_ = 0; + size_t inputTensorSize_ = 0; // Total number of elements in input tensor + int64_t numClasses_ = 80; // Default, will be updated from model info + + // --- Private Helper Methods --- + + // Initializes the ONNX Runtime session. + void initializeSession(const std::string& modelPath, bool useCUDA, size_t deviceId, int intraOpNumThreads); + + // Extracts input and output names, shapes, and other info from the loaded model. + void getInputOutputInfo(); + + // Converts a bounding box from center_x, center_y, width, height to x1, y1, x2, y2 format. + std::vector box_cxcywh_to_xyxy(const float* box_start); + + // Sigmoid activation function. + inline float sigmoid(float x); + + // Calculates the product of elements in a vector (for tensor size). + size_t vectorProduct_(const std::vector& vector); +}; + +#endif // RF_DETR_ONNX_H_ \ No newline at end of file diff --git a/inference/cpp/onnx/main.cpp b/inference/cpp/onnx/main.cpp new file mode 100644 index 000000000..f43150804 --- /dev/null +++ b/inference/cpp/onnx/main.cpp @@ -0,0 +1,408 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include "RF_DETR_ONNX.h" + +// Generates random colors for classes (for visualization) +std::vector generateClassColors(int num_classes) { + std::vector class_colors(num_classes); + std::srand(42); // Use a fixed seed for consistent colors + for (int i = 0; i < num_classes; ++i) { + class_colors[i] = cv::Scalar(std::rand() % 256, std::rand() % 256, std::rand() % 256); + } + return class_colors; +} + +// Load class names from a file +std::vector loadClassNames(const std::string& path, int defaultNumClasses) { + std::vector classNames; + std::ifstream ifs(path); + if (ifs.is_open()) { + std::string line; + while (std::getline(ifs, line)) { + classNames.push_back(line); + } + ifs.close(); + std::cout << "[INFO] Loaded " << classNames.size() << " class names from " << path << std::endl; + } + else { + std::cerr << "[WARNING] Could not open class name file: " << path << ". Generating dummy labels." << std::endl; + } + + if (classNames.empty()) { + std::cout << "[INFO] Generating " << defaultNumClasses << " dummy class labels (0, 1, 2...)." << std::endl; + for (int i = 0; i < defaultNumClasses; ++i) { + classNames.push_back(std::to_string(i)); + } + } + return classNames; +} + +// --- Drawing Function --- +void drawDetections( + cv::Mat& image, // Input image (will be modified) + const std::vector& detections, + const std::vector& classNames, + const std::vector& classColors) +{ + if (classNames.empty()) { + std::cerr << "Warning: classNames is empty. Cannot draw labels." << std::endl; + return; + } + if (classColors.empty() || classColors.size() < classNames.size()) { + std::cerr << "Warning: classColors is empty or insufficient. Cannot draw boxes with distinct colors." << std::endl; + return; // Or use a default color + } + + for (const auto& det : detections) { + if (det.class_id < 0 || det.class_id >= classNames.size()) { + std::cerr << "Warning: Invalid class_id " << det.class_id << " encountered. Skipping box." << std::endl; + continue; + } + + cv::Rect box = det.box; + cv::Scalar color = classColors[det.class_id]; + int classId = det.class_id; + float score = det.score; + + // Draw bounding box + cv::rectangle(image, box, color, 2); + + // Create label text + std::ostringstream oss; + oss << classNames[classId] << ": " << std::fixed << std::setprecision(2) << score; + std::string labelText = oss.str(); + + // Add text label background and text + int baseline = 0; + cv::Size textSize = cv::getTextSize(labelText, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseline); + baseline += 1; // Adjust baseline + + // Ensure text background doesn't go out of bounds (top) + int textRectY = std::max(box.y - textSize.height - baseline, 0); + cv::Rect textRect(box.x, textRectY, textSize.width, textSize.height + baseline); + + // Draw filled rectangle for text background + cv::rectangle(image, textRect, color, cv::FILLED); + + // Put white text on the background + cv::Point textOrg(box.x, box.y - baseline); // Adjust text position slightly above the box + // Ensure text org y isn't negative + textOrg.y = std::max(textOrg.y, textSize.height); + cv::putText(image, labelText, textOrg, cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(255, 255, 255), 1, cv::LINE_AA); + } +} + +int main(int argc, char* argv[]) { + // Print ONNX Runtime version using the C API (guaranteed to exist) + std::cout << "[INFO] ONNXRuntime version: " << OrtGetApiBase()->GetVersionString() << std::endl; + + + // --- Configuration Defaults --- + //std::string modelPath = "rf-detr-base.onnx"; // Provide a sensible default or require via arg + std::string labelPath = ""; // Default: No labels file, will use generic names + std::string sourceType = "camera"; // Default source: image + std::string inputPath = "0"; // Default input image path + std::string outputPath = "output.jpg"; // Default output path + float confThreshold = 0.5f; + bool useCUDA = true; // Default to CUDA if available + size_t deviceId = 0; + int cameraId = 0; + int intraOpNumThreads = 1; // Default OpenMP threads for ONNX Runtime CPU ops + + // --- Argument Parsing --- + std::cout << "[INFO] Parsing command line arguments..." << std::endl; + for (int i = 1; i < argc; ++i) { + std::string arg = argv[i]; + try { // Add try-catch for parsing values + if ((arg == "--model" || arg == "-m") && i + 1 < argc) { + modelPath = argv[++i]; + } + else if ((arg == "--labels" || arg == "-l") && i + 1 < argc) { + labelPath = argv[++i]; + } + else if (arg == "--source_type" && i + 1 < argc) { + sourceType = argv[++i]; + std::transform(sourceType.begin(), sourceType.end(), sourceType.begin(), ::tolower); // Lowercase + } + else if ((arg == "--input" || arg == "-i") && i + 1 < argc) { + inputPath = argv[++i]; // Path for image/video, ID for camera + } + else if ((arg == "--output" || arg == "-o") && i + 1 < argc) { + outputPath = argv[++i]; // Path for saving image/video + } + else if ((arg == "--conf" || arg == "-c") && i + 1 < argc) { + confThreshold = std::stof(argv[++i]); + } + else if (arg == "--use_cuda") { + useCUDA = true; + } + else if (arg == "--use_cpu") { + useCUDA = false; + } + else if (arg == "--device_id" && i + 1 < argc) { + deviceId = std::stoul(argv[++i]); + } + else if (arg == "--threads" && i + 1 < argc) { + intraOpNumThreads = std::stoi(argv[++i]); + if (intraOpNumThreads <= 0) intraOpNumThreads = 1; // Ensure at least 1 thread + } + else if (arg == "--help" || arg == "-h") { + std::cout << "\nUsage: " << argv[0] << " [options]\n\n" + << "Options:\n" + << " -m, --model Path to the ONNX model file (default: " << modelPath << ")\n" + << " -l, --labels Path to the file containing class names (one per line)\n" + << " --source_type Input source type: 'image', 'video', or 'camera' (default: " << sourceType << ")\n" + << " -i, --input Path to image/video file, or camera ID (integer) (default: " << inputPath << ")\n" + << " -o, --output Path to save the annotated image or video (default: " << outputPath << ")\n" + << " -c, --conf Confidence threshold for detections (default: " << confThreshold << ")\n" + << " --use_cuda Enable CUDA execution provider (default: " << (useCUDA ? "Yes" : "No") << ")\n" + << " --use_cpu Use CPU execution provider\n" + << " --device_id GPU device ID if using CUDA (default: " << deviceId << ")\n" + << " --threads Number of threads for ONNX Runtime intra-op parallelism (CPU, default: " << intraOpNumThreads << ")\n" + << " -h, --help Show this help message\n" << std::endl; + return 0; + } + else { + std::cerr << "[WARNING] Unknown or incomplete argument: " << arg << ". Use --help for options." << std::endl; + } + } + catch (const std::exception& e) { + std::cerr << "[ERROR] Invalid argument value for " << arg << ": " << e.what() << std::endl; + return 1; + } + } + + // Update camera ID if source type is camera + if (sourceType == "camera") { + try { + cameraId = std::stoi(inputPath); + } + catch (const std::exception& e) { + std::cerr << "[ERROR] Invalid camera ID '" << inputPath << "'. Please provide an integer ID. " << e.what() << std::endl; + return 1; + } + } + + + std::cout << "[INFO] Configuration:" << std::endl; + std::cout << " Model Path: " << modelPath << std::endl; + std::cout << " Labels Path: " << (labelPath.empty() ? "None (using defaults)" : labelPath) << std::endl; + std::cout << " Source Type: " << sourceType << std::endl; + std::cout << " Input: " << inputPath << (sourceType == "camera" ? " (Camera ID)" : "") << std::endl; + if (sourceType == "image" || sourceType == "video") { + std::cout << " Output Path: " << outputPath << std::endl; + } + std::cout << " Confidence Threshold: " << confThreshold << std::endl; + std::cout << " Execution Provider: " << (useCUDA ? "CUDA (Device ID: " + std::to_string(deviceId) + ")" : "CPU") << std::endl; + if (!useCUDA) { + std::cout << " Intra-op Threads: " << intraOpNumThreads << std::endl; + } + + try { + // --- Initialize Detector --- + std::cout << "[INFO] Initializing detector..." << std::endl; + auto detector_start = std::chrono::high_resolution_clock::now(); + // Create the detector object using the class + RF_DETR_ONNX detector(modelPath, useCUDA, deviceId, intraOpNumThreads); + auto detector_end = std::chrono::high_resolution_clock::now(); + std::chrono::duration detector_init_duration = detector_end - detector_start; + std::cout << "[INFO] Detector initialization time: " << detector_init_duration.count() << " ms" << std::endl; + + // --- Load Class Names & Generate Colors --- + int numClasses = detector.getNumClasses(); // Get number of classes from the detector + std::cout << "[INFO] Model expects " << numClasses << " classes." << std::endl; + std::vector classNames = loadClassNames(labelPath, numClasses); // Load or generate names + std::vector classColors = generateClassColors(classNames.size()); + + + // --- Process Input Based on Source Type --- + + if (sourceType == "image") { + std::cout << "[INFO] Processing image: " << inputPath << std::endl; + cv::Mat imageBGR = cv::imread(inputPath, cv::ImreadModes::IMREAD_COLOR); + if (imageBGR.empty()) { throw std::runtime_error("Could not read image: " + inputPath); } + + // Perform detection using the detector's detect method + auto detection_start = std::chrono::high_resolution_clock::now(); + std::vector detections = detector.detect(imageBGR, confThreshold); + auto detection_end = std::chrono::high_resolution_clock::now(); + std::chrono::duration detection_duration = detection_end - detection_start; + + // The detector class already prints timing for preprocess, infer, postprocess + // std::cout << "[INFO] Total detection pipeline time: " << detection_duration.count() << " ms." << std::endl; + + // Draw detections on the original image + drawDetections(imageBGR, detections, classNames, classColors); + + bool success = cv::imwrite(outputPath, imageBGR); + if (!success) { std::cerr << "[ERROR] Failed to save image to " << outputPath << std::endl; } + else { std::cout << "[INFO] Annotated image saved to: " << outputPath << std::endl; } + + } + else if (sourceType == "video") { + std::cout << "[INFO] Processing video: " << inputPath << std::endl; + cv::VideoCapture cap(inputPath); + if (!cap.isOpened()) { throw std::runtime_error("Could not open video file: " + inputPath); } + + int frameWidth = static_cast(cap.get(cv::CAP_PROP_FRAME_WIDTH)); + int frameHeight = static_cast(cap.get(cv::CAP_PROP_FRAME_HEIGHT)); + double fps = cap.get(cv::CAP_PROP_FPS); + if (fps <= 0) { // Handle cases where FPS is not reported correctly + std::cerr << "[WARNING] Video FPS reported as " << fps << ", defaulting to 30.0 for writer." << std::endl; + fps = 30.0; + } + int fourcc = cv::VideoWriter::fourcc('m', 'p', '4', 'v'); // Common codec, adjust if needed + + std::cout << "[INFO] Video properties: " << frameWidth << "x" << frameHeight << " @ " << fps << " FPS" << std::endl; + + cv::VideoWriter writer(outputPath, fourcc, fps, cv::Size(frameWidth, frameHeight)); + if (!writer.isOpened()) { throw std::runtime_error("Could not open video writer for: " + outputPath); } + std::cout << "[INFO] Saving annotated video to: " << outputPath << std::endl; + + cv::Mat frame; + int frameCount = 0; + auto total_start = std::chrono::high_resolution_clock::now(); + double total_detection_time_ms = 0.0; + + while (cap.read(frame)) { + frameCount++; + if (frame.empty()) { + std::cerr << "[WARNING] Read empty frame " << frameCount << ", stopping." << std::endl; break; + } + + auto frame_detect_start = std::chrono::high_resolution_clock::now(); + // Use the detector object + std::vector detections = detector.detect(frame, confThreshold); + auto frame_detect_end = std::chrono::high_resolution_clock::now(); + total_detection_time_ms += std::chrono::duration(frame_detect_end - frame_detect_start).count(); + + drawDetections(frame, detections, classNames, classColors); + writer.write(frame); // Write annotated frame + + // Simple progress indication + if (frameCount % 30 == 0) { // Print every 30 frames + double avg_time_per_frame = total_detection_time_ms / frameCount; + std::cout << "[INFO] Processed frame " << frameCount << " (Avg detection time: " + << avg_time_per_frame << " ms/frame)" << std::endl; + } + } + + auto total_end = std::chrono::high_resolution_clock::now(); + std::chrono::duration total_duration = total_end - total_start; + std::cout << "[INFO] Finished processing video. Processed " << frameCount << " frames in " + << total_duration.count() << " seconds." << std::endl; + if (frameCount > 0) { + std::cout << "[INFO] Average detection time per frame: " << (total_detection_time_ms / frameCount) << " ms" << std::endl; + } + + cap.release(); + writer.release(); + + } + else if (sourceType == "camera") { + // cameraId was already parsed from inputPath + std::cout << "[INFO] Starting camera stream: ID " << cameraId << std::endl; + cv::VideoCapture cap(cameraId); + if (!cap.isOpened()) { throw std::runtime_error("Could not open camera with ID: " + std::to_string(cameraId)); } + + // Optional: Set desired resolution (camera might ignore or adjust) + // cap.set(cv::CAP_PROP_FRAME_WIDTH, 1280); + // cap.set(cv::CAP_PROP_FRAME_HEIGHT, 720); + + int frameWidth = static_cast(cap.get(cv::CAP_PROP_FRAME_WIDTH)); + int frameHeight = static_cast(cap.get(cv::CAP_PROP_FRAME_HEIGHT)); + std::cout << "[INFO] Camera resolution: " << frameWidth << "x" << frameHeight << std::endl; + + const std::string windowName = "RF-DETR Live Detection (ONNX Runtime)"; + cv::namedWindow(windowName, cv::WINDOW_AUTOSIZE); + + cv::Mat frame; + std::cout << "[INFO] Press 'q' or ESC in the window to quit." << std::endl; + + double total_frame_time_ms = 0.0; + int frame_count_display = 0; + const int fps_update_interval = 10; // Update FPS every 10 frames + + while (true) { + auto frame_grab_start = std::chrono::high_resolution_clock::now(); + if (!cap.read(frame) || frame.empty()) { + std::cerr << "[ERROR] Failed to grab frame from camera. Exiting." << std::endl; break; + } + + // Use the detector object + std::vector detections = detector.detect(frame, confThreshold); + + auto frame_end = std::chrono::high_resolution_clock::now(); + std::chrono::duration frame_duration = frame_end - frame_grab_start; + total_frame_time_ms += frame_duration.count(); + frame_count_display++; + + drawDetections(frame, detections, classNames, classColors); + + // Add FPS display (average over last interval) + if (frame_count_display >= fps_update_interval) { + double avg_fps = (1000.0 * frame_count_display) / total_frame_time_ms; + cv::putText(frame, cv::format("FPS: %.2f", avg_fps), cv::Point(10, 25), + cv::FONT_HERSHEY_SIMPLEX, 0.7, cv::Scalar(0, 0, 255), 2); + // Reset counters for next interval + total_frame_time_ms = 0.0; + frame_count_display = 0; + } + else { + // Optionally display instantaneous FPS or nothing until interval is met + double current_fps = 1000.0 / frame_duration.count(); + cv::putText(frame, cv::format("FPS: %.2f", current_fps), cv::Point(10, 25), + cv::FONT_HERSHEY_SIMPLEX, 0.7, cv::Scalar(0, 0, 255), 2); + } + + cv::imshow(windowName, frame); + + int key = cv::waitKey(1); // Wait 1ms for key press + if (key == 'q' || key == 'Q' || key == 27) { // 27 is ASCII for ESC + std::cout << "[INFO] Quit key pressed. Exiting camera stream." << std::endl; break; + } + } + + cap.release(); + cv::destroyAllWindows(); + + } + else { + throw std::runtime_error("Invalid source_type specified: '" + sourceType + "'. Use 'image', 'video', or 'camera'."); + } + + std::cout << "[INFO] Processing finished successfully." << std::endl; + + } + catch (const Ort::Exception& ort_exception) { // Catch ONNX Runtime specific exceptions + std::cerr << "[FATAL ERROR][ONNX Runtime] " << ort_exception.what() << std::endl; + return 1; + } + catch (const std::exception& e) { // Catch standard exceptions + std::cerr << "[FATAL ERROR] " << e.what() << std::endl; + return 1; + } + catch (...) { // Catch any other unknown exceptions + std::cerr << "[FATAL ERROR] An unknown error occurred." << std::endl; + return 1; + } + + + return 0; +} \ No newline at end of file