Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions rock/actions/sandbox/response.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ class SandboxStatusResponse(BaseModel):
namespace: str | None = None
cpus: float | None = None
memory: str | None = None
limit_disk_rootfs: str | None = None
limit_disk_log: str | None = None


class CommandResponse(BaseModel):
Expand Down
2 changes: 2 additions & 0 deletions rock/actions/sandbox/sandbox_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ class SandboxInfo(TypedDict, total=False):
create_user_gray_flag: bool
cpus: float
memory: str
limit_disk_rootfs: str
limit_disk_log: str
create_time: str
start_time: str
stop_time: str
27 changes: 27 additions & 0 deletions rock/admin/entrypoints/sandbox_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
from rock.admin.proto.response import SandboxStartResponse
from rock.common.constants import (
CPU_PREEMPT_SWITCH,
SANDBOX_LIMIT_DISK_LOG_KEY,
SANDBOX_LIMIT_DISK_ROOTFS_KEY,
GET_STATUS_SWITCH,
KATA_DIND_DISK_SIZE_KEY,
KATA_RUNTIME_SWITCH,
Expand Down Expand Up @@ -69,6 +71,29 @@ async def _apply_kata_disk_size(config: DockerDeploymentConfig) -> None:
config.kata_disk_size = disk_size


async def _apply_disk_limits(config: DockerDeploymentConfig) -> None:
"""Apply disk limits from RuntimeConfig (rock-xxx.yml), overridable by Nacos at runtime.

Priority: Nacos > RuntimeConfig (rock-xxx.yml). None in both means no limit.
"""
runtime = sandbox_manager.rock_config.runtime
nacos = sandbox_manager.rock_config.nacos_provider

limit_disk_rootfs = runtime.sandbox_limit_disk_rootfs
limit_disk_log = runtime.sandbox_limit_disk_log

if nacos is not None:
nacos_rootfs = await nacos.get_config_value(SANDBOX_LIMIT_DISK_ROOTFS_KEY)
if nacos_rootfs:
limit_disk_rootfs = nacos_rootfs
nacos_log = await nacos.get_config_value(SANDBOX_LIMIT_DISK_LOG_KEY)
if nacos_log:
limit_disk_log = nacos_log

config.limit_disk_rootfs = limit_disk_rootfs
config.limit_disk_log = limit_disk_log


async def _apply_cpu_preempt_switch(config: DockerDeploymentConfig) -> None:
"""Check nacos switch and enable CPU preemption on the config if the switch is on.

Expand All @@ -89,6 +114,7 @@ async def start(request: SandboxStartRequest) -> RockResponse[SandboxStartRespon
await _apply_kata_runtime_switch(config)
await _apply_kata_disk_size(config)
await _apply_cpu_preempt_switch(config)
await _apply_disk_limits(config)
sandbox_start_response = await sandbox_manager.start(config)
return RockResponse(result=sandbox_start_response)

Expand All @@ -103,6 +129,7 @@ async def start_async(
await _apply_kata_runtime_switch(config)
await _apply_kata_disk_size(config)
await _apply_cpu_preempt_switch(config)
await _apply_disk_limits(config)
sandbox_start_response = await sandbox_manager.start_async(
config,
user_info=headers.user_info,
Expand Down
6 changes: 6 additions & 0 deletions rock/admin/proto/response.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ class SandboxStartResponse(SandboxResponse):
host_ip: str | None = None
cpus: float | None = None
memory: str | None = None
limit_disk_rootfs: str | None = None
limit_disk_log: str | None = None


# TODO: inherit from SandboxStartResponse
Expand All @@ -30,6 +32,8 @@ class SandboxStatusResponse(BaseModel):
namespace: str | None = None
cpus: float | None = None
memory: str | None = None
limit_disk_rootfs: str | None = None
limit_disk_log: str | None = None

@classmethod
def from_sandbox_info(cls, sandbox_info: "SandboxInfo") -> "SandboxStatusResponse":
Expand All @@ -46,6 +50,8 @@ def from_sandbox_info(cls, sandbox_info: "SandboxInfo") -> "SandboxStatusRespons
namespace=sandbox_info.get("namespace"),
cpus=sandbox_info.get("cpus"),
memory=sandbox_info.get("memory"),
limit_disk_rootfs=sandbox_info.get("limit_disk_rootfs"),
limit_disk_log=sandbox_info.get("limit_disk_log"),
)


Expand Down
2 changes: 2 additions & 0 deletions rock/common/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
SUPPORT_KATA_SWITCH = "support_kata_enabled"
CPU_PREEMPT_SWITCH = "cpu_preempt_enabled"
KATA_DIND_DISK_SIZE_KEY = "kata_dind_disk_size"
SANDBOX_LIMIT_DISK_ROOTFS_KEY = "sandbox_limit_disk_rootfs"
SANDBOX_LIMIT_DISK_LOG_KEY = "sandbox_limit_disk_log"
PID_PREFIX = "PIDSTART"
PID_SUFFIX = "PIDEND"
SCHEDULER_LOG_NAME = "scheduler.log"
Expand Down
4 changes: 4 additions & 0 deletions rock/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,10 @@ class RuntimeConfig:
use_standard_spec_only: bool = False
metrics_endpoint: str = ""
user_defined_tags: dict = field(default_factory=dict)
sandbox_limit_disk_rootfs: str | None = None
"""Default rootfs quota per container. None means no limit. Can be overridden by nacos key 'default_limit_disk'."""
sandbox_limit_disk_log: str | None = None
"""Default log-dir quota per container. None means no limit. Can be overridden by nacos key 'default_log_dir_quota'."""

def __post_init__(self) -> None:
# Convert dict to StandardSpec if needed
Expand Down
6 changes: 6 additions & 0 deletions rock/deployments/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,12 @@ class DockerDeploymentConfig(DeploymentConfig):
limit_cpus: float | None = None
"""Hard limit on the number of CPU cores the container can use. Used as --cpus when CPU preemption is enabled via nacos switch."""

limit_disk_rootfs: str | None = None
"""Maximum rootfs disk size for the container (e.g., '20g', '50g'). Maps to --storage-opt size=<value>. Only supported on overlay2 storage driver with xfs backing filesystem. None means no limit."""

limit_disk_log: str | None = None
"""XFS project quota for the sandbox log directory. Server-side only, applied via xfs_quota. None means no limit."""

container_name: str | None = None
"""Custom name for the container. If None, a random name will be generated."""

Expand Down
87 changes: 87 additions & 0 deletions rock/deployments/docker.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import asyncio
import datetime
import hashlib
import os
import random
import shlex
Expand Down Expand Up @@ -47,6 +48,7 @@


class DockerDeployment(AbstractDeployment):

def __init__(
self,
**kwargs: Any,
Expand All @@ -57,6 +59,8 @@ def __init__(
**kwargs: Keyword arguments (see `DockerDeploymentConfig` for details).
"""
self._config = DockerDeploymentConfig(**kwargs)
self._effective_limit_disk_rootfs: str | None = self._config.limit_disk_rootfs
self._effective_limit_disk_log: str | None = self._config.limit_disk_log
self._runtime: RemoteSandboxRuntime | None = None
self._container_process = None
self._runtime_timeout = 0.15
Expand Down Expand Up @@ -350,11 +354,82 @@ def _cpus(self):
return [f"--cpu-shares={cpu_shares}", f"--cpus={self.config.limit_cpus}"]
return [f"--cpus={self.config.cpus}"]

def _storage_opts(self):
if self._effective_limit_disk_rootfs is not None:
return ["--storage-opt", f"size={self._effective_limit_disk_rootfs}"]
return []

def _try_set_log_dir_quota(self, log_file_path: str) -> None:
"""Best-effort: set XFS project quota for sandbox log directory.

Requires the log path to be on an XFS mount with prjquota/pquota enabled.
This check is independent of Docker's storage driver (no overlay2 requirement).
"""
if self._effective_limit_disk_log is None:
return

if not DockerUtil.is_xfs_prjquota_path(log_file_path):
logger.info(f"Log path {log_file_path!r} is not on XFS+prjquota, skipping quota setup")
self._effective_limit_disk_log = None
return

# Derive a deterministic project id from container name; reserve low ids.
project_id = (int(hashlib.sha1(self.container_name.encode("utf-8")).hexdigest()[:8], 16) % 900000) + 100000
try:
findmnt_result = subprocess.run(
["findmnt", "-T", log_file_path, "-o", "TARGET", "--noheadings"],
capture_output=True,
text=True,
timeout=5,
)
if findmnt_result.returncode != 0:
logger.warning(f"Failed to find mountpoint for log path {log_file_path!r}, skip quota setup")
self._effective_limit_disk_log = None
return
mount_point = findmnt_result.stdout.strip()
if not mount_point:
logger.warning(f"Empty mountpoint for log path {log_file_path!r}, skip quota setup")
self._effective_limit_disk_log = None
return

set_project_cmd = f"project -s -p {shlex.quote(log_file_path)} {project_id}"
set_limit_cmd = f"limit -p bhard={self._effective_limit_disk_log} {project_id}"
for cmd in (set_project_cmd, set_limit_cmd):
result = subprocess.run(
["xfs_quota", "-x", "-c", cmd, mount_point],
capture_output=True,
text=True,
timeout=5,
)
if result.returncode != 0:
logger.warning(
f"xfs_quota failed for {log_file_path!r} with cmd={cmd!r}: {result.stderr.strip() or result.stdout.strip()}"
)
self._effective_limit_disk_log = None
return
logger.info(f"Set XFS project quota {self._effective_limit_disk_log} for log path {log_file_path!r}")
except Exception as e:
logger.warning(f"Failed to set XFS project quota for {log_file_path!r}: {e}")
self._effective_limit_disk_log = None

async def start(self):
"""Starts the runtime."""
if not self.sandbox_validator.check_availability():
raise Exception("Docker is not available")

storage_opt_supported = DockerUtil.detect_storage_opt_support()
# Resolve effective rootfs quota: downgrade to None if storage-opt is not supported.
if self._config.limit_disk_rootfs is not None and not storage_opt_supported:
logger.warning(
f"[{self.config.container_name}] --storage-opt not supported on this worker "
f"(requires overlay2 + xfs + prjquota), ignoring limit_disk_rootfs={self._config.limit_disk_rootfs}"
)
self._effective_limit_disk_rootfs = None
else:
self._effective_limit_disk_rootfs = self._config.limit_disk_rootfs
# Resolve effective log quota; _try_set_log_dir_quota will downgrade to None if XFS+prjquota is unavailable.
self._effective_limit_disk_log = self._config.limit_disk_log

if self._container_name is None:
self.set_container_name(self._get_container_name())
self._service_status.set_sandbox_id(self._container_name)
Expand Down Expand Up @@ -385,6 +460,7 @@ async def start(self):
log_file_path = f"{env_vars.ROCK_LOGGING_PATH}/{self.container_name}"
os.makedirs(log_file_path, exist_ok=True)
os.chmod(log_file_path, 0o777)
self._try_set_log_dir_quota(log_file_path)
volume_args.extend(["-v", f"{log_file_path}:{env_vars.ROCK_LOGGING_PATH}"])
env_arg = [
"-e",
Expand Down Expand Up @@ -421,6 +497,7 @@ async def start(self):
f"{self._service_status.get_mapped_port(Port.SSH)}:22",
*self._memory(),
*self._cpus(),
*self._storage_opts(),
*platform_arg,
*self._config.docker_args,
"--name",
Expand Down Expand Up @@ -554,6 +631,16 @@ def config(self) -> DockerDeploymentConfig:
"""Returns the config of the deployment."""
return self._config

@property
def effective_limit_disk_rootfs(self) -> str | None:
"""Returns the actual rootfs quota in effect after runtime capability checks (may differ from config.limit_disk_rootfs)."""
return self._effective_limit_disk_rootfs

@property
def effective_limit_disk_log(self) -> str | None:
"""Returns the actual log-dir quota in effect after runtime capability checks (may differ from config.limit_disk_log)."""
return self._effective_limit_disk_log

async def _check_stop(self):
logger.info(f"Start check container to stop: {self._container_name}")
try:
Expand Down
4 changes: 4 additions & 0 deletions rock/sandbox/sandbox_actor.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,8 @@ async def start(self):
logger.error(f"[{self._config.container_name}] start deployment failed: {ex}", exc_info=True)
raise ex
if isinstance(self._deployment, DockerDeployment):
self._config.limit_disk_rootfs = self._deployment.effective_limit_disk_rootfs
self._config.limit_disk_log = self._deployment.effective_limit_disk_log
self._clean_container_background()
await self._setup_monitor()

Expand Down Expand Up @@ -274,5 +276,7 @@ async def sandbox_info(self) -> SandboxInfo:
"namespace": await self.namespace(),
"cpus": self._config.cpus,
"memory": self._config.memory,
"limit_disk_rootfs": self._config.limit_disk_rootfs,
"limit_disk_log": self._config.limit_disk_log,
}
return {}
10 changes: 10 additions & 0 deletions rock/sandbox/sandbox_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,8 @@ async def get_status(self, sandbox_id) -> SandboxStatusResponse:
namespace=sandbox_info.get("namespace"),
cpus=sandbox_info.get("cpus"),
memory=sandbox_info.get("memory"),
limit_disk_rootfs=sandbox_info.get("limit_disk_rootfs"),
limit_disk_log=sandbox_info.get("limit_disk_log"),
)

async def build_sandbox_info_from_redis(self, sandbox_id: str, deployment_info: SandboxInfo) -> SandboxInfo | None:
Expand Down Expand Up @@ -376,3 +378,11 @@ def validate_sandbox_spec(self, runtime_config: RuntimeConfig, deployment_config
except ValueError as e:
logger.warning(f"Invalid memory size: {deployment_config.memory}", exc_info=e)
raise BadRequestRockError(f"Invalid memory size: {deployment_config.memory}")

# Validate limit_disk_rootfs format
if deployment_config.limit_disk_rootfs is not None:
try:
parse_size_to_bytes(deployment_config.limit_disk_rootfs)
except ValueError as e:
logger.warning(f"Invalid limit_disk_rootfs size: {deployment_config.limit_disk_rootfs}", exc_info=e)
raise BadRequestRockError(f"Invalid limit_disk_rootfs size: {deployment_config.limit_disk_rootfs}")
Loading
Loading