diff --git a/Cargo.lock b/Cargo.lock index 970f5c0e..68d63c67 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -949,6 +949,7 @@ dependencies = [ "cfg-if", "cfg_aliases", "libc", + "memoffset", ] [[package]] @@ -1820,11 +1821,13 @@ dependencies = [ "uhyve-interface", "uuid", "virtio-bindings", + "virtio-queue", "vm-fdt", - "vm-memory", + "vm-memory 0.18.0", "vmm-sys-util", "x86_64", "xhypervisor", + "zerocopy", ] [[package]] @@ -1887,12 +1890,34 @@ version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "091f1f09cfbf2a78563b562e7a949465cce1aef63b6065645188d995162f8868" +[[package]] +name = "virtio-queue" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e358084f32ed165fddb41d98ff1b7ff3c08b9611d8d6114a1b422e2e85688baf" +dependencies = [ + "libc", + "log", + "virtio-bindings", + "vm-memory 0.17.2", + "vmm-sys-util", +] + [[package]] name = "vm-fdt" version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7e21282841a059bb62627ce8441c491f09603622cd5a21c43bfedc85a2952f23" +[[package]] +name = "vm-memory" +version = "0.17.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e201ed63fdaba7bdafd07176f2060ffa6949aa05559acbc3f5189824bb285428" +dependencies = [ + "vm-memory 0.18.0", +] + [[package]] name = "vm-memory" version = "0.18.0" diff --git a/Cargo.toml b/Cargo.toml index da41b2ad..560f3be1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -50,7 +50,7 @@ libc = "0.2" log = "0.4" mac_address = "1.1" merge = "0.2" -nix = { version = "0.31", features = ["mman", "pthread", "signal"] } +nix = { version = "0.31", features = ["mman", "pthread", "signal", "net", "ioctl", "poll"] } nohash = "0.2" rftrace = { version = "0.3", optional = true } rftrace-frontend = { version = "0.3", optional = true } @@ -62,18 +62,22 @@ tempfile = "3.26" thiserror = "2.0.18" time = "0.3" toml = "1" -tun-tap = { version = "0.1.3", default-features = false } uhyve-interface = { version = "0.2.0", path = "uhyve-interface", features = ["std"] } virtio-bindings = "~0.2.7" vm-fdt = "0.3" vm-memory = { version = "0.18", features = ["backend-mmap"] } uuid = { version = "1.22.0", features = ["fast-rng", "v4"]} tar-no-std = { version = "0.4", features = ["alloc"] } +bitflags = "2.11" +virtio-queue = "0.17" +zerocopy = { version = "0.8", features = ["derive"] } [target.'cfg(target_os = "linux")'.dependencies] kvm-bindings = "0.14" kvm-ioctls = "0.24" landlock = "0.4.4" +mac_address = "1.1" +tun-tap = { version = "0.1", default-features = false } vmm-sys-util = "0.15" [target.'cfg(target_os = "macos")'.dependencies] @@ -88,7 +92,6 @@ memory_addresses = { version = "0.3", default-features = false, features = [ ] } [target.'cfg(target_arch = "aarch64")'.dependencies] -bitflags = "2.11" memory_addresses = { version = "0.3", default-features = false, features = [ "aarch64", ] } diff --git a/README.md b/README.md index 4c35b2c6..d41500ed 100644 --- a/README.md +++ b/README.md @@ -125,6 +125,30 @@ For more options, the default values, and the corresponding environment variable uhyve --help ``` +### Networking + +**Network support is currently unstable and tested only on Linux.** + +If you require uhyve to create its own virtual ethernet interface, you will need to provide it with the `CAP_NET_ADMIN` capability: + +``` +# as root +setcap cap_net_admin+ep /path/to/uhyve # ./target/debug/uhyve +``` + +~~You can set the pre-created tap device name via an environment variable `TAP`~~ + +Currently, the device is hard-coded with the name `tap10`. You will need to create the device and connect it to a bridge (such as virbr0): + +``` +ip tuntap add tap10 mode tap user "$(whoami)" +ip link set tap10 master virbr0 +ip link set dev tap10 up +``` + +And, if desired, set the IP address and gateway of your RustyHermit instance via `HERMIT_IP` and `HERMIT_GATEWAY`. + + ### Contributing If you are interested in contributing to Uhyve, make sure to check out the [Uhyve wiki][uhyve-wiki]! diff --git a/benches/benches/mod.rs b/benches/benches/mod.rs index 6e3cbbb2..f4b93b1d 100644 --- a/benches/benches/mod.rs +++ b/benches/benches/mod.rs @@ -1,2 +1,3 @@ pub mod complete_binary; +pub mod network; pub mod vm; diff --git a/benches/benches/network.rs b/benches/benches/network.rs new file mode 100644 index 00000000..38f21cfb --- /dev/null +++ b/benches/benches/network.rs @@ -0,0 +1,244 @@ +use std::{ + io::{Read, Write}, + net::{Shutdown, TcpListener, TcpStream}, + path::PathBuf, + thread, + time::Instant, +}; + +use byte_unit::{Byte, Unit}; +use criterion::{Criterion, criterion_group, measurement::Measurement}; +use log::debug; +use regex::Regex; +#[cfg(target_os = "linux")] +use uhyvelib::params::FileSandboxMode; +use uhyvelib::{ + params::{NetworkMode, Output, Params}, + vm::UhyveVm, +}; + +use crate::common::{BuildMode, HERMIT_GATEWAY, HERMIT_IP, build_hermit_bin, check_result}; + +const TOTAL_BYTES: u64 = 1024 * 1024 * 1024; + +/// Custom struct for throughput measurements in criterion. Must be used in connection with `iter_custom` +pub struct ThroughputMeasurement; + +impl Measurement for ThroughputMeasurement { + type Intermediate = (); + type Value = u64; + + fn start(&self) -> Self::Intermediate {} + + fn end(&self, _i: Self::Intermediate) -> Self::Value { + unreachable!("This measurement uses iter_custom") + } + + fn add(&self, v1: &Self::Value, v2: &Self::Value) -> Self::Value { + *v1 + *v2 + } + + fn zero(&self) -> Self::Value { + 0 + } + + fn to_f64(&self, value: &Self::Value) -> f64 { + *value as f64 + } + + fn formatter(&self) -> &dyn criterion::measurement::ValueFormatter { + &ThroughputFormatter + } +} + +struct ThroughputFormatter; + +impl criterion::measurement::ValueFormatter for ThroughputFormatter { + fn scale_values(&self, typical_value: f64, values: &mut [f64]) -> &'static str { + let (factor, unitstr) = match typical_value { + 0.0..1000.0 => (1.0, "bits/s"), + 1000.0..1000000.0 => (1000.0, "Kbits/s"), + 1000000.0..1000000000.0 => (1000000.0, "Mbits/s"), + 1000000000.0.. => (1000000000.0, "Gbits/s"), + _ => unreachable!("Negative Throughput???"), + }; + values.iter_mut().for_each(|v| *v /= factor); + unitstr + } + + fn scale_throughputs( + &self, + _typical_value: f64, + _throughput: &criterion::Throughput, + _throughputs: &mut [f64], + ) -> &'static str { + "bits/s" + } + + fn scale_for_machines(&self, _values: &mut [f64]) -> &'static str { + "bits/s" + } +} + +fn network_receive_bench(kernel_path: PathBuf) -> u64 { + let params = Params { + cpu_count: 1.try_into().unwrap(), + memory_size: Byte::from_u64_with_unit(64, Unit::MiB) + .unwrap() + .try_into() + .unwrap(), + output: Output::Buffer, + stats: true, + aslr: false, + #[cfg(target_os = "linux")] + file_isolation: FileSandboxMode::None, + network: Some(NetworkMode::Tap { + name: "tap10".to_string(), + }), + kernel_args: vec![ + "--".to_owned(), + "testname=receive_bench".to_owned(), + "test_argument=".to_owned(), + ], + ..Default::default() + }; + + let t = thread::spawn(move || { + let mut hermit_ip = String::from(HERMIT_IP); + hermit_ip.push_str(":9975"); + let mut stream = TcpStream::connect(hermit_ip).unwrap(); + + let buf = vec![123u8; 64 * 1024]; // Bytes without meaning + let mut sent: u64 = 0; + + let start = Instant::now(); + + while sent < TOTAL_BYTES { + let remaining = (TOTAL_BYTES - sent) as usize; + let to_send = remaining.min(buf.len()); + stream.write_all(&buf[..to_send]).unwrap(); + sent += to_send as u64; + } + + stream.shutdown(Shutdown::Write).unwrap(); + let elapsed = start.elapsed(); + let secs = elapsed.as_secs_f64(); + + debug!("Sent {sent} bytes in {secs:.3} s"); + let mbit = (sent as f64 * 8.0) / (secs * 1_000_000.0); + debug!("Throughput (sending): {mbit:.2} Mbit/s"); + }); + + let res = UhyveVm::new(kernel_path.clone(), params).unwrap().run(None); + + check_result(&res); + + let re = + Regex::new(r"(?m)^Throughput \(receiving\):\s*([0-9]+(?:\.[0-9]+)?)\s+Mbit/s").unwrap(); + + let caps = re.captures(res.output.as_ref().unwrap()).unwrap(); + let throughput: f64 = caps[1].parse().expect("invalid number"); + + t.join().unwrap(); + (throughput * 1000000.0) as u64 +} + +fn network_send_bench(kernel_path: PathBuf) -> u64 { + let params = Params { + cpu_count: 1.try_into().unwrap(), + memory_size: Byte::from_u64_with_unit(64, Unit::MiB) + .unwrap() + .try_into() + .unwrap(), + output: Output::Buffer, + stats: true, + aslr: false, + #[cfg(target_os = "linux")] + file_isolation: FileSandboxMode::None, + network: Some(NetworkMode::Tap { + name: "tap10".to_string(), + }), + kernel_args: vec![ + "--".to_owned(), + "testname=send_bench".to_owned(), + format!("test_argument={HERMIT_GATEWAY}:9975/{TOTAL_BYTES}").to_owned(), + ], + ..Default::default() + }; + + let t = thread::spawn(move || { + let listener = TcpListener::bind(HERMIT_GATEWAY.to_string() + ":9975").unwrap(); + debug!("socket bound"); + let (mut stream, peer) = listener.accept().unwrap(); + debug!("Got connection from {}", peer); + + stream.set_nodelay(true).unwrap(); + + let mut buf = vec![0u8; 8192]; + let mut received: u64 = 0; + + let start = Instant::now(); + loop { + let n = stream.read(&mut buf).unwrap(); + if n == 0 { + // connection terminated + break; + } + received += n as u64; + } + + let elapsed = start.elapsed(); + let secs = elapsed.as_secs_f64(); + + debug!("Received {received} bytes in {secs:.3} s"); + let mbit = (received as f64 * 8.0) / (secs * 1_000_000.0); + debug!("Throughput (receiving): {mbit:.2} Mbit/s"); + }); + + let res = UhyveVm::new(kernel_path.clone(), params).unwrap().run(None); + + check_result(&res); + + let re = Regex::new(r"(?m)^Throughput \(sending\):\s*([0-9]+(?:\.[0-9]+)?)\s+Mbit/s").unwrap(); + + let caps = re.captures(res.output.as_ref().unwrap()).unwrap(); + let throughput: f64 = caps[1].parse().expect("invalid number"); + + t.join().unwrap(); + (throughput * 1000000.0) as u64 +} + +pub fn network_receive_throughput(c: &mut Criterion) { + env_logger::try_init().ok(); + let kernel_path = build_hermit_bin("network_test", BuildMode::Release); + c.bench_function("network_receive_throughput", |b| { + b.iter_custom(|iters| { + let mut total: u64 = 0; + for _ in 0..iters { + total += network_receive_bench(kernel_path.clone()); + } + total / iters + }); + }); +} + +pub fn network_send_throughput(c: &mut Criterion) { + env_logger::try_init().ok(); + let kernel_path = build_hermit_bin("network_test", BuildMode::Release); + + c.bench_function("network_send_throughput", |b| { + b.iter_custom(|iters| { + let mut total: u64 = 0; + for _ in 0..iters { + total += network_send_bench(kernel_path.clone()); + } + total / iters + }); + }); +} + +criterion_group!( + name = network_benchmark_group; + config = Criterion::default().with_measurement(ThroughputMeasurement).sample_size(10); + targets = network_receive_throughput, network_send_throughput +); diff --git a/benches/benchmarks.rs b/benches/benchmarks.rs index 88832af4..5e76bc42 100644 --- a/benches/benchmarks.rs +++ b/benches/benchmarks.rs @@ -4,7 +4,18 @@ use criterion::criterion_main; pub mod benches; -use benches::{complete_binary::run_complete_binaries_group, vm::load_kernel_benchmark_group}; +use benches::{ + complete_binary::run_complete_binaries_group, network::network_benchmark_group, + vm::load_kernel_benchmark_group, +}; + +#[path = "../tests/common.rs"] +pub(crate) mod common; +pub use common::build_hermit_bin; // Add the benchmark groups that should be run -criterion_main!(load_kernel_benchmark_group, run_complete_binaries_group); +criterion_main!( + load_kernel_benchmark_group, + run_complete_binaries_group, + network_benchmark_group +); diff --git a/src/bin/uhyve.rs b/src/bin/uhyve.rs index 5f3171de..6836c881 100644 --- a/src/bin/uhyve.rs +++ b/src/bin/uhyve.rs @@ -12,7 +12,7 @@ use thiserror::Error; #[cfg(target_os = "linux")] use uhyvelib::params::FileSandboxMode; use uhyvelib::{ - params::{CpuCount, EnvVars, GuestMemorySize, Output, Params}, + params::{CpuCount, EnvVars, GuestMemorySize, NetworkMode, Output, Params}, vm::UhyveVm, }; @@ -191,6 +191,13 @@ struct UhyveArgs { #[merge(skip)] #[cfg(feature = "instrument")] pub trace: Option, + /// Network configuration. Specify network mode and device as colon separated string. + /// + /// Example: --net=tap:tap10 + #[serde(skip)] + #[merge(strategy = merge::option::overwrite_none)] + #[clap(short, long)] + net: Option, } /// Arguments for memory resources allocated to the guest (both guest and host). @@ -439,6 +446,7 @@ impl From for Params { config: _, #[cfg(feature = "instrument")] trace, + net, }, memory: MemoryArgs { @@ -501,6 +509,7 @@ impl From for Params { env: EnvVars::try_from(env_vars.as_slice()).unwrap(), #[cfg(feature = "instrument")] trace, + network: net.map(|net| NetworkMode::try_from(net).unwrap()), } } } @@ -697,6 +706,7 @@ mod tests { config: Some(PathBuf::from("config.txt")), #[cfg(feature = "instrument")] trace: Some(PathBuf::from(".")), + net: Some(String::from("tap10")), }, memory: MemoryArgs { memory_size: None, @@ -764,6 +774,7 @@ mod tests { config: Some(PathBuf::from("config.txt")), #[cfg(feature = "instrument")] trace: Some(PathBuf::from(".")), + net: Some(String::from("tap10")), }, memory: MemoryArgs { memory_size: Some(GuestMemorySize::from_str("16MiB").unwrap()), diff --git a/src/consts.rs b/src/consts.rs index a5a9a13f..f7904af6 100644 --- a/src/consts.rs +++ b/src/consts.rs @@ -24,10 +24,12 @@ pub const EFER_LMA: u64 = 1 << 10; /* Long mode active (read-only) */ pub const EFER_NXE: u64 = 1 << 11; /* PTE No-Execute bit enable */ pub const KERNEL_STACK_SIZE: u64 = 0x8000; pub const UHYVE_NET_MTU: usize = 1500; -pub const UHYVE_IRQ_NET: u32 = 11; +pub const UHYVE_NET_READ_TIMEOUT: u16 = 250; // maximum blocking time for a network read pub const GICD_BASE_ADDRESS: u64 = 0x800_0000; pub const GICD_SIZE: usize = 0x10000; pub const GICR_BASE_ADDRESS: u64 = 0x80A_0000; pub const GICR_SIZE: usize = 0xf60000; pub const MSI_BASE_ADDRESS: u64 = 0x808_0000; pub const MSI_SIZE: usize = 0x20000; + +pub const GUEST_PAGE_SIZE: u64 = 0x200000; /* 2 MB pages in guest */ diff --git a/src/hypercall.rs b/src/hypercall.rs index effa46e6..b426b98d 100644 --- a/src/hypercall.rs +++ b/src/hypercall.rs @@ -21,7 +21,7 @@ use crate::{ params::EnvVars, vcpu::VcpuStopReason, virt_to_phys, - vm::{KernelInfo, VmPeripherals}, + vm::{KernelInfo, VmPeripherals, internal::VirtualizationBackendInternal}, }; /// `addr` is the address of the hypercall parameter in the guest's memory space. `data` is the @@ -107,8 +107,8 @@ pub unsafe fn address_to_hypercall_v2( /// /// When a hypercall returns an error, or the hypercall is invalid, this function might panic /// (particularly on failing write calls, due to historical legacy). -pub fn handle_hypercall_v2( - peripherals: &VmPeripherals, +pub fn handle_hypercall_v2( + peripherals: &VmPeripherals, hypercall: v2::Hypercall<'_>, ) -> Option { let file_mapping = || peripherals.file_mapping.lock().unwrap(); @@ -160,8 +160,8 @@ pub fn handle_hypercall_v2( /// /// When a hypercall returns an error, or the hypercall is invalid, this function might panic /// (particularly on failing write calls). -pub fn handle_hypercall_v1( - peripherals: &VmPeripherals, +pub fn handle_hypercall_v1( + peripherals: &VmPeripherals, kernel_info: &KernelInfo, root_pt: impl FnOnce() -> HypervisorResult, hypercall: v1::Hypercall<'_>, @@ -497,8 +497,8 @@ fn read(mem: &MmapMemory, sysread: &mut v2::parameters::ReadParams, file_map: &m /// Handles a v1 write hypercall (for which a guest-provided guest virtual address must be /// converted to a guest physical address by the host). -fn write_v1( - peripherals: &VmPeripherals, +fn write_v1( + peripherals: &VmPeripherals, syswrite: &v1::parameters::WriteParams, root_pt: GuestPhysAddr, file_map: &mut UhyveFileMap, @@ -519,8 +519,8 @@ fn write_v1( } /// Handles an write syscall on the host. -fn write( - peripherals: &VmPeripherals, +fn write( + peripherals: &VmPeripherals, syswrite: &mut v2::parameters::WriteParams, file_map: &mut UhyveFileMap, ) -> io::Result<()> { diff --git a/src/isolation/landlock.rs b/src/isolation/landlock.rs index 721a8be2..aac9bb7b 100644 --- a/src/isolation/landlock.rs +++ b/src/isolation/landlock.rs @@ -294,7 +294,11 @@ where PathBuf::from("/proc/stat"), ]; - let mut uhyve_rw_paths: Vec = vec![PathBuf::from("/dev/kvm")]; + let mut uhyve_rw_paths: Vec = vec![ + PathBuf::from("/dev/kvm"), + #[cfg(target_os = "linux")] + PathBuf::from("/dev/net/tun"), + ]; #[cfg(feature = "instrument")] if let Some(trace) = trace { uhyve_ro_paths.push(PathBuf::from("/proc/self/maps")); diff --git a/src/lib.rs b/src/lib.rs index 4df9a971..0b67b2e9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,7 @@ #![warn(rust_2018_idioms)] +#![allow(unused_macros)] +#![allow(clippy::missing_safety_doc)] +#![allow(clippy::useless_conversion)] use std::path::PathBuf; @@ -27,8 +30,6 @@ mod parking; mod serial; pub mod stats; mod vcpu; -mod virtio; -mod virtqueue; pub mod vm; pub use arch::*; @@ -69,6 +70,13 @@ pub enum HypervisorError { #[error("Kernel Loading Error: {0}")] LoadedKernelError(#[from] vm::LoadKernelError), + + #[error("Kernel doesn't support the necessary features: {0}")] + FeatureMismatch(&'static str), } pub type HypervisorResult = Result; + +pub mod net; +mod pci; +mod virtio; diff --git a/src/linux/x86_64/kvm_cpu.rs b/src/linux/x86_64/kvm_cpu.rs index b9cc8aeb..ed6aefc4 100644 --- a/src/linux/x86_64/kvm_cpu.rs +++ b/src/linux/x86_64/kvm_cpu.rs @@ -3,18 +3,19 @@ use std::{io, num::NonZero, ops::Add, sync::Arc}; use kvm_bindings::*; use kvm_ioctls::{VcpuExit, VcpuFd, VmFd}; use uhyve_interface::GuestPhysAddr; -use vmm_sys_util::eventfd::EventFd; use x86_64::registers::control::{Cr0Flags, Cr4Flags}; use crate::{ HypervisorResult, consts::*, hypercall, - linux::KVM, - params::Params, + linux::{KVM, x86_64::virtio_device::KvmVirtioNetDevice}, + mem::MmapMemory, + params::{NetworkMode, Params}, + pci::{IOBASE_U64, IOEND_U64, PciConfigurationAddress, PciDevice}, stats::{CpuStats, VmExit}, vcpu::{VcpuStopReason, VirtualCPU}, - virtio::*, + virtio::net::VirtioNetPciDevice, vm::{ KernelInfo, VirtualizationBackend, VmPeripherals, internal::VirtualizationBackendInternal, }, @@ -36,12 +37,13 @@ const KVM_32BIT_GAP_SIZE: usize = 1024 << 20; pub(crate) const KVM_32BIT_GAP_START: usize = KVM_32BIT_MAX_MEM_SIZE - KVM_32BIT_GAP_SIZE; pub struct KvmVm { - vm_fd: VmFd, - peripherals: Arc, + pub(crate) vm_fd: VmFd, + peripherals: Arc>, } impl VirtualizationBackendInternal for KvmVm { type VCPU = KvmCpu; + type VirtioNetImpl = KvmVirtioNetDevice; const NAME: &str = "KvmVm"; fn new_cpu( @@ -68,7 +70,7 @@ impl VirtualizationBackendInternal for KvmVm { Ok(kvcpu) } - fn new(peripherals: Arc, params: &Params) -> HypervisorResult { + fn new(peripherals: Arc>, params: &Params) -> HypervisorResult { let vm = KVM.create_vm().unwrap(); // Double-check that neither the (first) guest address nor the end of the guest memory @@ -151,14 +153,19 @@ impl VirtualizationBackendInternal for KvmVm { } } - let evtfd = EventFd::new(0).unwrap(); - vm.register_irqfd(&evtfd, UHYVE_IRQ_NET)?; + if let Some(virtiodevice) = &peripherals.virtio_device { + virtiodevice.lock().unwrap().setup(&vm); + } Ok(Self { vm_fd: vm, peripherals, }) } + + fn virtio_net_device(mode: NetworkMode, memory: Arc) -> Self::VirtioNetImpl { + KvmVirtioNetDevice::new(VirtioNetPciDevice::new(mode, memory)) + } } impl VirtualizationBackend for KvmVm { @@ -168,7 +175,7 @@ impl VirtualizationBackend for KvmVm { pub struct KvmCpu { id: u32, vcpu: VcpuFd, - peripherals: Arc, + peripherals: Arc>, // TODO: Remove once the getenv/getargs hypercalls are removed kernel_info: Arc, pci_addr: Option, @@ -391,7 +398,6 @@ impl VirtualCPU for KvmCpu { fn r#continue(&mut self) -> HypervisorResult { loop { - let virtio_device = || self.peripherals.virtio_device.lock().unwrap(); self.vcpu.set_sync_valid_reg(kvm_ioctls::SyncReg::Register); match self.vcpu.run() { Ok(vcpu_stop_reason) => match vcpu_stop_reason { @@ -412,29 +418,17 @@ impl VirtualCPU for KvmCpu { if let Some(pci_addr) = self.pci_addr && pci_addr & 0x1ff800 == 0 { - virtio_device().handle_read(pci_addr & 0x3ff, addr); + if let Some(virtio_device) = &self.peripherals.virtio_device { + virtio_device.lock().unwrap().virtio.handle_read( + PciConfigurationAddress(pci_addr & 0x3ff), + addr, + ); + } } else { unsafe { *(addr.as_ptr() as *mut u32) = 0xffffffff }; } } PCI_CONFIG_ADDRESS_PORT => {} - VIRTIO_PCI_STATUS => { - virtio_device().read_status(addr); - } - VIRTIO_PCI_HOST_FEATURES => { - virtio_device().read_host_features(addr); - } - VIRTIO_PCI_GUEST_FEATURES => { - virtio_device().read_requested_features(addr); - } - VIRTIO_PCI_CONFIG_OFF_MSIX_OFF..=VIRTIO_PCI_CONFIG_OFF_MSIX_OFF_MAX => { - virtio_device() - .read_mac_byte(addr, port - VIRTIO_PCI_CONFIG_OFF_MSIX_OFF); - } - VIRTIO_PCI_ISR => virtio_device().reset_interrupt(), - VIRTIO_PCI_LINK_STATUS_MSIX_OFF => { - virtio_device().read_link_status(addr); - } port => { warn!("guest read from unknown I/O port {port:#x}"); } @@ -493,32 +487,20 @@ impl VirtualCPU for KvmCpu { s.increment_val(VmExit::PCIWrite) } match port { - //TODO: + // Legacy PCI addressing method PCI_CONFIG_DATA_PORT => { if let Some(pci_addr) = self.pci_addr - && pci_addr & 0x1ff800 == 0 + && pci_addr & 0x1ff800 == 0 && let Some(virtio_device) = + &self.peripherals.virtio_device { - virtio_device().handle_write(pci_addr & 0x3ff, &addr); + virtio_device.lock().unwrap().virtio.handle_write( + PciConfigurationAddress(pci_addr & 0x3ff), + &addr, + ); } } PCI_CONFIG_ADDRESS_PORT => { - self.pci_addr = Some(unsafe { *(addr.as_ptr() as *mut u32) }); - } - VIRTIO_PCI_STATUS => { - virtio_device().write_status(&addr); - } - VIRTIO_PCI_GUEST_FEATURES => { - virtio_device().write_requested_features(&addr); - } - VIRTIO_PCI_QUEUE_NOTIFY => { - virtio_device() - .handle_notify_output(&addr, &self.peripherals.mem); - } - VIRTIO_PCI_QUEUE_SEL => { - virtio_device().write_selected_queue(&addr); - } - VIRTIO_PCI_QUEUE_PFN => { - virtio_device().write_pfn(&addr, &self.peripherals.mem); + self.pci_addr = Some(unsafe { *(addr.as_ptr() as *const u32) }); } port => { warn!("guest wrote to unknown I/O port {port:#x}"); @@ -526,19 +508,28 @@ impl VirtualCPU for KvmCpu { } }; } - VcpuExit::MmioRead(addr, _targ) => { + VcpuExit::MmioRead(addr, data) => { match addr { 0x9_F000..0xA_0000 | 0xF_0000..0x10_0000 => {} // Search for MP floating table + IOBASE_U64..IOEND_U64 => { + if let Some(virtio_device) = &self.peripherals.virtio_device { + virtio_device + .lock() + .unwrap() + .virtio + .handle_read(PciConfigurationAddress(addr as u32), data) + } + } _ => { + let l = data.len(); self.print_registers(); - panic!("mmio read to {addr:#x}"); + panic!( + "undefined mmio read of {l} bytes to {addr:#x?} (ConfigAddress {:x?})", + PciConfigurationAddress::from_guest_address(addr.into()) + ); } } } - VcpuExit::MmioWrite(addr, _targ) => { - self.print_registers(); - panic!("undefined mmio write to {addr:#x}"); - } VcpuExit::Debug(debug) => { if let Some(s) = self.stats.as_mut() { s.increment_val(VmExit::Debug) @@ -566,6 +557,22 @@ impl VirtualCPU for KvmCpu { let err = io::Error::other(format!("{debug:?}")); return Err(err.into()); } + VcpuExit::MmioWrite(addr, data) => match addr { + IOBASE_U64..IOEND_U64 => { + if let Some(virtio_device) = &self.peripherals.virtio_device { + virtio_device + .lock() + .unwrap() + .virtio + .handle_write(PciConfigurationAddress(addr as u32), data) + } + } + _ => { + let l = data.len(); + self.print_registers(); + panic!("undefined mmio write of {l} bytes to {addr:#x?}"); + } + }, vcpu_exit => { let err = io::Error::other(format!("not implemented: {vcpu_exit:?}")); return Err(err.into()); diff --git a/src/linux/x86_64/mod.rs b/src/linux/x86_64/mod.rs index 0452b284..03ac66c7 100644 --- a/src/linux/x86_64/mod.rs +++ b/src/linux/x86_64/mod.rs @@ -1 +1,2 @@ pub mod kvm_cpu; +pub(crate) mod virtio_device; diff --git a/src/linux/x86_64/virtio_device.rs b/src/linux/x86_64/virtio_device.rs new file mode 100644 index 00000000..35b98af4 --- /dev/null +++ b/src/linux/x86_64/virtio_device.rs @@ -0,0 +1,145 @@ +use std::{ + io, + os::fd::{AsRawFd, BorrowedFd}, +}; + +use kvm_bindings::{ + KVM_IRQ_ROUTING_IRQCHIP, KVM_IRQCHIP_IOAPIC, KvmIrqRouting, kvm_irq_routing_entry, kvm_irqchip, +}; +use kvm_ioctls::{IoEventAddress, NoDatamatch, VmFd}; +use libc::EFD_NONBLOCK; +use uhyve_interface::GuestPhysAddr; +use vmm_sys_util::eventfd::EventFd; + +use crate::{ + pci::PciConfigurationAddress, + virtio::{ + DeviceStatus, + net::{VirtQueueInterrupter, VirtQueueNotificationWaiter, VirtioNetPciDevice}, + pci::HeaderConf, + }, +}; + +const UHYVE_IRQ_NET_PIC_PIN: u32 = 11; // Network is connected to PIC pin 11 + +/// Thin Wrapper around `EventFd` to implement `VirtQueueNotificationWaiter` +struct EventFdNotifier(EventFd); +impl VirtQueueNotificationWaiter for EventFdNotifier { + fn wait_for_notify(&self) { + self.0.read().unwrap(); + } + + fn wait_with_timeout(&self, timeout: u16) -> bool { + match wait_eventfd_with_timeout(&self.0, timeout) { + Ok(()) => { + self.wait_for_notify(); + true + } + Err(e) => { + if e.kind() == io::ErrorKind::TimedOut { + return false; + } + panic!("Could not read eventfd. Is the file nonblocking?"); + } + } + } +} + +/// Thin Wrapper around `EventFd` to implement `VirtQueueInterrupter` +struct EventFdInterrupter(EventFd); +impl VirtQueueInterrupter for EventFdInterrupter { + fn send_interrupt(&self) { + self.0.write(1).unwrap(); + } +} + +/// Wrapper around `VirtioNetPciDevice` containing the architecture specific functionality. +#[derive(Debug)] +pub struct KvmVirtioNetDevice { + pub virtio: VirtioNetPciDevice, +} +impl KvmVirtioNetDevice { + pub const fn new(virtio: VirtioNetPciDevice) -> Self { + Self { virtio } + } + + /// Write the capabilities to the config_space and register eventFDs to the VM + pub fn setup(&mut self, vm: &VmFd) { + self.virtio.header_caps.pci_config_hdr.status = + DeviceStatus::DEVICE_NEEDS_RESET | DeviceStatus::PCI_CAPABILITIES_LIST_ENABLE; + + let irqfd = initialize_interrupt(vm); + // Inform the kernel on which PIC pin the PCI interrupt will appear. + self.virtio.header_caps.pci_config_hdr.interrupt_line = UHYVE_IRQ_NET_PIC_PIN as u8; + + let notify_evtfd_rx = initialize_mmio_notify( + PciConfigurationAddress::new(HeaderConf::NOTIFY_0 as u32).guest_address(), + vm, + ); + let notify_evtfd_tx = initialize_mmio_notify( + PciConfigurationAddress::new(HeaderConf::NOTIFY_1 as u32).guest_address(), + vm, + ); + + self.virtio.update_config_generation(); + + self.virtio.start_network_threads( + EventFdNotifier(notify_evtfd_tx), + EventFdNotifier(notify_evtfd_rx), + EventFdInterrupter(irqfd), + ); + } +} + +fn initialize_interrupt(vm: &VmFd) -> EventFd { + let mut irqchip = kvm_irqchip { + chip_id: KVM_IRQCHIP_IOAPIC, + ..Default::default() + }; + vm.get_irqchip(&mut irqchip).unwrap(); + + let gsi: u32 = 123; // Number doesn't matter, just needs to be the same for kvm route and irqfd + + let mut kvm_route = kvm_irq_routing_entry { + gsi, + type_: KVM_IRQ_ROUTING_IRQCHIP, + ..Default::default() + }; + kvm_route.u.irqchip.irqchip = irqchip.chip_id; + kvm_route.u.irqchip.pin = UHYVE_IRQ_NET_PIC_PIN; + + let mut irq_routing = KvmIrqRouting::new(0).unwrap(); + irq_routing.push(kvm_route).unwrap(); + vm.set_gsi_routing(&irq_routing).unwrap(); + + let eventfd = EventFd::new(EFD_NONBLOCK).unwrap(); + + vm.register_irqfd(&eventfd, gsi).unwrap(); + eventfd +} + +fn initialize_mmio_notify(addr: GuestPhysAddr, vm: &VmFd) -> EventFd { + let notifyfd = EventFd::new(0).unwrap(); + vm.register_ioevent(¬ifyfd, &IoEventAddress::Mmio(addr.as_u64()), NoDatamatch) + .unwrap(); + notifyfd +} + +/// Waits for any activity on `fd`. Returns `1` on success, `0` on timeout and `-1` on error. +pub(crate) fn wait_eventfd_with_timeout(fd: &EventFd, timeout: u16) -> io::Result<()> { + use nix::poll::{PollFd, PollFlags, PollTimeout, poll}; + let mut pollfds = [PollFd::new( + // Safety: no ownership is leaked + unsafe { BorrowedFd::borrow_raw(fd.as_raw_fd()) }, + PollFlags::POLLIN, + )]; + match poll::(&mut pollfds, timeout.into())? { + -1 => Err(io::Error::last_os_error()), + 0 => Err(io::Error::new( + io::ErrorKind::TimedOut, + "eventfd wait timed out", + )), + 1 => Ok(()), + i => unreachable!("Poll returned {i}"), + } +} diff --git a/src/macos/aarch64/vcpu.rs b/src/macos/aarch64/vcpu.rs index c4c235c0..fefd5f73 100644 --- a/src/macos/aarch64/vcpu.rs +++ b/src/macos/aarch64/vcpu.rs @@ -17,7 +17,8 @@ use crate::{ BOOT_INFO_OFFSET, GICD_BASE_ADDRESS, GICR_BASE_ADDRESS, MSI_BASE_ADDRESS, PGT_OFFSET, }, hypercall, - params::Params, + mem::MmapMemory, + params::{NetworkMode, Params}, stats::CpuStats, vcpu::{VcpuStopReason, VirtualCPU}, vm::{ @@ -26,7 +27,7 @@ use crate::{ }; pub struct XhyveVm { - peripherals: Arc, + peripherals: Arc>, #[expect( dead_code, reason = "Gic should be created and stored throughout the struct's lifetime, not used actively" @@ -35,6 +36,7 @@ pub struct XhyveVm { } impl VirtualizationBackendInternal for XhyveVm { type VCPU = XhyveCpu; + type VirtioNetImpl = (); const NAME: &str = "XhyveVm"; fn new_cpu( @@ -56,7 +58,7 @@ impl VirtualizationBackendInternal for XhyveVm { }) } - fn new(peripherals: Arc, _params: &Params) -> HypervisorResult { + fn new(peripherals: Arc>, _params: &Params) -> HypervisorResult { trace!("Create VM..."); create_vm()?; @@ -75,6 +77,10 @@ impl VirtualizationBackendInternal for XhyveVm { Ok(Self { peripherals, gic }) } + + fn virtio_net_device(_mode: NetworkMode, _memory: Arc) -> Self::VirtioNetImpl { + unimplemented!(); + } } impl VirtualizationBackend for XhyveVm { @@ -84,7 +90,7 @@ impl VirtualizationBackend for XhyveVm { pub struct XhyveCpu { id: u32, vcpu: Option, - peripherals: Arc, + peripherals: Arc>, // TODO: Remove once the getenv/getargs hypercalls are removed kernel_info: Arc, stats: Option, diff --git a/src/mem.rs b/src/mem.rs index 6575ba7d..644ecf86 100644 --- a/src/mem.rs +++ b/src/mem.rs @@ -7,8 +7,8 @@ use nix::sys::mman::{MmapAdvise, madvise}; use thiserror::Error; use uhyve_interface::GuestPhysAddr; use vm_memory::{ - Address, GuestAddress, GuestMemoryRegion, GuestRegionMmap, MemoryRegionAddress, - mmap::MmapRegionBuilder, + Address, GuestAddress, GuestMemoryBackend, GuestMemoryMmap, GuestMemoryRegion, GuestRegionMmap, + MemoryRegionAddress, mmap::MmapRegionBuilder, }; #[derive(Error, Debug)] @@ -21,7 +21,7 @@ pub enum MemoryError { /// Uses `GuestMemoryMmap` under the hood. #[derive(Debug)] pub(crate) struct MmapMemory { - mem: GuestRegionMmap, + pub(crate) mem: GuestMemoryMmap, } impl MmapMemory { pub fn new( @@ -76,26 +76,34 @@ impl MmapMemory { } Self { - mem: GuestRegionMmap::<()>::new(mm_region, GuestAddress(guest_address.as_u64())) - .unwrap(), + mem: GuestMemoryMmap::from_regions(vec![ + GuestRegionMmap::<()>::new(mm_region, GuestAddress(guest_address.as_u64())) + .unwrap(), + ]) + .unwrap(), } } + /// Helper function to access the only Mmap region in our struct + fn region_mmap(&self) -> &GuestRegionMmap { + self.mem.iter().next().unwrap() + } + /// Returns the size of the memory in bytes pub fn size(&self) -> usize { - self.mem.size() + self.region_mmap().size() } /// Returns the first valid physical address from the gutest perspective. pub fn guest_addr(&self) -> GuestPhysAddr { - GuestPhysAddr::new(self.mem.start_addr().0) + GuestPhysAddr::new(self.mem.iter().next().unwrap().start_addr().0) } /// Returns a pointer to the beginning of the memory on the host. pub fn host_start(&self) -> *mut u8 { - let start_addr = self.mem.start_addr(); - let region_addr = self.mem.to_region_addr(start_addr).unwrap(); - self.mem.get_host_address(region_addr).unwrap() + let start_addr = self.region_mmap().start_addr(); + let region_addr = self.region_mmap().to_region_addr(start_addr).unwrap(); + self.region_mmap().get_host_address(region_addr).unwrap() } /// # Safety @@ -123,15 +131,15 @@ impl MmapMemory { ) -> Result { Ok(MemoryRegionAddress( addr.as_u64() - .checked_sub(self.mem.start_addr().0) + .checked_sub(self.mem.iter().next().unwrap().start_addr().0) .ok_or(MemoryError::BoundsViolation)?, )) } /// Checks if the range described by `addr` + `len` is part of this memory region fn check_range(&self, addr: MemoryRegionAddress, len: usize) -> Result { - Ok(self.mem.address_in_range(addr) - && self.mem.address_in_range( + Ok(self.region_mmap().address_in_range(addr) + && self.region_mmap().address_in_range( addr.checked_add(if len > 0 { len as u64 - 1 } else { 0 }) .ok_or(MemoryError::BoundsViolation)?, )) @@ -148,7 +156,10 @@ impl MmapMemory { let guest_addr = self.addr_to_mem_region_addr(addr)?; if self.check_range(guest_addr, len)? { Ok(unsafe { - std::slice::from_raw_parts_mut(self.mem.get_host_address(guest_addr).unwrap(), len) + std::slice::from_raw_parts_mut( + self.region_mmap().get_host_address(guest_addr).unwrap(), + len, + ) }) } else { Err(MemoryError::BoundsViolation) @@ -171,7 +182,10 @@ impl MmapMemory { let guest_addr = self.addr_to_mem_region_addr(addr)?; if self.check_range(guest_addr, len)? { Ok(unsafe { - std::slice::from_raw_parts_mut(self.mem.get_host_address(guest_addr).unwrap(), len) + std::slice::from_raw_parts_mut( + self.region_mmap().get_host_address(guest_addr).unwrap(), + len, + ) }) } else { Err(MemoryError::BoundsViolation) @@ -182,9 +196,9 @@ impl MmapMemory { /// memory, if the address is valid. pub fn host_address(&self, addr: GuestPhysAddr) -> Result<*const u8, MemoryError> { let ptr = self - .mem + .region_mmap() .get_host_address( - self.mem + self.region_mmap() .to_region_addr(GuestAddress(addr.as_u64())) .unwrap(), ) @@ -199,7 +213,7 @@ impl MmapMemory { } unsafe fn get_ptr_internal(&self, addr: MemoryRegionAddress) -> Result<*mut u8, MemoryError> { - self.mem + self.region_mmap() .get_host_address(addr) .map_err(|_| MemoryError::BoundsViolation) } diff --git a/src/net/mod.rs b/src/net/mod.rs new file mode 100644 index 00000000..2f46a250 --- /dev/null +++ b/src/net/mod.rs @@ -0,0 +1,52 @@ +#![cfg_attr(not(target_os = "linux"), expect(unused))] + +use std::io; + +pub use crate::consts::UHYVE_NET_MTU; + +pub const BROADCAST_MAC_ADDR: [u8; 6] = [0xff; 6]; +pub const PCI_ETHERNET_CLASS_CODE: u8 = 0x2; +pub const PCI_ETHERNET_SUBCLASS: u8 = 0x0; +pub const PCI_ETHERNET_PROG_IF: u8 = 0; +pub const PCI_ETHERNET_REVISION_ID: u8 = 0; +pub const UHYVE_QUEUE_SIZE: u16 = 256; + +pub const UHYVE_PCI_CLASS_INFO: [u8; 3] = [ + PCI_ETHERNET_REVISION_ID, + PCI_ETHERNET_PROG_IF, + PCI_ETHERNET_SUBCLASS, +]; + +// tap devices on macOS don't seem to be supported directly by Apple +// TODO: Let mac users investigate if this is possible. +// #[cfg(target_os = "linux")] +pub(crate) mod tap; + +pub(crate) trait NetworkInterface { + type RX: NetworkInterfaceRX; + type TX: NetworkInterfaceTX; + + /// Return the MAC address as a byte array + fn mac_address_as_bytes(&self) -> [u8; 6]; + + /// Split off a tx and rx object. + fn split(self) -> (Self::RX, Self::TX); +} + +pub(crate) trait NetworkInterfaceTX: Send { + /// Sends a packet to the interface. + /// + /// **NOTE**: ensure the packet has the appropriate format and header. + /// Incorrect packets will be dropped without warning. + fn send(&mut self, buf: &[u8]) -> io::Result; +} + +pub(crate) trait NetworkInterfaceRX: Send { + /// Receives a packet from the interface. + /// + /// Blocks until a packet is sent into the virtual interface. At that point, the content of the + /// packet is copied into the provided buffer. + /// + /// Returns the size of the received packet + fn recv(&mut self, buf: &mut [u8], timeout: u16) -> io::Result; +} diff --git a/src/net/tap.rs b/src/net/tap.rs new file mode 100644 index 00000000..54b3690e --- /dev/null +++ b/src/net/tap.rs @@ -0,0 +1,141 @@ +use std::{ + fs::File, + io::{self, Read, Write}, + os::fd::AsFd, +}; +#[cfg(target_os = "linux")] +use std::{fs::OpenOptions, io::Error, os::unix::io::AsRawFd}; + +#[cfg(target_os = "linux")] +use libc::{IFF_NO_PI, IFF_TAP, ifreq}; +use nix::poll::{PollFd, PollFlags, PollTimeout, poll}; +#[cfg(target_os = "linux")] +use nix::{ifaddrs::getifaddrs, ioctl_write_int}; + +use crate::net::{NetworkInterface, NetworkInterfaceRX, NetworkInterfaceTX}; + +/// An existing (externally created) TAP device +pub struct Tap { + fd: File, + mac: [u8; 6], + name: String, +} + +impl Tap { + #[cfg(target_os = "linux")] + pub fn new(iface_name: &str) -> io::Result { + if iface_name.len() > 16 { + return Err(Error::other("Interface name must not exceed 16 bytes")); + } + let mut ifr_name: [i8; 16] = [0; 16]; + iface_name + .as_bytes() + .iter() + .take(15) + .map(|b| *b as i8) + .enumerate() + .for_each(|(i, b)| ifr_name[i] = b); + + let config_str = ifreq { + ifr_name, + ifr_ifru: libc::__c_anonymous_ifr_ifru { + ifru_flags: IFF_TAP as i16 | IFF_NO_PI as i16, // TODO: Investigate if IFF_NO_PI is necessary as well + }, + }; + + let fd = OpenOptions::new() + .read(true) + .write(true) + .open("/dev/net/tun")?; + + ioctl_write_int!(tun_set_iff, b'T', 202); + + let res = + unsafe { tun_set_iff(fd.as_raw_fd(), &config_str as *const ifreq as u64).unwrap() }; + + if res == -1 { + error!("Can't open TAP device {iface_name}"); + return Err(Error::other("Can't open TAP device")); + } + + // Find MAC address of the TAP device + let mut mac_addr = None; + for ifaddr in getifaddrs().unwrap() { + if let Some(address) = ifaddr.address + && ifaddr.interface_name == iface_name + && let Some(link_addr) = address.as_link_addr() + { + mac_addr = Some(link_addr.addr().unwrap()); + } + } + + Ok(Self { + fd, + name: iface_name.to_string(), + mac: mac_addr.expect("TAP device without MAC address?"), + }) + } +} +impl NetworkInterface for Tap { + type RX = TapRX; + type TX = TapTX; + + fn mac_address_as_bytes(&self) -> [u8; 6] { + self.mac + } + + fn split(self) -> (Self::RX, Self::TX) { + ( + Self::RX { + fd: self.fd.try_clone().unwrap(), + name: self.name.clone(), + }, + Self::TX { + fd: self.fd.try_clone().unwrap(), + name: self.name.clone(), + }, + ) + } +} + +pub struct TapTX { + fd: File, + name: String, +} +impl NetworkInterfaceTX for TapTX { + fn send(&mut self, buf: &[u8]) -> io::Result { + trace!("sending {} bytes on {}", buf.len(), self.name); + self.fd.write(buf) + } +} + +pub(crate) fn read_file_with_timeout( + file: &mut F, + target: &mut [u8], + timeout: u16, +) -> io::Result { + let mut pollfds = [PollFd::new(file.as_fd(), PollFlags::POLLIN)]; + let nready = poll::(&mut pollfds, timeout.into())?; + if nready == 0 { + Ok(0) + } else { + file.read(target) + } +} + +pub struct TapRX { + fd: File, + name: String, +} +impl NetworkInterfaceRX for TapRX { + fn recv(&mut self, buf: &mut [u8], timeout: u16) -> io::Result { + match read_file_with_timeout(&mut self.fd, buf, timeout) { + Ok(0) => Ok(0), // Timeout + Ok(i) => { + trace!("receiving {i:?} bytes on {}", self.name); + Ok(i) + } + Err(e) => Err(e), + } + } +} diff --git a/src/params.rs b/src/params.rs index 8c404b18..cab1f77f 100644 --- a/src/params.rs +++ b/src/params.rs @@ -77,6 +77,9 @@ pub struct Params { /// Store trace dumps in this directory #[cfg(feature = "instrument")] pub trace: Option, + + /// Networking configuration + pub network: Option, } impl Default for Params { @@ -106,6 +109,7 @@ impl Default for Params { aslr: true, #[cfg(feature = "instrument")] trace: Default::default(), + network: None, } } } @@ -286,6 +290,44 @@ impl + core::fmt::Debug + PartialEq + PartialEq<&'static str>> Try } } +#[derive(Debug, Clone, PartialEq)] +pub enum NetworkMode { + Tap { name: String }, +} +impl TryFrom for NetworkMode { + type Error = &'static str; + + fn try_from(netmode: String) -> Result { + netmode_try_from(netmode) + } +} +impl TryFrom<&str> for NetworkMode { + type Error = &'static str; + + fn try_from(netmode: &str) -> Result { + netmode_try_from(netmode) + } +} + +fn netmode_try_from>(netmode: S) -> Result { + if netmode.as_ref() == "tap" { + return Ok(NetworkMode::Tap { + name: "tap10".to_string(), + }); + } + + let (mode, device) = netmode + .as_ref() + .split_once(':') + .ok_or("invalid netmode string. Must be mode:devicename")?; + match mode { + "tap" => Ok(NetworkMode::Tap { + name: device.to_string(), + }), + _ => Err("invalid networking mode"), + } +} + /// Enforcement strictness for file sandbox /// /// Use None if you are using Uhyve as a library, as it is not currently diff --git a/src/pci.rs b/src/pci.rs new file mode 100644 index 00000000..5e41f27b --- /dev/null +++ b/src/pci.rs @@ -0,0 +1,221 @@ +#![cfg_attr(not(target_os = "linux"), expect(unused))] + +use std::ops::Add; + +use thiserror::Error; +use uhyve_interface::GuestPhysAddr; +use zerocopy::{Immutable, IntoBytes}; + +use crate::{ + consts::GUEST_PAGE_SIZE, + net::{PCI_ETHERNET_REVISION_ID, UHYVE_PCI_CLASS_INFO}, + virtio::{DeviceStatus, VIRTIO_VENDOR_ID}, +}; + +/// For now, use an address large enough to be outside of kvm_userspace, +/// as IO/MMIO writes are otherwise dismissed. +// pub const IOBASE: u64 = 0xFE000000; +pub const IOBASE_U64: u64 = 0xFE000000; +pub const IOBASE: GuestPhysAddr = GuestPhysAddr::new(IOBASE_U64); +pub const IOEND_U64: u64 = IOBASE.as_u64() + (1_u64 << 24); // Configuration space address length is 24 (PCI Bus Local Spec 3.2.2.3.2) +pub const IOEND: GuestPhysAddr = GuestPhysAddr::new(IOEND_U64); + +/// An address in the PCI configuration space. +/// (PCI Bus Local Specification 3.2.2.3) +/// These addresses are 24-bit long, and the last two bytes specify the transaction type. +#[derive(Eq, PartialEq, Clone, Copy, Debug)] +pub struct PciConfigurationAddress(pub(crate) u32); +impl Add for PciConfigurationAddress { + type Output = Self; + + fn add(self, rhs: usize) -> Self::Output { + PciConfigurationAddress(self.0 + rhs as u32) + } +} +impl PciConfigurationAddress { + pub const fn new(address: u32) -> Self { + Self(address) + } + + pub fn from_guest_address(address: GuestPhysAddr) -> Option { + if address & 0b11 != 0 { + warn!("PciConfigurationAddress not at word boundary"); + } + + if address < IOBASE || address >= IOEND { + return None; + } + Some(Self((address - IOBASE) as u32)) + } + + pub fn guest_address(&self) -> GuestPhysAddr { + IOBASE + self.0 as u64 + } + + pub fn offset(&self) -> PciConfigurationOffset { + PciConfigurationOffset((self.0 & 0b1111_1111) as u8) + } +} + +/// The offset is the effective addressing within a PCI function +#[derive(Eq, PartialEq, Clone, Copy, Debug)] +pub struct PciConfigurationOffset(pub(crate) u8); + +pub trait PciDevice { + fn handle_read(&mut self, address: PciConfigurationAddress, dest: &mut [u8]); + fn handle_write(&mut self, address: PciConfigurationAddress, src: &[u8]); +} + +#[derive(Error, Debug)] +pub enum PciError { + #[error("Trying to write to function's read_only field ({:#x})", .0.0)] + ReadOnlyOffset(PciConfigurationOffset), + #[error("Trying to access function at invalid offset ({:#x})", .0.0)] + InvalidOffset(PciConfigurationOffset), + #[error("Unaligned Access to a PCI struct ({:#x})", .0.0)] + UnalignedAccess(PciConfigurationOffset), + #[error("Read/Write data is not a power of two")] + InvalidAccessSize, +} + +#[derive(IntoBytes, Clone, Copy, Debug, Default, Immutable)] +#[repr(C)] +pub struct MemoryBar64 { + address: u64, +} +impl MemoryBar64 { + pub fn new(address: u64) -> Self { + // BAR size is 0x200000 + assert_eq!(address, address & -(GUEST_PAGE_SIZE as i64) as u64); + Self { + address: address | 0b1100, + } + } + // pub fn read(&self) -> u64 { + // self.address + // } + // pub fn read_upper(&self) -> u32 { + // (self.address >> 32) as u32 + // } + // pub fn read_lower(&self) -> u32 { + // self.address as u32 + // } + + pub fn write(&mut self, data: &[u8]) -> Result<(), PciError> { + // BAR0 -> BAR detection writes something to this register and reads it back. We protect the lowest bits to ensure it stays a 64-Bit address field + + let addr_lower = self.address & (GUEST_PAGE_SIZE - 1); + let d = match data.len() { + 1 | 2 => return Ok(()), // This is smaller than GUEST_PAGE_SIZE -> Ignore it + 4 => u64::from_le_bytes([data[0], data[1], data[2], data[3], 0, 0, 0, 0]), + 8 => u64::from_le_bytes([ + data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7], + ]), + _ => return Err(PciError::InvalidAccessSize), + }; + self.address = (d & -(GUEST_PAGE_SIZE as i64) as u64) + addr_lower; + Ok(()) + } + + pub fn write_upper(&mut self, data: &[u8]) -> Result<(), PciError> { + let mut addr_bytes = self.address.to_le_bytes(); + for (d, c) in data.iter().zip(addr_bytes.iter_mut().skip(4)) { + *c = *d; + } + self.address = u64::from_le_bytes(addr_bytes); + Ok(()) + } +} + +/// Type 0 Configuration Space Header. +/// PCIe Base Specification Section 7.5.2 +#[derive(IntoBytes, Clone, Copy, Debug, Immutable)] +#[repr(C)] +pub struct PciType0ConfigSpaceHeader { + pub vendor_id: u16, + pub device_id: u16, + pub command: u16, + pub status: DeviceStatus, + pub revision: u8, + pub class_code: [u8; 3], + pub cache_line_size: u8, + pub master_latency_timr: u8, + pub header_type: u8, + pub bist: u8, + pub base_address_registers: [MemoryBar64; 3], + pub cardbus_cis_pointer: u32, + pub subsystem_vendor_id: u16, + pub subsystem_id: u16, + pub expansion_rom_base_address: u32, + pub capabilities_ptr: u8, + pub _reserved: [u8; 7], + pub interrupt_line: u8, + pub interrupt_pin: u8, + pub min_gnt: u8, + pub max_lat: u8, +} +impl Default for PciType0ConfigSpaceHeader { + fn default() -> Self { + Self { + vendor_id: VIRTIO_VENDOR_ID, + device_id: 0, + command: 0, + status: DeviceStatus::UNINITIALIZED, + revision: PCI_ETHERNET_REVISION_ID, + class_code: UHYVE_PCI_CLASS_INFO, + cache_line_size: 0, + master_latency_timr: 0, + header_type: 0, + bist: 0, + base_address_registers: [MemoryBar64::new(0); 3], + cardbus_cis_pointer: 0, + subsystem_vendor_id: 0, + subsystem_id: 0, + expansion_rom_base_address: 0, + capabilities_ptr: 0, + _reserved: [0; 7], + interrupt_line: 0, + interrupt_pin: 0, + min_gnt: 0, + max_lat: 0, + } + } +} +impl PciType0ConfigSpaceHeader { + pub fn write(&mut self, offset: PciConfigurationOffset, data: &[u8]) -> Result<(), PciError> { + if offset.0 + data.len() as u8 > 0x40 { + return Err(PciError::InvalidOffset(offset)); + } + match offset.0 { + 0..=0x03 | 0x06..=0x0F | 0x28..=0x33 | 0x35..=0x3B => { + Err(PciError::InvalidOffset(offset)) + } + 0x04 => { + // Command register + self.command = u16::from_le_bytes([data[0], data[1]]); + Ok(()) + } + 0x10 | 0x18 | 0x20 => { + self.base_address_registers[((offset.0 - 0x10) / 8) as usize].write(data) + } + 0x14 | 0x1c | 0x24 => { + self.base_address_registers[((offset.0 - 0x14) / 8) as usize].write_upper(data) + } + 0x05 + | 0x11..=0x13 + | 0x15..=0x17 + | 0x19..=0x1B + | 0x1d..=0x1f + | 0x21..=0x23 + | 0x25..=0x27 => { + warn!("Unaligned PCI BAR access"); + Err(PciError::UnalignedAccess(offset)) + } + 0x34 => { + self.capabilities_ptr = data[0]; + Ok(()) + } + _ => Err(PciError::InvalidOffset(offset)), + } + } +} diff --git a/src/virtio.rs b/src/virtio.rs deleted file mode 100644 index 945942d3..00000000 --- a/src/virtio.rs +++ /dev/null @@ -1,336 +0,0 @@ -#![cfg_attr(target_os = "macos", allow(dead_code))] // no virtio implementation for macos -use std::{fmt, mem::size_of, ptr::copy_nonoverlapping, sync::Mutex, vec::Vec}; - -use log::info; -use mac_address::*; -use tun_tap::*; -use uhyve_interface::GuestPhysAddr; -use virtio_bindings::bindings::virtio_net::*; - -use crate::{mem::MmapMemory, virtqueue::*}; - -const STATUS_ACKNOWLEDGE: u8 = 0b00000001; -const STATUS_DRIVER: u8 = 0b00000010; -const STATUS_DRIVER_OK: u8 = 0b00000100; -const STATUS_FEATURES_OK: u8 = 0b00001000; -const STATUS_DRIVER_NEEDS_RESET: u8 = 0b01000000; -const STATUS_FAILED: u8 = 0b10000000; - -const VENDOR_ID_REGISTER: usize = 0x0; -const DEVICE_ID_REGISTER: usize = 0x2; -const _COMMAND_REGISTER: usize = 0x4; -const STATUS_REGISTER: u32 = 0x6; -const CLASS_REGISTER: usize = 0x8; -const BAR0_REGISTER: usize = 0x10; -const _SUBSYSTEM_VENDOR_ID_REGISTER: usize = 0x2C; -const _SUBSYSTEM_ID_REGISTER: usize = 0x2E; -const _INTERRUPT_REGISTER: usize = 0x3C; -const _RX_QUEUE: usize = 0; -const TX_QUEUE: usize = 1; -const IOBASE: u16 = 0xc000; -const ETHARP_HWADDR_LEN: u16 = 6; - -pub const VIRTIO_PCI_HOST_FEATURES: u16 = IOBASE; -pub const VIRTIO_PCI_GUEST_FEATURES: u16 = IOBASE + 4; -pub const VIRTIO_PCI_QUEUE_PFN: u16 = IOBASE + 8; -pub const _VIRTIO_PCI_QUEUE_NUM: u16 = IOBASE + 12; -pub const VIRTIO_PCI_QUEUE_SEL: u16 = IOBASE + 14; -pub const VIRTIO_PCI_QUEUE_NOTIFY: u16 = IOBASE + 16; -pub const VIRTIO_PCI_STATUS: u16 = IOBASE + 18; -pub const VIRTIO_PCI_ISR: u16 = IOBASE + 19; -pub const VIRTIO_PCI_CONFIG_OFF_MSIX_OFF: u16 = 20; -pub const VIRTIO_PCI_CONFIG_OFF_MSIX_OFF_MAX: u16 = VIRTIO_PCI_CONFIG_OFF_MSIX_OFF + 5; -pub const VIRTIO_PCI_LINK_STATUS_MSIX_OFF: u16 = ETHARP_HWADDR_LEN + VIRTIO_PCI_CONFIG_OFF_MSIX_OFF; - -const HOST_FEATURES: u32 = (1 << VIRTIO_NET_F_STATUS) | (1 << VIRTIO_NET_F_MAC); - -pub trait PciDevice { - fn handle_read(&self, address: u32, dest: &mut [u8]); - fn handle_write(&mut self, address: u32, src: &[u8]); -} - -type PciRegisters = [u8; 0x40]; - -pub struct VirtioNetPciDevice { - registers: PciRegisters, //Add more - requested_features: u32, - selected_queue_num: u16, - virt_queues: Vec, - iface: Option>, - mac_addr: [u8; 6], -} - -impl fmt::Debug for VirtioNetPciDevice { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("VirtioNetPciDevice") - .field("status", &self.registers[STATUS_REGISTER as usize]) - .finish() - } -} - -macro_rules! read_u16 { - ($registers:expr, $address:expr) => { - ($registers[$address] as u16) | ($registers[$address + 1] as u16) << 8 - }; -} - -macro_rules! write_u16 { - ($registers:expr, $address:expr, $value:expr) => { - $registers[$address] = ($value & 0xFF) as u8; - $registers[$address + 1] = (($value >> 8) & 0xFF) as u8; - () - }; -} - -#[expect(unused_macros)] -macro_rules! read_u32 { - ($registers:expr, $address:expr) => { - ($registers[$address] as u32) - | (($registers[$address + 1] as u32) << 8) - | (($registers[$address + 2] as u32) << 16) - | (($registers[$address + 3] as u32) << 24) - }; -} - -#[expect(unused_macros)] -macro_rules! write_u32 { - ($registers:expr, $address:expr, $value:expr) => { - $registers[$address] = ($value & 0xFF) as u8; - $registers[$address + 1] = (($value >> 8) & 0xFF) as u8; - $registers[$address + 2] = (($value >> 16) & 0xFF) as u8; - $registers[$address + 3] = (($value >> 24) & 0xFF) as u8; - () - }; -} - -impl VirtioNetPciDevice { - pub const fn new() -> VirtioNetPciDevice { - let mut registers: PciRegisters = [0; 0x40]; - write_u16!(registers, VENDOR_ID_REGISTER, 0x1AF4); - write_u16!(registers, DEVICE_ID_REGISTER, 0x1000); - write_u16!(registers, CLASS_REGISTER + 2, 0x0200); - write_u16!(registers, BAR0_REGISTER, IOBASE); - registers[STATUS_REGISTER as usize] = STATUS_DRIVER_NEEDS_RESET; - let virt_queues: Vec = Vec::new(); - VirtioNetPciDevice { - registers, - requested_features: 0, - selected_queue_num: 0, - virt_queues, - iface: None, - mac_addr: [0; 6], - } - } - - pub fn _poll_rx(_device: &mut VirtioNetPciDevice) { - //TODO: how to read packets without synchronization issues - } - - pub fn handle_notify_output(&mut self, dest: &[u8], mem: &MmapMemory) { - let tx_num = read_u16!(dest, 0); - if tx_num == 1 && self.read_status_reg() & STATUS_DRIVER_OK == STATUS_DRIVER_OK { - self.send_available_packets(mem); - } - } - - // Sends packets using the tun_tap crate, subject to change - fn send_available_packets(&mut self, mem: &MmapMemory) { - let tx_queue = &mut self.virt_queues[TX_QUEUE]; - let mut send_indices = Vec::new(); - for index in tx_queue.avail_iter() { - send_indices.push(index); - } - for index in send_indices { - let desc = unsafe { tx_queue.get_descriptor(index) }; - let gpa = GuestPhysAddr::new(unsafe { *(desc.addr as *const u64) }); - let hva = mem.host_address(gpa).unwrap(); - match &self.iface { - Some(tap) => unsafe { - let vec = vec![0; (desc.len as usize) - size_of::()]; - let slice: &[u8] = &vec; - copy_nonoverlapping( - hva, - slice.as_ptr() as *mut u8, - (desc.len as usize) - size_of::(), - ); - let unlocked_tap = tap.lock().unwrap(); - //Actually send packet - unlocked_tap.send(slice).unwrap_or(0); - }, - None => self.registers[STATUS_REGISTER as usize] |= STATUS_DRIVER_NEEDS_RESET, - } - tx_queue.add_used(index as u32, 1) - } - } - - pub fn read_status(&self, dest: &mut [u8]) { - self.handle_read(STATUS_REGISTER & 0x3FFF, dest); - } - - // Virtio handshake - pub fn write_status(&mut self, dest: &[u8]) { - let status = self.read_status_reg(); - if dest[0] == 0 { - self.write_status_reg(0); - self.requested_features = 0; - self.selected_queue_num = 0; - self.virt_queues.clear(); - self.iface = None; - } else if status == STATUS_DRIVER_NEEDS_RESET || status == 0 { - self.write_status_reset(dest); - } else if status == STATUS_ACKNOWLEDGE { - self.write_status_acknowledge(dest); - } else if status == STATUS_ACKNOWLEDGE | STATUS_DRIVER { - self.write_status_features(dest); - } else if status == STATUS_ACKNOWLEDGE | STATUS_DRIVER | STATUS_FEATURES_OK { - self.write_status_ok(dest); - } - } - - pub fn read_mac_byte(&self, dest: &mut [u8], index: u16) { - dest[0] = self.mac_addr[index as usize]; - } - - // This function is reliant on tap devices as the underlying packet sending mechanism - // Gets the tap device by name then gets its mac address - fn get_mac_addr(&mut self) { - if let Some(tap) = &self.iface { - let locked_dev = tap.lock().unwrap(); - match mac_address_by_name(locked_dev.name()) { - Ok(Some(ma)) => self.mac_addr = ma.bytes(), - Ok(None) => { - info!("No MAC address found."); - self.registers[STATUS_REGISTER as usize] |= STATUS_DRIVER_NEEDS_RESET; - } - Err(e) => { - info!("{e:?}"); - self.registers[STATUS_REGISTER as usize] |= STATUS_DRIVER_NEEDS_RESET; - } - } - } - } - - // Driver acknowledges device - fn write_status_reset(&mut self, dest: &[u8]) { - if dest[0] == STATUS_ACKNOWLEDGE { - self.write_status_reg(dest[0]); - } - } - - // Driver recognizes the device - fn write_status_acknowledge(&mut self, dest: &[u8]) { - if dest[0] == STATUS_ACKNOWLEDGE | STATUS_DRIVER { - self.write_status_reg(dest[0]); - } - } - - // finish negotiating features - fn write_status_features(&mut self, dest: &[u8]) { - if dest[0] == STATUS_ACKNOWLEDGE | STATUS_DRIVER | STATUS_FEATURES_OK { - self.write_status_reg(dest[0]); - } - } - - // Complete handshake - fn write_status_ok(&mut self, dest: &[u8]) { - if dest[0] == STATUS_ACKNOWLEDGE | STATUS_DRIVER | STATUS_FEATURES_OK | STATUS_DRIVER_OK { - self.write_status_reg(dest[0]); - self.iface = match Iface::new("", Mode::Tap) { - Ok(tap) => Some(Mutex::new(tap)), - Err(err) => { - info!("Error creating TAP device: {err}"); - self.registers[STATUS_REGISTER as usize] |= STATUS_DRIVER_NEEDS_RESET; - None - } - }; - self.get_mac_addr(); - } - } - - fn write_status_reg(&mut self, status: u8) { - self.registers[STATUS_REGISTER as usize] = status; - } - - fn read_status_reg(&self) -> u8 { - self.registers[STATUS_REGISTER as usize] - } - - pub fn write_selected_queue(&mut self, dest: &[u8]) { - self.selected_queue_num = unsafe { - #[expect(clippy::cast_ptr_alignment)] - *(dest.as_ptr() as *const u16) - }; - } - - // Register virtqueue - pub fn write_pfn(&mut self, dest: &[u8], mem: &MmapMemory) { - let status = self.read_status_reg(); - if status & STATUS_FEATURES_OK != 0 - && status & STATUS_DRIVER_OK == 0 - && self.selected_queue_num as usize == self.virt_queues.len() - { - let gpa = GuestPhysAddr::new(unsafe { - #[expect(clippy::cast_ptr_alignment)] - *(dest.as_ptr() as *const u64) - }); - let hva = mem.host_address(gpa).unwrap(); - let queue = unsafe { Virtqueue::new(hva as *mut u8, QUEUE_LIMIT) }; - self.virt_queues.push(queue); - } - } - - pub fn write_requested_features(&mut self, dest: &[u8]) { - if self.read_status_reg() == STATUS_ACKNOWLEDGE | STATUS_DRIVER { - let requested_features = unsafe { - #[expect(clippy::cast_ptr_alignment)] - *(dest.as_ptr() as *const u32) - }; - self.requested_features = - (self.requested_features | requested_features) & HOST_FEATURES; - } - } - - pub fn read_requested_features(&self, dest: &mut [u8]) { - if self.read_status_reg() == STATUS_ACKNOWLEDGE | STATUS_DRIVER { - let bytes = self.requested_features.to_ne_bytes(); - dest[0..(bytes.len())].clone_from_slice(&bytes[..]); - } - } - - pub fn read_link_status(&self, dest: &mut [u8]) { - let status = self.read_status_reg(); - if status & STATUS_FAILED != 0 || status & STATUS_DRIVER_NEEDS_RESET != 0 { - dest[0] = 0; - } else { - match &self.iface { - Some(_) => dest[0] = 1, - None => dest[0] = 0, - } - } - } - - pub fn read_host_features(&self, dest: &mut [u8]) { - let bytes = HOST_FEATURES.to_ne_bytes(); - dest[0..(bytes.len())].clone_from_slice(&bytes[..]); - } - - pub fn reset_interrupt(&mut self) { - // TODO: IRQ - } -} - -impl PciDevice for VirtioNetPciDevice { - fn handle_read(&self, address: u32, dest: &mut [u8]) { - dest.copy_from_slice(&self.registers[address as usize..][..dest.len()]); - } - - fn handle_write(&mut self, address: u32, dest: &[u8]) { - self.registers[address as usize..][..dest.len()].copy_from_slice(dest); - } -} - -impl Default for VirtioNetPciDevice { - fn default() -> Self { - Self::new() - } -} diff --git a/src/virtio/capabilities.rs b/src/virtio/capabilities.rs new file mode 100644 index 00000000..12b8103d --- /dev/null +++ b/src/virtio/capabilities.rs @@ -0,0 +1,384 @@ +//! VirtIO capability structures. + +use bitflags::bitflags; +use zerocopy::{Immutable, IntoBytes}; + +use crate::{ + net::{BROADCAST_MAC_ADDR, UHYVE_NET_MTU}, + pci::PciConfigurationOffset, + virtio::{ + VirtqueueNotification, + pci::{HeaderConf, get_offset}, + }, +}; + +/// Virtio capability type IDs. See section 4.1.4 virtio v1.2 +#[derive(Debug, Clone, Copy, IntoBytes, PartialEq, Eq, Immutable)] +#[repr(u8)] +#[allow(non_camel_case_types, dead_code)] +pub enum CfgType { + INVALID_CFG = 0x00, + /// Common configuration + COMMON_CFG = 0x01, + /// Notifications + NOTIFY_CFG = 0x02, + /// ISR status + ISR_CFG = 0x03, + /// Device-specific configuration + DEVICE_CFG = 0x04, + /// PCI configuration access + PCI_CFG = 0x05, + /// Shared memory region + _SHARED_MEMORY_CFG = 0x08, + /// Vendor-specific data + VENDOR_CFG = 0x09, +} + +#[repr(u32)] +#[derive(Clone, Copy, Debug, IntoBytes, Immutable)] +pub enum FeatureSelector { + Low = 0, + High = 1, +} + +impl From for FeatureSelector { + fn from(value: u32) -> Self { + match value { + 0 => Self::Low, + 1 => Self::High, + _ => Self::Low, // TODO, should this panic, or should we set to an invalid value? + } + } +} + +/// Vendor-specific PCI capability. +/// Section 4.1.4 virtio v1.2 +#[derive(IntoBytes, Clone, Copy, Debug, Immutable)] +#[repr(C)] +pub struct PciCap { + /// Generic PCI field: PCI_CAP_ID_VNDR + cap_vndr: u8, + + /// Generic PCI field: next ptr + pub cap_next: u8, + + /// Generic PCI field: capability length + pub cap_len: u8, + + /// Identifies the structure. See [`crate::net::virtio::config::cfg_type`] + pub cfg_type: CfgType, + + /// Index of the device BAR register + bar_index: u8, + + /// Identify multiple capabilities of the same type. + id: u8, + + _padding: [u8; 2], + + /// Offset of address relative to the base address within the BAR. + pub offset: u32, + + /// Length of the structure, in bytes. + /// + /// The length **MAY** include padding padding, or fields unused by the driver, etc. + pub length: u32, +} + +impl Default for PciCap { + fn default() -> Self { + Self { + cap_vndr: 0x09, // Virtio v1.2 Sec. 4.1.4 + cap_next: 0, + cap_len: std::mem::size_of::() as u8, + cfg_type: CfgType::INVALID_CFG, + bar_index: 0, + id: 0, + _padding: [0u8; 2], + offset: 0, + length: 0, + } + } +} + +#[derive(Copy, Clone, Debug, IntoBytes, PartialEq, Eq, Immutable)] +#[repr(C)] +pub struct NetDevStatus(u16); +bitflags! { + impl NetDevStatus: u16 { + const UNINITIALIZED = 0; + const VIRTIO_NET_S_LINK_UP = 1; + const VIRTIO_NET_S_ANNOUNCE = 2; + } +} + +// TODO: Replace with virtio_bindings::Virtio_net_config? +/// Virtio device configuration layout. Virtio v1.2 Section 5.1.4 +#[derive(IntoBytes, Clone, Debug, Immutable)] +#[repr(C)] +pub struct NetDevCfg { + /// **read-only**: macaddress, always exists. + pub mac: [u8; 6], + + /// **read-write** Status field: VIRTIO_NET_S_LINK_UP and VIRTIO_NET_S_ANNOUNCE. + pub status: NetDevStatus, + + /// **read-only**: only exists if VIRTIO_F_MQ or VIRTIO_NET_F_RSS are negotiated, however + /// implements and does not use it. TODO + _max_virtqueue_pairs: u16, + /// Exists only if VIRTIO_NET_F_MTU is negotiated. Must be at least 1280 (5.1.4.1 v1.2). + /// must not modify once set. + pub mtu: u16, + _speed: u32, + _duplex: u8, + _rss_max_key_size: u8, + _rss_max_indirection_table_length: u16, + _supported_hash_types: u32, +} +impl NetDevCfg { + pub const MAC_ADDRESS: u8 = get_offset!(HeaderConf::DEVICE_CFG_START, NetDevCfg, mac).0; + pub const MAC_ADDRESS_END: u8 = get_offset!(HeaderConf::DEVICE_CFG_START, NetDevCfg, mac).0 + 6; + pub const NET_STATUS: u8 = get_offset!(HeaderConf::DEVICE_CFG_START, NetDevCfg, status).0; + pub const MTU: u8 = get_offset!(HeaderConf::DEVICE_CFG_START, NetDevCfg, mtu).0; +} + +impl Default for NetDevCfg { + fn default() -> Self { + Self { + mac: BROADCAST_MAC_ADDR, + status: NetDevStatus::UNINITIALIZED, + _max_virtqueue_pairs: 0, + mtu: UHYVE_NET_MTU as u16, + _speed: 0u32, + _duplex: 0u8, + _rss_max_key_size: 0u8, + _rss_max_indirection_table_length: 0u16, + _supported_hash_types: 0u32, + } + } +} + +/// ISR capability, refers to at a single byte which ocntains an 8-bit ISR status field to be used +/// for INT#x interrupt handling. The offset has no alignment requirements. See Virtio v1.2 Sec. 4.1.4.5. +/// +/// See section 4.1.5.3 and 4.1.5.4 on usage. +#[derive(Copy, Clone, Debug, IntoBytes, PartialEq, Eq, Default, Immutable)] +#[repr(C)] +pub struct IsrStatus(u8); +bitflags! { + impl IsrStatus: u8 { + /// Notify that the buffers/Virtqueues have been changed + const NOTIFY_USED_BUFFER = 0b01; + /// Notify that the device configuration has been changed. + const NOTIFY_CONFIGURUTION_CHANGED = 0b10; + } +} +impl IsrStatus { + pub const ISR_FLAGS: u8 = PciConfigurationOffset(HeaderConf::ISR_CFG_START).0; +} + +/// Notification location. This is a standard PciCap, followed by an offset multiplier. +/// +/// ## Important +/// +/// `cap.offset` must be 2-byte aligned, `notify_off_multiplier` must be an even power of 2 or 0. +/// `cap.length` must be at least 2 and larg enough to support queue notification offset. +/// +/// See section 4.1.4.4.1 virtio v1.2 +#[derive(IntoBytes, Clone, Debug, Immutable)] +#[repr(C)] +pub struct NotifyCap { + pub cap: PciCap, + /// Combind with queue_notify_off to derive the Queue Notify address + /// within a BAR for a virtqueue. + /// + /// For example: if notify_off_multiplier is 0, the same Queue Notify address + /// is used for all queues. (section 4.1.4.4 virtio v1.2) + pub notify_off_multiplier: u32, +} + +impl Default for NotifyCap { + fn default() -> Self { + Self { + cap: PciCap { + cap_len: std::mem::size_of::() as u8, + cfg_type: CfgType::NOTIFY_CFG, + offset: 0, + // We have two notification addresses. TODO: We prob. only need one + length: std::mem::size_of::() as u32 * 2, + ..Default::default() + }, + notify_off_multiplier: 0, + } + } +} + +/// Common configuration, section 4.1.4.3 virtio v1.2 +/// +/// All data should be treated as little-endian. +#[derive(IntoBytes, Clone, Copy, Debug, Immutable)] +#[repr(C)] +#[allow(dead_code)] +pub struct ComCfg { + /// **read-write**: The driver uses this to select device_feature. + /// + /// Values may only be 0 for feature bits 0-31, or one for feature bits 32-63. + pub device_feature_select: FeatureSelector, + + /// **read-only**: The driver reads the currently activated feature bits. + /// + /// See: [`crate::virtio-bindings`] for `VIRTIO_NET_F_*` feature flags + pub device_feature: u32, + + /// **read-write**: The driver uses this to report which feature bits it is offering. + pub driver_feature_select: FeatureSelector, + + /// **read-write**: Driver reads activated feature bits + /// + /// See: [`crate::virtio-bindings`] for `VIRTIO_NET_F_*` feature flags + pub driver_feature: u32, + + /// **read-write**: The driver sets the Configuration Vector for MSI-X. + pub config_msix_vector: u16, + + /// **read-only**: The device specifies the maximum number of virtqueues supported here. + pub num_queues: u16, + + /// **read-write**: + /// The driver writes the device status here (section 2.1 virtio v1.2). + /// Writing 0 into this field resets the device. + pub device_status: u8, + + /// **read-only**: Configuration atomicity value. The device changes this every time the + /// configuration noticeably changes. + pub config_generation: u8, // read-only for driver + + // About a specific virtqueue + /// **read-write**: The driver selects which virtqueue the following fields refer to. + pub queue_select: u16, + + /// **read-write**: On reset, specifies the maximum queue size supported by the device. + /// + /// This can be modified by the driver to reduce memory requirements. A 0 means the queue is + /// unavailable. + pub queue_size: u16, + + /// **read-write**: The driver uses this to specify the queue vector for MSI-Xw. + pub queue_msix_vector: u16, + + /// **read-write**: The driver uses this to selectively prevent the device from executing + /// requests from this virtqueue. + /// + /// 1 - enabled; 0 - disabled. + pub queue_enable: u16, + + /// **read-only**: Offset of the notification area. + /// + /// **NOTE**: This is not an offset in bytes. Section 4.1.4.4 virtio v1.2 + pub queue_notify_off: u16, + + /// **read-write**: The driver writes the physical address of Descriptor Area here. See section + /// 2.6 virtio v1.2 + pub queue_desc: u64, + + /// **read-write**: The driver writes the physical address of Driver Area here. See section 2.6 + /// virtio v1.2 + pub queue_driver: u64, + + /// **read-write**: The driver writes the physical address of Device Area here. See section 2.6 + /// virtio v1.2 + pub queue_device: u64, + + /// **read-only** for driver: The driver will use this value to put it in the ’virtqueue number’ field + /// in the available buffer notification structure. + /// This field exists only if VIRTIO_F_NOTIF_CONFIG_DATA has been negotiated. + pub queue_notify_data: u16, + + /// ***read-write**: The driver uses this to selectively reset the queue. + /// This field exists only if VIRTIO_F_RING_RESET has been negotiated. + pub queue_reset: u16, + + _padding: [u8; 4], +} +#[allow(dead_code)] +impl ComCfg { + pub const DEVICE_FEATURE_SELECT: u8 = + get_offset!(HeaderConf::COMMON_CFG_START, Self, device_feature_select).0; + + pub const DEVICE_FEATURE: u8 = + get_offset!(HeaderConf::COMMON_CFG_START, Self, device_feature).0; + + pub const DRIVER_FEATURE_SELECT: u8 = + get_offset!(HeaderConf::COMMON_CFG_START, Self, driver_feature_select).0; + + pub const DRIVER_FEATURE: u8 = + get_offset!(HeaderConf::COMMON_CFG_START, Self, driver_feature).0; + + pub const NUM_QUEUES: u8 = get_offset!(HeaderConf::COMMON_CFG_START, Self, num_queues).0; + + pub const CONFIG_MSIX_VECTOR: u8 = + get_offset!(HeaderConf::COMMON_CFG_START, Self, config_msix_vector).0; + + pub const DEVICE_STATUS: u8 = get_offset!(HeaderConf::COMMON_CFG_START, Self, device_status).0; + + pub const CONFIG_GENERATION: u8 = + get_offset!(HeaderConf::COMMON_CFG_START, Self, config_generation).0; + + pub const QUEUE_SELECT: u8 = get_offset!(HeaderConf::COMMON_CFG_START, Self, queue_select).0; + + pub const QUEUE_SIZE: u8 = get_offset!(HeaderConf::COMMON_CFG_START, Self, queue_size).0; + + pub const QUEUE_MSIX_VECTOR: u8 = + get_offset!(HeaderConf::COMMON_CFG_START, Self, queue_msix_vector).0; + + pub const QUEUE_ENABLE: u8 = get_offset!(HeaderConf::COMMON_CFG_START, Self, queue_enable).0; + + pub const QUEUE_NOTIFY_OFFSET: u8 = + get_offset!(HeaderConf::COMMON_CFG_START, Self, queue_notify_off).0; + + pub const QUEUE_DESC_LOW: u8 = get_offset!(HeaderConf::COMMON_CFG_START, Self, queue_desc).0; + pub const QUEUE_DESC_HIGH: u8 = + get_offset!(HeaderConf::COMMON_CFG_START, Self, queue_desc).0 + 4; + + pub const QUEUE_DRIVER_LOW: u8 = + get_offset!(HeaderConf::COMMON_CFG_START, Self, queue_driver).0; + pub const QUEUE_DRIVER_HIGH: u8 = + get_offset!(HeaderConf::COMMON_CFG_START, Self, queue_driver).0 + 4; + + pub const QUEUE_DEVICE_LOW: u8 = + get_offset!(HeaderConf::COMMON_CFG_START, Self, queue_device).0; + pub const QUEUE_DEVICE_HIGH: u8 = + get_offset!(HeaderConf::COMMON_CFG_START, Self, queue_device).0 + 4; + + pub const QUEUE_NOTIFY_DATA: u8 = + get_offset!(HeaderConf::COMMON_CFG_START, Self, queue_notify_data).0; + + pub const QUEUE_RESET: u8 = get_offset!(HeaderConf::COMMON_CFG_START, Self, queue_reset).0; +} + +impl Default for ComCfg { + fn default() -> Self { + Self { + device_feature_select: FeatureSelector::Low, + device_feature: 0, + driver_feature_select: FeatureSelector::Low, + driver_feature: 0, + config_msix_vector: super::VIRTIO_MSI_NO_VECTOR, + num_queues: 0, + device_status: 0, + config_generation: 0, + queue_select: 0, + queue_size: 0, + queue_msix_vector: 0, + queue_enable: 0, + // we will use the same address for all queues, since we only have 2. TODO + queue_notify_off: 0, + queue_desc: 0, + queue_driver: 0, + queue_device: 0, + queue_notify_data: 0, + queue_reset: 0, + _padding: Default::default(), + } + } +} diff --git a/src/virtio/mod.rs b/src/virtio/mod.rs new file mode 100644 index 00000000..f8431f56 --- /dev/null +++ b/src/virtio/mod.rs @@ -0,0 +1,87 @@ +//! Virtio Datastructures and constants. + +pub(crate) mod capabilities; +pub(crate) mod net; +pub(crate) mod pci; + +pub mod features { + use virtio_bindings::{ + bindings::virtio_net::{VIRTIO_NET_F_MAC, VIRTIO_NET_F_MTU, VIRTIO_NET_F_STATUS}, + virtio_config::VIRTIO_F_VERSION_1, + }; + + pub const UHYVE_NET_FEATURES_LOW: u32 = + 1 << VIRTIO_NET_F_MAC | 1 << VIRTIO_NET_F_STATUS | 1 << VIRTIO_NET_F_MTU; + pub const UHYVE_NET_FEATURES_HIGH: u32 = ((1_usize << VIRTIO_F_VERSION_1) >> 32) as u32; +} + +use bitflags::bitflags; +use zerocopy::{Immutable, IntoBytes}; + +pub(crate) const QUEUE_LIMIT: usize = 256; + +// A virtqueue notification as described in the Virtio standard v1.2 Sec. 2.9 & 4.1.5.2. +#[repr(C)] +#[derive(IntoBytes, Debug, Default, Immutable)] +pub struct VirtqueueNotification { + /// VQ number to be notified + pub vqn: u16, + /// next_off: Offset within the ring to the next available ring entry (lower 15 bytes) + /// wrap: wrap counter (msb) + pub next_off_wrap: u16, +} + +/// Virtio device status field. See section 2.1 virtio v1.2 +#[derive(Copy, Clone, Debug, IntoBytes, PartialEq, Eq, Immutable)] +#[repr(C)] +pub struct DeviceStatus(u16); +bitflags! { + impl DeviceStatus : u16 { + /// Despite not being a valid virtio Flag, 0 represents an uninitialized or reset device. + const UNINITIALIZED = 0; + /// Indicates the guest has found the device and recognises it as valid. + const ACKNOWLEDGE = 1; + + /// Indicates the guest knows how to drive the device. + const DRIVER = 2; + + /// Indicates the driver is set up and ready to drive the device. + const DRIVER_OK = 4; + + /// indicates the driver has acknowledged the features it understands and negotiation is + /// complete. + const FEATURES_OK = 8; + + /// Indicates that the device has experienced an error from which it can’t recover. + const DEVICE_NEEDS_RESET = 64; + + /// Indicates that the PCI capabilities pointer points to a linked list at register address + /// 0x34. + /// + /// See: PCI-to-PCI bridge architechture, section 3.2.4.4 + const PCI_CAPABILITIES_LIST_ENABLE = 16; + + /// Failed to initialize. + const FAILED = 128; + } +} + +// Virtio Device IDs. +// +// The device is calculated by adding 0x1040 to the virtio device ID as in section 5, or have a +// transitional device ID. +// +// See sections 4.1.2.1 and 5 virtio v1.2 +const ROOT_DEVICE_ID: u16 = 0x1040; +pub const NET_DEVICE_ID: u16 = ROOT_DEVICE_ID + 1; +const _BLOCK_DEVICE_ID: u16 = ROOT_DEVICE_ID + 2; +const _CONSOLE_DEVICE_ID: u16 = ROOT_DEVICE_ID + 3; +const _SOCKET_DEVICE_ID: u16 = ROOT_DEVICE_ID + 19; + +/// Virtio PCI vendor ID, section 4.1.2 v1.2 +pub const VIRTIO_VENDOR_ID: u16 = 0x1AF4; + +/// For now, use an address large enough to be outside of kvm_userspace, +/// as IO/MMIO writes are otherwise dismissed. +pub const IOBASE: u32 = 0xFE000000; +const VIRTIO_MSI_NO_VECTOR: u16 = 0xffff; diff --git a/src/virtio/net.rs b/src/virtio/net.rs new file mode 100644 index 00000000..d1fc875e --- /dev/null +++ b/src/virtio/net.rs @@ -0,0 +1,742 @@ +#![cfg_attr(not(target_os = "linux"), expect(unused))] + +use std::{ + collections::VecDeque, + fmt, + io::{Read, Write}, + mem, + sync::{ + Arc, Mutex, + atomic::{AtomicBool, Ordering}, + mpsc::{Receiver, Sender, channel}, + }, + thread::{self, JoinHandle}, +}; + +use virtio_bindings::{ + bindings::virtio_net::virtio_net_hdr_v1, virtio_config::VIRTIO_F_RING_RESET, +}; +use virtio_queue::{Error as VirtIOError, Queue, QueueOwnedT, QueueT}; + +#[cfg(target_os = "linux")] +use crate::net::tap::Tap; +use crate::{ + consts::{UHYVE_NET_MTU, UHYVE_NET_READ_TIMEOUT}, + mem::MmapMemory, + net::{NetworkInterface, NetworkInterfaceRX, NetworkInterfaceTX, UHYVE_QUEUE_SIZE}, + params::NetworkMode, + pci::{MemoryBar64, PciConfigurationAddress, PciDevice}, + virtio::{ + DeviceStatus, IOBASE, NET_DEVICE_ID, QUEUE_LIMIT, + capabilities::{ComCfg, FeatureSelector, IsrStatus, NetDevCfg, NetDevStatus}, + features::{UHYVE_NET_FEATURES_HIGH, UHYVE_NET_FEATURES_LOW}, + pci::HeaderConf, + }, +}; + +const VIRTIO_NET_HEADER_SZ: usize = mem::size_of::(); + +/// Network -> Uhyve -> VM +const RX_QUEUE: u16 = 0; +/// VM -> Uhyve -> Network +const TX_QUEUE: u16 = 1; +/// Only one interrupt is needed, so we have to use PCI device pin 1. +pub const UHYVE_IRQ_NET_PCI_PIN: u8 = 1; + +pub(crate) trait VirtQueueNotificationWaiter: Send { + /// Wait until the virtqueue sends a notify + fn wait_for_notify(&self); + + /// Wait until the virtqueue sends a notify with `timeout` in milliseconds. + /// Returns `true` if notification happened, `false` on timeout. + fn wait_with_timeout(&self, timeout: u16) -> bool; +} + +pub(crate) trait VirtQueueInterrupter: Send { + fn send_interrupt(&self); +} + +/// Write access to u64 fields in virtio is done in two separate accesses. This is a helper struct to support this pattern. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub(crate) enum Area { + DescHigh, + DescLow, + DriverHigh, + DriverLow, + DeviceHigh, + DeviceLow, +} + +enum ThreadStartMsg { + Start, + Abort, +} + +/// Struct to manage uhyve's network device. +pub(crate) struct VirtioNetPciDevice { + /// PCI configuration space & VirtIO capabilities. + pub header_caps: HeaderConf, + /// records if ISR status must be alerted. This is set by the thread and + /// read by read_isr_notify + isr_changed: Arc, + /// received virtqueue + rx_queue: Arc>, + /// transmitted virtqueue + tx_queue: Arc>, + guest_mmap: Arc, + /// Store all negotiated feature sets. Chapter 2.2 virtio v1.2 + feature_set: u64, + config_generation: (bool, u8), // changed & counter + interface_cfg: NetworkMode, + rx_thread: Option>, + tx_thread: Option>, + thread_start_channels: (Sender, Sender), + rx_thread_start_channel_receiver: Option>, + tx_thread_start_channel_receiver: Option>, + stop_threads: Arc, +} +impl fmt::Debug for VirtioNetPciDevice { + // TODO: More exhaustive debug print + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("VirtioNetPciDevice") + .field("status", &self.header_caps.common_cfg.device_status) + .finish() + } +} + +impl VirtioNetPciDevice { + pub fn new(interface_cfg: NetworkMode, guest_mmap: Arc) -> VirtioNetPciDevice { + let mut header_caps = HeaderConf::new(); + header_caps.pci_config_hdr.device_id = NET_DEVICE_ID; + header_caps.pci_config_hdr.base_address_registers[0] = MemoryBar64::new(IOBASE as u64); + header_caps.pci_config_hdr.interrupt_pin = UHYVE_IRQ_NET_PCI_PIN; + header_caps.common_cfg.num_queues = 2; + header_caps.common_cfg.device_feature_select = FeatureSelector::Low; + header_caps.common_cfg.device_feature = UHYVE_NET_FEATURES_LOW; + header_caps.common_cfg.queue_size = UHYVE_QUEUE_SIZE; + header_caps.notify_cap.notify_off_multiplier = 4; + + // Create invalid virtqueues. Improper, unsafe and poor practice! + // Ideally, we would mark and watch the queues as ready. + let rx_queue = Arc::new(Mutex::new( + Queue::new(header_caps.common_cfg.queue_size).unwrap(), + )); + let tx_queue = Arc::new(Mutex::new( + Queue::new(header_caps.common_cfg.queue_size).unwrap(), + )); + + let (tx_sender, tx_receiver) = channel(); + let (rx_sender, rx_receiver) = channel(); + + VirtioNetPciDevice { + header_caps, + isr_changed: Arc::new(AtomicBool::new(false)), + rx_queue, + tx_queue, + guest_mmap, + feature_set: (UHYVE_NET_FEATURES_LOW as u64) & ((UHYVE_NET_FEATURES_HIGH as u64) << 32), + config_generation: (false, 0), + rx_thread: None, + tx_thread: None, + thread_start_channels: (tx_sender, rx_sender), + rx_thread_start_channel_receiver: Some(rx_receiver), + tx_thread_start_channel_receiver: Some(tx_receiver), + interface_cfg, + stop_threads: Arc::new(AtomicBool::new(false)), + } + } + + /// VirtIO v1.2 - 4.1.4.3.1 requires that "The device MUST present a changed config_generation + /// after the driver has read a device-specific configuration value which has changed since any + /// part of the device-specific configuration was last read." + pub fn update_config_generation(&mut self) { + if !self.config_generation.0 { + self.config_generation.1 += 1; + self.config_generation.0 = true; + } + } + + #[inline] + pub fn read_isr_notify(&self, data: &mut [u8]) { + // We must be alerted from the thread somehow, hence fetching an AtomicBool + if self.isr_changed.swap(false, Ordering::AcqRel) { + data[0] = IsrStatus::NOTIFY_USED_BUFFER.bits(); + } + } + + /// Reset queue in common capability structure when VIRTIO_F_RING_RESET is negotiated. + /// This is currently disabled, but only called in vcpu.rs + /// Virtqueue Reset: chapter 2.6.1 virtio v1.2 + pub fn write_reset_queue(&mut self) { + if self.feature_set & (1 << VIRTIO_F_RING_RESET) != 0 { + // reset only selected queue + let mut queue = match self.header_caps.common_cfg.queue_select { + RX_QUEUE => self.rx_queue.lock().unwrap(), + TX_QUEUE => self.tx_queue.lock().unwrap(), + _ => panic!("invalid queue selected!"), + }; + queue.reset(); + } + self.header_caps.common_cfg.queue_reset = 0; + self.update_config_generation(); + } + + /// Read queue_reset from common capability structure when VIRTIO_F_RING_RESET is negotiated. + /// Virtqueue Reset: chapter 2.6.1 virtio v1.2 + pub fn read_queue_reset(&self, data: &mut [u8]) { + data[0] = self.header_caps.common_cfg.queue_reset as u8; + } + + // Virtio handshake: chapter 3 virtio v1.2 + pub fn write_status(&mut self, data: &[u8]) { + let status_reg = &mut self.header_caps.pci_config_hdr.status; + + // A state machine might be a nicer way to structure the code here. + + // Device initialization procedure: See Virtio V1.2 Sec. 4.2.2 + // Step 1: reset the device + if data[0] == DeviceStatus::UNINITIALIZED.bits() as u8 { + *status_reg = DeviceStatus::UNINITIALIZED; + self.header_caps.common_cfg.driver_feature = 0; + self.header_caps.common_cfg.queue_select = 0; + self.rx_queue.as_ref().lock().unwrap().reset(); + self.tx_queue.as_ref().lock().unwrap().reset(); + return; + } + + if status_reg.contains(DeviceStatus::DEVICE_NEEDS_RESET) { + error!("Virtio PCI device needs reset but is written to anyway"); + return; + } + + if *status_reg == DeviceStatus::UNINITIALIZED + && data[0] == DeviceStatus::ACKNOWLEDGE.bits() as u8 + { + // Step 2: Guest has noted device + status_reg.insert(DeviceStatus::ACKNOWLEDGE) + } else if *status_reg == DeviceStatus::ACKNOWLEDGE + && data[0] == (*status_reg | DeviceStatus::DRIVER).bits() as u8 + { + // Step 3: Guest knows how to drive device + status_reg.insert(DeviceStatus::DRIVER) + } else if *status_reg == DeviceStatus::ACKNOWLEDGE | DeviceStatus::DRIVER + && data[0] == (*status_reg | DeviceStatus::FEATURES_OK).bits() as u8 + { + // Step 5: Fix features + status_reg.insert(DeviceStatus::FEATURES_OK) + } else if *status_reg + == DeviceStatus::ACKNOWLEDGE | DeviceStatus::DRIVER | DeviceStatus::FEATURES_OK + && data[0] == (*status_reg | DeviceStatus::DRIVER_OK).bits() as u8 + { + // Step 8: guest OS is ready + status_reg.insert(DeviceStatus::DRIVER_OK); + debug!("Starting RX & TX Threads"); + self.thread_start_channels + .0 + .send(ThreadStartMsg::Start) + .unwrap(); + self.thread_start_channels + .1 + .send(ThreadStartMsg::Start) + .unwrap(); + } else { + error!( + "Invalid status register operation (Status register: {:?}, operation: {:b})", + status_reg, data[0] + ); + *status_reg = DeviceStatus::DEVICE_NEEDS_RESET; + } + self.update_config_generation(); + } + + pub fn read_status_reg(&self) -> u8 { + self.header_caps.pci_config_hdr.status.bits() as u8 + } + + pub fn read_mac_address_bytes(&self, offset: usize, data: &mut [u8]) { + for (d, m) in data + .iter_mut() + .zip(self.header_caps.dev.mac.iter()) + .take(6) + .skip(offset) + { + *d = *m; + } + } + + #[cfg(target_os = "linux")] + pub(crate) fn start_network_threads< + TXNOTIFIER: VirtQueueNotificationWaiter + 'static, + RXNOTIFIER: VirtQueueNotificationWaiter + 'static, + INTERRUPTER: VirtQueueInterrupter + 'static, + >( + &mut self, + tx_notifier: TXNOTIFIER, + rx_notifier: RXNOTIFIER, + interrupter: INTERRUPTER, + ) { + let iface = match &self.interface_cfg { + NetworkMode::Tap { name } => { + Box::new(Tap::new(name).expect("Could not create Tap device")) + } + }; + + // store the interfaces MAC address + self.header_caps.dev.mac = iface.mac_address_as_bytes(); + + let (mut rx, mut tx) = iface.split(); + + self.tx_thread = Some({ + let tx_queue = self.tx_queue.clone(); + let mmap = Arc::clone(&self.guest_mmap); + let tx_start_channel_receiver = self.tx_thread_start_channel_receiver.take().unwrap(); + let stop_threads = self.stop_threads.clone(); + thread::spawn(move || { + match tx_start_channel_receiver.recv().unwrap() { + ThreadStartMsg::Abort => return, + ThreadStartMsg::Start => {} + } + debug!("Starting TX thread."); + while !stop_threads.load(Ordering::Relaxed) { + if tx_notifier.wait_with_timeout(UHYVE_NET_READ_TIMEOUT) { + match send_available_packets(&mut tx, &tx_queue, &mmap) { + Ok(_) => {} + Err(VirtIOError::QueueNotReady) => { + error!("Sending before queue is ready!") + } + Err(e) => error!("Error sending frames: {e:?}"), + } + } + } + }) + }); + + self.rx_thread = Some({ + let rx_queue = self.rx_queue.clone(); + let alert = Arc::clone(&self.isr_changed); + let mut frame_queue: VecDeque<([u8; 1500], usize)> = + VecDeque::with_capacity(QUEUE_LIMIT / 2); + let rx_start_channel_receiver = self.rx_thread_start_channel_receiver.take().unwrap(); + let mmap = Arc::clone(&self.guest_mmap); + let stop_threads = self.stop_threads.clone(); + + // reads frames from the frame queue and puts them in the virtio queue. Notifies the driver if necessary. + thread::spawn(move || { + match rx_start_channel_receiver.recv().unwrap() { + ThreadStartMsg::Abort => return, + ThreadStartMsg::Start => {} + } + debug!("Starting RX thread."); + while !stop_threads.load(Ordering::Relaxed) { + let mut buf = [0u8; UHYVE_NET_MTU]; + let len = rx.recv(&mut buf, UHYVE_NET_READ_TIMEOUT).unwrap(); + let mmap = mmap.as_ref(); + frame_queue.push_back((buf, len)); + + assert!( + len <= UHYVE_NET_MTU, + "Frame larger than MTU, was the device reconfigured?" + ); + + match write_packet(&rx_queue, &mut frame_queue, mmap, &rx_notifier) { + Ok(data_sent) => { + if data_sent + && rx_queue + .lock() + .unwrap() + .needs_notification(&mmap.mem) + .unwrap() + { + alert.store(true, Ordering::Release); + interrupter.send_interrupt(); + } + } + Err(VirtIOError::QueueNotReady) => error!("Sending before queue is ready!"), + Err(e) => error!("Could not write frames to guest: {e:?}"), + } + } + }) + }); + + self.header_caps.dev.status = NetDevStatus::VIRTIO_NET_S_LINK_UP; + self.update_config_generation(); + } + + #[inline] + pub fn read_net_status(&self, data: &mut [u8]) { + data.copy_from_slice(&self.header_caps.dev.status.bits().to_le_bytes()) + } + + #[inline] + pub fn read_mtu(&self, data: &mut [u8]) { + data.copy_from_slice(&self.header_caps.dev.mtu.to_le_bytes()) + } + + #[inline] + pub fn read_queue_size(&self, data: &mut [u8]) { + data.copy_from_slice(&self.header_caps.common_cfg.queue_size.to_le_bytes()) + } + + #[inline] + pub fn read_queue_notify_offset(&self, data: &mut [u8]) { + let offs = match self.header_caps.common_cfg.queue_select { + RX_QUEUE => 0, + TX_QUEUE => 1, + _ => { + warn!("driver reads invalid queue"); + 0 + } + }; + data.copy_from_slice(&[offs, 0]); + } + + pub fn write_selected_queue(&mut self, data: &[u8]) { + // let val = u16::from_le_bytes(dest.try_into().unwrap()); + let val = data[0] as u16; + + // VirtIO 4.1.4.3.1: Set queue_size to 0 if current queue is 'unavailable'. + // We only support 2, so handling like this for now. + if val != RX_QUEUE && val != TX_QUEUE { + self.header_caps.common_cfg.queue_size = 0; + } + // trace!("Select queue: {val}"); + self.header_caps.common_cfg.queue_select = val; + self.update_config_generation(); + } + + /// Enable or disable the currently selected queue. + #[inline] + pub fn queue_enable(&mut self, data: &[u8]) { + let val = data[0] as u16; + // let val = u16::from_le_bytes(data.try_into().unwrap()); + + assert!(val == 1 || val == 0, "Invalid queue enable value provided!"); + + let stat = val == 1; + + { + let mut queue = match self.header_caps.common_cfg.queue_select { + RX_QUEUE => self.rx_queue.lock().unwrap(), + TX_QUEUE => self.tx_queue.lock().unwrap(), + _ => { + panic!("Cannot enable invalid queue!") + } + }; + queue.set_ready(stat); + // we'll need to set if we're enabling, as queue is_valid will return false + // the queue is disabled + if stat && !queue.is_valid(&self.guest_mmap.mem) { + error!("tried to set queue as ready, but is not valid") + } + } + self.header_caps.common_cfg.queue_enable = val; + self.update_config_generation(); + } + + /// The driver tells us the addresses of the queues used for communication + pub fn update_queue_addr(&mut self, area: Area, bytes: &[u8]) { + debug!("updating queue address {area:?} to {bytes:x?}"); + let status = self.header_caps.pci_config_hdr.status; + assert!( + status.contains(DeviceStatus::FEATURES_OK), + "Driver tries to set queue addresses before feature negotiation" + ); + assert!( + !status.contains(DeviceStatus::DRIVER_OK), + "Driver tries to set queue addresses after driver initialization" + ); + + { + let mut queue = match self.header_caps.common_cfg.queue_select { + RX_QUEUE => self.rx_queue.as_ref().lock().unwrap(), + TX_QUEUE => self.tx_queue.as_ref().lock().unwrap(), + _ => panic!("Invalid queue selected!"), + }; + + match bytes.len() { + 4 => { + let addr_part = u32::from_le_bytes(bytes.try_into().unwrap()); + match area { + Area::DescHigh => queue.set_desc_table_address(None, Some(addr_part)), + Area::DescLow => queue.set_desc_table_address(Some(addr_part), None), + Area::DriverHigh => queue.set_avail_ring_address(None, Some(addr_part)), + Area::DriverLow => queue.set_avail_ring_address(Some(addr_part), None), + Area::DeviceHigh => queue.set_used_ring_address(None, Some(addr_part)), + Area::DeviceLow => queue.set_used_ring_address(Some(addr_part), None), + } + } + 8 => { + let addr_low = u32::from_le_bytes(bytes[0..4].try_into().unwrap()); + let addr_high = u32::from_le_bytes(bytes[4..7].try_into().unwrap()); + match area { + Area::DescLow => { + queue.set_desc_table_address(Some(addr_low), Some(addr_high)) + } + Area::DriverLow => { + queue.set_avail_ring_address(Some(addr_low), Some(addr_high)) + } + Area::DeviceLow => { + queue.set_used_ring_address(Some(addr_low), Some(addr_high)) + } + _ => panic!("Unaligned virtqueue area address"), + } + } + _ => unreachable!("Not a 4 or 8 byte access to the virtqueue configuration"), + } + } + self.update_config_generation(); + } + + pub fn read_config_generation(&mut self, data: &mut [u8; 1]) { + data[0] = self.config_generation.1; + self.config_generation.0 = false; + } + + pub fn write_requested_features(&mut self, data: &[u8]) { + if self + .header_caps + .pci_config_hdr + .status + .contains(DeviceStatus::ACKNOWLEDGE | DeviceStatus::DRIVER) + { + let requested_features: u32 = u32::from_le_bytes(data.try_into().unwrap()); + + self.header_caps.common_cfg.driver_feature = + match self.header_caps.common_cfg.driver_feature_select { + FeatureSelector::Low => { + (self.header_caps.common_cfg.driver_feature | requested_features) + & UHYVE_NET_FEATURES_LOW + } + FeatureSelector::High => { + (self.header_caps.common_cfg.driver_feature | requested_features) + & UHYVE_NET_FEATURES_HIGH + } + } + } + self.update_config_generation(); + } + + pub fn write_device_feature_select(&mut self, data: &[u8]) { + self.header_caps.common_cfg.device_feature_select = + FeatureSelector::from(u32::from_le_bytes(data.try_into().unwrap())); + self.update_config_generation(); + } + + pub fn write_driver_feature_select(&mut self, data: &[u8]) { + self.header_caps.common_cfg.driver_feature_select = + FeatureSelector::from(u32::from_le_bytes(data.try_into().unwrap())); + self.update_config_generation(); + } + + pub fn read_host_features(&self, data: &mut [u8]) { + match self.header_caps.common_cfg.device_feature_select { + FeatureSelector::Low => data.copy_from_slice(&UHYVE_NET_FEATURES_LOW.to_le_bytes()), + FeatureSelector::High => data.copy_from_slice(&UHYVE_NET_FEATURES_HIGH.to_le_bytes()), + // _ => data.fill(0), // VirtIO 4.1.4.3.1: present zero for any invalid select + } + } + + #[allow(dead_code)] + fn reset_interrupt(&mut self) { + todo!() + } +} +impl Drop for VirtioNetPciDevice { + fn drop(&mut self) { + self.thread_start_channels + .0 + .send(ThreadStartMsg::Abort) + .unwrap(); + self.thread_start_channels + .1 + .send(ThreadStartMsg::Abort) + .unwrap(); + self.stop_threads.store(true, Ordering::Relaxed); + if let Some(rx_thread) = self.rx_thread.take() { + rx_thread.join().unwrap() + } + if let Some(tx_thread) = self.tx_thread.take() { + tx_thread.join().unwrap() + } + } +} + +impl PciDevice for VirtioNetPciDevice { + fn handle_read(&mut self, address: PciConfigurationAddress, dest: &mut [u8]) { + match address.offset().0 { + IsrStatus::ISR_FLAGS => self.read_isr_notify(dest), + ComCfg::DEVICE_STATUS => dest[0] = self.read_status_reg(), + ComCfg::DEVICE_FEATURE => self.read_host_features(dest), + ComCfg::CONFIG_GENERATION => self.read_config_generation(dest.try_into().unwrap()), + ComCfg::QUEUE_SIZE => self.read_queue_size(dest), + ComCfg::QUEUE_NOTIFY_OFFSET => self.read_queue_notify_offset(dest), + NetDevCfg::MAC_ADDRESS..NetDevCfg::MAC_ADDRESS_END => { + let offs = address.offset().0 - NetDevCfg::MAC_ADDRESS; + self.read_mac_address_bytes(offs as usize, dest); + } + NetDevCfg::NET_STATUS => self.read_net_status(dest), + NetDevCfg::MTU => self.read_mtu(dest), + ComCfg::QUEUE_RESET => self.read_queue_reset(dest), + _ => { + if let Err(e) = self.header_caps.read(address.offset(), dest) { + error!("PCI Read error: {e}"); + } + } + } + } + + fn handle_write(&mut self, address: PciConfigurationAddress, data: &[u8]) { + match address.offset().0 { + ComCfg::DEVICE_STATUS => self.write_status(data), + ComCfg::DRIVER_FEATURE_SELECT => self.write_driver_feature_select(data), + ComCfg::DEVICE_FEATURE_SELECT => self.write_device_feature_select(data), + ComCfg::DRIVER_FEATURE => self.write_requested_features(data), + ComCfg::QUEUE_SELECT => self.write_selected_queue(data), + ComCfg::QUEUE_DESC_LOW => self.update_queue_addr(Area::DescLow, data), + ComCfg::QUEUE_DESC_HIGH => self.update_queue_addr(Area::DescHigh, data), + ComCfg::QUEUE_ENABLE => self.queue_enable(data), + ComCfg::QUEUE_DRIVER_LOW => self.update_queue_addr(Area::DriverLow, data), + ComCfg::QUEUE_DRIVER_HIGH => self.update_queue_addr(Area::DriverHigh, data), + ComCfg::QUEUE_DEVICE_LOW => self.update_queue_addr(Area::DeviceLow, data), + ComCfg::QUEUE_DEVICE_HIGH => self.update_queue_addr(Area::DeviceHigh, data), + ComCfg::QUEUE_RESET => self.write_reset_queue(), + IsrStatus::ISR_FLAGS => { + panic!("Guest should not write to ISR!") + } + HeaderConf::NOTIFY_REGION_START..HeaderConf::NOTIFY_REGION_END => { + panic!("Writing to MemNotify address! Is IOEventFD correctly configured?") + } + _ => { + if let Err(e) = self.header_caps.write(address.offset(), data) { + error!("PCI Write error: {e}"); + } + } + } + } +} + +/// Write host-received packets to the virtio-queue. +/// Returns true if notification must occur +pub fn write_packet( + rx_queue: &Arc>, + frame_queue: &mut VecDeque<([u8; UHYVE_NET_MTU], usize)>, + mmap: &MmapMemory, + notifier: &NOTIFIER, +) -> Result { + let mut queue = rx_queue.lock().unwrap(); + + if !queue.is_valid(&mmap.mem) { + error!("Queue is not valid!"); + return Err(VirtIOError::InvalidSize); + } + + if !queue.ready() { + error!("QueueTx not ready!"); + return Err(VirtIOError::QueueNotReady); + } + + queue.disable_notification(&mmap.mem)?; + + for &(frame, len) in frame_queue.iter() { + debug!("Transmitting: writing host-received frame of length {len} into virtqueue"); + + let header = virtio_net_hdr_v1 { + num_buffers: 1, + ..Default::default() + }; + + let desc_chain; + loop { + if let Some(d) = queue.pop_descriptor_chain(&mmap.mem) { + desc_chain = d; + break; + } + queue.enable_notification(&mmap.mem)?; + notifier.wait_for_notify(); + queue.disable_notification(&mmap.mem)?; + } + + let mut writer = desc_chain.clone().writer(&mmap.mem).unwrap(); + writer + .write_all(unsafe { + std::slice::from_raw_parts( + &header as *const _ as *const u8, + size_of::(), + ) + }) + .unwrap(); + writer.write_all(frame.as_slice()).unwrap(); + trace!( + "Transmitting: Putting index {} to used ring (next used: {}, size: {})", + desc_chain.head_index(), + queue.next_used(), + queue.size() + ); + queue + .add_used( + &mmap.mem, + desc_chain.head_index(), + (len + VIRTIO_NET_HEADER_SZ) as u32, + ) + .unwrap(); + } + frame_queue.clear(); + + queue.enable_notification(&mmap.mem)?; + + Ok(true) +} + +/// Sends the packets received from the guest to the network interface +pub fn send_available_packets( + sink: &mut dyn NetworkInterfaceTX, + tx_queue_locked: &Arc>, + mem: &MmapMemory, +) -> std::result::Result { + trace!("reading frames from VM"); + let queue = &mut tx_queue_locked.try_lock().unwrap(); + if !queue.is_valid(&mem.mem) { + error!("Queue is not valid!"); + return Err(VirtIOError::InvalidSize); + } + + if !queue.ready() { + error!("QueueTx not ready!"); + return Err(VirtIOError::QueueNotReady); + } + + queue.disable_notification(&mem.mem)?; + + while let Some(chain) = queue.iter(&mem.mem).unwrap().next() { + let mut buff = Vec::::with_capacity(1512); + let mut reader = chain.clone().reader(&mem.mem).unwrap(); + let mut packet_reader = reader.split_at(VIRTIO_NET_HEADER_SZ).unwrap(); + + let header_bytes_read = reader.read_to_end(&mut buff).unwrap(); + let packet_bytes_read = packet_reader.read_to_end(&mut buff).unwrap(); + trace!("received frame of length {packet_bytes_read} from VM"); + + match (*sink).send(&buff[VIRTIO_NET_HEADER_SZ..]) { + Ok(sent_len) => { + if sent_len != packet_bytes_read { + error!( + "Could not send all data provided! sent {sent_len}, vs {packet_bytes_read}" + ); + } + } + Err(e) => { + error!("could not send frame: {e}"); + error!("frame slice: {:x?}", &buff[VIRTIO_NET_HEADER_SZ..]); + } + } + + queue.add_used( + &mem.mem, + chain.head_index(), + (header_bytes_read + packet_bytes_read) as u32, + )?; + } + queue.enable_notification(&mem.mem)?; + + Ok(true) +} diff --git a/src/virtio/pci.rs b/src/virtio/pci.rs new file mode 100644 index 00000000..6b376f75 --- /dev/null +++ b/src/virtio/pci.rs @@ -0,0 +1,139 @@ +//! Configuration Structures for Virtio PCI devices + +#![cfg_attr(not(target_os = "linux"), expect(unused))] + +use std::mem::size_of; + +use zerocopy::{Immutable, IntoBytes}; + +use crate::{ + pci::{PciConfigurationOffset, PciError, PciType0ConfigSpaceHeader}, + virtio::{VirtqueueNotification, capabilities::*}, +}; + +/// Helper macro to calculate the byte offset of a field in a struct. +/// offset is a base offset of the struct that is added to the calculation, +/// ty is the struct and field is ty's field to calculate the offset from. +macro_rules! get_offset { + ($offset:expr, $ty:ty, $field:ident) => { + unsafe { + let base_ptr: *const _ = std::mem::MaybeUninit::<$ty>::uninit().as_ptr(); + let f: *const _ = std::ptr::addr_of!((*base_ptr).$field); + crate::pci::PciConfigurationOffset( + (f as *const u8).offset_from(base_ptr as *const u8) as u8 + $offset as u8, + ) + } + }; +} +// Make macro visible for uhyve +pub(crate) use get_offset; + +/// The default memory layout of the PCI header and the capabilities looks as follows: +/// ```text +/// 0x00 ┌─────────────────────────┐ +/// │ PCI Header │────┐ +/// 0x40 ├─────────────────────────┤◄───┘ +/// │ Common Capability ├────────┐ +/// 0x50 ├─────────────────────────┤ │ +/// │ ISR Capability ├──────┐ │ +/// 0x60 ├─────────────────────────┤ │ │ +/// │ Notify Capability ├────┐ │ │ +/// 0x78 ├─────────────────────────┤ │ │ │ +/// │ Device Capability ├──┐ │ │ │ +/// 0x88 ├─────────────────────────┤◄─┼─┼─┼─┘ +/// │ Common Configuration │ │ │ │ +/// 0xC8 ├─────────────────────────┤◄─┼─┼─┘ +/// │ ISR Configuration │ │ │ +/// 0xD0 ├─────────────────────────┤◄─┼─┘ +/// │ Notification Region │ │ +/// 0xD8 ├─────────────────────────┤◄─┘ +/// │ Device Configuration │ +/// 0xF0 └─────────────────────────┘ +/// ``` +#[derive(Default, Debug, IntoBytes, Immutable)] +#[repr(C)] +pub(crate) struct HeaderConf { + pub pci_config_hdr: PciType0ConfigSpaceHeader, + pub common_cap: PciCap, + pub isr_cap: PciCap, + pub notify_cap: NotifyCap, + _padding0: u32, + pub device_cap: PciCap, + pub common_cfg: ComCfg, + pub _isr: IsrStatus, + _padding1: [u8; 7], + pub _notif: [VirtqueueNotification; 2], + pub dev: NetDevCfg, +} +impl HeaderConf { + pub const HDR_START: u8 = get_offset!(0, HeaderConf, pci_config_hdr).0; + pub const HDR_END: u8 = Self::COMMON_CAP_START - 1; + pub const COMMON_CAP_START: u8 = get_offset!(0, HeaderConf, common_cap).0; + pub const ISR_CAP_START: u8 = get_offset!(0, HeaderConf, isr_cap).0; + pub const NOTIFY_CAP_START: u8 = get_offset!(0, HeaderConf, notify_cap).0; + pub const DEVICE_CAP_START: u8 = get_offset!(0, HeaderConf, device_cap).0; + pub const COMMON_CFG_START: u8 = get_offset!(0, HeaderConf, common_cfg).0; + pub const ISR_CFG_START: u8 = get_offset!(0, HeaderConf, _isr).0; + pub const ISR_CFG_END: u8 = Self::NOTIFY_REGION_START - 1; + pub const NOTIFY_REGION_START: u8 = get_offset!(0, HeaderConf, _notif).0; + pub const NOTIFY_0: u8 = Self::NOTIFY_REGION_START; + pub const NOTIFY_1: u8 = Self::NOTIFY_REGION_START + size_of::() as u8; + pub const NOTIFY_REGION_END: u8 = Self::DEVICE_CFG_START - 1; + pub const DEVICE_CFG_START: u8 = get_offset!(0, HeaderConf, dev).0; + pub const DEVICE_CFG_END: u8 = Self::DEVICE_CFG_START + size_of::() as u8 - 1; + + /// Provides the empty but linked datastructures for VirtioPCI. See module level description for the internal memory layout. + pub fn new() -> Self { + let mut h: Self = Default::default(); + h.pci_config_hdr.capabilities_ptr = Self::COMMON_CAP_START; + h.common_cap.cap_next = Self::ISR_CAP_START; + h.common_cap.offset = Self::COMMON_CFG_START as u32; + h.common_cap.cfg_type = CfgType::COMMON_CFG; + h.common_cap.length = size_of::() as u32; + + h.isr_cap.cap_next = Self::NOTIFY_CAP_START; + h.isr_cap.offset = Self::ISR_CFG_START as u32; + h.isr_cap.cfg_type = CfgType::ISR_CFG; + h.isr_cap.length = size_of::() as u32; + + h.notify_cap.cap.cap_next = Self::DEVICE_CAP_START; + h.notify_cap.cap.offset = Self::NOTIFY_REGION_START as u32; + h.notify_cap.cap.cfg_type = CfgType::NOTIFY_CFG; + + h.device_cap.cap_next = 0; + h.device_cap.offset = Self::DEVICE_CFG_START as u32; + h.device_cap.cfg_type = CfgType::DEVICE_CFG; + h.device_cap.length = size_of::() as u32; + + h + } + + pub fn read(&self, address: PciConfigurationOffset, dest: &mut [u8]) -> Result<(), PciError> { + let a = address.0; + match a { + Self::ISR_CFG_START..=Self::ISR_CFG_END => unreachable!(), + Self::NOTIFY_REGION_START..=Self::NOTIFY_REGION_END => unreachable!(), + 0..Self::DEVICE_CFG_END => { + dest.copy_from_slice( + &self.as_bytes()[address.0 as usize..address.0 as usize + dest.len()], + ); + Ok(()) + } + + _ => Err(PciError::InvalidOffset(address)), + } + } + pub fn write(&mut self, address: PciConfigurationOffset, data: &[u8]) -> Result<(), PciError> { + let a = address.0; + match a { + ComCfg::DEVICE_FEATURE_SELECT + | ComCfg::NUM_QUEUES + | ComCfg::CONFIG_GENERATION + | ComCfg::QUEUE_NOTIFY_OFFSET + | ComCfg::QUEUE_NOTIFY_DATA => Err(PciError::ReadOnlyOffset(address)), + Self::HDR_START..=Self::HDR_END => self.pci_config_hdr.write(address, data), + Self::COMMON_CAP_START..=Self::DEVICE_CFG_END => Err(PciError::ReadOnlyOffset(address)), + _ => Err(PciError::InvalidOffset(address)), + } + } +} diff --git a/src/virtqueue.rs b/src/virtqueue.rs deleted file mode 100644 index acdbe172..00000000 --- a/src/virtqueue.rs +++ /dev/null @@ -1,149 +0,0 @@ -#![cfg_attr(target_os = "macos", allow(dead_code))] // no virtio implementation for macos -use std::{marker::PhantomData, mem, mem::size_of}; - -use crate::consts::PAGE_SIZE; - -pub const QUEUE_LIMIT: usize = 256; - -#[repr(C)] -pub struct VringDescriptor { - pub addr: u64, - pub len: u32, - pub flags: u16, - pub next: u16, -} - -pub struct Vring { - mem: *const u8, - _marker: PhantomData<*const T>, -} - -impl Vring { - pub fn new(mem: *const u8) -> Self { - Vring { - mem, - _marker: PhantomData, - } - } - - pub fn _flags(&self) -> u16 { - unsafe { - #[expect(clippy::cast_ptr_alignment)] - *(self.mem as *const u16) - } - } - - pub fn index(&self) -> u16 { - unsafe { - #[expect(clippy::cast_ptr_alignment)] - *(self.mem.offset(2) as *const u16) - } - } - - pub fn advance_index(&mut self) { - unsafe { - let new_value = self.index() + 1; - #[expect(clippy::cast_ptr_alignment)] - let write_ptr = self.mem.offset(2) as *mut u16; - *write_ptr = new_value; - } - } - - pub fn ring_elem(&mut self, index: u16) -> &mut T { - let elem_size = mem::size_of::() as u16; - unsafe { &mut *(self.mem.offset((4 + index * elem_size) as isize) as *mut T) } - } -} - -#[repr(C)] -pub struct VringUsedElement { - pub id: u32, - pub len: u32, -} - -pub type VringAvailable = Vring; -pub type VringUsed = Vring; - -pub struct Virtqueue { - pub descriptor_table: *mut VringDescriptor, - pub available_ring: VringAvailable, - pub used_ring: VringUsed, - pub last_seen_available: u16, - #[expect(dead_code)] - pub last_seen_used: u16, - pub queue_size: u16, -} - -pub struct AvailIter<'a> { - available_ring: &'a VringAvailable, - last_seen_available: &'a mut u16, - queue_size: u16, -} - -impl Iterator for AvailIter<'_> { - type Item = u16; - - fn next(&mut self) -> Option { - if *self.last_seen_available == self.available_ring.index() { - return None; - } - - let index = *self.last_seen_available % self.queue_size; - *self.last_seen_available += 1; - Some(index) - } -} - -fn align(addr: usize, boundary: usize) -> usize { - (addr + boundary - 1) & !(boundary - 1) -} - -fn get_available_ring_offset() -> usize { - size_of::() * QUEUE_LIMIT -} - -fn get_used_ring_offset() -> usize { - align( - get_available_ring_offset() + size_of::() * (QUEUE_LIMIT + 3), - PAGE_SIZE, - ) -} - -impl Virtqueue { - pub unsafe fn new(mem: *mut u8, queue_size: usize) -> Self { - #[expect(clippy::cast_ptr_alignment)] - let descriptor_table = mem as *mut VringDescriptor; - let available_ring_ptr = unsafe { mem.add(get_available_ring_offset()) }; - let used_ring_ptr = unsafe { mem.add(get_used_ring_offset()) }; - let available_ring = VringAvailable::new(available_ring_ptr); - let used_ring = VringUsed::new(used_ring_ptr); - Virtqueue { - descriptor_table, - available_ring, - used_ring, - last_seen_available: 0, - last_seen_used: 0, - queue_size: queue_size as u16, - } - } - - pub unsafe fn get_descriptor(&mut self, index: u16) -> &mut VringDescriptor { - unsafe { &mut *self.descriptor_table.offset(index as isize) } - } - - pub fn avail_iter(&mut self) -> AvailIter<'_> { - AvailIter { - available_ring: &self.available_ring, - last_seen_available: &mut self.last_seen_available, - queue_size: self.queue_size, - } - } - - pub fn add_used(&mut self, desc_index: u32, len: u32) { - let tgt_index = self.used_ring.index() % self.queue_size; - let used_elem = self.used_ring.ring_elem(tgt_index); - used_elem.id = desc_index; - used_elem.len = len; - self.used_ring.advance_index(); - } -} diff --git a/src/vm.rs b/src/vm.rs index 57c0e3e3..91ca87fc 100644 --- a/src/vm.rs +++ b/src/vm.rs @@ -35,7 +35,6 @@ use crate::{ serial::{Destination, UhyveSerial}, stats::{CpuStats, VmStats}, vcpu::VirtualCPU, - virtio::*, }; #[cfg(target_os = "linux")] use crate::{ @@ -63,10 +62,12 @@ pub type DefaultBackend = crate::linux::x86_64::kvm_cpu::KvmVm; pub type DefaultBackend = crate::macos::XhyveVm; pub(crate) mod internal { - use std::sync::Arc; + use std::{fmt::Debug, sync::Arc}; use crate::{ HypervisorResult, + mem::MmapMemory, + params::NetworkMode, vcpu::VirtualCPU, vm::{KernelInfo, Params, VmPeripherals}, }; @@ -74,6 +75,7 @@ pub(crate) mod internal { /// Trait marking a interface for creating (accelerated) VMs. pub trait VirtualizationBackendInternal: Sized { type VCPU: 'static + VirtualCPU; + type VirtioNetImpl: Debug; const NAME: &str; /// Create a new CPU object @@ -84,7 +86,9 @@ pub(crate) mod internal { enable_stats: bool, ) -> HypervisorResult; - fn new(peripherals: Arc, params: &Params) -> HypervisorResult; + fn new(peripherals: Arc>, params: &Params) -> HypervisorResult; + + fn virtio_net_device(mode: NetworkMode, mmap: Arc) -> Self::VirtioNetImpl; } } @@ -101,17 +105,17 @@ pub struct VmResult { /// mutable devices that a vCPU interacts with #[derive(Debug)] -pub(crate) struct VmPeripherals { +pub(crate) struct VmPeripherals { pub file_mapping: Mutex, - pub mem: MmapMemory, + pub mem: Arc, pub(crate) serial: UhyveSerial, - pub virtio_device: Mutex, + pub virtio_device: Option>, } // TODO: Investigate soundness // https://github.com/hermitcore/uhyve/issues/229 -unsafe impl Send for VmPeripherals {} -unsafe impl Sync for VmPeripherals {} +unsafe impl Send for VmPeripherals {} +unsafe impl Sync for VmPeripherals {} /// static information that does not change during execution #[derive(Debug)] @@ -203,8 +207,9 @@ pub(crate) fn generate_guest_start_address( pub struct UhyveVm { pub(crate) vcpus: Vec<::VCPU>, - pub(crate) peripherals: Arc, + pub(crate) peripherals: Arc>, pub(crate) kernel_info: Arc, + _virt_backend: VirtBackend::BACKEND, } impl UhyveVm { pub fn new(kernel_path: PathBuf, mut params: Params) -> HypervisorResult> { @@ -425,10 +430,31 @@ impl UhyveVm { stack_address, }); + // create virtio interface + let mem = Arc::new(mem); + if let Some(version) = hermit_version + && kernel_info.params.network.is_some() + && (version + < HermitVersion { + major: 0, + minor: 13, + patch: 2, + }) { + return Err(HypervisorError::FeatureMismatch( + "Network requires Kernel 0.13.2 or newer", + )); + } + let virtio_device = kernel_info.params.network.as_ref().map(|mode| { + Mutex::new(VirtBackend::BACKEND::virtio_net_device( + mode.clone(), + mem.clone(), + )) + }); + let peripherals = Arc::new(VmPeripherals { mem, // create virtio interface - virtio_device: Mutex::new(VirtioNetPciDevice::new()), + virtio_device, // TODO: file_mapping not in kernel_info file_mapping: Mutex::new(file_mapping), serial, @@ -500,6 +526,7 @@ impl UhyveVm { peripherals, kernel_info, vcpus, + _virt_backend: virt_backend, }) } diff --git a/tests/common.rs b/tests/common.rs index 76f83e64..67d3fdb7 100644 --- a/tests/common.rs +++ b/tests/common.rs @@ -28,6 +28,9 @@ pub enum BuildMode { Release, } +pub const HERMIT_GATEWAY: &str = "10.0.5.2"; +pub const HERMIT_IP: &str = "10.0.5.3"; + /// Uses Cargo to build a kernel in the `tests/test-kernels` directory. /// Returns a path to the build binary. pub fn build_hermit_bin(kernel: impl AsRef, mode: BuildMode) -> PathBuf { @@ -57,6 +60,8 @@ pub fn build_hermit_bin(kernel: impl AsRef, mode: BuildMode) -> PathBuf { .arg("--target=x86_64-unknown-hermit") .arg("--bin") .arg(kernel) + .env("HERMIT_IP", HERMIT_IP) + .env("HERMIT_GATEWAY", HERMIT_GATEWAY) .current_dir(&kernel_src_path); cmd = if mode == BuildMode::Release { diff --git a/tests/network-test.rs b/tests/network-test.rs new file mode 100644 index 00000000..2f1e5c43 --- /dev/null +++ b/tests/network-test.rs @@ -0,0 +1,275 @@ +mod common; + +use std::{ + io::{Read, Write}, + net::{Shutdown, TcpListener, TcpStream}, + sync::Mutex, + thread, + time::Instant, +}; + +use byte_unit::{Byte, Unit}; +use common::{BuildMode, HERMIT_GATEWAY, HERMIT_IP, build_hermit_bin, check_result}; +use regex::Regex; +use uhyvelib::{ + params::{NetworkMode, Output, Params}, + vm::UhyveVm, +}; + +static NETWORK_TEST_MUTEX: Mutex<()> = Mutex::new(()); + +#[test] +fn network_guest_receive_test() { + let mut builder = env_logger::Builder::from_default_env(); + // The precise timestampe can be important when debugging networking, + builder.format_timestamp_nanos().try_init().ok(); + + let bin_path = build_hermit_bin("network_test", BuildMode::Debug); + let params = Params { + cpu_count: 1.try_into().unwrap(), + memory_size: Byte::from_u64_with_unit(64, Unit::MiB) + .unwrap() + .try_into() + .unwrap(), + output: Output::Buffer, + stats: true, + aslr: false, + network: Some(NetworkMode::Tap { + name: "tap10".to_string(), + }), + kernel_args: vec![ + "--".to_owned(), + "testname=simple_receive_test".to_owned(), + "test_argument=".to_owned(), + ], + ..Default::default() + }; + + let _guard = NETWORK_TEST_MUTEX.lock(); + + let t = thread::spawn(move || { + let mut hermit_ip = String::from(HERMIT_IP); + hermit_ip.push_str(":9975"); + let mut stream = TcpStream::connect(hermit_ip).unwrap(); + for i in 0..10_u8 { + let mut v = Vec::with_capacity(i as usize); + for _ in 0..=i { + v.push(b'a' + i); + } + println!("Sending {v:?}"); + stream.write_all(&v).unwrap(); + } + stream.write_all(b"exit").unwrap(); + }); + + let res = UhyveVm::new(bin_path, params).unwrap().run(None); + check_result(&res); + + t.join().unwrap(); + + for t in ["a", "bb", "ccc", "dddd", "eeeee"] { + assert!(res.output.as_ref().unwrap().contains(t)); + } +} + +#[test] +fn network_guest_send_test() { + let mut builder = env_logger::Builder::from_default_env(); + // The precise timestampe can be important when debugging networking, + builder.format_timestamp_nanos().try_init().ok(); + + let bin_path = build_hermit_bin("network_test", BuildMode::Debug); + let params = Params { + cpu_count: 1.try_into().unwrap(), + memory_size: Byte::from_u64_with_unit(64, Unit::MiB) + .unwrap() + .try_into() + .unwrap(), + output: Output::Buffer, + stats: true, + aslr: false, + network: Some(NetworkMode::Tap { + name: "tap10".to_string(), + }), + kernel_args: vec![ + "--".to_owned(), + "testname=simple_send_test".to_owned(), + "test_argument=".to_owned() + HERMIT_GATEWAY + ":9975", + ], + ..Default::default() + }; + + let _guard = NETWORK_TEST_MUTEX.lock(); + let t = thread::spawn(move || { + let listener = TcpListener::bind(HERMIT_GATEWAY.to_string() + ":9975").unwrap(); + println!("socket bound"); + let (mut socket, _) = listener.accept().unwrap(); + println!("connection established"); + let mut received_bytes = Vec::new(); + loop { + let mut buf = [0u8; 1500]; + match socket.read(&mut buf) { + Err(e) => { + println!("read err {e:?}"); + } + Ok(received) => { + println!("read {}", std::str::from_utf8(&buf[..received]).unwrap()); + received_bytes.extend_from_slice(&buf[..received]); + if buf.windows(4).any(|window| window == b"exit") { + break; + } + } + } + } + println!("received bytes: {received_bytes:?}"); + for t in ["a", "bb", "ccc", "dddd", "eeeee"] { + assert!( + received_bytes + .windows(t.len()) + .any(|window| window == t.as_bytes()) + ); + } + }); + + let res = UhyveVm::new(bin_path, params).unwrap().run(None); + check_result(&res); + + t.join().unwrap(); +} + +const TOTAL_BYTES: u64 = 128 * 1024 * 1024; // 128 MiB + +#[test] +fn network_receive_large() { + env_logger::try_init().ok(); + let kernel_path = build_hermit_bin("network_test", BuildMode::Debug); + + let params = Params { + cpu_count: 1.try_into().unwrap(), + memory_size: Byte::from_u64_with_unit(64, Unit::MiB) + .unwrap() + .try_into() + .unwrap(), + output: Output::Buffer, + stats: true, + aslr: false, + network: Some(NetworkMode::Tap { + name: "tap10".to_string(), + }), + kernel_args: vec![ + "--".to_owned(), + "testname=receive_bench".to_owned(), + "test_argument=".to_owned(), + ], + ..Default::default() + }; + + let _guard = NETWORK_TEST_MUTEX.lock(); + let t = thread::spawn(move || { + let mut hermit_ip = String::from(HERMIT_IP); + hermit_ip.push_str(":9975"); + let mut stream = TcpStream::connect(hermit_ip).unwrap(); + + let buf = vec![123u8; 64 * 1024]; // Bytes without meaning + let mut sent: u64 = 0; + + let start = Instant::now(); + + while sent < TOTAL_BYTES { + let remaining = (TOTAL_BYTES - sent) as usize; + let to_send = remaining.min(buf.len()); + stream.write_all(&buf[..to_send]).unwrap(); + sent += to_send as u64; + } + + stream.shutdown(Shutdown::Write).unwrap(); + let elapsed = start.elapsed(); + let secs = elapsed.as_secs_f64(); + + println!("Sent {sent} bytes in {secs:.3} s"); + let mbit = (sent as f64 * 8.0) / (secs * 1_000_000.0); + println!("Throughput (sending): {mbit:.2} Mbit/s"); + }); + + let res = UhyveVm::new(kernel_path.clone(), params).unwrap().run(None); + + check_result(&res); + println!("Kernel Output:\n{}", res.output.as_ref().unwrap()); + + let re = Regex::new(r"(?m)^Received\s*([0-9]+)\s+Bytes").unwrap(); + + let caps = re + .captures(res.output.as_ref().unwrap()) + .expect("kernel output doesn't container received bytes"); + let bytes_received: u64 = caps[1].parse().expect("invalid number"); + + assert_eq!(TOTAL_BYTES, bytes_received); + + t.join().unwrap(); +} + +#[test] +fn network_send_large() { + env_logger::try_init().ok(); + let kernel_path = build_hermit_bin("network_test", BuildMode::Debug); + + let params = Params { + cpu_count: 1.try_into().unwrap(), + memory_size: Byte::from_u64_with_unit(64, Unit::MiB) + .unwrap() + .try_into() + .unwrap(), + output: Output::Buffer, + stats: true, + aslr: false, + network: Some(NetworkMode::Tap { + name: "tap10".to_string(), + }), + kernel_args: vec![ + "--".to_owned(), + "testname=send_bench".to_owned(), + format!("test_argument={HERMIT_GATEWAY}:9975/{TOTAL_BYTES}").to_owned(), + ], + ..Default::default() + }; + + let _guard = NETWORK_TEST_MUTEX.lock(); + let t = thread::spawn(move || { + let listener = TcpListener::bind(HERMIT_GATEWAY.to_string() + ":9975").unwrap(); + println!("socket bound"); + let (mut stream, peer) = listener.accept().unwrap(); + println!("Got connection from {}", peer); + + stream.set_nodelay(true).unwrap(); + + let mut buf = vec![0u8; 8192]; + let mut received: u64 = 0; + + let start = Instant::now(); + loop { + let n = stream.read(&mut buf).unwrap(); + if n == 0 { + // connection terminated + break; + } + received += n as u64; + } + + let elapsed = start.elapsed(); + let secs = elapsed.as_secs_f64(); + + println!("Received {received} bytes in {secs:.3} s"); + let mbit = (received as f64 * 8.0) / (secs * 1_000_000.0); + println!("Throughput (receiving): {mbit:.2} Mbit/s"); + received + }); + + let res = UhyveVm::new(kernel_path.clone(), params).unwrap().run(None); + + let received = t.join().unwrap(); + + check_result(&res); + println!("Kernel Output:\n{}", res.output.as_ref().unwrap()); + + assert_eq!(TOTAL_BYTES, received); +} diff --git a/tests/test-kernels/Cargo.lock b/tests/test-kernels/Cargo.lock index 5e46abdf..5b1c1587 100644 --- a/tests/test-kernels/Cargo.lock +++ b/tests/test-kernels/Cargo.lock @@ -76,8 +76,8 @@ checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" [[package]] name = "hermit" -version = "0.13.1" -source = "git+https://github.com/hermit-os/hermit-rs.git?tag=hermit-0.13.1#8d32adb0ca02460e13228e45981fbd3c6a409af9" +version = "0.13.2" +source = "git+https://github.com/hermit-os/hermit-rs.git?tag=hermit-0.13.2#0e68850e7b848656f8704b6c1fdc3a09685cb4de" dependencies = [ "cc", "home", diff --git a/tests/test-kernels/Cargo.toml b/tests/test-kernels/Cargo.toml index e03f3f7e..6b744cca 100644 --- a/tests/test-kernels/Cargo.toml +++ b/tests/test-kernels/Cargo.toml @@ -7,7 +7,11 @@ publish = false [target.'cfg(target_os = "hermit")'.dependencies.hermit] git = "https://github.com/hermit-os/hermit-rs.git" -tag = "hermit-0.13.1" +tag = "hermit-0.13.2" +# We disable dhcp to use a hard-coded IPv4 +default-features = false +features = ["pci", "pci-ids", "acpi", "fsgsbase", "smp", "tcp", "vsock", "virtio-net"] + [target.'cfg(target_arch = "x86_64")'.dependencies] x86_64 = { version = "0.15", default-features = false, features = [ diff --git a/tests/test-kernels/src/bin/network_test.rs b/tests/test-kernels/src/bin/network_test.rs new file mode 100644 index 00000000..67b921db --- /dev/null +++ b/tests/test-kernels/src/bin/network_test.rs @@ -0,0 +1,128 @@ +#![allow(dead_code)] +#![allow(unused_imports)] + +use std::{ + env, + fs::File, + io::{Error, Read, Write}, + net::{Ipv4Addr, Shutdown, SocketAddrV4, TcpListener, TcpStream}, + time::Instant, +}; + +#[cfg(target_os = "hermit")] +use hermit as _; + +fn simple_receive_test() -> Result<(), Error> { + let listener = TcpListener::bind("127.0.0.1:9975").unwrap(); + println!("socket bound"); + let (mut socket, _) = listener.accept().unwrap(); + println!("connection established"); + loop { + let mut buf = [0u8; 1500]; + match socket.read(&mut buf) { + Err(e) => { + println!("read err {e:?}"); + } + Ok(received) => { + println!("read {}", std::str::from_utf8(&buf[..received]).unwrap()); + if buf.windows(4).any(|window| window == b"exit") { + break; + } + } + } + } + Ok(()) +} + +fn simple_send_test(host_ip_port: &str) -> Result<(), Error> { + let mut stream = TcpStream::connect(host_ip_port).expect("Can't connect to host"); + for i in 0..10_u8 { + let mut v = Vec::with_capacity(i as usize); + for _ in 0..=i { + v.push(b'a' + i); + } + println!("Sending {v:?}"); + stream.write_all(&v)?; + } + stream.write_all(b"exit")?; + Ok(()) +} + +fn send_bench(host_ip_send_cnt: &str) -> Result<(), Error> { + let v = host_ip_send_cnt.split("/").collect::>(); + + let total_bytes: u64 = v[1].parse().unwrap(); + + let mut stream = TcpStream::connect(v[0]).expect("Can't connect to host"); + + let buf = vec![123u8; 64 * 1024]; // Bytes without meaning + let mut sent: u64 = 0; + + let start = Instant::now(); + + while sent < total_bytes { + let remaining = (total_bytes - sent) as usize; + let to_send = remaining.min(buf.len()); + stream.write_all(&buf[..to_send]).unwrap(); + sent += to_send as u64; + } + + stream.shutdown(Shutdown::Write).unwrap(); + let elapsed = start.elapsed(); + let secs = elapsed.as_secs_f64(); + + println!("Sent {sent} bytes in {secs:.3} s"); + let mbit = (sent as f64 * 8.0) / (secs * 1_000_000.0); + println!("Throughput (sending): {mbit:.2} Mbit/s"); + + Ok(()) +} + +fn receive_bench() -> Result<(), Error> { + let listener = TcpListener::bind("127.0.0.1:9975").unwrap(); + println!("Waiting for connection..."); + + let (mut stream, peer) = listener.accept()?; + println!("Got connection from {peer}"); + + stream.set_nodelay(true)?; + + let mut buf = vec![0u8; 8192]; + let mut received: u64 = 0; + + let start = Instant::now(); + + loop { + let n = stream.read(&mut buf)?; + if n == 0 { + // connection terminated + break; + } + received += n as u64; + } + + let elapsed = start.elapsed(); + let secs = elapsed.as_secs_f64(); + + println!("Received {received} Bytes in {secs:.3} s"); + let mbit = (received as f64 * 8.0) / (secs * 1_000_000.0); + println!("Throughput (receiving): {mbit:.2} Mbit/s"); + + Ok(()) +} + +fn main() -> Result<(), Error> { + let args: Vec = env::args().collect(); + let testname = &args[1].split('=').collect::>()[1]; + let test_argument = &args[2].split('=').collect::>()[1]; + + println!("Network Test - {testname}"); + + match *testname { + "simple_receive_test" => simple_receive_test(), + "simple_send_test" => simple_send_test(test_argument), + "receive_bench" => receive_bench(), + "send_bench" => send_bench(test_argument), + _ => panic!("test {testname} not found"), + } +}