diff --git a/CODEOWNERS b/CODEOWNERS index 6c46db6201a86..d2a65a7c201a0 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -386,6 +386,7 @@ extensions/upstreams/tcp @ggreenway @mattklein123 /*/extensions/load_balancing_policies/client_side_weighted_round_robin @wbpcode @adisuissa @efimki /*/extensions/load_balancing_policies/override_host @yanavlasov @tonya11en /*/extensions/load_balancing_policies/wrr_locality @wbpcode @adisuissa @efimki +/*/extensions/load_balancing_policies/load_aware_locality @tonya11en @jukie # Early header mutation /*/extensions/http/early_header_mutation/header_mutation @wbpcode @tyxia # Network matching extensions diff --git a/api/BUILD b/api/BUILD index 2b36b337b1c34..0977bcdf1334d 100644 --- a/api/BUILD +++ b/api/BUILD @@ -344,6 +344,7 @@ proto_library( "//envoy/extensions/load_balancing_policies/common/v3:pkg", "//envoy/extensions/load_balancing_policies/dynamic_modules/v3:pkg", "//envoy/extensions/load_balancing_policies/least_request/v3:pkg", + "//envoy/extensions/load_balancing_policies/load_aware_locality/v3:pkg", "//envoy/extensions/load_balancing_policies/maglev/v3:pkg", "//envoy/extensions/load_balancing_policies/override_host/v3:pkg", "//envoy/extensions/load_balancing_policies/pick_first/v3:pkg", diff --git a/api/envoy/extensions/load_balancing_policies/load_aware_locality/v3/BUILD b/api/envoy/extensions/load_balancing_policies/load_aware_locality/v3/BUILD new file mode 100644 index 0000000000000..674cc63b086b3 --- /dev/null +++ b/api/envoy/extensions/load_balancing_policies/load_aware_locality/v3/BUILD @@ -0,0 +1,12 @@ +# DO NOT EDIT. This file is generated by tools/proto_format/proto_sync.py. + +load("@envoy_api//bazel:api_build_system.bzl", "api_proto_package") + +licenses(["notice"]) # Apache 2 + +api_proto_package( + deps = [ + "//envoy/config/cluster/v3:pkg", + "@xds//udpa/annotations:pkg", + ], +) diff --git a/api/envoy/extensions/load_balancing_policies/load_aware_locality/v3/load_aware_locality.proto b/api/envoy/extensions/load_balancing_policies/load_aware_locality/v3/load_aware_locality.proto new file mode 100644 index 0000000000000..9aa387ec33336 --- /dev/null +++ b/api/envoy/extensions/load_balancing_policies/load_aware_locality/v3/load_aware_locality.proto @@ -0,0 +1,69 @@ +syntax = "proto3"; + +package envoy.extensions.load_balancing_policies.load_aware_locality.v3; + +import "envoy/config/cluster/v3/cluster.proto"; + +import "google/protobuf/duration.proto"; +import "google/protobuf/wrappers.proto"; + +import "udpa/annotations/status.proto"; +import "validate/validate.proto"; + +option java_package = "io.envoyproxy.envoy.extensions.load_balancing_policies.load_aware_locality.v3"; +option java_outer_classname = "LoadAwareLocalityProto"; +option java_multiple_files = true; +option go_package = "github.com/envoyproxy/go-control-plane/envoy/extensions/load_balancing_policies/load_aware_locality/v3;load_aware_localityv3"; +option (udpa.annotations.file_status).package_version_status = ACTIVE; + +// [#protodoc-title: Load-Aware Locality-Picking Load Balancing Policy] +// [#extension: envoy.load_balancing_policies.load_aware_locality] + +// Configuration for the load_aware_locality LB policy which uses ORCA utilization data +// to route traffic between localities based on available headroom. +// [#next-free-field: 8] +message LoadAwareLocality { + // The child LB policy to create for endpoint-picking within each locality. + config.cluster.v3.LoadBalancingPolicy endpoint_picking_policy = 1 + [(validate.rules).message = {required: true}]; + + // How frequently ORCA weights and locality utilization are recomputed on the + // main thread. Defaults to 1s. + google.protobuf.Duration weight_update_period = 2; + + // Named metrics from ORCA reports to use for computing utilization. + // When configured, endpoint utilization is computed by taking the max of the + // values of these metrics. For map fields in the ORCA proto, the string will + // be of the form ``.``. For example, the string + // ``named_metrics.foo`` will look for the key ``foo`` in the ORCA + // :ref:`named_metrics ` + // field. If this field is not configured or none of the specified metrics are + // present in the load report, then + // :ref:`application_utilization ` + // is used instead. If that field is also not set or is zero, then + // :ref:`cpu_utilization ` + // is used. + repeated string metric_names_for_computing_utilization = 3; + + // When the local locality's utilization is at most this threshold above the + // remote average, route 100% of traffic to the local locality. This avoids + // unnecessary cross-zone routing when utilization is roughly balanced. + // Must be in [0, 1]. Defaults to 0.1. + google.protobuf.DoubleValue utilization_variance_threshold = 4 + [(validate.rules).double = {lte: 1.0 gte: 0.0}]; + + // EWMA smoothing factor for per-locality utilization. Must be in (0, 1]. + // Higher values react faster to changes, lower values are more stable. + // Defaults to 0.3. + google.protobuf.DoubleValue ewma_alpha = 5 [(validate.rules).double = {lte: 1.0 gt: 0.0}]; + + // Minimum fraction of traffic sent to non-local localities to keep + // ORCA data fresh. Applied even in all_local mode. Must be in [0, 1). + // Set to 0 to disable. Defaults to 0.03 (3%). + google.protobuf.DoubleValue probe_percentage = 6 [(validate.rules).double = {lt: 1.0 gte: 0.0}]; + + // If a given endpoint has not reported load metrics in this long, then we + // stop using the reported weight. This ensures that we do not continue to + // use very stale weights. Set to 0s to disable expiration. Defaults to 3 minutes. + google.protobuf.Duration weight_expiration_period = 7 [(validate.rules).duration = {gte {}}]; +} diff --git a/api/versioning/BUILD b/api/versioning/BUILD index d9358c863733e..595edf04c6a0c 100644 --- a/api/versioning/BUILD +++ b/api/versioning/BUILD @@ -283,6 +283,7 @@ proto_library( "//envoy/extensions/load_balancing_policies/common/v3:pkg", "//envoy/extensions/load_balancing_policies/dynamic_modules/v3:pkg", "//envoy/extensions/load_balancing_policies/least_request/v3:pkg", + "//envoy/extensions/load_balancing_policies/load_aware_locality/v3:pkg", "//envoy/extensions/load_balancing_policies/maglev/v3:pkg", "//envoy/extensions/load_balancing_policies/override_host/v3:pkg", "//envoy/extensions/load_balancing_policies/pick_first/v3:pkg", diff --git a/changelogs/current.yaml b/changelogs/current.yaml index 992d38cf67bf9..14ca5bb070d25 100644 --- a/changelogs/current.yaml +++ b/changelogs/current.yaml @@ -368,6 +368,14 @@ new_features: change: | Added a per-connection filter state object to select a workload trust domain in the SPIFFE validator in the multi-tenant deployments. +- area: load_balancing + change: | + Added :ref:`load-aware locality LB policy + ` + that uses ORCA utilization data to distribute traffic across localities based on real-time + headroom. The policy computes per-locality EWMA utilization from endpoint ORCA reports and + shifts traffic away from overloaded localities while respecting a configurable variance + threshold and probe percentage to keep telemetry fresh. - area: tls change: | Extended TLS certificate compression (RFC 8879): added brotli to QUIC (which already supported zlib), diff --git a/docs/root/intro/arch_overview/upstream/load_balancing/load_aware_locality.rst b/docs/root/intro/arch_overview/upstream/load_balancing/load_aware_locality.rst new file mode 100644 index 0000000000000..61f6d54a58973 --- /dev/null +++ b/docs/root/intro/arch_overview/upstream/load_balancing/load_aware_locality.rst @@ -0,0 +1,554 @@ +.. _arch_overview_load_balancing_load_aware_locality: + +Load-aware locality load balancing +----------------------------------- + +.. attention:: + + This extension is **alpha** and is not yet intended for production use. + +The load-aware locality LB policy +(:ref:`envoy.load_balancing_policies.load_aware_locality +`) +is a locality-picking load balancer designed for deployments where incoming +load is not evenly distributed across zones, causing some localities to run +hotter than others. It uses per-endpoint utilization from +`ORCA `_ +reports to weight each locality by its available headroom, +preferring the local zone when load is balanced and spilling to +remote zones as the local zone heats up. + +When to use this policy +^^^^^^^^^^^^^^^^^^^^^^^ + +This policy is a good fit when: + +- You want cross-zone traffic decisions driven by actual backend load rather + than static host counts or management-server-provided weights. +- You want local-zone preference when things are balanced, but automatic + spillover when they are not. + +When not to use this policy +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This policy is **not** a good fit when: + +- **You need deterministic routing.** For session affinity or consistent + hashing, use ring hash or Maglev instead. +- **Traffic is already balanced and you just want local preference.** + Zone-aware routing is simpler, well-tested, and has no ORCA dependency. +- **You want the control plane to dictate routing weights.** Use the + :ref:`WrrLocality + ` + policy with EDS locality weights instead. + +Example configuration +^^^^^^^^^^^^^^^^^^^^^ + +Minimal configuration using round robin as the endpoint-picking child policy: + +.. code-block:: yaml + + load_balancing_policy: + policies: + - typed_extension_config: + name: envoy.load_balancing_policies.load_aware_locality + typed_config: + "@type": type.googleapis.com/envoy.extensions.load_balancing_policies.load_aware_locality.v3.LoadAwareLocality + endpoint_picking_policy: + policies: + - typed_extension_config: + name: envoy.load_balancing_policies.round_robin + typed_config: + "@type": type.googleapis.com/envoy.extensions.load_balancing_policies.round_robin.v3.RoundRobin + +Full configuration with all tuning parameters: + +.. code-block:: yaml + + load_balancing_policy: + policies: + - typed_extension_config: + name: envoy.load_balancing_policies.load_aware_locality + typed_config: + "@type": type.googleapis.com/envoy.extensions.load_balancing_policies.load_aware_locality.v3.LoadAwareLocality + endpoint_picking_policy: + policies: + - typed_extension_config: + name: envoy.load_balancing_policies.least_request + typed_config: + "@type": type.googleapis.com/envoy.extensions.load_balancing_policies.least_request.v3.LeastRequest + weight_update_period: 2s + utilization_variance_threshold: 0.15 + ewma_alpha: 0.4 + probe_percentage: 0.05 + weight_expiration_period: 120s + +Configuration parameters +^^^^^^^^^^^^^^^^^^^^^^^^ + +.. list-table:: + :header-rows: 1 + :widths: 35 10 55 + + * - Parameter + - Default + - Description + * - ``endpoint_picking_policy`` + - (required) + - Child LB policy for selecting an endpoint within the chosen locality. + Any endpoint-picking policy can be used, regardless of wehther the policy + implements ORCA handling (e.g. least request). + * - ``weight_update_period`` + - 1 s + - How often locality weights are recomputed from ORCA data. + * - ``metric_names_for_computing_utilization`` + - (unset) + - Named ORCA metrics to use for computing utilization. When configured, + the max of the matching metric values is used. Map field entries use + the form ``.`` (e.g. ``named_metrics.foo``). + If not configured or no named metrics match, ``application_utilization`` + is used; if that is also absent or zero, ``cpu_utilization`` is used. + See :ref:`Weight computation ` + for the full precedence order. + * - ``utilization_variance_threshold`` + - 0.1 + - When the local locality's utilization is at most this value above the + host-count-weighted remote average, all traffic is routed locally. This + is a one-sided check -- if the local zone is less loaded than remote + zones, all-local routing always applies. Range: [0, 1]. + * - ``ewma_alpha`` + - 0.3 + - EWMA smoothing factor. Higher values react faster; lower values are more + stable. Range: (0, 1]. + * - ``probe_percentage`` + - 0.03 + - Minimum fraction of traffic sent to non-local localities to keep ORCA data + fresh. The deficit is redistributed proportionally to host count. Set to + 0 to disable (safe only if out-of-band ORCA reporting is available or if + zero cross-zone traffic is strictly required). Range: [0, 1). + * - ``weight_expiration_period`` + - 180 s + - ORCA samples older than this are discarded and the locality's EWMA state + is reset. Set to 0 s to disable expiration. + +Architecture +^^^^^^^^^^^^ + +The policy operates at two levels: + +1. **Locality picking** (this policy) -- selects which locality to route a given + request to, based on ORCA-derived utilization. +2. **Endpoint picking** (a configurable child policy) -- selects the specific + endpoint within the chosen locality. Any endpoint-picking LB policy + (e.g. client-side weighted round robin or least request) can be used. + +This separation means you can pair load-aware locality selection with whatever +endpoint-picking strategy best suits your workload. The locality-picking layer +handles *where* to send traffic; the child policy handles *which host* within +that locality. + +At a high level, the request path looks like this: + +:: + + Incoming request + | + +-- 1. Priority selection (standard Envoy healthy/degraded priority load) + | + +-- 2. Locality selection (this policy: weighted random by ORCA headroom) + | + +-- 3. Endpoint selection (child LB: e.g. round robin, least request) + | + v + Chosen upstream host + +Threading model +""""""""""""""" + +The policy is implemented as a ``ThreadAwareLoadBalancer``: + +- **Main thread:** A periodic timer (configurable via ``weight_update_period``) + reads ORCA utilization from each healthy host, averages utilization + per locality, applies EWMA smoothing, and computes capacity-weighted routing + weights. The resulting immutable snapshot is published to worker threads via a + thread-local slot. +- **Worker threads:** Each worker maintains per-locality child LB instances. + On the request path, the worker reads the latest weight snapshot from its + thread-local slot, performs weighted random locality selection, and delegates + endpoint selection to the chosen locality's child LB. + +This design means the request path involves no locks or cross-thread +synchronization -- workers only read from their thread-local slot. + +ORCA data flow +"""""""""""""" + +For this policy to function, upstream endpoints must report ORCA utilization +data. ORCA reports can be delivered in two ways: + +- **Per-request reports:** Utilization data is piggybacked on response trailers. + The Envoy router processes these reports and stores the utilization value in a + per-host slot so that the main-thread weight computation can read it without + locking. This is the mechanism used by this policy today. +- **Out-of-band (OOB) reports:** A periodic gRPC stream from the backend to + Envoy. OOB reporting is not yet integrated with this policy but could + eliminate the need for ``probe_percentage`` in the future, since utilization + data would arrive independently of traffic. + +Because the policy currently relies on per-request reports, it can only receive +fresh utilization data from localities that are actively receiving traffic. This +is why the ``probe_percentage`` parameter exists: it ensures a minimum fraction +of traffic is sent to non-local localities to keep their ORCA data fresh. + +Endpoints should report at least one of: + +- Named metrics via ``metric_names_for_computing_utilization`` -- when + configured, these take highest precedence. +- ``application_utilization`` -- a value in [0, 1] representing + application-level load. Used when no named metrics are configured or present. +- ``cpu_utilization`` -- used as final fallback when neither named metrics nor + ``application_utilization`` are available. + +**Combining with Client-Side Weighted Round Robin** + +:ref:`Client-Side Weighted Round Robin (CSWRR) +` can be +used as the ``endpoint_picking_policy`` without conflict. This policy (for +locality selection) and CSWRR (for endpoint selection within a locality) both +consume ORCA data independently via separate per-host slots. Utilization reports +reach both policies without interference, enabling two-level ORCA-aware load +balancing: locality selection is driven by locality-level headroom, and endpoint +selection within each locality is driven by per-endpoint capacity weights. + +.. _load_aware_locality_weight_computation: + +Weight computation +^^^^^^^^^^^^^^^^^^ + +On the main thread, a periodic timer (configurable via ``weight_update_period``, +default 1 s) recomputes per-locality routing weights using the following steps: + +1. For each locality, compute the average utilization of its endpoints from their + most recent ORCA reports. The metric precedence order is: + + a. The max of any metrics listed in ``metric_names_for_computing_utilization`` + that are present in the report (e.g. ``named_metrics.foo``), if configured. + b. ``application_utilization`` if reported and > 0. + c. ``cpu_utilization``. + +2. Apply EWMA smoothing (controlled by ``ewma_alpha``, default 0.3) to dampen + oscillation. On the very first report for a locality (or after EWMA state has + been reset due to weight expiration), the raw utilization value is used + directly without blending, so the policy begins differentiating localities + after a single update cycle. +3. Weight each locality proportionally to ``host_count * (1 - smoothed_utilization)``. +4. If a local locality exists, compute the **host-count-weighted** average + utilization of all remote localities. If the local locality's utilization is + **at most** ``utilization_variance_threshold`` (default 0.1) **above** this + remote average, route 100 % of traffic locally to avoid unnecessary + cross-zone hops. Note that this is a one-sided check: if the local zone is + *less* loaded than the remote average (by any amount), all-local routing + always triggers. The threshold only governs how much *hotter* the local zone + can be before spillover begins. +5. Enforce a ``probe_percentage`` (default 3 %) minimum share of traffic to + non-local localities so ORCA data stays fresh even when local routing is + dominant. When the remote localities' combined weight is below this minimum, + the deficit is subtracted from the local locality's weight and redistributed + to remote localities **proportionally to their host count** (not + proportionally to their headroom weights). Host-count proportional + redistribution is intentional: when probing for fresh data, we want to sample + all remote localities fairly rather than biasing toward zones whose current + (possibly stale) utilization happens to look lower. +6. If all localities report zero headroom (utilization >= 1.0), fall back to + distributing traffic proportionally to host count. + +These weights are published to worker threads via a thread-local slot. +Each worker thread maintains per-locality child LB instances and uses the +published weights to perform weighted random locality selection on the +request path. + +In pseudo-formula notation: + +:: + + # Per-locality utilization (EWMA smoothing) + smoothed_util(L) = ewma_alpha * raw_util(L) + (1 - ewma_alpha) * prev_smoothed_util(L) + (first sample: smoothed_util(L) = raw_util(L)) + + # Per-locality headroom and weight + headroom(L) = max(0, 1 - smoothed_util(L)) + weight(L) = host_count(L) * headroom(L) + routing_share(L) = weight(L) / sum(weight(L_i) for all L_i) + + # Local-preference check (one-sided: local must not be too far ABOVE remote) + remote_weighted_avg = sum(smoothed_util(R_i) * host_count(R_i)) / sum(host_count(R_i)) + if smoothed_util(local) <= remote_weighted_avg + utilization_variance_threshold: + route 100% to local (subject to probe_percentage below) + + # Probe percentage enforcement + remote_share = sum(weight(R_i)) / sum(weight(L_i) for all L_i) + if remote_share < probe_percentage: + deficit = probe_percentage * total_weight - sum(weight(R_i)) + weight(local) -= deficit + for each remote R_i: + weight(R_i) += deficit * host_count(R_i) / sum(host_count(R_j)) + + # All-overloaded fallback + if sum(weight(L_i)) == 0: + weight(L_i) = host_count(L_i) # proportional to host count + +Worked example +"""""""""""""" + +Consider 3 localities with the default variance threshold of 0.1: + +- **Locality A** (local): 10 hosts, average utilization 0.7 +- **Locality B** (remote): 10 hosts, average utilization 0.3 +- **Locality C** (remote): 10 hosts, average utilization 0.4 + +The host-count-weighted remote average utilization is +``(0.3 * 10 + 0.4 * 10) / (10 + 10) = 0.35``. Locality A's utilization (0.7) +exceeds the remote average plus the threshold (``0.35 + 0.1 = 0.45``), so the +policy does **not** route 100 % locally. + +Headroom-weighted routing is computed as: + +- A: ``10 * (1 - 0.7) = 3`` +- B: ``10 * (1 - 0.3) = 7`` +- C: ``10 * (1 - 0.4) = 6`` +- Total: ``3 + 7 + 6 = 16`` + +Traffic split: **A ~ 19 %, B ~ 44 %, C ~ 37 %**. Traffic flows away from the +hot local zone toward localities with more headroom. + +Now suppose load rebalances and all localities converge to roughly 0.45 +utilization. The remote average is 0.45, and locality A is within the variance +threshold (``0.45 <= 0.45 + 0.1``). The policy snaps to **100 % local +routing** (minus the 3 % probe percentage), avoiding unnecessary cross-zone +hops. + +Asymmetric host count example +''''''''''''''''''''''''''''' + +The host-count-weighted remote average matters when localities have different +sizes: + +- **Locality A** (local): 10 hosts, utilization 0.50 +- **Locality B** (remote): 20 hosts, utilization 0.40 +- **Locality C** (remote): 5 hosts, utilization 0.60 + +The host-count-weighted remote average is +``(0.40 * 20 + 0.60 * 5) / (20 + 5) = 0.44``. A simple (unweighted) average +would be ``(0.40 + 0.60) / 2 = 0.50`` -- a very different value. Because +locality A's utilization (0.50) is within the variance threshold of the weighted +remote average (``0.50 <= 0.44 + 0.1 = 0.54``), the policy routes **100 % +locally** (minus probe percentage). + +Headroom weights if spillover were active: + +- A: ``10 * (1 - 0.50) = 5.0`` +- B: ``20 * (1 - 0.40) = 12.0`` +- C: ``5 * (1 - 0.60) = 2.0`` + +Notice that locality B's larger host count amplifies its weight even though its +per-host headroom (0.60) is less than A's (0.50). + +Local preference and probe percentage +"""""""""""""""""""""""""""""""""""""" + +The ``utilization_variance_threshold`` and ``probe_percentage`` parameters work +together to balance two goals: minimizing cross-zone traffic and maintaining +fresh ORCA data. + +- When the local zone's utilization is **at most** ``utilization_variance_threshold`` + **above** the host-count-weighted remote average, the policy routes 100 % of + traffic locally. This is a one-sided check: if the local zone is less loaded + than remote zones, all-local routing always applies regardless of how large + the gap is. The threshold only limits how much *hotter* the local zone can be + before spillover begins. +- Even in this "all-local" mode, ``probe_percentage`` (default 3 %) ensures + that a small fraction of traffic still reaches remote localities. The + implementation subtracts the deficit from the local locality's weight and + distributes it to remote localities proportionally to their host count (not + headroom). Without this probing, the policy would have no fresh ORCA data for + remote zones and would be blind to load changes there. +- Setting ``probe_percentage`` to 0 disables probing entirely. This is safe + only if out-of-band ORCA reporting is available or if cross-zone traffic must + be strictly avoided; be aware that the policy may react slowly to remote load + changes without it. + +Weight expiration +""""""""""""""""" + +If an endpoint has not reported ORCA metrics within ``weight_expiration_period`` +(default 3 min), its utilization sample is discarded. Additionally, the +per-locality EWMA state is **reset**: the smoothed utilization returns to 0.0 and +the locality appears to have **full headroom** (0 % utilization) until fresh +data arrives. Operators should be aware that this can cause a transient traffic +surge toward a locality whose data has just expired. Tuning +``weight_expiration_period`` higher reduces the chance of spurious resets, while +tuning it lower prevents stale data from persisting after backends are drained +or stop reporting. + +Cold-start behavior +""""""""""""""""""" + +When the policy first starts (or when no ORCA data is available), each locality's +utilization defaults to 0, giving it full headroom (``1 - 0 = 1.0``). With all +localities at full headroom, routing weights reduce to host counts, making traffic +distribution proportional to the number of hosts in each locality -- equivalent +to round-robin locality selection. As ORCA reports arrive, the first utilization +sample for each locality is applied directly (without EWMA blending), so the +policy begins differentiating localities within a single ``weight_update_period`` +cycle rather than gradually ramping through EWMA. + +Priority support +"""""""""""""""" + +The policy respects Envoy's :ref:`priority levels +`. Priority selection happens first +using the standard healthy/degraded priority load calculation, then locality +selection applies within the chosen priority. Unlike zone-aware routing (which +only operates at priority 0), this policy applies locality-aware selection at +all priority levels. + +The policy computes **three separate sets** of per-locality weights for each +priority level: + +- **Healthy weights** -- used when the priority load calculation selects healthy + hosts for this priority. This is the common case. +- **Degraded weights** -- used when Envoy selects + :ref:`degraded ` hosts for this + priority. The weights are computed from the degraded host subset only. +- **All-host weights** -- used when the priority is in + :ref:`panic mode ` and Envoy + falls back to considering all hosts regardless of health status. The weights + are computed from the full host set. + +Each set is computed independently: the per-locality utilization averages, +EWMA smoothing state, and resulting headroom weights are tracked separately for +healthy, degraded, and all-host subsets. + +Compatibility notes +""""""""""""""""""" + +This policy is **not** currently compatible with :ref:`load balancer subsetting +`. Subsetting partitions hosts into subsets +that cut across locality boundaries, and it is not straightforward to reconcile +locality-level headroom weights with per-subset host partitioning. + +Statistics +"""""""""" + +The policy emits the following zone routing statistics under the cluster's stat +prefix (``cluster..``). These are the same counters used by +zone-aware routing, making it easy to compare behavior when migrating between +the two approaches. + +.. list-table:: + :header-rows: 1 + :widths: 40 60 + + * - Statistic + - Description + * - ``lb_zone_routing_all_directly`` + - Request was routed to the local zone while the variance threshold + triggered all-local routing (i.e. load was balanced). + * - ``lb_zone_routing_sampled`` + - Request was routed to the local zone via weighted random selection + (i.e. the local zone won the weighted draw during spillover). + * - ``lb_zone_routing_cross_zone`` + - Request was routed to a remote zone. + +Additionally, ``lb_recalculate_zone_structures`` is incremented each time the +main-thread timer fires and recomputes locality weights. + +.. _load_aware_locality_comparison: + +Comparison with other approaches +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Envoy offers three locality-selection strategies. The right choice depends on +whether you have ORCA reporting, whether the control plane supplies locality +weights, and whether you need to react to runtime load imbalance. + ++----------------------------------------+-------------------------------+-------------------------------+--------------------------------------+ +| Feature | Zone-aware routing | WrrLocality | Load-aware locality (this policy) | ++========================================+===============================+===============================+======================================+ +| Routing signal | Healthy host counts | Static weights from EDS | Real-time ORCA utilization | ++----------------------------------------+-------------------------------+-------------------------------+--------------------------------------+ +| Reacts to load imbalance | No | No | Yes | ++----------------------------------------+-------------------------------+-------------------------------+--------------------------------------+ +| Requires management server weights | No | Yes | No | ++----------------------------------------+-------------------------------+-------------------------------+--------------------------------------+ +| Requires ORCA reports from backends | No | No | Yes | ++----------------------------------------+-------------------------------+-------------------------------+--------------------------------------+ +| Cross-zone traffic minimization | Yes (local preference) | Depends on weights | Yes (local preference with probe | +| | | | minimum) | ++----------------------------------------+-------------------------------+-------------------------------+--------------------------------------+ +| Cold-start behavior | Routes by host count ratio | Routes by EDS weights | Routes proportionally to host count | ++----------------------------------------+-------------------------------+-------------------------------+--------------------------------------+ +| Oscillation dampening | N/A | N/A | EWMA smoothing | ++----------------------------------------+-------------------------------+-------------------------------+--------------------------------------+ +| Control plane dependency | None | Requires EDS weights | None (data-plane only) | ++----------------------------------------+-------------------------------+-------------------------------+--------------------------------------+ +| Priority level support | P=0 only | All | All | ++----------------------------------------+-------------------------------+-------------------------------+--------------------------------------+ +| Degraded / panic mode support | No | Yes | Yes (separate weight sets) | ++----------------------------------------+-------------------------------+-------------------------------+--------------------------------------+ +| Load balancer subsetting | Yes | No | No | ++----------------------------------------+-------------------------------+-------------------------------+--------------------------------------+ + +Implementation notes +^^^^^^^^^^^^^^^^^^^^^ + +This section describes implementation internals for contributors to this policy. + +The policy is implemented as a ``ThreadAwareLoadBalancer``. On the main thread, +a ``weight_update_timer_`` fires periodically to invoke +``computeLocalityRoutingWeights()``, which reads ``host->lbPolicyData()`` from +each healthy host (one slot per locality-picker policy), averages utilization +per locality, applies EWMA smoothing, computes headroom weights, and publishes +an immutable ``RoutingWeightsSnapshot`` to workers via a +``ThreadLocal::TypedSlot``. + +:: + + LoadAwareLocalityLoadBalancer (main thread) + | + |-- weight_update_timer_ + | Fires periodically to recompute locality routing weights. + | + |-- on timer callback: + | computeLocalityRoutingWeights() + | - Read host->lbPolicyData() from each healthy host + | - Average per locality, apply EWMA smoothing + | - Compute weight = host_count * (1 - smoothed_utilization) + | - Check variance threshold for local-zone preference + | - Apply probe percentage minimum + | - Publish immutable snapshot to workers via TLS + | + +-- WorkerLocalLbFactory (shared across all workers) + | + |-- child_thread_aware_lb_ (shared child ThreadAwareLoadBalancer) + | + |-- tls_ (ThreadLocal::TypedSlot) + | Main thread pushes RoutingWeightsSnapshot to workers. + | Workers read with zero synchronization. + | + +-- create() --> WorkerLocalLb (one per worker thread) + | + |-- selectLocality() [weighted random by capacity] + | + +-- per_locality_[] (one PerLocalityState per locality) + |-- PrioritySetImpl (hosts for this locality only) + +-- LoadBalancer (worker-local child, e.g. RoundRobin) + +ORCA reports arrive via the router filter which calls ``onOrcaLoadReport()`` on +the host's ``lbPolicyData()`` slot for this policy (a separate slot from any +CSWRR slot on the same host). Utilization values are stored via lock-free +atomics (``std::atomic``) so the main-thread weight computation can +read them without locking. This is why ``probe_percentage`` is needed: because +ORCA data arrives only on the request path, remote localities that receive no +traffic yield no data. diff --git a/docs/root/intro/arch_overview/upstream/load_balancing/load_balancing.rst b/docs/root/intro/arch_overview/upstream/load_balancing/load_balancing.rst index be284b6e99eaf..e531e559697fe 100644 --- a/docs/root/intro/arch_overview/upstream/load_balancing/load_balancing.rst +++ b/docs/root/intro/arch_overview/upstream/load_balancing/load_balancing.rst @@ -14,6 +14,7 @@ Load Balancing excluded original_dst zone_aware + load_aware_locality subsets slow_start override_host diff --git a/source/extensions/extensions_build_config.bzl b/source/extensions/extensions_build_config.bzl index daf76e5e96040..c55893ce405c9 100644 --- a/source/extensions/extensions_build_config.bzl +++ b/source/extensions/extensions_build_config.bzl @@ -576,6 +576,7 @@ EXTENSIONS = { # Load balancing policies for upstream # "envoy.load_balancing_policies.least_request": "//source/extensions/load_balancing_policies/least_request:config", + "envoy.load_balancing_policies.load_aware_locality": "//source/extensions/load_balancing_policies/load_aware_locality:config", "envoy.load_balancing_policies.random": "//source/extensions/load_balancing_policies/random:config", "envoy.load_balancing_policies.round_robin": "//source/extensions/load_balancing_policies/round_robin:config", "envoy.load_balancing_policies.maglev": "//source/extensions/load_balancing_policies/maglev:config", diff --git a/source/extensions/extensions_metadata.yaml b/source/extensions/extensions_metadata.yaml index ede636ef97d31..27485ff9a01a2 100644 --- a/source/extensions/extensions_metadata.yaml +++ b/source/extensions/extensions_metadata.yaml @@ -2049,6 +2049,13 @@ envoy.load_balancing_policies.least_request: status: stable type_urls: - envoy.extensions.load_balancing_policies.least_request.v3.LeastRequest +envoy.load_balancing_policies.load_aware_locality: + categories: + - envoy.load_balancing_policies + security_posture: unknown + status: alpha + type_urls: + - envoy.extensions.load_balancing_policies.load_aware_locality.v3.LoadAwareLocality envoy.load_balancing_policies.random: categories: - envoy.load_balancing_policies diff --git a/source/extensions/load_balancing_policies/load_aware_locality/BUILD b/source/extensions/load_balancing_policies/load_aware_locality/BUILD new file mode 100644 index 0000000000000..890af1964ef23 --- /dev/null +++ b/source/extensions/load_balancing_policies/load_aware_locality/BUILD @@ -0,0 +1,19 @@ +load( + "//bazel:envoy_build_system.bzl", + "envoy_cc_extension", + "envoy_extension_package", +) + +licenses(["notice"]) # Apache 2 + +envoy_extension_package() + +envoy_cc_extension( + name = "config", + srcs = ["config.cc"], + hdrs = ["config.h"], + deps = [ + "//source/common/upstream:load_balancer_factory_base_lib", + "@envoy_api//envoy/extensions/load_balancing_policies/load_aware_locality/v3:pkg_cc_proto", + ], +) diff --git a/source/extensions/load_balancing_policies/load_aware_locality/config.cc b/source/extensions/load_balancing_policies/load_aware_locality/config.cc new file mode 100644 index 0000000000000..7cb1045bd5f04 --- /dev/null +++ b/source/extensions/load_balancing_policies/load_aware_locality/config.cc @@ -0,0 +1,27 @@ +#include "source/extensions/load_balancing_policies/load_aware_locality/config.h" + +namespace Envoy { +namespace Extensions { +namespace LoadBalancingPolicies { +namespace LoadAwareLocality { + +Upstream::ThreadAwareLoadBalancerPtr Factory::create(OptRef, + const Upstream::ClusterInfo&, + const Upstream::PrioritySet&, Runtime::Loader&, + Envoy::Random::RandomGenerator&, TimeSource&) { + // TODO(jukie): Implement load-aware locality load balancer. + return nullptr; +} + +absl::StatusOr +Factory::loadConfig(Server::Configuration::ServerFactoryContext&, const Protobuf::Message&) { + // TODO(jukie): Implement load-aware locality config loading. + return nullptr; +} + +REGISTER_FACTORY(Factory, Upstream::TypedLoadBalancerFactory); + +} // namespace LoadAwareLocality +} // namespace LoadBalancingPolicies +} // namespace Extensions +} // namespace Envoy diff --git a/source/extensions/load_balancing_policies/load_aware_locality/config.h b/source/extensions/load_balancing_policies/load_aware_locality/config.h new file mode 100644 index 0000000000000..0140691580782 --- /dev/null +++ b/source/extensions/load_balancing_policies/load_aware_locality/config.h @@ -0,0 +1,37 @@ +#pragma once + +#include "envoy/extensions/load_balancing_policies/load_aware_locality/v3/load_aware_locality.pb.h" +#include "envoy/upstream/load_balancer.h" + +#include "source/common/upstream/load_balancer_factory_base.h" + +namespace Envoy { +namespace Extensions { +namespace LoadBalancingPolicies { +namespace LoadAwareLocality { + +using LoadAwareLocalityProto = + envoy::extensions::load_balancing_policies::load_aware_locality::v3::LoadAwareLocality; + +class Factory : public Upstream::TypedLoadBalancerFactoryBase { +public: + Factory() : TypedLoadBalancerFactoryBase("envoy.load_balancing_policies.load_aware_locality") {} + + Upstream::ThreadAwareLoadBalancerPtr create(OptRef lb_config, + const Upstream::ClusterInfo& cluster_info, + const Upstream::PrioritySet& priority_set, + Runtime::Loader& runtime, + Envoy::Random::RandomGenerator& random, + TimeSource& time_source) override; + + absl::StatusOr + loadConfig(Server::Configuration::ServerFactoryContext& context, + const Protobuf::Message& config) override; +}; + +DECLARE_FACTORY(Factory); + +} // namespace LoadAwareLocality +} // namespace LoadBalancingPolicies +} // namespace Extensions +} // namespace Envoy diff --git a/test/extensions/load_balancing_policies/load_aware_locality/BUILD b/test/extensions/load_balancing_policies/load_aware_locality/BUILD new file mode 100644 index 0000000000000..f60293a7e7baa --- /dev/null +++ b/test/extensions/load_balancing_policies/load_aware_locality/BUILD @@ -0,0 +1,26 @@ +load( + "//bazel:envoy_build_system.bzl", + "envoy_package", +) +load( + "//test/extensions:extensions_build_system.bzl", + "envoy_extension_cc_test", +) + +licenses(["notice"]) # Apache 2 + +envoy_package() + +envoy_extension_cc_test( + name = "config_test", + srcs = ["config_test.cc"], + extension_names = ["envoy.load_balancing_policies.load_aware_locality"], + rbe_pool = "6gig", + deps = [ + "//source/extensions/load_balancing_policies/load_aware_locality:config", + "//test/mocks/server:factory_context_mocks", + "//test/mocks/upstream:cluster_info_mocks", + "//test/mocks/upstream:priority_set_mocks", + "@envoy_api//envoy/config/core/v3:pkg_cc_proto", + ], +) diff --git a/test/extensions/load_balancing_policies/load_aware_locality/config_test.cc b/test/extensions/load_balancing_policies/load_aware_locality/config_test.cc new file mode 100644 index 0000000000000..2485d33ec62b1 --- /dev/null +++ b/test/extensions/load_balancing_policies/load_aware_locality/config_test.cc @@ -0,0 +1,43 @@ +#include "envoy/config/core/v3/extension.pb.h" + +#include "source/extensions/load_balancing_policies/load_aware_locality/config.h" + +#include "test/mocks/server/factory_context.h" +#include "test/mocks/upstream/cluster_info.h" +#include "test/mocks/upstream/priority_set.h" + +namespace Envoy { +namespace Extensions { +namespace LoadBalancingPolicies { +namespace LoadAwareLocality { +namespace { + +TEST(LoadAwareLocalityConfigTest, CreateFactory) { + NiceMock context; + NiceMock cluster_info; + NiceMock main_thread_priority_set; + + envoy::config::core::v3::TypedExtensionConfig config; + config.set_name("envoy.load_balancing_policies.load_aware_locality"); + LoadAwareLocalityProto config_msg; + config.mutable_typed_config()->PackFrom(config_msg); + + auto& factory = Config::Utility::getAndCheckFactory(config); + EXPECT_EQ("envoy.load_balancing_policies.load_aware_locality", factory.name()); + + // loadConfig is stubbed to return nullptr for now. + auto lb_config = factory.loadConfig(context, *factory.createEmptyConfigProto()); + EXPECT_TRUE(lb_config.ok()); + + // create is stubbed to return nullptr for now. + auto thread_aware_lb = + factory.create({}, cluster_info, main_thread_priority_set, context.runtime_loader_, + context.api_.random_, context.time_system_); + EXPECT_EQ(nullptr, thread_aware_lb); +} + +} // namespace +} // namespace LoadAwareLocality +} // namespace LoadBalancingPolicies +} // namespace Extensions +} // namespace Envoy