diff --git a/crates/test-util/src/wast.rs b/crates/test-util/src/wast.rs
index a2f3a3304ff3..ee046a824f83 100644
--- a/crates/test-util/src/wast.rs
+++ b/crates/test-util/src/wast.rs
@@ -491,6 +491,7 @@ impl WastTest {
                 "spec_testsuite/proposals/threads/exports.wast",
                 "spec_testsuite/proposals/threads/memory.wast",
                 "misc_testsuite/memory64/threads.wast",
+                "misc_testsuite/winch/rmw32_cmpxchg_u_wrap.wast",
             ];
 
             if unsupported.iter().any(|part| self.path.ends_with(part)) {
diff --git a/tests/disas/winch/x64/atomic/rmw/cmpxchg/i64_atomic_rmw32_cmpxchgu.wat b/tests/disas/winch/x64/atomic/rmw/cmpxchg/i64_atomic_rmw32_cmpxchgu.wat
index 2fa2451eae72..a6c3a954e728 100644
--- a/tests/disas/winch/x64/atomic/rmw/cmpxchg/i64_atomic_rmw32_cmpxchgu.wat
+++ b/tests/disas/winch/x64/atomic/rmw/cmpxchg/i64_atomic_rmw32_cmpxchgu.wat
@@ -12,7 +12,7 @@
 ;;       movq    0x18(%r11), %r11
 ;;       addq    $0x20, %r11
 ;;       cmpq    %rsp, %r11
-;;       ja      0x6f
+;;       ja      0x71
 ;;   1c: movq    %rdi, %r14
 ;;       subq    $0x10, %rsp
 ;;       movq    %rdi, 8(%rsp)
@@ -22,7 +22,7 @@
 ;;       movl    $0, %edx
 ;;       andl    $3, %edx
 ;;       cmpl    $0, %edx
-;;       jne     0x71
+;;       jne     0x73
 ;;   4d: movl    $0, %edx
 ;;       movq    0x30(%r14), %r11
 ;;       movq    (%r11), %rbx
@@ -33,8 +33,9 @@
 ;;       popq    %rcx
 ;;       popq    %rax
 ;;       lock cmpxchgl %ecx, (%rbx)
+;;       movl    %eax, %eax
 ;;       addq    $0x10, %rsp
 ;;       popq    %rbp
 ;;       retq
-;;   6f: ud2
 ;;   71: ud2
+;;   73: ud2
diff --git a/tests/disas/winch/x64/atomic/rmw/cmpxchg/i64_atomic_rmw32_cmpxchgu_extend.wat b/tests/disas/winch/x64/atomic/rmw/cmpxchg/i64_atomic_rmw32_cmpxchgu_extend.wat
new file mode 100644
index 000000000000..1221f51cda28
--- /dev/null
+++ b/tests/disas/winch/x64/atomic/rmw/cmpxchg/i64_atomic_rmw32_cmpxchgu_extend.wat
@@ -0,0 +1,44 @@
+;;! target = "x86_64"
+;;! test = "winch"
+
+(module
+  (memory 1 1 shared)
+  (func (export "f") (result i64)
+    i32.const 0
+    i64.const 0xDEADBEEF00000000
+    i64.const 0x1234
+    i64.atomic.rmw32.cmpxchg_u))
+;; wasm[0]::function[0]:
+;;       pushq   %rbp
+;;       movq    %rsp, %rbp
+;;       movq    8(%rdi), %r11
+;;       movq    0x18(%r11), %r11
+;;       addq    $0x20, %r11
+;;       cmpq    %rsp, %r11
+;;       ja      0x76
+;;   1c: movq    %rdi, %r14
+;;       subq    $0x10, %rsp
+;;       movq    %rdi, 8(%rsp)
+;;       movq    %rsi, (%rsp)
+;;       movl    $0x1234, %eax
+;;       movabsq $16045690981097406464, %rcx
+;;       movl    $0, %edx
+;;       andl    $3, %edx
+;;       cmpl    $0, %edx
+;;       jne     0x78
+;;   52: movl    $0, %edx
+;;       movq    0x30(%r14), %r11
+;;       movq    (%r11), %rbx
+;;       movl    %edx, %edx
+;;       addq    %rdx, %rbx
+;;       pushq   %rcx
+;;       pushq   %rax
+;;       popq    %rcx
+;;       popq    %rax
+;;       lock cmpxchgl %ecx, (%rbx)
+;;       movl    %eax, %eax
+;;       addq    $0x10, %rsp
+;;       popq    %rbp
+;;       retq
+;;   76: ud2
+;;   78: ud2
diff --git a/tests/misc_testsuite/winch/rmw32_cmpxchg_u_wrap.wast b/tests/misc_testsuite/winch/rmw32_cmpxchg_u_wrap.wast
new file mode 100644
index 000000000000..3d14edb0ca07
--- /dev/null
+++ b/tests/misc_testsuite/winch/rmw32_cmpxchg_u_wrap.wast
@@ -0,0 +1,11 @@
+;;! threads = true
+
+(module
+  (memory 1 1 shared)
+  (func (export "f") (result i64)
+    i32.const 0
+    i64.const 0xDEADBEEF00000000
+    i64.const 0x1234
+    i64.atomic.rmw32.cmpxchg_u))
+
+(assert_return (invoke "f") (i64.const 0))
diff --git a/winch/codegen/src/codegen/mod.rs b/winch/codegen/src/codegen/mod.rs
index 0d08767192e2..732eea6b431e 100644
--- a/winch/codegen/src/codegen/mod.rs
+++ b/winch/codegen/src/codegen/mod.rs
@@ -1416,21 +1416,27 @@ where
         size: OperandSize,
         extend: Option<Extend<Zero>>,
     ) -> Result<()> {
-        // Emission for this instruction is a bit trickier. The address for the CAS is the 3rd from
-        // the top of the stack, and we must emit instruction to compute the actual address with
-        // `emit_compute_heap_address_align_checked`, while we still have access to self. However,
-        // some ISAs have requirements with regard to the registers used for some arguments, so we
-        // need to pass the context to the masm. To solve this issue, we pop the two first
-        // arguments from the stack, compute the address, push back the arguments, and hand over
-        // the control to masm. The implementer of `atomic_cas` can expect to find `expected` and
-        // `replacement` at the top the context's stack.
-
-        // pop the args
+        // At this point in the stack we have:
+        //    [ address, expected, replacement ]
+        //
+        // Therefore, emission for this instruction is a bit
+        // trickier. The address for the CAS is the 3rd from the top
+        // of the stack, and we must emit instruction to compute the
+        // actual address with
+        // `emit_compute_heap_address_align_checked`, while we still
+        // have access to self. However, some ISAs have requirements
+        // with regard to the registers used for some arguments, so we
+        // need to pass the context to the masm. To solve this issue,
+        // we pop the two first arguments from the stack, compute the
+        // address, push back the arguments, and hand over the control
+        // to masm. The implementer of `atomic_cas` can expect to find
+        // `expected` and `replacement` at the top the context's
+        // stack.
+
         let replacement = self.context.pop_to_reg(self.masm, None)?;
         let expected = self.context.pop_to_reg(self.masm, None)?;
 
         if let Some(addr) = self.emit_compute_heap_address_align_checked(arg, size)? {
-            // push back the args
             self.context.stack.push(expected.into());
             self.context.stack.push(replacement.into());
 
diff --git a/winch/codegen/src/isa/x64/masm.rs b/winch/codegen/src/isa/x64/masm.rs
index 83a0789edbe6..a6d123f3f08a 100644
--- a/winch/codegen/src/isa/x64/masm.rs
+++ b/winch/codegen/src/isa/x64/masm.rs
@@ -1781,23 +1781,20 @@ impl Masm for MacroAssembler {
     ) -> Result<()> {
         // `cmpxchg` expects `expected` to be in the `*a*` register.
         // reserve rax for the expected argument.
-        let rax = context.reg(regs::rax(), self)?;
 
-        let replacement = context.pop_to_reg(self, None)?;
+        let replacement =
+            context.without::<Result<TypedReg>, _, _>(&[regs::rax()], self, |cx, masm| {
+                cx.pop_to_reg(masm, None)
+            })??;
 
-        // mark `rax` as allocatable again.
-        context.free_reg(rax);
         let expected = context.pop_to_reg(self, Some(regs::rax()))?;
 
         self.asm
             .cmpxchg(addr, replacement.reg, writable!(expected.reg), size, flags);
 
         if let Some(extend) = extend {
-            // We don't need to zero-extend from 32 to 64bits.
-            if !(extend.from_bits() == 32 && extend.to_bits() == 64) {
-                self.asm
-                    .movzx_rr(expected.reg, writable!(expected.reg), extend);
-            }
+            self.asm
+                .movzx_rr(expected.reg, writable!(expected.reg), extend);
         }
 
         context.stack.push(expected.into());