OpenHelix-Team · ruiheng123 · Oct 9, 2025 · Oct 9, 2025 · Nov 3, 2025 · Nov 3, 2025
diff --git a/.gitgnore → .gitignore b/.gitgnore → .gitignore
@@ -144,10 +144,14 @@ dmypy.json
 # Mac OS
 .DS_Store
 
+
 # Caches and Datasets
 cache/
 data/
-
+pretrained_models/
 # Rollout videos and wandb logs
 rollouts/
 wandb/
+outputs/
+experiments/logs/
+evaluation_results/
diff --git a/LIBERO b/LIBERO
diff --git a/calvin b/calvin
diff --git a/eval.sh b/eval.sh
@@ -0,0 +1,14 @@
+export HF_HUB_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+export HF_DATASETS_OFFLINE=1
+CUDA_VISIBLE_DEVICES=1 python experiments/robot/libero/run_libero_eval.py \
+  --use_proprio True \
+  --num_images_in_input 2 \
+  --use_film False \
+  --pretrained_checkpoint outputs/configs+libero_10_no_noops+b16+lr-0.0002+lora-r64+dropout-0.0--image_aug--VLA-Adapter--libero_10_no_noops--2025_10_27_17_22_41--use_3d_True_dim_2048_inject_all--170000_chkpt \
+  --task_suite_name libero_10 \
+  --use_pro_version True \
+  --use_3d True \
+  --inject_layers all \
+# outputs/configs+libero_10_no_noops+b16+lr-0.0002+lora-r64+dropout-0.0--image_aug--VLA-Adapter--libero_10_no_noops--1759126170--160000_chkpt \  
+  # > eval_logs/Spatial--chkpt.log 2>&1 &
diff --git a/eval2.sh b/eval2.sh
@@ -0,0 +1,6 @@
+export HF_HUB_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+export HF_DATASETS_OFFLINE=1
+CUDA_VISIBLE_DEVICES=7 python vla-scripts/evaluate_calvin.py \
+  --pretrained_checkpoint outputs/CALVIN-ABC-Pro
+  # --pretrained_checkpoint outputs/configs+calvin_abc_rlds+b16+lr-0.0002+lora-r64+dropout-0.0--image_aug--VLA-Adapter--calvin_abc_rlds----100000_chkpt \
diff --git a/experiments/robot/libero/run_libero_eval.py b/experiments/robot/libero/run_libero_eval.py
@@ -12,7 +12,7 @@
 from dataclasses import dataclass
 from enum import Enum
 from pathlib import Path
-from typing import Optional, Union
+from typing import Optional, Union, List
 
 import draccus
 import numpy as np
@@ -48,7 +48,7 @@
     set_seed_everywhere,
 )
 from prismatic.vla.constants import NUM_ACTIONS_CHUNK
-
+from prismatic.models.pi3_loader import load_pc_model
 
 # Define task suite constants
 class TaskSuite(str, Enum):
@@ -128,6 +128,11 @@ class GenerateConfig:
     use_pro_version: bool = True                     # encourage to use the pro models we released.
     phase: str = "Inference"
 
+    use_3d: bool = False
+    dim_3d: int = 2048
+    pi3_path: Path = Path("/home/ruihengwang/vla/VLA-Adapter/pretrained_models/pi3_checkpoint")
+    inject_layers: Optional[int | List[int] | str] = None
+
 
 
 def validate_config(cfg: GenerateConfig) -> None:
@@ -292,6 +297,7 @@ def run_episode(
     noisy_action_projector=None,
     initial_state=None,
     log_file=None,
+    pi3_model=None
 ):
     """Run a single episode in the environment."""
     # Reset environment
@@ -342,7 +348,8 @@ def run_episode(
                     proprio_projector=proprio_projector,
                     noisy_action_projector=noisy_action_projector,
                     use_film=cfg.use_film,
-                    use_minivlm=cfg.use_minivlm
+                    use_minivlm=cfg.use_minivlm,
+                    pi3_model=pi3_model
                 )
 
                 action_queue.extend(actions) 
@@ -383,7 +390,8 @@ def run_task(
     total_episodes=0,
     total_successes=0,
     log_file=None,
-    save_version=None
+    save_version=None,
+    pi3_model=None
 ):
     """Run evaluation for a single task."""
     # Get task
@@ -433,6 +441,7 @@ def run_task(
             noisy_action_projector,
             initial_state,
             log_file,
+            pi3_model
         )
 
         # Update counters
@@ -483,6 +492,10 @@ def eval_libero(cfg: GenerateConfig) -> float:
 
     # Initialize model and components
     model, action_head, proprio_projector, noisy_action_projector, processor = initialize_model(cfg)
+    if cfg.use_3d:
+        pi3_model = load_pc_model(cfg.pi3_path)
+    else:
+        pi3_model = None
 
     # for name, param in model.named_parameters():
     #     if 'action_queries' in name: 
@@ -500,6 +513,7 @@ def eval_libero(cfg: GenerateConfig) -> float:
     num_tasks = task_suite.n_tasks
 
     log_message(f"Task suite: {cfg.task_suite_name}", log_file)
+    log_message(f"Using pretrained checkpoint: {cfg.pretrained_checkpoint}", log_file)
 
     # Start evaluation
     total_episodes, total_successes = 0, 0
@@ -517,7 +531,8 @@ def eval_libero(cfg: GenerateConfig) -> float:
             total_episodes,
             total_successes,
             log_file,
-            cfg.save_version
+            cfg.save_version,
+            pi3_model
         )
 
     # Calculate final success rate

diff --git a/experiments/robot/openvla_utils.py b/experiments/robot/openvla_utils.py
@@ -32,7 +32,7 @@
     ACTION_PROPRIO_NORMALIZATION_TYPE,
 )
 from prismatic.vla.datasets.rlds.utils.data_utils import NormalizationType
-
+from prismatic.models.pi3_loader import load_pc_model
 # Initialize important constants
 DATE = time.strftime("%Y_%m_%d")
 DATE_TIME = time.strftime("%Y_%m_%d-%H_%M_%S")
@@ -507,6 +507,9 @@ def get_action_head(cfg: Any, llm_dim: int) -> Union[L1RegressionActionHead]:
             hidden_dim=llm_dim, 
             action_dim=ACTION_DIM,
             use_pro_version=cfg.use_pro_version,
+            use_3d=cfg.use_3d,
+            dim_3d=cfg.dim_3d,
+            inject_layers=cfg.inject_layers,
         )
 
     else:
@@ -745,6 +748,8 @@ def get_vla_action(
     noisy_action_projector: Optional[torch.nn.Module] = None,
     use_film: bool = False,
     use_minivlm: bool = False,
+    use_3d_model: bool = False,
+    pi3_model: Optional[torch.nn.Module] = None
 ) -> List[np.ndarray]:
     """
     Generate action predictions with the VLA policy.
@@ -764,6 +769,11 @@ def get_vla_action(
         List[np.ndarray]: Predicted actions
     """
     with torch.inference_mode():
+        if use_3d_model:
+            assert pi3_model is not None
+            pi3_model = pi3_model.to(DEVICE).to(torch.bfloat16)
+
+
 
         # Collect all input images
         all_images = [obs["full_image"]]
@@ -795,6 +805,24 @@ def get_vla_action(
             all_wrist_pixel_values = [wrist_inputs["pixel_values"] for wrist_inputs in all_wrist_inputs]
             inputs["pixel_values"] = torch.cat([primary_pixel_values] + all_wrist_pixel_values, dim=1)
 
+        if use_3d_model:
+            img_1, img_2 = inputs["pixel_values"][:, 0:3, :, :].to(DEVICE).to(torch.bfloat16), inputs["pixel_values"][:, 6:9, :, :].to(DEVICE).to(torch.bfloat16)
+            pi3_num_reg_token = 5
+
+            img_tensor = torch.stack([img_1, img_2], dim=1) # [B, 2, 3, H, W] where 2 indicates 2 views
+            B, N, _, H, W = img_tensor.shape
+            img_tensor = img_tensor.reshape((B*N, _, H, W))
+            hidden = pi3_model.encoder(img_tensor, is_training=True)
+            if isinstance(hidden, dict):
+                hidden = hidden["x_norm_patchtokens"]
+            hidden, pos = pi3_model.decode(hidden, N, H, W)
+            hidden = hidden[:, pi3_num_reg_token:, :]
+            L_3d, dim_3d = hidden.shape[-2:]
+            hidden = hidden.reshape(B, -1, L_3d, dim_3d)
+            hidden = hidden.reshape(B, -1, dim_3d)
+        else:
+            hidden = None
+
         # Process proprioception data if used
         proprio = None
         if cfg.use_proprio:
@@ -819,6 +847,7 @@ def get_vla_action(
                 noisy_action_projector=noisy_action_projector,
                 action_head=action_head,
                 use_film=use_film,
+                hidden_3d=hidden
             )
 
     # Extract subset of actions for open loop steps

diff --git a/experiments/robot/robot_utils.py b/experiments/robot/robot_utils.py
@@ -107,6 +107,7 @@ def get_action(
     noisy_action_projector: Optional[torch.nn.Module] = None,
     use_film: bool = False,
     use_minivlm: bool = False,
+    pi3_model: Optional[torch.nn.Module] = None
 ) -> Union[List[np.ndarray], np.ndarray]:
     """
     Query the model to get action predictions.
@@ -140,7 +141,9 @@ def get_action(
                 proprio_projector=proprio_projector,
                 noisy_action_projector=noisy_action_projector,
                 use_film=use_film,
-                use_minivlm=use_minivlm
+                use_minivlm=use_minivlm,
+                use_3d_model=cfg.use_3d,
+                pi3_model=pi3_model
             )
         else:
             raise ValueError(f"Unsupported model family: {cfg.model_family}")

diff --git a/pretrained_models/configs/modeling_prismatic.py b/pretrained_models/configs/modeling_prismatic.py
@@ -428,6 +428,14 @@ def _replace_input_embeddings(self, input_embeddings, all_actions_mask, noisy_ac
         Returns:
             Modified input_embeddings tensor
         """
+        """
+        * input_embeddings:               [B, L_a + L_lang, Dim]
+        * all_actions_mask:               [B, L_a + L_lang]
+        * noisy_action_features:          [B, L_a, Dim]
+        * 此处其实是替换，我们 L_a + L_lang 这一串我们把 L_a 的部分，用 mask_indicies 索引从哪开始 L_a 这块
+        * 我们 action_queries （论文核心设计）是 Embedding(num_tokens, dim) 的 weight
+        * 这一块是 [B, L_a + L_lang, Dim] 当中 L_a 替换成 action_queries 的 weight，L_lang 不动
+        """
         # Clone input to avoid modifying the original tensor
         new_input_embeddings = input_embeddings.clone()
 
@@ -455,13 +463,26 @@ def _replace_input_embeddings(self, input_embeddings, all_actions_mask, noisy_ac
 
     def _process_action_masks(self, labels):
         """Helper to get action masks from labels"""
+        """
+        * IGNORE_INDEX = -100, labels 中从第一个 -100 开始，
+        * ACTION_TOKEN_BEGIN_IDX = 151386
+        * NUM_TOKENS = 64, action 有 64 个 token ，从而 labels 一般是 64 个非 -100 。
+        * ACTION_DIM = 7，current_action 是 labels 里 前 6 个，next_actions 是 后 58 个
+        * 两个 mask 都是 Boolean。因此 1-48 是 -100, 49 - 54 是 curr_action, 55 - 110 是 next_actions, 后面都是 -100。
+        * 因而 all_action_mask 其实就是 [B, L] 这里 每一个 sample 中 64 个是 True，表示第几个 token 是 action 的。
+        * action 部分的 64 个就是 True。余下的是 False
+        """
         current_action_mask = get_current_action_mask(labels)
         next_actions_mask = get_next_actions_mask(labels)
         all_actions_mask = current_action_mask | next_actions_mask  # (B, seq_len)
         return all_actions_mask
 
     def _process_vision_features(self, pixel_values, language_embeddings=None, use_film=False):
         """Process vision features with optional FiLM conditioning"""
+        """
+        * 原设置没有 film condition，因此 language 的 feature embedding 不会传入给 vision transformer。
+        * [B, 3 * num_images, H, W] --(vision)--> [B, 256 * num_images, D] --(projector)--> [B, 256 * num_images, llm_dim]
+        """
         if use_film:
             # FiLM: Infuse language inputs into visual features
             patch_features = self.vision_backbone(pixel_values, language_embeddings)  # (bsz, 256 * num_images, D)
@@ -473,6 +494,11 @@ def _process_vision_features(self, pixel_values, language_embeddings=None, use_f
 
     def _process_proprio_features(self, projected_patch_embeddings, proprio, proprio_projector):
         """Process proprioceptive features and append to vision features"""
+        """
+        * 将 proprio 投影到 [B, D] 的 vector，然后 [B, 1, D]
+        * 然后 append 到尾部
+        * 实际上没有使用。
+        """
         if proprio_projector is not None and proprio is not None:
             # projected_patch_embeddings: (bsz, num_patches * num_images, llm_dim)
             # proprio: (bsz, proprio_dim) or (propro_dim,)
@@ -486,7 +512,13 @@ def _process_proprio_features(self, projected_patch_embeddings, proprio, proprio
     def _build_multimodal_attention(self, input_embeddings, projected_patch_embeddings, attention_mask):
         """Build multimodal embeddings and attention mask"""
         # Update attention mask
-
+        """
+        * 这里 input_embedding 中 L_a 的部分已经被替换为 nn.Embedding 的 weight了。
+        * 其实就是 input_embed 和 mask 在 length 上和 vision 的 embed 里 cat
+        * multimodal_embeddings: [B, 1 + L_v + (L_a + L_lang -1), Dim] 注意这个 1 是 <BOS> token。L_v 被插在了这二者之间了。
+        * multimodal_attention_mask: [B, 1 + L_v + (L_a + L_lang -1)]。
+        * vision 部分的 mask [B, L_v] 是 全 True 的。
+        """
         projected_patch_attention_mask = None
         if attention_mask is not None:
             projected_patch_attention_mask = torch.full(
@@ -511,6 +543,7 @@ def _build_multimodal_attention(self, input_embeddings, projected_patch_embeddin
 
     def _build_multimodal_labels(self, labels, projected_patch_embeddings):
         """Build multimodal labels with IGNORE_INDEX for patch embeddings"""
+        #* 所有 vision 部分的 index 都标为 -100（非 action 的 label），然后和原来 label [B, 1 + L_v + (L_a + L_lang -1)] 拼接
         if labels is not None:
             projected_patch_labels = torch.full(
                 (projected_patch_embeddings.shape[0], projected_patch_embeddings.shape[1]),
@@ -543,6 +576,22 @@ def forward(
         use_film: bool = False,
     ) -> Union[Tuple, PrismaticCausalLMOutputWithPast]:
         """Run a forward pass through the VLM, returning a PrismaticCausalLMOutputWithPast instance."""
+        """
+        * Debug NOTE:
+        * input_ids has shape:                   [B, 120]                with dtype: torch.int64 
+        ^ input_ids: 
+        * attention_mask has shape:              [B, 120]                with dtype: torch.bool
+        ^ attention_mask [torch.where(~m)[0].tolist() for m in attention_mask]
+        ^ [[119], [109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119], [114, 115, 116, 117, 118, 119], ...]
+
+        * pixel_values has shape:                [B, 12, 224, 224]       with dtype: torch.float32
+        * labels has shape:                      [B, 120]                with dtype: torch.int64
+        ^ [(r[0].item(), r[-1].item()) if len(r:=torch.where(l!=-100)[0]) else (None,None) for l in labels] 
+        ^ -100 一段 --> 非 -100 --> -100 一段
+        ^ [(54, 118), (44, 108), (49, 113), (48, 112), (50, 114), (44, 108), (49, 113), (55, 119)]
+
+        * proprio has shape:                     [B, 8]                  with dtype: torch.float32
+        """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -595,6 +644,10 @@ def forward(
 
         # === Handle Multimodal Forward ===
         elif (input_ids.shape[0] == pixel_values.shape[0]) or (inputs_embeds.shape[0] == pixel_values.shape[0]):
+
+            #! Entered here!
+            #* input_ids: [B, L_a+L_lang](int64) --(embedding)--> [B, L_a+L_lang, Dim](bfloat16) where 120 is the sequence len.
+            #* non -100 labels are acion tokens.
             assert past_key_values is None, "Unexpected key `past_key_values` provided during multimodal forward!"
 
             # Get input embeddings (from language model embeddings)
@@ -604,6 +657,11 @@ def forward(
             # Extract action masks
             all_actions_mask = self._process_action_masks(labels)
 
+            #* labels 有 64 个 非 -100 的 id，mask 也就是对应 64 个 位置是 True。这里也就是 labels 非 -100 的位置对应 True，说明是 action token
+            #* input_embeddings: [B, L_a + L_lang, Dim]
+            #* all_actions_mask 定位 L_a 起始终止 index。
+            #* language_embeddings: [B, L_lang, Dim]
+            #* projected_patch_embeddings:   [B, L_vis, Dim]
             # Extract the language portion of the input embeddings (i.e. remove the action tokens portion)
 
             # print(input_embeddings[~all_actions_mask].size())
@@ -639,6 +697,10 @@ def forward(
 
             # Build labels for multimodal sequence if needed
             multimodal_labels = self._build_multimodal_labels(labels, projected_patch_embeddings)
+
+            #* multimodal_embeddings:     [B, 1 + L_vis + (L_a + L_lang -1), Dim]
+            #* multimodal_attention_mask: [B, 1 + L_vis + (L_a + L_lang -1)]
+            #* mask 在 L_vis 和 L_a 为 True，余下为 False，这其实是说 Langugae 部分是 Causal 而 action，vis 是 bidirectional。
 
             # Dispatch to language model
             language_model_output = self.language_model(
@@ -817,6 +879,7 @@ def _regression_or_discrete_prediction(
         action_head=None,
         proprio=None,
         proprio_projector=None,
+        hidden_3d=None
     ):
         """Run L1 regression-based continuous action prediction or discrete action tokens prediction."""
 
@@ -867,7 +930,8 @@ def _regression_or_discrete_prediction(
             # L1 regression prediction
             normalized_actions = action_head.predict_action(multi_layer_hidden_states,
                                                 proprio=proprio,
-                                                proprio_projector=proprio_projector)
+                                                proprio_projector=proprio_projector,
+                                                hidden_3d=hidden_3d)
             normalized_actions = normalized_actions.reshape(NUM_ACTIONS_CHUNK, ACTION_DIM)
             normalized_actions = normalized_actions.float().cpu().detach().numpy()
         else:
@@ -918,6 +982,7 @@ def predict_action(
 
         pixel_values = kwargs["pixel_values"] # [1, 12, 224, 224]
         attention_mask = kwargs["attention_mask"] # 
+        hidden_3d = kwargs.get("hidden_3d", None)
 
         # Create fake labels tensor (needed for action mask)
         labels = input_ids.clone()
@@ -964,6 +1029,7 @@ def predict_action(
             action_head=action_head,
             proprio=proprio, # [8]
             proprio_projector=proprio_projector,
+            hidden_3d=hidden_3d,
             )
 
         # Unnormalize predicted actions