Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .gitgnore → .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -144,10 +144,14 @@ dmypy.json
# Mac OS
.DS_Store


# Caches and Datasets
cache/
data/

pretrained_models/
# Rollout videos and wandb logs
rollouts/
wandb/
outputs/
experiments/logs/
evaluation_results/
1 change: 1 addition & 0 deletions LIBERO
Submodule LIBERO added at 8f1084
1 change: 1 addition & 0 deletions calvin
Submodule calvin added at fa03f0
14 changes: 14 additions & 0 deletions eval.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
export HF_HUB_OFFLINE=1
export TRANSFORMERS_OFFLINE=1
export HF_DATASETS_OFFLINE=1
CUDA_VISIBLE_DEVICES=1 python experiments/robot/libero/run_libero_eval.py \
--use_proprio True \
--num_images_in_input 2 \
--use_film False \
--pretrained_checkpoint outputs/configs+libero_10_no_noops+b16+lr-0.0002+lora-r64+dropout-0.0--image_aug--VLA-Adapter--libero_10_no_noops--2025_10_27_17_22_41--use_3d_True_dim_2048_inject_all--170000_chkpt \
--task_suite_name libero_10 \
--use_pro_version True \
--use_3d True \
--inject_layers all \
# outputs/configs+libero_10_no_noops+b16+lr-0.0002+lora-r64+dropout-0.0--image_aug--VLA-Adapter--libero_10_no_noops--1759126170--160000_chkpt \
# > eval_logs/Spatial--chkpt.log 2>&1 &
6 changes: 6 additions & 0 deletions eval2.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
export HF_HUB_OFFLINE=1
export TRANSFORMERS_OFFLINE=1
export HF_DATASETS_OFFLINE=1
CUDA_VISIBLE_DEVICES=7 python vla-scripts/evaluate_calvin.py \
--pretrained_checkpoint outputs/CALVIN-ABC-Pro
# --pretrained_checkpoint outputs/configs+calvin_abc_rlds+b16+lr-0.0002+lora-r64+dropout-0.0--image_aug--VLA-Adapter--calvin_abc_rlds----100000_chkpt \
25 changes: 20 additions & 5 deletions experiments/robot/libero/run_libero_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from dataclasses import dataclass
from enum import Enum
from pathlib import Path
from typing import Optional, Union
from typing import Optional, Union, List

import draccus
import numpy as np
Expand Down Expand Up @@ -48,7 +48,7 @@
set_seed_everywhere,
)
from prismatic.vla.constants import NUM_ACTIONS_CHUNK

from prismatic.models.pi3_loader import load_pc_model

# Define task suite constants
class TaskSuite(str, Enum):
Expand Down Expand Up @@ -128,6 +128,11 @@ class GenerateConfig:
use_pro_version: bool = True # encourage to use the pro models we released.
phase: str = "Inference"

use_3d: bool = False
dim_3d: int = 2048
pi3_path: Path = Path("/home/ruihengwang/vla/VLA-Adapter/pretrained_models/pi3_checkpoint")
inject_layers: Optional[int | List[int] | str] = None



def validate_config(cfg: GenerateConfig) -> None:
Expand Down Expand Up @@ -292,6 +297,7 @@ def run_episode(
noisy_action_projector=None,
initial_state=None,
log_file=None,
pi3_model=None
):
"""Run a single episode in the environment."""
# Reset environment
Expand Down Expand Up @@ -342,7 +348,8 @@ def run_episode(
proprio_projector=proprio_projector,
noisy_action_projector=noisy_action_projector,
use_film=cfg.use_film,
use_minivlm=cfg.use_minivlm
use_minivlm=cfg.use_minivlm,
pi3_model=pi3_model
)

action_queue.extend(actions)
Expand Down Expand Up @@ -383,7 +390,8 @@ def run_task(
total_episodes=0,
total_successes=0,
log_file=None,
save_version=None
save_version=None,
pi3_model=None
):
"""Run evaluation for a single task."""
# Get task
Expand Down Expand Up @@ -433,6 +441,7 @@ def run_task(
noisy_action_projector,
initial_state,
log_file,
pi3_model
)

# Update counters
Expand Down Expand Up @@ -483,6 +492,10 @@ def eval_libero(cfg: GenerateConfig) -> float:

# Initialize model and components
model, action_head, proprio_projector, noisy_action_projector, processor = initialize_model(cfg)
if cfg.use_3d:
pi3_model = load_pc_model(cfg.pi3_path)
else:
pi3_model = None

# for name, param in model.named_parameters():
# if 'action_queries' in name:
Expand All @@ -500,6 +513,7 @@ def eval_libero(cfg: GenerateConfig) -> float:
num_tasks = task_suite.n_tasks

log_message(f"Task suite: {cfg.task_suite_name}", log_file)
log_message(f"Using pretrained checkpoint: {cfg.pretrained_checkpoint}", log_file)

# Start evaluation
total_episodes, total_successes = 0, 0
Expand All @@ -517,7 +531,8 @@ def eval_libero(cfg: GenerateConfig) -> float:
total_episodes,
total_successes,
log_file,
cfg.save_version
cfg.save_version,
pi3_model
)

# Calculate final success rate
Expand Down
31 changes: 30 additions & 1 deletion experiments/robot/openvla_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
ACTION_PROPRIO_NORMALIZATION_TYPE,
)
from prismatic.vla.datasets.rlds.utils.data_utils import NormalizationType

from prismatic.models.pi3_loader import load_pc_model
# Initialize important constants
DATE = time.strftime("%Y_%m_%d")
DATE_TIME = time.strftime("%Y_%m_%d-%H_%M_%S")
Expand Down Expand Up @@ -507,6 +507,9 @@ def get_action_head(cfg: Any, llm_dim: int) -> Union[L1RegressionActionHead]:
hidden_dim=llm_dim,
action_dim=ACTION_DIM,
use_pro_version=cfg.use_pro_version,
use_3d=cfg.use_3d,
dim_3d=cfg.dim_3d,
inject_layers=cfg.inject_layers,
)

else:
Expand Down Expand Up @@ -745,6 +748,8 @@ def get_vla_action(
noisy_action_projector: Optional[torch.nn.Module] = None,
use_film: bool = False,
use_minivlm: bool = False,
use_3d_model: bool = False,
pi3_model: Optional[torch.nn.Module] = None
) -> List[np.ndarray]:
"""
Generate action predictions with the VLA policy.
Expand All @@ -764,6 +769,11 @@ def get_vla_action(
List[np.ndarray]: Predicted actions
"""
with torch.inference_mode():
if use_3d_model:
assert pi3_model is not None
pi3_model = pi3_model.to(DEVICE).to(torch.bfloat16)



# Collect all input images
all_images = [obs["full_image"]]
Expand Down Expand Up @@ -795,6 +805,24 @@ def get_vla_action(
all_wrist_pixel_values = [wrist_inputs["pixel_values"] for wrist_inputs in all_wrist_inputs]
inputs["pixel_values"] = torch.cat([primary_pixel_values] + all_wrist_pixel_values, dim=1)

if use_3d_model:
img_1, img_2 = inputs["pixel_values"][:, 0:3, :, :].to(DEVICE).to(torch.bfloat16), inputs["pixel_values"][:, 6:9, :, :].to(DEVICE).to(torch.bfloat16)
pi3_num_reg_token = 5

img_tensor = torch.stack([img_1, img_2], dim=1) # [B, 2, 3, H, W] where 2 indicates 2 views
B, N, _, H, W = img_tensor.shape
img_tensor = img_tensor.reshape((B*N, _, H, W))
hidden = pi3_model.encoder(img_tensor, is_training=True)
if isinstance(hidden, dict):
hidden = hidden["x_norm_patchtokens"]
hidden, pos = pi3_model.decode(hidden, N, H, W)
hidden = hidden[:, pi3_num_reg_token:, :]
L_3d, dim_3d = hidden.shape[-2:]
hidden = hidden.reshape(B, -1, L_3d, dim_3d)
hidden = hidden.reshape(B, -1, dim_3d)
else:
hidden = None

# Process proprioception data if used
proprio = None
if cfg.use_proprio:
Expand All @@ -819,6 +847,7 @@ def get_vla_action(
noisy_action_projector=noisy_action_projector,
action_head=action_head,
use_film=use_film,
hidden_3d=hidden
)

# Extract subset of actions for open loop steps
Expand Down
5 changes: 4 additions & 1 deletion experiments/robot/robot_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ def get_action(
noisy_action_projector: Optional[torch.nn.Module] = None,
use_film: bool = False,
use_minivlm: bool = False,
pi3_model: Optional[torch.nn.Module] = None
) -> Union[List[np.ndarray], np.ndarray]:
"""
Query the model to get action predictions.
Expand Down Expand Up @@ -140,7 +141,9 @@ def get_action(
proprio_projector=proprio_projector,
noisy_action_projector=noisy_action_projector,
use_film=use_film,
use_minivlm=use_minivlm
use_minivlm=use_minivlm,
use_3d_model=cfg.use_3d,
pi3_model=pi3_model
)
else:
raise ValueError(f"Unsupported model family: {cfg.model_family}")
Expand Down
70 changes: 68 additions & 2 deletions pretrained_models/configs/modeling_prismatic.py
Original file line number Diff line number Diff line change
Expand Up @@ -428,6 +428,14 @@ def _replace_input_embeddings(self, input_embeddings, all_actions_mask, noisy_ac
Returns:
Modified input_embeddings tensor
"""
"""
* input_embeddings: [B, L_a + L_lang, Dim]
* all_actions_mask: [B, L_a + L_lang]
* noisy_action_features: [B, L_a, Dim]
* 此处其实是替换,我们 L_a + L_lang 这一串我们把 L_a 的部分,用 mask_indicies 索引从哪开始 L_a 这块
* 我们 action_queries (论文核心设计)是 Embedding(num_tokens, dim) 的 weight
* 这一块是 [B, L_a + L_lang, Dim] 当中 L_a 替换成 action_queries 的 weight,L_lang 不动
"""
# Clone input to avoid modifying the original tensor
new_input_embeddings = input_embeddings.clone()

Expand Down Expand Up @@ -455,13 +463,26 @@ def _replace_input_embeddings(self, input_embeddings, all_actions_mask, noisy_ac

def _process_action_masks(self, labels):
"""Helper to get action masks from labels"""
"""
* IGNORE_INDEX = -100, labels 中从第一个 -100 开始,
* ACTION_TOKEN_BEGIN_IDX = 151386
* NUM_TOKENS = 64, action 有 64 个 token ,从而 labels 一般是 64 个非 -100 。
* ACTION_DIM = 7,current_action 是 labels 里 前 6 个,next_actions 是 后 58 个
* 两个 mask 都是 Boolean。因此 1-48 是 -100, 49 - 54 是 curr_action, 55 - 110 是 next_actions, 后面都是 -100。
* 因而 all_action_mask 其实就是 [B, L] 这里 每一个 sample 中 64 个是 True,表示第几个 token 是 action 的。
* action 部分的 64 个就是 True。余下的是 False
"""
current_action_mask = get_current_action_mask(labels)
next_actions_mask = get_next_actions_mask(labels)
all_actions_mask = current_action_mask | next_actions_mask # (B, seq_len)
return all_actions_mask

def _process_vision_features(self, pixel_values, language_embeddings=None, use_film=False):
"""Process vision features with optional FiLM conditioning"""
"""
* 原设置没有 film condition,因此 language 的 feature embedding 不会传入给 vision transformer。
* [B, 3 * num_images, H, W] --(vision)--> [B, 256 * num_images, D] --(projector)--> [B, 256 * num_images, llm_dim]
"""
if use_film:
# FiLM: Infuse language inputs into visual features
patch_features = self.vision_backbone(pixel_values, language_embeddings) # (bsz, 256 * num_images, D)
Expand All @@ -473,6 +494,11 @@ def _process_vision_features(self, pixel_values, language_embeddings=None, use_f

def _process_proprio_features(self, projected_patch_embeddings, proprio, proprio_projector):
"""Process proprioceptive features and append to vision features"""
"""
* 将 proprio 投影到 [B, D] 的 vector,然后 [B, 1, D]
* 然后 append 到尾部
* 实际上没有使用。
"""
if proprio_projector is not None and proprio is not None:
# projected_patch_embeddings: (bsz, num_patches * num_images, llm_dim)
# proprio: (bsz, proprio_dim) or (propro_dim,)
Expand All @@ -486,7 +512,13 @@ def _process_proprio_features(self, projected_patch_embeddings, proprio, proprio
def _build_multimodal_attention(self, input_embeddings, projected_patch_embeddings, attention_mask):
"""Build multimodal embeddings and attention mask"""
# Update attention mask

"""
* 这里 input_embedding 中 L_a 的部分已经被替换为 nn.Embedding 的 weight了。
* 其实就是 input_embed 和 mask 在 length 上和 vision 的 embed 里 cat
* multimodal_embeddings: [B, 1 + L_v + (L_a + L_lang -1), Dim] 注意这个 1 是 <BOS> token。L_v 被插在了这二者之间了。
* multimodal_attention_mask: [B, 1 + L_v + (L_a + L_lang -1)]。
* vision 部分的 mask [B, L_v] 是 全 True 的。
"""
projected_patch_attention_mask = None
if attention_mask is not None:
projected_patch_attention_mask = torch.full(
Expand All @@ -511,6 +543,7 @@ def _build_multimodal_attention(self, input_embeddings, projected_patch_embeddin

def _build_multimodal_labels(self, labels, projected_patch_embeddings):
"""Build multimodal labels with IGNORE_INDEX for patch embeddings"""
#* 所有 vision 部分的 index 都标为 -100(非 action 的 label),然后和原来 label [B, 1 + L_v + (L_a + L_lang -1)] 拼接
if labels is not None:
projected_patch_labels = torch.full(
(projected_patch_embeddings.shape[0], projected_patch_embeddings.shape[1]),
Expand Down Expand Up @@ -543,6 +576,22 @@ def forward(
use_film: bool = False,
) -> Union[Tuple, PrismaticCausalLMOutputWithPast]:
"""Run a forward pass through the VLM, returning a PrismaticCausalLMOutputWithPast instance."""
"""
* Debug NOTE:
* input_ids has shape: [B, 120] with dtype: torch.int64
^ input_ids:
* attention_mask has shape: [B, 120] with dtype: torch.bool
^ attention_mask [torch.where(~m)[0].tolist() for m in attention_mask]
^ [[119], [109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119], [114, 115, 116, 117, 118, 119], ...]

* pixel_values has shape: [B, 12, 224, 224] with dtype: torch.float32
* labels has shape: [B, 120] with dtype: torch.int64
^ [(r[0].item(), r[-1].item()) if len(r:=torch.where(l!=-100)[0]) else (None,None) for l in labels]
^ -100 一段 --> 非 -100 --> -100 一段
^ [(54, 118), (44, 108), (49, 113), (48, 112), (50, 114), (44, 108), (49, 113), (55, 119)]

* proprio has shape: [B, 8] with dtype: torch.float32
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
Expand Down Expand Up @@ -595,6 +644,10 @@ def forward(

# === Handle Multimodal Forward ===
elif (input_ids.shape[0] == pixel_values.shape[0]) or (inputs_embeds.shape[0] == pixel_values.shape[0]):

#! Entered here!
#* input_ids: [B, L_a+L_lang](int64) --(embedding)--> [B, L_a+L_lang, Dim](bfloat16) where 120 is the sequence len.
#* non -100 labels are acion tokens.
assert past_key_values is None, "Unexpected key `past_key_values` provided during multimodal forward!"

# Get input embeddings (from language model embeddings)
Expand All @@ -604,6 +657,11 @@ def forward(
# Extract action masks
all_actions_mask = self._process_action_masks(labels)

#* labels 有 64 个 非 -100 的 id,mask 也就是对应 64 个 位置是 True。这里也就是 labels 非 -100 的位置对应 True,说明是 action token
#* input_embeddings: [B, L_a + L_lang, Dim]
#* all_actions_mask 定位 L_a 起始终止 index。
#* language_embeddings: [B, L_lang, Dim]
#* projected_patch_embeddings: [B, L_vis, Dim]
# Extract the language portion of the input embeddings (i.e. remove the action tokens portion)

# print(input_embeddings[~all_actions_mask].size())
Expand Down Expand Up @@ -639,6 +697,10 @@ def forward(

# Build labels for multimodal sequence if needed
multimodal_labels = self._build_multimodal_labels(labels, projected_patch_embeddings)

#* multimodal_embeddings: [B, 1 + L_vis + (L_a + L_lang -1), Dim]
#* multimodal_attention_mask: [B, 1 + L_vis + (L_a + L_lang -1)]
#* mask 在 L_vis 和 L_a 为 True,余下为 False,这其实是说 Langugae 部分是 Causal 而 action,vis 是 bidirectional。

# Dispatch to language model
language_model_output = self.language_model(
Expand Down Expand Up @@ -817,6 +879,7 @@ def _regression_or_discrete_prediction(
action_head=None,
proprio=None,
proprio_projector=None,
hidden_3d=None
):
"""Run L1 regression-based continuous action prediction or discrete action tokens prediction."""

Expand Down Expand Up @@ -867,7 +930,8 @@ def _regression_or_discrete_prediction(
# L1 regression prediction
normalized_actions = action_head.predict_action(multi_layer_hidden_states,
proprio=proprio,
proprio_projector=proprio_projector)
proprio_projector=proprio_projector,
hidden_3d=hidden_3d)
normalized_actions = normalized_actions.reshape(NUM_ACTIONS_CHUNK, ACTION_DIM)
normalized_actions = normalized_actions.float().cpu().detach().numpy()
else:
Expand Down Expand Up @@ -918,6 +982,7 @@ def predict_action(

pixel_values = kwargs["pixel_values"] # [1, 12, 224, 224]
attention_mask = kwargs["attention_mask"] #
hidden_3d = kwargs.get("hidden_3d", None)

# Create fake labels tensor (needed for action mask)
labels = input_ids.clone()
Expand Down Expand Up @@ -964,6 +1029,7 @@ def predict_action(
action_head=action_head,
proprio=proprio, # [8]
proprio_projector=proprio_projector,
hidden_3d=hidden_3d,
)

# Unnormalize predicted actions
Expand Down
Loading