YAML Metadata
Warning:
empty or missing yaml metadata in repo card
(https://huggingface.co/docs/hub/model-cards#model-card-metadata)
Using π€ Transformers to Use the model
- Loading model
from transformers import AutoProcessor
import modeling_contextvla
processor = AutoProcessor.from_pretrained("huiwon/ContextVLA-3B-Qwen2.5VL-FAST", use_fast=True)
processor.tokenizer.padding_side = 'left'
fast_tokenizer = AutoProcessor.from_pretrained(
"physical-intelligence/fast", trust_remote_code=True
)
# time_horizon and action_dim should be defined (task-specifically)
fast_tokenizer.time_horizon = time_horizon
fast_tokenizer.action_dim = action_dim
model = modeling_contextvla.ContextVLA_Qwen2_5_VL.from_pretrained(
"huiwon/ContextVLA-3B-Qwen2.5VL-FAST",
attn_implementation="flash_attention_2",
dtype=torch.bfloat16
)
- Data input
import numpy as np
def array_to_pil_image(self, frame):
if len(frame.shape) == 3 and frame.shape[0] == 3:
frame = np.transpose(frame, (1, 2, 0))
if frame.dtype != np.uint8:
if frame.max() <= 1.0:
frame = (frame * 255).astype(np.uint8)
else:
frame = frame.astype(np.uint8)
return Image.fromarray(frame)
main_pixel_values = np.zeros((8, 224, 224, 3))
wrist_pixel_values = np.zeros((8, 224, 224, 3))
right_pixel_values = np.zeros((8, 224, 224, 3))
pixel_values = np.stack([main_pixel_values, wrist_pixel_values, right_pixel_values], axis=1)
pixel_values = pixel_values.reshape(-1, pixel_values.shape[-3], pixel_values.shape[-2], pixel_values.shape[-1])
image_contents = [{"type": "image", "image": array_to_pil_image(frame)} for frame in pixel_values]
messages = [
{
"role": "user",
"content": image_contents + [{"type": "text", "text": task_description}],
}
]
text = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = vision_process.process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=False,
return_tensors="pt",
)
- Extract actions
import torch
model.model.layers[2].input_id_context = inputs['input_ids'].detach()
ACTION_TOKEN_MIN = 151665
ACTION_TOKEN_MAX = 153712
# norm_stats q1 and q99 should be loaded
action_high, action_low = np.array(norm_stats["norm_stats"]["actions"]["q99"]), np.array(norm_stats["norm_stats"]["actions"]["q01"])
error_action = np.zeros((time_horizon, action_dim))
with torch.no_grad():
generated_ids = model.generate(
**inputs,
max_new_tokens=256,
do_sample=False,
pad_token_id=processor.tokenizer.eos_token_id
)
action_indices = (ACTION_TOKEN_MIN <= generated_ids[0]) & (generated_ids[0] <= ACTION_TOKEN_MAX)
action_indices = torch.where(action_indices)[0]
output_action = fast_tokenizer.decode([generated_ids[0][action_indices] - ACTION_TOKEN_MIN])[0]
if np.allclose(error_action, output_action):
unnorm_actions = output_action
else:
unnorm_actions = (
0.5 * (output_action + 1) * (action_high - action_low)
+ action_low
)
action = np.array(unnorm_actions)
- Downloads last month
- 16
Inference Providers
NEW
This model isn't deployed by any Inference Provider.
π
Ask for provider support