YAML Metadata Warning: empty or missing yaml metadata in repo card (https://huggingface.co/docs/hub/model-cards#model-card-metadata)

Using 🤗 Transformers to Use the model

Loading model

from transformers import AutoProcessor
import modeling_contextvla

processor = AutoProcessor.from_pretrained("huiwon/ContextVLA-3B-Qwen2.5VL-FAST", use_fast=True)
processor.tokenizer.padding_side = 'left'
        
fast_tokenizer = AutoProcessor.from_pretrained(
    "physical-intelligence/fast", trust_remote_code=True
)
# time_horizon and action_dim should be defined (task-specifically)
fast_tokenizer.time_horizon = time_horizon
fast_tokenizer.action_dim = action_dim

model = modeling_contextvla.ContextVLA_Qwen2_5_VL.from_pretrained(
    "huiwon/ContextVLA-3B-Qwen2.5VL-FAST",
    attn_implementation="flash_attention_2",
    dtype=torch.bfloat16
)

Data input

import numpy as np

def array_to_pil_image(self, frame):
    if len(frame.shape) == 3 and frame.shape[0] == 3:
        frame = np.transpose(frame, (1, 2, 0))
    if frame.dtype != np.uint8:
        if frame.max() <= 1.0:
            frame = (frame * 255).astype(np.uint8)
        else:
            frame = frame.astype(np.uint8)
    return Image.fromarray(frame)

main_pixel_values = np.zeros((8, 224, 224, 3))
wrist_pixel_values = np.zeros((8, 224, 224, 3))
right_pixel_values = np.zeros((8, 224, 224, 3))

pixel_values = np.stack([main_pixel_values, wrist_pixel_values, right_pixel_values], axis=1)
pixel_values = pixel_values.reshape(-1, pixel_values.shape[-3], pixel_values.shape[-2], pixel_values.shape[-1])

image_contents = [{"type": "image", "image": array_to_pil_image(frame)} for frame in pixel_values]

messages = [
    {
        "role": "user",
        "content": image_contents + [{"type": "text", "text": task_description}],
    }
]
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

image_inputs, video_inputs = vision_process.process_vision_info(messages)

inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=False,
    return_tensors="pt",
)

Extract actions

import torch

model.model.layers[2].input_id_context = inputs['input_ids'].detach()
ACTION_TOKEN_MIN = 151665
ACTION_TOKEN_MAX = 153712

# norm_stats q1 and q99 should be loaded
action_high, action_low = np.array(norm_stats["norm_stats"]["actions"]["q99"]), np.array(norm_stats["norm_stats"]["actions"]["q01"])
error_action = np.zeros((time_horizon, action_dim))

with torch.no_grad():
    generated_ids = model.generate(
        **inputs,
        max_new_tokens=256,
        do_sample=False,
        pad_token_id=processor.tokenizer.eos_token_id
    )

    action_indices = (ACTION_TOKEN_MIN <= generated_ids[0]) & (generated_ids[0] <= ACTION_TOKEN_MAX)
    action_indices = torch.where(action_indices)[0]
        
    output_action = fast_tokenizer.decode([generated_ids[0][action_indices] - ACTION_TOKEN_MIN])[0]

    if np.allclose(error_action, output_action):
        unnorm_actions = output_action
    else:
        unnorm_actions = (
            0.5 * (output_action + 1) * (action_high - action_low)
            + action_low
        )
    action = np.array(unnorm_actions)

Downloads last month: 16

Safetensors

Model size

4B params

Tensor type

F32

BF16

Inference Providers NEW

This model isn't deployed by any Inference Provider. 🙋 Ask for provider support