yuhangzang
update
f6e3956
import gradio as gr
import spaces
import torch
from PIL import Image
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
MODEL_ID = "internlm/CapRL-Qwen3VL-4B"
DEFAULT_PROMPT = "Describe the image in detail."
MAX_NEW_TOKENS = 1536
# Default demo content
DEFAULT_IMAGE = "./examples/1909.png"
DEFAULT_CAPTION = """Based on the provided bar chart from the Pew Research Center, here is a detailed description:
**Title:** Older Republicans especially likely to see Trump as fighting for their beliefs
**Subtitle:** Among Republicans and Republican leaners, % who say the phrase 'fights for what I believe in' describes Trump ...
**Source:** Survey of U.S. adults conducted Feb. 4-15, 2020.
**Legend:**
- **Very well:** 51% (All Rep/Lean Rep)
- **Fairly well:** 36% (All Rep/Lean Rep)
- **NET:** Sum of "Very well" and "Fairly well"
**Data Summary:**
**All Rep/Lean Rep:**
- Very well: 51%
- Fairly well: 36%
- NET: 87%
**By Age:**
- Ages 18-29: Very well 31%, Fairly well 45%, NET 76%
- 30-49: Very well 41%, Fairly well 42%, NET 82%
- 50-64: Very well 58%, Fairly well 33%, NET 92%
- 65+: Very well 68%, Fairly well 26%, NET 94%
**By Education:**
- Postgrad: Very well 42%, Fairly well 38%, NET 80%
- College grad: Very well 45%, Fairly well 40%, NET 85%
- Some college: Very well 51%, Fairly well 36%, NET 87%
- HS or less: Very well 56%, Fairly well 33%, NET 89%
**By Conservatism:**
- Conserv: Very well 63%, Fairly well 31%, NET 94%
- Mod/Lib: Very well 32%, Fairly well 44%, NET 75%
**By Party Identification:**
- Republican: Very well 61%, Fairly well 32%, NET 93%
- Lean Republican: Very well 36%, Fairly well 41%, NET 77%
**Analysis:**
- The overall percentage of Republicans and Republican leaners who say Trump "fights for what I believe in" is 87% (51% "very well" and 36% "fairly well").
- The group most likely to say this is those aged 65 and older (94% NET), followed by those 50-64 (92% NET) and those with a high school diploma or less (89% NET).
- The youngest group (18-29) is the least likely (76% NET).
- Among education levels, those with a high school diploma or less are most likely (89% NET), followed by some college (87%) and college graduates (85%).
- The most conservative group (Conservative) is the most likely to say this (94% NET), while the moderate/liberal group is the least likely (75% NET).
- Among party identifiers, Republicans are most likely (93% NET), while lean Republicans are less likely (77% NET).
- The 65+ age group is also the most conservative (94% NET) and the most likely to say Trump fights for their beliefs.
- The 65+ group is also the most likely to say it "very well" (68%) and the least likely to say it "fairly well" (26%).
- The most conservative group (63% "very well") is also the most likely overall (94% NET), while the most moderate/liberal group (32% "very well") is the least likely (75% NET).
- The Republican party has a 93% NET, while the lean Republican group has a 77% NET.
**Conclusion:**
The chart shows that older Republicans (65+) are the most likely to see Trump as fighting for their beliefs, with 94% saying it "very well" or "fairly well." This is followed by those aged 50-64 (92%) and those with a high school diploma or less (89%). The most conservative Republicans are also the most likely (94%), while moderate/liberal Republicans are the least likely (75%). The youngest group (18-29) is the least likely (76%)."""
DEFAULT_CAPTION_TOKENS = 993
def get_device() -> str:
return "cuda" if torch.cuda.is_available() else "cpu"
def select_dtype(device: str):
if device == "cuda":
if torch.cuda.is_bf16_supported():
return torch.bfloat16
return torch.float16
return torch.float32
def load_model():
device = get_device()
dtype = select_dtype(device)
# Use device_map="auto" for proper GPU allocation with spaces.GPU decorator
model = Qwen3VLForConditionalGeneration.from_pretrained(
MODEL_ID,
torch_dtype=dtype,
device_map="auto",
trust_remote_code=True,
)
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
return model, processor
MODEL, PROCESSOR = load_model()
@spaces.GPU
@torch.inference_mode()
def generate_caption(image: Image.Image):
if image is None:
return "", 0
try:
# Validate image
if not isinstance(image, Image.Image):
return "Error: Invalid image format", 0
# Check image size (warn if too large)
max_size = 4096
if image.width > max_size or image.height > max_size:
# Resize if too large to prevent OOM
image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)
device = MODEL.device
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": DEFAULT_PROMPT},
],
}
]
# Preparation for inference using Qwen3-VL style
inputs = PROCESSOR.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt",
)
inputs = inputs.to(device)
generated_ids = MODEL.generate(
**inputs,
max_new_tokens=MAX_NEW_TOKENS,
do_sample=False,
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = PROCESSOR.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
caption = output_text[0].strip()
input_ids = inputs.get("input_ids")
input_length = input_ids.shape[-1] if input_ids is not None else 0
total_length = generated_ids.shape[-1]
num_generated_tokens = max(total_length - input_length, 0)
return caption, int(num_generated_tokens)
except torch.cuda.OutOfMemoryError:
torch.cuda.empty_cache()
return "Error: Out of GPU memory. Please try with a smaller image.", 0
except Exception as e:
return f"Error generating caption: {str(e)}", 0
with gr.Blocks(title="CapRL-Qwen3VL-4B Image Captioning") as demo:
gr.Markdown(
"""
# CapRL
๐Ÿ“–<a href="https://arxiv.org/abs/2509.22647">Paper</a> | ๐Ÿ <a href="https://github.com/InternLM/CapRL">Github</a> | ๐Ÿค—<a href="https://huggingface.co/collections/long-xing1/caprl-68d64ac32ded31596c36e189">CapRL Collection</a> | ๐Ÿค—<a href="https://huggingface.co/papers/2509.22647">Daily Paper</a>
### CapRL Series Model & Dataset
| Series | Models & Resources |
| :--- | :--- |
| **CapRL 2.0 Series** | [๐Ÿค— CapRL-Qwen3VL-2B](https://huggingface.co/internlm/CapRL-Qwen3VL-2B) \| [๐Ÿค— CapRL-Qwen3VL-4B](https://huggingface.co/internlm/CapRL-Qwen3VL-4B) \| [๐Ÿ“ฆ CapRL-Qwen3VL-2B-GGUF](https://huggingface.co/internlm/CapRL-Qwen3VL-2B-GGUF) \| [๐Ÿ“ฆ CapRL-Qwen3VL-4B-GGUF](https://huggingface.co/internlm/CapRL-Qwen3VL-4B-GGUF) \| [๐ŸŒˆCapRL-Qwen3VL-4B Space](https://huggingface.co/spaces/yuhangzang/CapRL-Qwen3VL-4B)
| **CapRL 1.0 Series** | [๐Ÿค— CapRL-Qwen2.5VL-3B](https://huggingface.co/internlm/CapRL-3B) \| [๐Ÿค— CapRL-InternVL3.5-8B](https://huggingface.co/yuhangzang/CapRL-InternVL3.5-8B) \| [๐Ÿ“Š CapRL-2M Dataset](https://huggingface.co/datasets/internlm/CapRL-2M) \| [๐Ÿ“ฆ CapRL-3B-GGUF](https://huggingface.co/mradermacher/CapRL-3B-GGUF) \| [๐Ÿ“ฆ CapRL-3B-i1-GGUF](https://huggingface.co/mradermacher/CapRL-3B-i1-GGUF) \| [๐ŸŒˆCapRL-Qwen2.5VL-3B Space](https://huggingface.co/spaces/yuhangzang/caprl)
We are excited to release the **CapRL 2.0 series**: **CapRL-Qwen3VL-2B** and **CapRL-Qwen3VL-4B**. These models feature fewer parameters while delivering even more powerful captioning performance.
We welcome everyone to try them out!
**This Space** is based on **CapRL-Qwen3VL-4B**. You can also try out **CapRL-Qwen2.5VL-3B** ๐ŸŽจ&nbsp;&nbsp;&nbsp;&nbsp;โžก๏ธ&nbsp;&nbsp;&nbsp;&nbsp;[๐ŸŒˆCapRL-Qwen2.5VL-3B Space](https://huggingface.co/spaces/yuhangzang/caprl)
"""
)
with gr.Row():
with gr.Column():
# Preload a default image to match the provided caption
image_input = gr.Image(type="pil", label="Input Image", value=Image.open(DEFAULT_IMAGE))
generate_button = gr.Button("Generate Caption")
with gr.Column():
# Show a default caption and its token count on load
caption_output = gr.Textbox(label="Caption", lines=6, value=DEFAULT_CAPTION)
token_output = gr.Number(label="Generated Tokens", precision=0, value=DEFAULT_CAPTION_TOKENS)
generate_button.click(
fn=generate_caption,
inputs=image_input,
outputs=[caption_output, token_output],
show_progress=True,
)
image_input.upload(
fn=generate_caption,
inputs=image_input,
outputs=[caption_output, token_output],
show_progress=True,
)
gr.Examples(
examples=[
["./examples/1909.png"],
["./examples/44687.jpeg"],
["./examples/natural.png"],
],
inputs=image_input,
outputs=[caption_output, token_output],
fn=generate_caption,
cache_examples=True,
label="Example Images"
)
gr.Markdown("### Citation")
gr.Markdown("If you find this project useful, please kindly cite:")
citation_text = """@article{xing2025caprl,
title={{CapRL}: Stimulating Dense Image Caption Capabilities via Reinforcement Learning},
author={Xing, Long and Dong, Xiaoyi and Zang, Yuhang and Cao, Yuhang and Liang, Jianze and Huang, Qidong and Wang, Jiaqi and Wu, Feng and Lin, Dahua},
journal={arXiv preprint arXiv:2509.22647},
year={2025}
}"""
gr.Code(value=citation_text, language="markdown", label="BibTeX Citation")
demo.launch()