Spaces:

yuhangzang
/

CapRL-Qwen3VL-4B

Running on Zero

yuhangzang

update

f6e3956 6 months ago

10.1 kB

	import gradio as gr
	import spaces
	import torch
	from PIL import Image
	from transformers import Qwen3VLForConditionalGeneration, AutoProcessor

	MODEL_ID = "internlm/CapRL-Qwen3VL-4B"
	DEFAULT_PROMPT = "Describe the image in detail."
	MAX_NEW_TOKENS = 1536

	# Default demo content
	DEFAULT_IMAGE = "./examples/1909.png"
	DEFAULT_CAPTION = """Based on the provided bar chart from the Pew Research Center, here is a detailed description:

	Title: Older Republicans especially likely to see Trump as fighting for their beliefs
	Subtitle: Among Republicans and Republican leaners, % who say the phrase 'fights for what I believe in' describes Trump ...

	Source: Survey of U.S. adults conducted Feb. 4-15, 2020.

	Legend:
	- Very well: 51% (All Rep/Lean Rep)
	- Fairly well: 36% (All Rep/Lean Rep)
	- NET: Sum of "Very well" and "Fairly well"

	Data Summary:

	All Rep/Lean Rep:
	- Very well: 51%
	- Fairly well: 36%
	- NET: 87%

	By Age:
	- Ages 18-29: Very well 31%, Fairly well 45%, NET 76%
	- 30-49: Very well 41%, Fairly well 42%, NET 82%
	- 50-64: Very well 58%, Fairly well 33%, NET 92%
	- 65+: Very well 68%, Fairly well 26%, NET 94%

	By Education:
	- Postgrad: Very well 42%, Fairly well 38%, NET 80%
	- College grad: Very well 45%, Fairly well 40%, NET 85%
	- Some college: Very well 51%, Fairly well 36%, NET 87%
	- HS or less: Very well 56%, Fairly well 33%, NET 89%

	By Conservatism:
	- Conserv: Very well 63%, Fairly well 31%, NET 94%
	- Mod/Lib: Very well 32%, Fairly well 44%, NET 75%

	By Party Identification:
	- Republican: Very well 61%, Fairly well 32%, NET 93%
	- Lean Republican: Very well 36%, Fairly well 41%, NET 77%

	Analysis:

	- The overall percentage of Republicans and Republican leaners who say Trump "fights for what I believe in" is 87% (51% "very well" and 36% "fairly well").
	- The group most likely to say this is those aged 65 and older (94% NET), followed by those 50-64 (92% NET) and those with a high school diploma or less (89% NET).
	- The youngest group (18-29) is the least likely (76% NET).
	- Among education levels, those with a high school diploma or less are most likely (89% NET), followed by some college (87%) and college graduates (85%).
	- The most conservative group (Conservative) is the most likely to say this (94% NET), while the moderate/liberal group is the least likely (75% NET).
	- Among party identifiers, Republicans are most likely (93% NET), while lean Republicans are less likely (77% NET).
	- The 65+ age group is also the most conservative (94% NET) and the most likely to say Trump fights for their beliefs.
	- The 65+ group is also the most likely to say it "very well" (68%) and the least likely to say it "fairly well" (26%).
	- The most conservative group (63% "very well") is also the most likely overall (94% NET), while the most moderate/liberal group (32% "very well") is the least likely (75% NET).
	- The Republican party has a 93% NET, while the lean Republican group has a 77% NET.

	Conclusion:
	The chart shows that older Republicans (65+) are the most likely to see Trump as fighting for their beliefs, with 94% saying it "very well" or "fairly well." This is followed by those aged 50-64 (92%) and those with a high school diploma or less (89%). The most conservative Republicans are also the most likely (94%), while moderate/liberal Republicans are the least likely (75%). The youngest group (18-29) is the least likely (76%)."""
	DEFAULT_CAPTION_TOKENS = 993


	def get_device() -> str:
	return "cuda" if torch.cuda.is_available() else "cpu"


	def select_dtype(device: str):
	if device == "cuda":
	if torch.cuda.is_bf16_supported():
	return torch.bfloat16
	return torch.float16
	return torch.float32


	def load_model():
	device = get_device()
	dtype = select_dtype(device)

	# Use device_map="auto" for proper GPU allocation with spaces.GPU decorator
	model = Qwen3VLForConditionalGeneration.from_pretrained(
	MODEL_ID,
	torch_dtype=dtype,
	device_map="auto",
	trust_remote_code=True,
	)

	processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
	return model, processor


	MODEL, PROCESSOR = load_model()


	@spaces.GPU
	@torch.inference_mode()
	def generate_caption(image: Image.Image):
	if image is None:
	return "", 0

	try:
	# Validate image
	if not isinstance(image, Image.Image):
	return "Error: Invalid image format", 0

	# Check image size (warn if too large)
	max_size = 4096
	if image.width > max_size or image.height > max_size:
	# Resize if too large to prevent OOM
	image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)

	device = MODEL.device
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "image", "image": image},
	{"type": "text", "text": DEFAULT_PROMPT},
	],
	}
	]

	# Preparation for inference using Qwen3-VL style
	inputs = PROCESSOR.apply_chat_template(
	messages,
	tokenize=True,
	add_generation_prompt=True,
	return_dict=True,
	return_tensors="pt",
	)
	inputs = inputs.to(device)

	generated_ids = MODEL.generate(
	**inputs,
	max_new_tokens=MAX_NEW_TOKENS,
	do_sample=False,
	)

	generated_ids_trimmed = [
	out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
	]
	output_text = PROCESSOR.batch_decode(
	generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
	)
	caption = output_text[0].strip()

	input_ids = inputs.get("input_ids")
	input_length = input_ids.shape[-1] if input_ids is not None else 0
	total_length = generated_ids.shape[-1]
	num_generated_tokens = max(total_length - input_length, 0)

	return caption, int(num_generated_tokens)

	except torch.cuda.OutOfMemoryError:
	torch.cuda.empty_cache()
	return "Error: Out of GPU memory. Please try with a smaller image.", 0
	except Exception as e:
	return f"Error generating caption: {str(e)}", 0


	with gr.Blocks(title="CapRL-Qwen3VL-4B Image Captioning") as demo:
	gr.Markdown(
	"""
	# CapRL
	📖<a href="https://arxiv.org/abs/2509.22647">Paper</a> \| 🏠<a href="https://github.com/InternLM/CapRL">Github</a> \| 🤗<a href="https://huggingface.co/collections/long-xing1/caprl-68d64ac32ded31596c36e189">CapRL Collection</a> \| 🤗<a href="https://huggingface.co/papers/2509.22647">Daily Paper</a>

	### CapRL Series Model & Dataset
	\| Series \| Models & Resources \|
	\| :--- \| :--- \|
	\| CapRL 2.0 Series \| [🤗 CapRL-Qwen3VL-2B](https://huggingface.co/internlm/CapRL-Qwen3VL-2B) \\| [🤗 CapRL-Qwen3VL-4B](https://huggingface.co/internlm/CapRL-Qwen3VL-4B) \\| [📦 CapRL-Qwen3VL-2B-GGUF](https://huggingface.co/internlm/CapRL-Qwen3VL-2B-GGUF) \\| [📦 CapRL-Qwen3VL-4B-GGUF](https://huggingface.co/internlm/CapRL-Qwen3VL-4B-GGUF) \\| [🌈CapRL-Qwen3VL-4B Space](https://huggingface.co/spaces/yuhangzang/CapRL-Qwen3VL-4B)
	\| CapRL 1.0 Series \| [🤗 CapRL-Qwen2.5VL-3B](https://huggingface.co/internlm/CapRL-3B) \\| [🤗 CapRL-InternVL3.5-8B](https://huggingface.co/yuhangzang/CapRL-InternVL3.5-8B) \\| [📊 CapRL-2M Dataset](https://huggingface.co/datasets/internlm/CapRL-2M) \\| [📦 CapRL-3B-GGUF](https://huggingface.co/mradermacher/CapRL-3B-GGUF) \\| [📦 CapRL-3B-i1-GGUF](https://huggingface.co/mradermacher/CapRL-3B-i1-GGUF) \\| [🌈CapRL-Qwen2.5VL-3B Space](https://huggingface.co/spaces/yuhangzang/caprl)


	We are excited to release the CapRL 2.0 series: CapRL-Qwen3VL-2B and CapRL-Qwen3VL-4B. These models feature fewer parameters while delivering even more powerful captioning performance.
	We welcome everyone to try them out!

	This Space is based on CapRL-Qwen3VL-4B. You can also try out CapRL-Qwen2.5VL-3B 🎨    ➡️    [🌈CapRL-Qwen2.5VL-3B Space](https://huggingface.co/spaces/yuhangzang/caprl)
	"""
	)

	with gr.Row():
	with gr.Column():
	# Preload a default image to match the provided caption
	image_input = gr.Image(type="pil", label="Input Image", value=Image.open(DEFAULT_IMAGE))
	generate_button = gr.Button("Generate Caption")
	with gr.Column():
	# Show a default caption and its token count on load
	caption_output = gr.Textbox(label="Caption", lines=6, value=DEFAULT_CAPTION)
	token_output = gr.Number(label="Generated Tokens", precision=0, value=DEFAULT_CAPTION_TOKENS)

	generate_button.click(
	fn=generate_caption,
	inputs=image_input,
	outputs=[caption_output, token_output],
	show_progress=True,
	)

	image_input.upload(
	fn=generate_caption,
	inputs=image_input,
	outputs=[caption_output, token_output],
	show_progress=True,
	)

	gr.Examples(
	examples=[
	["./examples/1909.png"],
	["./examples/44687.jpeg"],
	["./examples/natural.png"],
	],
	inputs=image_input,
	outputs=[caption_output, token_output],
	fn=generate_caption,
	cache_examples=True,
	label="Example Images"
	)

	gr.Markdown("### Citation")
	gr.Markdown("If you find this project useful, please kindly cite:")

	citation_text = """@article{xing2025caprl,
	title={{CapRL}: Stimulating Dense Image Caption Capabilities via Reinforcement Learning},
	author={Xing, Long and Dong, Xiaoyi and Zang, Yuhang and Cao, Yuhang and Liang, Jianze and Huang, Qidong and Wang, Jiaqi and Wu, Feng and Lin, Dahua},
	journal={arXiv preprint arXiv:2509.22647},
	year={2025}
	}"""

	gr.Code(value=citation_text, language="markdown", label="BibTeX Citation")


	demo.launch()