Spaces:
Running on Zero
Running on Zero
| import gradio as gr | |
| import spaces | |
| import torch | |
| from PIL import Image | |
| from transformers import Qwen3VLForConditionalGeneration, AutoProcessor | |
| MODEL_ID = "internlm/CapRL-Qwen3VL-4B" | |
| DEFAULT_PROMPT = "Describe the image in detail." | |
| MAX_NEW_TOKENS = 1536 | |
| # Default demo content | |
| DEFAULT_IMAGE = "./examples/1909.png" | |
| DEFAULT_CAPTION = """Based on the provided bar chart from the Pew Research Center, here is a detailed description: | |
| **Title:** Older Republicans especially likely to see Trump as fighting for their beliefs | |
| **Subtitle:** Among Republicans and Republican leaners, % who say the phrase 'fights for what I believe in' describes Trump ... | |
| **Source:** Survey of U.S. adults conducted Feb. 4-15, 2020. | |
| **Legend:** | |
| - **Very well:** 51% (All Rep/Lean Rep) | |
| - **Fairly well:** 36% (All Rep/Lean Rep) | |
| - **NET:** Sum of "Very well" and "Fairly well" | |
| **Data Summary:** | |
| **All Rep/Lean Rep:** | |
| - Very well: 51% | |
| - Fairly well: 36% | |
| - NET: 87% | |
| **By Age:** | |
| - Ages 18-29: Very well 31%, Fairly well 45%, NET 76% | |
| - 30-49: Very well 41%, Fairly well 42%, NET 82% | |
| - 50-64: Very well 58%, Fairly well 33%, NET 92% | |
| - 65+: Very well 68%, Fairly well 26%, NET 94% | |
| **By Education:** | |
| - Postgrad: Very well 42%, Fairly well 38%, NET 80% | |
| - College grad: Very well 45%, Fairly well 40%, NET 85% | |
| - Some college: Very well 51%, Fairly well 36%, NET 87% | |
| - HS or less: Very well 56%, Fairly well 33%, NET 89% | |
| **By Conservatism:** | |
| - Conserv: Very well 63%, Fairly well 31%, NET 94% | |
| - Mod/Lib: Very well 32%, Fairly well 44%, NET 75% | |
| **By Party Identification:** | |
| - Republican: Very well 61%, Fairly well 32%, NET 93% | |
| - Lean Republican: Very well 36%, Fairly well 41%, NET 77% | |
| **Analysis:** | |
| - The overall percentage of Republicans and Republican leaners who say Trump "fights for what I believe in" is 87% (51% "very well" and 36% "fairly well"). | |
| - The group most likely to say this is those aged 65 and older (94% NET), followed by those 50-64 (92% NET) and those with a high school diploma or less (89% NET). | |
| - The youngest group (18-29) is the least likely (76% NET). | |
| - Among education levels, those with a high school diploma or less are most likely (89% NET), followed by some college (87%) and college graduates (85%). | |
| - The most conservative group (Conservative) is the most likely to say this (94% NET), while the moderate/liberal group is the least likely (75% NET). | |
| - Among party identifiers, Republicans are most likely (93% NET), while lean Republicans are less likely (77% NET). | |
| - The 65+ age group is also the most conservative (94% NET) and the most likely to say Trump fights for their beliefs. | |
| - The 65+ group is also the most likely to say it "very well" (68%) and the least likely to say it "fairly well" (26%). | |
| - The most conservative group (63% "very well") is also the most likely overall (94% NET), while the most moderate/liberal group (32% "very well") is the least likely (75% NET). | |
| - The Republican party has a 93% NET, while the lean Republican group has a 77% NET. | |
| **Conclusion:** | |
| The chart shows that older Republicans (65+) are the most likely to see Trump as fighting for their beliefs, with 94% saying it "very well" or "fairly well." This is followed by those aged 50-64 (92%) and those with a high school diploma or less (89%). The most conservative Republicans are also the most likely (94%), while moderate/liberal Republicans are the least likely (75%). The youngest group (18-29) is the least likely (76%).""" | |
| DEFAULT_CAPTION_TOKENS = 993 | |
| def get_device() -> str: | |
| return "cuda" if torch.cuda.is_available() else "cpu" | |
| def select_dtype(device: str): | |
| if device == "cuda": | |
| if torch.cuda.is_bf16_supported(): | |
| return torch.bfloat16 | |
| return torch.float16 | |
| return torch.float32 | |
| def load_model(): | |
| device = get_device() | |
| dtype = select_dtype(device) | |
| # Use device_map="auto" for proper GPU allocation with spaces.GPU decorator | |
| model = Qwen3VLForConditionalGeneration.from_pretrained( | |
| MODEL_ID, | |
| torch_dtype=dtype, | |
| device_map="auto", | |
| trust_remote_code=True, | |
| ) | |
| processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) | |
| return model, processor | |
| MODEL, PROCESSOR = load_model() | |
| def generate_caption(image: Image.Image): | |
| if image is None: | |
| return "", 0 | |
| try: | |
| # Validate image | |
| if not isinstance(image, Image.Image): | |
| return "Error: Invalid image format", 0 | |
| # Check image size (warn if too large) | |
| max_size = 4096 | |
| if image.width > max_size or image.height > max_size: | |
| # Resize if too large to prevent OOM | |
| image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS) | |
| device = MODEL.device | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image", "image": image}, | |
| {"type": "text", "text": DEFAULT_PROMPT}, | |
| ], | |
| } | |
| ] | |
| # Preparation for inference using Qwen3-VL style | |
| inputs = PROCESSOR.apply_chat_template( | |
| messages, | |
| tokenize=True, | |
| add_generation_prompt=True, | |
| return_dict=True, | |
| return_tensors="pt", | |
| ) | |
| inputs = inputs.to(device) | |
| generated_ids = MODEL.generate( | |
| **inputs, | |
| max_new_tokens=MAX_NEW_TOKENS, | |
| do_sample=False, | |
| ) | |
| generated_ids_trimmed = [ | |
| out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) | |
| ] | |
| output_text = PROCESSOR.batch_decode( | |
| generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False | |
| ) | |
| caption = output_text[0].strip() | |
| input_ids = inputs.get("input_ids") | |
| input_length = input_ids.shape[-1] if input_ids is not None else 0 | |
| total_length = generated_ids.shape[-1] | |
| num_generated_tokens = max(total_length - input_length, 0) | |
| return caption, int(num_generated_tokens) | |
| except torch.cuda.OutOfMemoryError: | |
| torch.cuda.empty_cache() | |
| return "Error: Out of GPU memory. Please try with a smaller image.", 0 | |
| except Exception as e: | |
| return f"Error generating caption: {str(e)}", 0 | |
| with gr.Blocks(title="CapRL-Qwen3VL-4B Image Captioning") as demo: | |
| gr.Markdown( | |
| """ | |
| # CapRL | |
| ๐<a href="https://arxiv.org/abs/2509.22647">Paper</a> | ๐ <a href="https://github.com/InternLM/CapRL">Github</a> | ๐ค<a href="https://huggingface.co/collections/long-xing1/caprl-68d64ac32ded31596c36e189">CapRL Collection</a> | ๐ค<a href="https://huggingface.co/papers/2509.22647">Daily Paper</a> | |
| ### CapRL Series Model & Dataset | |
| | Series | Models & Resources | | |
| | :--- | :--- | | |
| | **CapRL 2.0 Series** | [๐ค CapRL-Qwen3VL-2B](https://huggingface.co/internlm/CapRL-Qwen3VL-2B) \| [๐ค CapRL-Qwen3VL-4B](https://huggingface.co/internlm/CapRL-Qwen3VL-4B) \| [๐ฆ CapRL-Qwen3VL-2B-GGUF](https://huggingface.co/internlm/CapRL-Qwen3VL-2B-GGUF) \| [๐ฆ CapRL-Qwen3VL-4B-GGUF](https://huggingface.co/internlm/CapRL-Qwen3VL-4B-GGUF) \| [๐CapRL-Qwen3VL-4B Space](https://huggingface.co/spaces/yuhangzang/CapRL-Qwen3VL-4B) | |
| | **CapRL 1.0 Series** | [๐ค CapRL-Qwen2.5VL-3B](https://huggingface.co/internlm/CapRL-3B) \| [๐ค CapRL-InternVL3.5-8B](https://huggingface.co/yuhangzang/CapRL-InternVL3.5-8B) \| [๐ CapRL-2M Dataset](https://huggingface.co/datasets/internlm/CapRL-2M) \| [๐ฆ CapRL-3B-GGUF](https://huggingface.co/mradermacher/CapRL-3B-GGUF) \| [๐ฆ CapRL-3B-i1-GGUF](https://huggingface.co/mradermacher/CapRL-3B-i1-GGUF) \| [๐CapRL-Qwen2.5VL-3B Space](https://huggingface.co/spaces/yuhangzang/caprl) | |
| We are excited to release the **CapRL 2.0 series**: **CapRL-Qwen3VL-2B** and **CapRL-Qwen3VL-4B**. These models feature fewer parameters while delivering even more powerful captioning performance. | |
| We welcome everyone to try them out! | |
| **This Space** is based on **CapRL-Qwen3VL-4B**. You can also try out **CapRL-Qwen2.5VL-3B** ๐จ โก๏ธ [๐CapRL-Qwen2.5VL-3B Space](https://huggingface.co/spaces/yuhangzang/caprl) | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| # Preload a default image to match the provided caption | |
| image_input = gr.Image(type="pil", label="Input Image", value=Image.open(DEFAULT_IMAGE)) | |
| generate_button = gr.Button("Generate Caption") | |
| with gr.Column(): | |
| # Show a default caption and its token count on load | |
| caption_output = gr.Textbox(label="Caption", lines=6, value=DEFAULT_CAPTION) | |
| token_output = gr.Number(label="Generated Tokens", precision=0, value=DEFAULT_CAPTION_TOKENS) | |
| generate_button.click( | |
| fn=generate_caption, | |
| inputs=image_input, | |
| outputs=[caption_output, token_output], | |
| show_progress=True, | |
| ) | |
| image_input.upload( | |
| fn=generate_caption, | |
| inputs=image_input, | |
| outputs=[caption_output, token_output], | |
| show_progress=True, | |
| ) | |
| gr.Examples( | |
| examples=[ | |
| ["./examples/1909.png"], | |
| ["./examples/44687.jpeg"], | |
| ["./examples/natural.png"], | |
| ], | |
| inputs=image_input, | |
| outputs=[caption_output, token_output], | |
| fn=generate_caption, | |
| cache_examples=True, | |
| label="Example Images" | |
| ) | |
| gr.Markdown("### Citation") | |
| gr.Markdown("If you find this project useful, please kindly cite:") | |
| citation_text = """@article{xing2025caprl, | |
| title={{CapRL}: Stimulating Dense Image Caption Capabilities via Reinforcement Learning}, | |
| author={Xing, Long and Dong, Xiaoyi and Zang, Yuhang and Cao, Yuhang and Liang, Jianze and Huang, Qidong and Wang, Jiaqi and Wu, Feng and Lin, Dahua}, | |
| journal={arXiv preprint arXiv:2509.22647}, | |
| year={2025} | |
| }""" | |
| gr.Code(value=citation_text, language="markdown", label="BibTeX Citation") | |
| demo.launch() | |