Spaces:

ButterM40
/

local-inference

Running

App Files Files Community

ButterM40 commited on Nov 10

Commit

b9ed0c9

1 Parent(s): a828cd4

Optimize build: lazy model loading + CPU torch wheel

Browse files

Files changed (3) hide show

Dockerfile +5 -2
requirements.txt +1 -2
server.py +47 -22

Dockerfile CHANGED Viewed

@@ -32,9 +32,12 @@ RUN apt-get update && \
 # Copy requirements first for better caching
 COPY requirements.txt .
-# Upgrade pip and install dependencies preferring binary wheels
 RUN pip install --no-cache-dir --upgrade pip setuptools wheel && \
-    pip install --no-cache-dir --prefer-binary -r requirements.txt || \
     (echo "Initial pip install failed, retrying without --prefer-binary" && pip install --no-cache-dir -r requirements.txt)
 # Copy the rest of the application

 # Copy requirements first for better caching
 COPY requirements.txt .
+# Upgrade pip and install torch CPU wheel first (faster than compiling)
 RUN pip install --no-cache-dir --upgrade pip setuptools wheel && \
+    pip install --no-cache-dir torch==2.5.1+cpu --index-url https://download.pytorch.org/whl/cpu
+# Install remaining dependencies preferring binary wheels
+RUN pip install --no-cache-dir --prefer-binary -r requirements.txt || \
     (echo "Initial pip install failed, retrying without --prefer-binary" && pip install --no-cache-dir -r requirements.txt)
 # Copy the rest of the application

requirements.txt CHANGED Viewed

@@ -3,9 +3,8 @@ fastapi==0.115.5
 uvicorn[standard]==0.32.1
 pydantic==2.10.2
-# Transformers and ML
 transformers==4.46.3
-torch==2.5.1
 accelerate>=0.26.0
 # Tokenizers

 uvicorn[standard]==0.32.1
 pydantic==2.10.2
+# Transformers and ML (torch installed separately in Dockerfile)
 transformers==4.46.3
 accelerate>=0.26.0
 # Tokenizers

server.py CHANGED Viewed

@@ -47,32 +47,45 @@ def background_health_monitor():
 threading.Thread(target=background_health_monitor, daemon=True).start()
 # =====================================================
-# Load Models
 # =====================================================
-print("Loading models...")
-# Chat Model
 chat_model_name = "Qwen/Qwen1.5-0.5B-Chat"
-chat_tokenizer = AutoTokenizer.from_pretrained(chat_model_name)
-chat_model = AutoModelForCausalLM.from_pretrained(
-    chat_model_name,
-    torch_dtype=torch.bfloat16,
-    device_map="auto",
-    low_cpu_mem_usage=True,
-    offload_folder="offload",
-).eval()
-# Summarization Model
-summary_pipe = pipeline(
-    "summarization",
-    model="sshleifer/distilbart-cnn-6-6",
-    device=0 if torch.cuda.is_available() else -1
-)
-# Vision Model
-vision_model_name = "microsoft/git-base-coco"
-vision_model = AutoModelForVision2Seq.from_pretrained(vision_model_name).to("cuda" if torch.cuda.is_available() else "cpu")
-vision_processor = AutoProcessor.from_pretrained(vision_model_name)
 # =====================================================
 # API Schemas
@@ -97,6 +110,9 @@ class WordPredictionRequest(BaseModel):
 @app.post("/api/chat")
 def chat_generate(req: ChatRequest):
     try:
         # Build prompt and run generation while requesting per-step scores
         prompt = (
             "<|im_start|>system\nYou are a helpful AI assistant.<|im_end|>\n"
@@ -181,6 +197,9 @@ def chat_generate(req: ChatRequest):
 @app.post("/predict_words")
 def predict_words(req: WordPredictionRequest):
     try:
         input_ids = chat_tokenizer.encode(req.word, return_tensors="pt")
         with torch.no_grad():
             outputs = chat_model(input_ids)
@@ -204,6 +223,9 @@ def predict_words(req: WordPredictionRequest):
 @app.post("/api/summarize")
 def summarize_text(req: SummaryRequest):
     try:
         # Get word count
         word_count = len(req.text.split())
         # Adjust max_length to be ~30-50% of input length
@@ -228,6 +250,9 @@ def summarize_text(req: SummaryRequest):
 @app.post("/process_image")
 async def process_image(image: UploadFile = File(...)):
     try:
         contents = await image.read()
         img = Image.open(io.BytesIO(contents)).convert('RGB')

 threading.Thread(target=background_health_monitor, daemon=True).start()
 # =====================================================
+# Model Loading (Lazy Initialization)
 # =====================================================
 chat_model_name = "Qwen/Qwen1.5-0.5B-Chat"
+chat_tokenizer = None
+chat_model = None
+summary_pipe = None
+vision_model = None
+vision_processor = None
+def load_chat_model():
+    global chat_tokenizer, chat_model
+    if chat_tokenizer is None or chat_model is None:
+        print("Loading chat model...")
+        chat_tokenizer = AutoTokenizer.from_pretrained(chat_model_name)
+        chat_model = AutoModelForCausalLM.from_pretrained(
+            chat_model_name,
+            torch_dtype=torch.bfloat16,
+            device_map="auto",
+            low_cpu_mem_usage=True,
+            offload_folder="offload",
+        ).eval()
+def load_summary_model():
+    global summary_pipe
+    if summary_pipe is None:
+        print("Loading summarization model...")
+        summary_pipe = pipeline(
+            "summarization",
+            model="sshleifer/distilbart-cnn-6-6",
+            device=0 if torch.cuda.is_available() else -1
+        )
+def load_vision_model():
+    global vision_model, vision_processor
+    if vision_model is None or vision_processor is None:
+        print("Loading vision model...")
+        vision_model_name = "microsoft/git-base-coco"
+        vision_model = AutoModelForVision2Seq.from_pretrained(vision_model_name).to("cuda" if torch.cuda.is_available() else "cpu")
+        vision_processor = AutoProcessor.from_pretrained(vision_model_name)
 # =====================================================
 # API Schemas
 @app.post("/api/chat")
 def chat_generate(req: ChatRequest):
     try:
+        # Load models on first request
+        load_chat_model()
         # Build prompt and run generation while requesting per-step scores
         prompt = (
             "<|im_start|>system\nYou are a helpful AI assistant.<|im_end|>\n"
 @app.post("/predict_words")
 def predict_words(req: WordPredictionRequest):
     try:
+        # Load models on first request
+        load_chat_model()
         input_ids = chat_tokenizer.encode(req.word, return_tensors="pt")
         with torch.no_grad():
             outputs = chat_model(input_ids)
 @app.post("/api/summarize")
 def summarize_text(req: SummaryRequest):
     try:
+        # Load models on first request
+        load_summary_model()
         # Get word count
         word_count = len(req.text.split())
         # Adjust max_length to be ~30-50% of input length
 @app.post("/process_image")
 async def process_image(image: UploadFile = File(...)):
     try:
+        # Load models on first request
+        load_vision_model()
         contents = await image.read()
         img = Image.open(io.BytesIO(contents)).convert('RGB')