|
|
""" |
|
|
API Endpoints |
|
|
All return JSON strings for consistency. |
|
|
Can be used by Gradio UI or external apps. |
|
|
""" |
|
|
import json |
|
|
from typing import Optional |
|
|
|
|
|
from core.models import get_model_service |
|
|
from core.sessions import get_session_service |
|
|
from core.state import get_state |
|
|
from core.logger import logger |
|
|
from core.config import VERSION |
|
|
|
|
|
|
|
|
def _json_response(success: bool, data: dict = None, error: str = None) -> str: |
|
|
"""Standard JSON response format""" |
|
|
response = {"success": success} |
|
|
if data: |
|
|
response.update(data) |
|
|
if error: |
|
|
response["error"] = error |
|
|
return json.dumps(response, indent=2) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def api_list_models() -> str: |
|
|
"""List all installed models""" |
|
|
service = get_model_service() |
|
|
models = service.get_installed_models() |
|
|
loaded_id = get_state().get_loaded_model_id() |
|
|
|
|
|
return _json_response(True, { |
|
|
"models": models, |
|
|
"count": len(models), |
|
|
"loaded_model_id": loaded_id |
|
|
}) |
|
|
|
|
|
|
|
|
def api_get_model(model_id: str) -> str: |
|
|
"""Get specific model details""" |
|
|
model = get_state().get_model_by_id(model_id) |
|
|
if model: |
|
|
return _json_response(True, {"model": model}) |
|
|
return _json_response(False, error="Model not found") |
|
|
|
|
|
|
|
|
def api_load_model(model_id: str) -> str: |
|
|
"""Load a model""" |
|
|
service = get_model_service() |
|
|
result = service.load_model(model_id) |
|
|
return json.dumps(result) |
|
|
|
|
|
|
|
|
def api_unload_model() -> str: |
|
|
"""Unload current model""" |
|
|
service = get_model_service() |
|
|
service.unload_model() |
|
|
return _json_response(True, {"message": "Model unloaded"}) |
|
|
|
|
|
|
|
|
def api_delete_model(model_id: str) -> str: |
|
|
"""Delete an installed model""" |
|
|
service = get_model_service() |
|
|
result = service.delete_model(model_id) |
|
|
return json.dumps(result) |
|
|
|
|
|
|
|
|
def api_search_models(query: str = "", max_params: float = 7.0, limit: int = 20) -> str: |
|
|
"""Search HuggingFace for models""" |
|
|
service = get_model_service() |
|
|
results, status = service.search_hf_models(query, max_params, limit) |
|
|
return _json_response(True, { |
|
|
"results": results, |
|
|
"count": len(results), |
|
|
"status": status |
|
|
}) |
|
|
|
|
|
|
|
|
def api_get_model_files(repo_id: str) -> str: |
|
|
"""Get available files for a HF model""" |
|
|
service = get_model_service() |
|
|
files = service.get_hf_model_files(repo_id) |
|
|
return _json_response(True, { |
|
|
"repo_id": repo_id, |
|
|
"files": files, |
|
|
"count": len(files) |
|
|
}) |
|
|
|
|
|
|
|
|
def api_download_model(repo_id: str, filename: str) -> str: |
|
|
"""Download a model from HuggingFace""" |
|
|
service = get_model_service() |
|
|
result = service.download_model(repo_id, filename) |
|
|
return json.dumps(result) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def api_list_sessions() -> str: |
|
|
"""List all sessions""" |
|
|
service = get_session_service() |
|
|
sessions = service.get_all_sessions() |
|
|
return _json_response(True, { |
|
|
"sessions": sessions, |
|
|
"count": len(sessions), |
|
|
"active_session_id": get_state().get_active_session_id() |
|
|
}) |
|
|
|
|
|
|
|
|
def api_get_session(session_id: str) -> str: |
|
|
"""Get session with messages""" |
|
|
service = get_session_service() |
|
|
session = service.get_session(session_id) |
|
|
if session: |
|
|
return _json_response(True, {"session": session}) |
|
|
return _json_response(False, error="Session not found") |
|
|
|
|
|
|
|
|
def api_create_session( |
|
|
title: str = "", |
|
|
session_type: str = "chat", |
|
|
system_prompt: str = "" |
|
|
) -> str: |
|
|
"""Create new session""" |
|
|
service = get_session_service() |
|
|
session = service.create_session(title, session_type, system_prompt) |
|
|
return _json_response(True, { |
|
|
"session_id": session["id"], |
|
|
"title": session["title"] |
|
|
}) |
|
|
|
|
|
|
|
|
def api_delete_session(session_id: str) -> str: |
|
|
"""Delete a session""" |
|
|
service = get_session_service() |
|
|
result = service.delete_session(session_id) |
|
|
return _json_response(result, {"message": "Deleted" if result else "Not found"}) |
|
|
|
|
|
|
|
|
def api_rename_session(session_id: str, new_title: str) -> str: |
|
|
"""Rename a session""" |
|
|
service = get_session_service() |
|
|
result = service.rename_session(session_id, new_title) |
|
|
return _json_response(result, {"title": new_title}) |
|
|
|
|
|
|
|
|
def api_clear_session(session_id: str) -> str: |
|
|
"""Clear session messages""" |
|
|
service = get_session_service() |
|
|
result = service.clear_session(session_id) |
|
|
return _json_response(result, {"message": "Cleared"}) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def api_chat( |
|
|
session_id: str, |
|
|
message: str, |
|
|
max_tokens: int = 512, |
|
|
temperature: float = 0.7 |
|
|
) -> str: |
|
|
"""Send chat message and get response""" |
|
|
model_service = get_model_service() |
|
|
session_service = get_session_service() |
|
|
|
|
|
|
|
|
if not model_service.is_model_loaded(): |
|
|
return _json_response(False, error="No model loaded") |
|
|
|
|
|
|
|
|
session = session_service.get_session(session_id) |
|
|
if not session: |
|
|
return _json_response(False, error="Session not found") |
|
|
|
|
|
|
|
|
session_service.add_message(session_id, "user", message) |
|
|
|
|
|
|
|
|
messages = [] |
|
|
if session.get("system_prompt"): |
|
|
messages.append({"role": "system", "content": session["system_prompt"]}) |
|
|
|
|
|
for msg in session_service.get_messages(session_id): |
|
|
messages.append({"role": msg["role"], "content": msg["content"]}) |
|
|
|
|
|
|
|
|
response = model_service.generate(messages, max_tokens, temperature) |
|
|
|
|
|
|
|
|
session_service.add_message(session_id, "assistant", response) |
|
|
|
|
|
return _json_response(True, { |
|
|
"response": response, |
|
|
"session_id": session_id |
|
|
}) |
|
|
|
|
|
|
|
|
def api_inference( |
|
|
prompt: str = "", |
|
|
messages: str = "[]", |
|
|
system_prompt: str = "", |
|
|
max_tokens: int = 512, |
|
|
temperature: float = 0.7, |
|
|
top_p: float = 0.9, |
|
|
context: str = "", |
|
|
lora_scale: float = 1.0 |
|
|
) -> str: |
|
|
""" |
|
|
Universal inference endpoint. |
|
|
Can use direct prompt or message list. |
|
|
Supports system prompt, context injection, LoRA scaling (future). |
|
|
""" |
|
|
model_service = get_model_service() |
|
|
|
|
|
if not model_service.is_model_loaded(): |
|
|
return _json_response(False, error="No model loaded") |
|
|
|
|
|
try: |
|
|
|
|
|
msg_list = json.loads(messages) if messages and messages != "[]" else [] |
|
|
|
|
|
|
|
|
full_messages = [] |
|
|
|
|
|
|
|
|
if system_prompt: |
|
|
full_messages.append({"role": "system", "content": system_prompt}) |
|
|
|
|
|
|
|
|
if context: |
|
|
full_messages.append({"role": "system", "content": f"Context:\n{context}"}) |
|
|
|
|
|
|
|
|
full_messages.extend(msg_list) |
|
|
|
|
|
|
|
|
if prompt: |
|
|
full_messages.append({"role": "user", "content": prompt}) |
|
|
|
|
|
if not full_messages: |
|
|
return _json_response(False, error="No prompt or messages provided") |
|
|
|
|
|
|
|
|
response = model_service.generate(full_messages, max_tokens, temperature, top_p) |
|
|
|
|
|
loaded_model = model_service.get_loaded_model() |
|
|
|
|
|
return _json_response(True, { |
|
|
"response": response, |
|
|
"model_id": loaded_model["id"] if loaded_model else None, |
|
|
"config": { |
|
|
"max_tokens": max_tokens, |
|
|
"temperature": temperature, |
|
|
"top_p": top_p, |
|
|
"lora_scale": lora_scale |
|
|
} |
|
|
}) |
|
|
|
|
|
except Exception as e: |
|
|
logger.error("API", f"Inference error: {e}") |
|
|
return _json_response(False, error=str(e)) |
|
|
|
|
|
|
|
|
def api_chat_with_config( |
|
|
session_id: str, |
|
|
message: str, |
|
|
max_tokens: int = 512, |
|
|
temperature: float = 0.7, |
|
|
top_p: float = 0.9, |
|
|
system_prompt_override: str = "", |
|
|
context: str = "", |
|
|
lora_scale: float = 1.0 |
|
|
) -> str: |
|
|
""" |
|
|
Chat with full configuration options. |
|
|
Supports: custom inference params, system prompt override, context injection. |
|
|
""" |
|
|
model_service = get_model_service() |
|
|
session_service = get_session_service() |
|
|
|
|
|
if not model_service.is_model_loaded(): |
|
|
return _json_response(False, error="No model loaded") |
|
|
|
|
|
session = session_service.get_session(session_id) |
|
|
if not session: |
|
|
return _json_response(False, error="Session not found") |
|
|
|
|
|
|
|
|
session_service.add_message(session_id, "user", message) |
|
|
|
|
|
|
|
|
messages = [] |
|
|
|
|
|
|
|
|
sys_prompt = system_prompt_override or session.get("system_prompt", "") |
|
|
if sys_prompt: |
|
|
messages.append({"role": "system", "content": sys_prompt}) |
|
|
|
|
|
|
|
|
if context: |
|
|
messages.append({"role": "system", "content": f"Context:\n{context}"}) |
|
|
|
|
|
|
|
|
for msg in session_service.get_messages(session_id): |
|
|
messages.append({"role": msg["role"], "content": msg["content"]}) |
|
|
|
|
|
|
|
|
response = model_service.generate(messages, max_tokens, temperature, top_p) |
|
|
|
|
|
|
|
|
session_service.add_message(session_id, "assistant", response) |
|
|
|
|
|
return _json_response(True, { |
|
|
"response": response, |
|
|
"session_id": session_id, |
|
|
"config_used": { |
|
|
"max_tokens": max_tokens, |
|
|
"temperature": temperature, |
|
|
"top_p": top_p, |
|
|
"lora_scale": lora_scale, |
|
|
"context_provided": bool(context) |
|
|
} |
|
|
}) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def api_get_status() -> str: |
|
|
"""Get current system status""" |
|
|
model_service = get_model_service() |
|
|
state = get_state() |
|
|
|
|
|
loaded = model_service.get_loaded_model() |
|
|
|
|
|
return _json_response(True, { |
|
|
"version": VERSION, |
|
|
"model_loaded": model_service.is_model_loaded(), |
|
|
"loaded_model": loaded["name"] if loaded else None, |
|
|
"installed_models_count": len(state.get_installed_models()), |
|
|
"sessions_count": len(state.get_sessions()), |
|
|
"active_session_id": state.get_active_session_id() |
|
|
}) |
|
|
|
|
|
|
|
|
def api_get_backends() -> str: |
|
|
"""Check which inference backends are available""" |
|
|
from core.models import _get_llama_cpp, _get_transformers |
|
|
|
|
|
return _json_response(True, { |
|
|
"backends": { |
|
|
"gguf": _get_llama_cpp() is not None, |
|
|
"transformers": _get_transformers() is not None |
|
|
} |
|
|
}) |
|
|
|
|
|
|
|
|
def api_health() -> str: |
|
|
"""Health check endpoint""" |
|
|
return _json_response(True, { |
|
|
"status": "healthy", |
|
|
"version": VERSION |
|
|
}) |
|
|
|