PhishingTest

Paused

App Files Files Community

dungeon29 commited on 22 days ago

Commit

4faed86

verified ·

1 Parent(s): 0527e37

Upload app.py

Browse files

Files changed (1) hide show

app.py +39 -18

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import torch
 import torch.nn.functional as F
-from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
 from huggingface_hub import hf_hub_download
 import gradio as gr
 import requests
@@ -20,8 +20,8 @@ from rag_engine import RAGEngine
 from llm_client import LLMClient
 # --------- Config ----------
-REPO_ID = "dungeon29/deberta-lstm-detect-phishing"
-CKPT_NAME = "pytorch_model.bin"
 MODEL_NAME = "microsoft/deberta-base"         # base tokenizer/backbone
 LABELS = ["benign", "phishing"]               # adjust to your classes
@@ -33,27 +33,48 @@ LABELS = ["benign", "phishing"]               # adjust to your classes
 device = "cuda" if torch.cuda.is_available() else "cpu"
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-ckpt_path = hf_hub_download(repo_id=REPO_ID, filename=CKPT_NAME)
 checkpoint = torch.load(ckpt_path, map_location=device)
 # If you saved hyperparams in the checkpoint, use them:
-model = DeBERTaLSTMClassifier()
 # Load weights
 try:
-    model.load_state_dict(checkpoint)
-except RuntimeError as e:
-    if "attention" in str(e):
-        # Old model without attention layer - initialize attention layer and load partial state
-        state_dict = checkpoint["model_state_dict"]
-        model_dict = model.state_dict()
-        # Filter out attention layer parameters
-        filtered_dict = {k: v for k, v in state_dict.items() if "attention" not in k}
-        model_dict.update(filtered_dict)
-        model.load_state_dict(model_dict)
-        print("Loaded model without attention layer, using newly initialized attention weights")
     else:
-        raise e
 model.to(device).eval()
@@ -360,7 +381,7 @@ def rag_predict_fn(text: str):
         if fetched_content:
             # Limit content length to avoid token overflow
-            truncated_content = fetched_content[:4000]
             analysis_context = f"URL: {input_text}\n\nWebsite Content:\n{truncated_content}\n..."
             print(f"✅ Successfully fetched {len(fetched_content)} chars from URL.")
         else:

 import torch
 import torch.nn.functional as F
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
 from huggingface_hub import hf_hub_download
 import gradio as gr
 import requests
 from llm_client import LLMClient
 # --------- Config ----------
+REPO_ID = "dungeon29/phishing-deberta-lstm"       # HF repo that holds the checkpoint
+CKPT_NAME = "deberta_lstm_checkpoint.pt"               # the .pt file name
 MODEL_NAME = "microsoft/deberta-base"         # base tokenizer/backbone
 LABELS = ["benign", "phishing"]               # adjust to your classes
 device = "cuda" if torch.cuda.is_available() else "cpu"
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+# Check if checkpoint exists locally, otherwise download from HF
+if os.path.exists(CKPT_NAME):
+    print(f"📂 Found local checkpoint: {CKPT_NAME}")
+    ckpt_path = CKPT_NAME
+else:
+    print(f"⬇️ Downloading checkpoint {CKPT_NAME} from HF Hub...")
+    try:
+        ckpt_path = hf_hub_download(repo_id=REPO_ID, filename=CKPT_NAME)
+    except Exception as e:
+        print(f"⚠️ Could not download from HF: {e}")
+        # Fallback to pytorch_model.bin if the new name fails (optional, but good for safety)
+        print("🔄 Trying fallback to pytorch_model.bin...")
+        ckpt_path = hf_hub_download(repo_id=REPO_ID, filename="pytorch_model.bin")
 checkpoint = torch.load(ckpt_path, map_location=device)
 # If you saved hyperparams in the checkpoint, use them:
+if isinstance(checkpoint, dict):
+    model_args = checkpoint.get("model_args", {})  # e.g., {"lstm_hidden":256, "num_labels":2, ...}
+else:
+    model_args = {}
+model = DeBERTaLSTMClassifier(**model_args)
 # Load weights
 try:
+    state_dict = torch.load(ckpt_path, map_location=device)
+    # Xử lý nếu file lưu dạng checkpoint đầy đủ (có key "model_state_dict")
+    if "model_state_dict" in state_dict:
+        state_dict = state_dict["model_state_dict"]
+    model.load_state_dict(state_dict, strict=False)
+    # Kiểm tra layer attention
+    if hasattr(model, 'attention') and 'attention.weight' not in state_dict:
+         print("⚠️ Loaded model without attention layer, using newly initialized attention weights")
     else:
+         print("✅ Load weights successfully!")
+except Exception as e:
+    print(f"❌ Error when loading weights: {e}")
+    raise e
 model.to(device).eval()
         if fetched_content:
             # Limit content length to avoid token overflow
+            truncated_content = fetched_content[:1500]
             analysis_context = f"URL: {input_text}\n\nWebsite Content:\n{truncated_content}\n..."
             print(f"✅ Successfully fetched {len(fetched_content)} chars from URL.")
         else: