Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from transformers import pipeline | |
| import PyPDF2 | |
| from docx import Document | |
| import re | |
| # Load pipelines | |
| classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") | |
| ner = pipeline("ner", model="Jean-Baptiste/roberta-large-ner-english", grouped_entities=True) | |
| # File reading | |
| def read_file(file_obj): | |
| name = file_obj.name | |
| if name.endswith(".txt"): | |
| return file_obj.read().decode("utf-8", errors="ignore") | |
| elif name.endswith(".pdf"): | |
| reader = PyPDF2.PdfReader(file_obj) | |
| return " ".join([page.extract_text() for page in reader.pages if page.extract_text()]) | |
| elif name.endswith(".docx"): | |
| doc = Document(file_obj) | |
| return "\n".join([para.text for para in doc.paragraphs]) | |
| else: | |
| return "Unsupported file format" | |
| # Contract classification | |
| def is_contract(text): | |
| result = classifier(text[:1000], ["contract", "not a contract"]) | |
| return result['labels'][0] == 'contract', result | |
| # Rule-based + NER-based party extraction | |
| def extract_parties_with_rules(text): | |
| results = set() | |
| # Rule-based: between X and Y | |
| matches = re.findall(r'between\s+(.*?)\s+and\s+(.*?)[\.,\n]', text, re.IGNORECASE) | |
| for match in matches: | |
| results.update(match) | |
| # Rule-based: "X" (Party A), etc. | |
| named_matches = re.findall(r'β([^β]+)β\s*\(.*?Party [AB]\)', text) | |
| results.update(named_matches) | |
| # NER fallback | |
| entities = ner(text[:1000]) | |
| ner_parties = [ent['word'] for ent in entities if ent['entity_group'] in ['ORG', 'PER']] | |
| results.update(ner_parties) | |
| return list(results) | |
| # Main logic | |
| def process_file(file): | |
| text = read_file(file) | |
| if not text.strip(): | |
| return "Empty or unreadable file.", None | |
| is_contract_flag, classification = is_contract(text) | |
| if is_contract_flag: | |
| parties = extract_parties_with_rules(text) | |
| return "β This is a contract.", ", ".join(parties) | |
| else: | |
| return "β This is NOT a contract.", "" | |
| # Gradio interface | |
| iface = gr.Interface( | |
| fn=process_file, | |
| inputs=gr.File(file_types=[".txt", ".pdf", ".docx"], label="Upload a document"), | |
| outputs=[ | |
| gr.Textbox(label="Classification Result"), | |
| gr.Textbox(label="Detected Parties (ORG/PER or Rule-based)") | |
| ], | |
| title="Contract Classifier with RoBERTa", | |
| description="Upload a document (.pdf, .txt, .docx) to detect if it's a contract and extract involved parties using RoBERTa + Rule-based matching." | |
| ) | |
| iface.launch() |