Spaces:

akpande2
/

iAura_1

Paused

App Files Files Community

akpande2 commited on 18 days ago

Commit

d37a3c7

verified ·

1 Parent(s): 081fe78

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +268 -1

pipeline.py CHANGED Viewed

@@ -742,4 +742,271 @@ class UltraRobustCallAnalytics:
             with torch.no_grad():
                 logits = model(**inputs).logits
-                probs = torch.softmax(logits, dim=-1)[0].

             with torch.no_grad():
                 logits = model(**inputs).logits
+                probs = torch.softmax(logits, dim=-1)[0].cpu().numpy()
+            # Map labels to age buckets (aggregating across genders)
+            # Labels usually look like: 'female_20-29', 'male_20-29', etc.
+            labels = model.config.id2label
+            age_scores = defaultdict(float)
+            for i, score in enumerate(probs):
+                label = labels[i]
+                # Extract age part (assuming format gender_age)
+                parts = label.split('_')
+                if len(parts) > 1:
+                    age_group = parts[-1] # e.g., "20-29"
+                    age_scores[age_group] += score
+            # Get best age bracket
+            if age_scores:
+                best_age = max(age_scores, key=age_scores.get)
+                return best_age
+            return "UNKNOWN"
+        except Exception as e:
+            print(f"    ⚠ Age detection failed: {e}")
+            return "UNKNOWN"
+    def _run_enhanced_diarization(self, wav, sr, file_path):
+        """
+        Run Pyannote diarization or fallback to simple segmentation
+        """
+        if self.diarization_pipeline is None:
+            print("    ⚠ No auth token provided, using energy-based fallback segmentation")
+            return self._energy_based_segmentation(wav, sr)
+        try:
+            # Run pipeline
+            diarization = self.diarization_pipeline(file_path)
+            segments = []
+            for turn, _, speaker in diarization.itertracks(yield_label=True):
+                segments.append({
+                    "start": turn.start,
+                    "end": turn.end,
+                    "speaker": speaker
+                })
+            return segments
+        except Exception as e:
+            print(f"    ⚠ Diarization error: {e}, using fallback")
+            return self._energy_based_segmentation(wav, sr)
+    def _energy_based_segmentation(self, wav, sr):
+        """Fallback if deep learning diarization fails"""
+        # Simple energy detection to split speech from silence
+        # Treating as single speaker (SPEAKER_00)
+        intervals = librosa.effects.split(wav, top_db=30)
+        segments = []
+        for start, end in intervals:
+            segments.append({
+                "start": start / sr,
+                "end": end / sr,
+                "speaker": "SPEAKER_00"
+            })
+        return segments
+    def _merge_segments_smart(self, segments, min_gap=0.5):
+        """Merge segments from same speaker that are close together"""
+        if not segments:
+            return []
+        merged = []
+        current = segments[0]
+        for next_seg in segments[1:]:
+            # If same speaker and gap is small
+            if (next_seg['speaker'] == current['speaker'] and
+                (next_seg['start'] - current['end']) < min_gap):
+                # Extend current segment
+                current['end'] = next_seg['end']
+            else:
+                merged.append(current)
+                current = next_seg
+        merged.append(current)
+        return merged
+    def _is_silence(self, chunk, threshold=0.005):
+        """Check if audio chunk is essentially silence"""
+        return np.max(np.abs(chunk)) < threshold
+    def _detect_emotion(self, chunk):
+        """Detect emotion from audio chunk"""
+        try:
+            # Ensure chunk is long enough for model
+            if len(chunk) < 16000 * 0.5:
+                return "neutral"
+            # Use the pipeline loaded in init
+            # Note: Pipeline expects file path or numpy array
+            preds = self.emotion_classifier(chunk, top_k=1)
+            return preds[0]['label']
+        except:
+            return "neutral"
+    def _calculate_tone_advanced(self, chunk, sr, text):
+        """
+        Calculate pitch, jitter, and shimmer using Parselmouth (Praat)
+        """
+        try:
+            if len(chunk) < sr * 0.1:
+                return {"pitch_hz": 0, "jitter": 0, "shimmer": 0}
+            snd = parselmouth.Sound(chunk, sampling_frequency=sr)
+            # Pitch
+            pitch = snd.to_pitch()
+            pitch_val = pitch.selected_array['frequency']
+            pitch_val = pitch_val[pitch_val != 0]
+            avg_pitch = np.mean(pitch_val) if len(pitch_val) > 0 else 0
+            # Pulses for Jitter/Shimmer
+            point_process = call(snd, "To PointProcess (periodic, cc)", 75, 500)
+            try:
+                jitter = call(point_process, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3)
+            except:
+                jitter = 0
+            try:
+                shimmer = call([snd, point_process], "Get shimmer (local)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
+            except:
+                shimmer = 0
+            return {
+                "pitch_hz": round(float(avg_pitch), 1),
+                "jitter": round(float(jitter * 100), 2), # percentage
+                "shimmer": round(float(shimmer * 100), 2) # db
+            }
+        except:
+            return {"pitch_hz": 0, "jitter": 0, "shimmer": 0}
+    def _assign_roles_smart(self, results):
+        """
+        Assign AGENT vs CUSTOMER roles based on content analysis
+        """
+        speakers = set(r['speaker'] for r in results)
+        if len(speakers) == 1:
+            # Monologue - assume Agent recording
+            for r in results: r['role'] = "AGENT"
+            return results
+        speaker_scores = defaultdict(int)
+        # Agent keywords
+        agent_keywords = [
+            "thank you for calling", "my name is", "how can i help",
+            "assist you", "recording", "company", "representative"
+        ]
+        # Customer keywords
+        customer_keywords = [
+            "issue", "problem", "not working", "bill", "complain",
+            "cancel", "help me", "fix"
+        ]
+        for res in results:
+            text = res['text'].lower()
+            spk = res['speaker']
+            # Scoring
+            if any(k in text for k in agent_keywords):
+                speaker_scores[spk] += 2
+            if any(k in text for k in customer_keywords):
+                speaker_scores[spk] -= 2
+        # First speaker is often the agent (intro)
+        first_spk = results[0]['speaker']
+        speaker_scores[first_spk] += 1
+        # Identify Agent (highest score)
+        agent_spk = max(speaker_scores, key=speaker_scores.get)
+        # Assign
+        for res in results:
+            if res['speaker'] == agent_spk:
+                res['role'] = "AGENT"
+            else:
+                res['role'] = "CUSTOMER"
+        return results
+    def _analyze_customer_journey(self, results):
+        """Analyze sentiment flow of the customer"""
+        cust_segments = [r for r in results if r['role'] == "CUSTOMER"]
+        if not cust_segments:
+            return {"emotional_arc": "No customer audio", "impact_score": 0}
+        # Map emotions to scores
+        emo_map = {
+            "happy": 1.0, "joy": 1.0, "neutral": 0.1,
+            "sad": -0.5, "angry": -1.0, "frustrated": -1.0
+        }
+        start_score = sum(emo_map.get(s['emotion'], 0) for s in cust_segments[:3]) / min(3, len(cust_segments))
+        end_score = sum(emo_map.get(s['emotion'], 0) for s in cust_segments[-3:]) / min(3, len(cust_segments))
+        impact = end_score - start_score
+        if impact > 0.2: arc = "Positive Resolution"
+        elif impact < -0.2: arc = "Negative Escalation"
+        else: arc = "Neutral/Unresolved"
+        return {
+            "emotional_arc": arc,
+            "start_sentiment": round(start_score, 2),
+            "end_sentiment": round(end_score, 2),
+            "impact_score": round(impact, 2)
+        }
+    def _analyze_agent_kpi(self, results, customer_impact):
+        """Calculate Agent performance metrics"""
+        agent_segments = [r for r in results if r['role'] == "AGENT"]
+        if not agent_segments:
+            return {"overall_score": 0}
+        # 1. Politeness (Keyword based)
+        polite_words = ["please", "thank", "sorry", "apologize", "appreciate"]
+        total_words = sum(len(s['text'].split()) for s in agent_segments)
+        polite_count = sum(1 for s in agent_segments if any(w in s['text'].lower() for w in polite_words))
+        politeness_score = min(100, (polite_count / max(1, len(agent_segments))) * 200)
+        # 2. Tone Consistency (Jitter/Shimmer variance)
+        jitter_vals = [s['tone']['jitter'] for s in agent_segments]
+        tone_stability = 100 - min(100, np.std(jitter_vals) * 10) if jitter_vals else 50
+        # 3. Resolution Impact (from customer journey)
+        # Map -1.0 to 1.0 range -> 0 to 100
+        resolution_score = 50 + (customer_impact * 50)
+        resolution_score = max(0, min(100, resolution_score))
+        # Overall Weighted Score
+        overall = (
+            (politeness_score * 0.3) +
+            (tone_stability * 0.2) +
+            (resolution_score * 0.5)
+        )
+        return {
+            "overall_score": int(overall),
+            "politeness": int(politeness_score),
+            "tone_stability": int(tone_stability),
+            "resolution_effectiveness": int(resolution_score)
+        }
+    def _flush_memory(self):
+        """Aggressive memory cleanup"""
+        gc.collect()
+        if self.device == "cuda":
+            torch.cuda.empty_cache()
+if __name__ == "__main__":
+    # Example usage
+    print("Initialize with: analyzer = UltraRobustCallAnalytics(hf_token='YOUR_TOKEN')")
+    print("Process with: result = analyzer.process_call('path/to/audio.wav')")