parsing.py•5.92 kB
import re
import tldextract
from typing import List, Dict, Any, Set, Optional
from collections import Counter
# Sentiment keywords
POSITIVE_WORDS = {
"best", "recommended", "affordable", "powerful", "reliable", "intuitive",
"robust", "excellent", "great", "good", "top", "leading", "premium",
"efficient", "fast", "easy", "simple", "comprehensive", "complete"
}
NEGATIVE_WORDS = {
"expensive", "buggy", "limited", "poor", "bad", "slow", "complicated",
"difficult", "unreliable", "outdated", "weak", "basic", "incomplete",
"problematic", "frustrating", "confusing", "overpriced"
}
def extract_domains(raw_answer: str) -> List[Dict[str, Any]]:
"""Extract domains from raw answer using regex and tldextract"""
# Find URLs and domains
url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+'
domain_pattern = r'\b(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,}\b'
domains = set()
# Extract from URLs
urls = re.findall(url_pattern, raw_answer)
for url in urls:
try:
extracted = tldextract.extract(url)
if extracted.domain and extracted.suffix:
domain = f"{extracted.domain}.{extracted.suffix}"
domains.add(domain)
except:
continue
# Extract standalone domains
standalone_domains = re.findall(domain_pattern, raw_answer)
for domain in standalone_domains:
try:
extracted = tldextract.extract(domain)
if extracted.domain and extracted.suffix:
clean_domain = f"{extracted.domain}.{extracted.suffix}"
domains.add(clean_domain)
except:
continue
# Count occurrences
domain_counts = Counter()
for domain in domains:
count = len(re.findall(re.escape(domain), raw_answer, re.IGNORECASE))
domain_counts[domain] = count
return [{"domain": domain, "count": count} for domain, count in domain_counts.most_common()]
def extract_mentions(raw_answer: str, brand_set: Set[str]) -> List[Dict[str, Any]]:
"""Extract brand mentions with count and first index"""
mentions = []
for brand in brand_set:
# Case-insensitive search
pattern = re.compile(re.escape(brand), re.IGNORECASE)
matches = list(pattern.finditer(raw_answer))
if matches:
first_index = matches[0].start()
count = len(matches)
mentions.append({
"brand": brand,
"count": count,
"first_index": first_index
})
# Sort by first_index (earliest mention first)
mentions.sort(key=lambda x: x["first_index"])
return mentions
def naive_sentiment(raw_answer: str, brand: str) -> Dict[str, float]:
"""Compute sentiment scores for a brand using keyword windows"""
positive_count = 0
negative_count = 0
total_contexts = 0
# Find all mentions of the brand
pattern = re.compile(re.escape(brand), re.IGNORECASE)
matches = list(pattern.finditer(raw_answer))
for match in matches:
start = max(0, match.start() - 40)
end = min(len(raw_answer), match.end() + 40)
context = raw_answer[start:end].lower()
# Count positive and negative words in context
context_positive = sum(1 for word in POSITIVE_WORDS if word in context)
context_negative = sum(1 for word in NEGATIVE_WORDS if word in context)
positive_count += context_positive
negative_count += context_negative
total_contexts += 1
if total_contexts == 0:
return {"positive": 0.0, "neutral": 1.0, "negative": 0.0}
# Calculate ratios
positive_ratio = positive_count / (positive_count + negative_count + 1) # +1 for smoothing
negative_ratio = negative_count / (positive_count + negative_count + 1)
neutral_ratio = 1.0 - positive_ratio - negative_ratio
return {
"positive": round(positive_ratio, 3),
"neutral": round(max(0, neutral_ratio), 3),
"negative": round(negative_ratio, 3)
}
def compute_sov(mentions: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Compute share of voice from mentions"""
if not mentions:
return []
total_mentions = sum(m["count"] for m in mentions)
sov_data = []
for mention in mentions:
brand_sov = mention["count"] / total_mentions
sov_data.append({
"brand": mention["brand"],
"overall": round(brand_sov, 3),
"by_platform": {} # Will be filled by platform-specific data
})
return sov_data
def compute_first_position(mentions: List[Dict[str, Any]]) -> Optional[str]:
"""Find brand with earliest first mention"""
if not mentions:
return None
# Sort by first_index and return the brand with earliest mention
sorted_mentions = sorted(mentions, key=lambda x: x["first_index"])
return sorted_mentions[0]["brand"]
def auto_detect_brands(raw_answer: str, max_brands: int = 10) -> List[str]:
"""Auto-detect brands as Proper Nouns (A-Z leading tokens)"""
# Find words that start with capital letters (potential brand names)
words = re.findall(r'\b[A-Z][a-zA-Z0-9]*\b', raw_answer)
# Filter out common words and short words
common_words = {
"The", "This", "That", "These", "Those", "And", "Or", "But", "For", "With",
"From", "About", "When", "Where", "How", "Why", "What", "Which", "Who",
"You", "Your", "We", "Our", "They", "Their", "He", "She", "It", "Its"
}
potential_brands = [
word for word in words
if word not in common_words and len(word) > 2
]
# Count occurrences and return top brands
brand_counts = Counter(potential_brands)
return [brand for brand, count in brand_counts.most_common(max_brands)]