Spaces:
Running
Running
Commit ·
bb4de9b
1
Parent(s): c311e40
added outliers reduction and coherence calculation
Browse files
app.py
CHANGED
|
@@ -32,6 +32,8 @@ import hashlib
|
|
| 32 |
from datetime import datetime
|
| 33 |
import altair as alt
|
| 34 |
|
|
|
|
|
|
|
| 35 |
|
| 36 |
|
| 37 |
#to remove funciton locally defined here, we can use importing from mosaic_core
|
|
@@ -1505,6 +1507,87 @@ else:
|
|
| 1505 |
c2.metric("Outliers (-1)", outlier_count)
|
| 1506 |
c3.metric("Outlier rate", f"{outlier_pct:.1f}%")
|
| 1507 |
c4.metric("Units clustered", total_units)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1508 |
|
| 1509 |
with st.expander("Show topic-size overview"):
|
| 1510 |
# Show biggest topics first (excluding outliers)
|
|
@@ -1639,6 +1722,82 @@ else:
|
|
| 1639 |
labs.append("Unlabelled")
|
| 1640 |
else:
|
| 1641 |
labs.append(llm_names.get(t, "Unlabelled"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1642 |
|
| 1643 |
# VISUALISATION
|
| 1644 |
st.subheader("Experiential Topics Visualisation")
|
|
|
|
| 32 |
from datetime import datetime
|
| 33 |
import altair as alt
|
| 34 |
|
| 35 |
+
from gensim.corpora import Dictionary
|
| 36 |
+
from gensim.models import CoherenceModel
|
| 37 |
|
| 38 |
|
| 39 |
#to remove funciton locally defined here, we can use importing from mosaic_core
|
|
|
|
| 1507 |
c2.metric("Outliers (-1)", outlier_count)
|
| 1508 |
c3.metric("Outlier rate", f"{outlier_pct:.1f}%")
|
| 1509 |
c4.metric("Units clustered", total_units)
|
| 1510 |
+
|
| 1511 |
+
|
| 1512 |
+
with st.expander("Model Quality Metrics (Coherence & Embeddings)"):
|
| 1513 |
+
st.caption(
|
| 1514 |
+
"These metrics assess topic quality. **Topic Coherence (C_v)** measures human interpretability "
|
| 1515 |
+
"(how often top words actually appear together in the text), while **Embedding Coherence** "
|
| 1516 |
+
"measures semantic tightness (how close the words are in the vector space)."
|
| 1517 |
+
)
|
| 1518 |
+
|
| 1519 |
+
if "quality_metrics" not in st.session_state or st.session_state.quality_metrics_hash != get_config_hash(current_config):
|
| 1520 |
+
with st.spinner("Calculating coherence metrics..."):
|
| 1521 |
+
# 1. Prepare Data for Gensim (C_v)
|
| 1522 |
+
# Tokenize on the fly (fast enough for inference)
|
| 1523 |
+
tokenized_docs = [d.split() for d in docs]
|
| 1524 |
+
dictionary = Dictionary(tokenized_docs)
|
| 1525 |
+
|
| 1526 |
+
# Get top 10 words for every active topic (excluding outliers)
|
| 1527 |
+
unique_topics = [t for t in set(tm.topics_) if t != -1]
|
| 1528 |
+
topics_top_words = [
|
| 1529 |
+
[word for word, _ in tm.get_topic(t)[:10]]
|
| 1530 |
+
for t in unique_topics
|
| 1531 |
+
]
|
| 1532 |
+
|
| 1533 |
+
# 2. Calculate C_v
|
| 1534 |
+
# We use processes=1 to be safe on Cloud
|
| 1535 |
+
if topics_top_words:
|
| 1536 |
+
cm = CoherenceModel(
|
| 1537 |
+
topics=topics_top_words,
|
| 1538 |
+
texts=tokenized_docs,
|
| 1539 |
+
dictionary=dictionary,
|
| 1540 |
+
coherence='c_v',
|
| 1541 |
+
processes=1
|
| 1542 |
+
)
|
| 1543 |
+
c_v_score = cm.get_coherence()
|
| 1544 |
+
else:
|
| 1545 |
+
c_v_score = 0.0
|
| 1546 |
+
|
| 1547 |
+
# 3. Calculate Embedding Coherence (Proxy)
|
| 1548 |
+
# Average cosine similarity of top 10 words in embedding space
|
| 1549 |
+
emb_coh_score = 0.0
|
| 1550 |
+
if tm.embedding_model and unique_topics:
|
| 1551 |
+
total_sim = 0
|
| 1552 |
+
valid_topics = 0
|
| 1553 |
+
for words in topics_top_words:
|
| 1554 |
+
if len(words) < 2: continue
|
| 1555 |
+
|
| 1556 |
+
# Encode words to vectors
|
| 1557 |
+
word_embs = tm.embedding_model.encode(words)
|
| 1558 |
+
|
| 1559 |
+
# Compute similarity matrix
|
| 1560 |
+
sim_matrix = np.inner(word_embs, word_embs)
|
| 1561 |
+
|
| 1562 |
+
# Average of upper triangle (pairwise similarities)
|
| 1563 |
+
tri_u = sim_matrix[np.triu_indices(len(words), k=1)]
|
| 1564 |
+
|
| 1565 |
+
if len(tri_u) > 0:
|
| 1566 |
+
total_sim += np.mean(tri_u)
|
| 1567 |
+
valid_topics += 1
|
| 1568 |
+
|
| 1569 |
+
if valid_topics > 0:
|
| 1570 |
+
emb_coh_score = total_sim / valid_topics
|
| 1571 |
+
|
| 1572 |
+
# Save to session state so we don't re-calc on every interaction
|
| 1573 |
+
st.session_state.quality_metrics = (c_v_score, emb_coh_score)
|
| 1574 |
+
st.session_state.quality_metrics_hash = get_config_hash(current_config)
|
| 1575 |
+
|
| 1576 |
+
# Retrieve from cache
|
| 1577 |
+
c_v, emb_coh = st.session_state.quality_metrics
|
| 1578 |
+
|
| 1579 |
+
# Display
|
| 1580 |
+
qc1, qc2 = st.columns(2)
|
| 1581 |
+
qc1.metric(
|
| 1582 |
+
"Topic Coherence (C_v)",
|
| 1583 |
+
f"{c_v:.3f}",
|
| 1584 |
+
help="Measures how often the top words in a topic appear together in the original text. Good values: 0.5 - 0.7."
|
| 1585 |
+
)
|
| 1586 |
+
qc2.metric(
|
| 1587 |
+
"Embedding Coherence",
|
| 1588 |
+
f"{emb_coh:.3f}",
|
| 1589 |
+
help="Measures how mathematically close the top words are in the vector space. Higher means tighter semantic clusters."
|
| 1590 |
+
)
|
| 1591 |
|
| 1592 |
with st.expander("Show topic-size overview"):
|
| 1593 |
# Show biggest topics first (excluding outliers)
|
|
|
|
| 1722 |
labs.append("Unlabelled")
|
| 1723 |
else:
|
| 1724 |
labs.append(llm_names.get(t, "Unlabelled"))
|
| 1725 |
+
|
| 1726 |
+
|
| 1727 |
+
# --- START OUTLIER REDUCTION ---
|
| 1728 |
+
if outlier_count > 0:
|
| 1729 |
+
st.markdown("### Outlier Reduction")
|
| 1730 |
+
with st.expander("Assign 'Unlabelled' reports to topics"):
|
| 1731 |
+
st.caption(
|
| 1732 |
+
"**Warning:** Reducing outliers alters the scientific strictness of your model. "
|
| 1733 |
+
"It forces noise points into their nearest semantic topic."
|
| 1734 |
+
)
|
| 1735 |
+
|
| 1736 |
+
col_red1, col_red2, col_red3 = st.columns([1, 1, 1])
|
| 1737 |
+
|
| 1738 |
+
with col_red1:
|
| 1739 |
+
red_strategy = st.selectbox(
|
| 1740 |
+
"Strategy",
|
| 1741 |
+
["embeddings", "c-tf-idf"],
|
| 1742 |
+
index=0, # Default to embeddings
|
| 1743 |
+
help="Embeddings: Match based on meaning (semantic vectors). c-TF-IDF: Match based on shared keywords."
|
| 1744 |
+
)
|
| 1745 |
+
|
| 1746 |
+
with col_red2:
|
| 1747 |
+
red_threshold = st.slider(
|
| 1748 |
+
"Similarity Threshold",
|
| 1749 |
+
min_value=0.0,
|
| 1750 |
+
max_value=1.0,
|
| 1751 |
+
value=0.4,
|
| 1752 |
+
step=0.05,
|
| 1753 |
+
help="If an outlier's similarity to the nearest topic is below this number, it stays as an outlier. Higher = Stricter."
|
| 1754 |
+
)
|
| 1755 |
+
|
| 1756 |
+
with col_red3:
|
| 1757 |
+
st.write("") # Formatting spacer
|
| 1758 |
+
if st.button("Reduce Outliers", use_container_width=True):
|
| 1759 |
+
with st.spinner(f"Reassigning outliers (Strategy: {red_strategy}, Threshold: {red_threshold})..."):
|
| 1760 |
+
try:
|
| 1761 |
+
# 1. Calculate new assignments
|
| 1762 |
+
# Note: We use the cached 'docs' and 'embeddings'
|
| 1763 |
+
new_topics = tm.reduce_outliers(
|
| 1764 |
+
docs,
|
| 1765 |
+
tm.topics_,
|
| 1766 |
+
strategy=red_strategy,
|
| 1767 |
+
embeddings=embeddings,
|
| 1768 |
+
threshold=red_threshold
|
| 1769 |
+
)
|
| 1770 |
+
|
| 1771 |
+
# 2. Update the model internal state
|
| 1772 |
+
tm.update_topics(docs, topics=new_topics)
|
| 1773 |
+
|
| 1774 |
+
# 3. Regenerate labels for the visualization
|
| 1775 |
+
# (Because topic sizes and potentially keywords changed)
|
| 1776 |
+
new_info = tm.get_topic_info()
|
| 1777 |
+
new_name_map = new_info.set_index("Topic")["Name"].to_dict()
|
| 1778 |
+
|
| 1779 |
+
# Apply existing LLM labels if they map to valid topics
|
| 1780 |
+
final_labels = []
|
| 1781 |
+
current_llm_map = st.session_state.get("llm_names", {})
|
| 1782 |
+
|
| 1783 |
+
for t in new_topics:
|
| 1784 |
+
if t == -1:
|
| 1785 |
+
final_labels.append("Unlabelled")
|
| 1786 |
+
else:
|
| 1787 |
+
# Prefer LLM label, fallback to new default name
|
| 1788 |
+
lab = current_llm_map.get(t, new_name_map.get(t, f"Topic {t}"))
|
| 1789 |
+
final_labels.append(lab)
|
| 1790 |
+
|
| 1791 |
+
# 4. Save to session state
|
| 1792 |
+
st.session_state.latest_results = (tm, reduced, final_labels)
|
| 1793 |
+
|
| 1794 |
+
# 5. Refresh UI
|
| 1795 |
+
st.success(f"Outliers reduced! (Threshold {red_threshold})")
|
| 1796 |
+
st.rerun()
|
| 1797 |
+
|
| 1798 |
+
except Exception as e:
|
| 1799 |
+
st.error(f"Error reducing outliers: {e}")
|
| 1800 |
+
|
| 1801 |
|
| 1802 |
# VISUALISATION
|
| 1803 |
st.subheader("Experiential Topics Visualisation")
|