romybeaute commited on
Commit
bb4de9b
·
1 Parent(s): c311e40

added outliers reduction and coherence calculation

Browse files
Files changed (1) hide show
  1. app.py +159 -0
app.py CHANGED
@@ -32,6 +32,8 @@ import hashlib
32
  from datetime import datetime
33
  import altair as alt
34
 
 
 
35
 
36
 
37
  #to remove funciton locally defined here, we can use importing from mosaic_core
@@ -1505,6 +1507,87 @@ else:
1505
  c2.metric("Outliers (-1)", outlier_count)
1506
  c3.metric("Outlier rate", f"{outlier_pct:.1f}%")
1507
  c4.metric("Units clustered", total_units)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1508
 
1509
  with st.expander("Show topic-size overview"):
1510
  # Show biggest topics first (excluding outliers)
@@ -1639,6 +1722,82 @@ else:
1639
  labs.append("Unlabelled")
1640
  else:
1641
  labs.append(llm_names.get(t, "Unlabelled"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1642
 
1643
  # VISUALISATION
1644
  st.subheader("Experiential Topics Visualisation")
 
32
  from datetime import datetime
33
  import altair as alt
34
 
35
+ from gensim.corpora import Dictionary
36
+ from gensim.models import CoherenceModel
37
 
38
 
39
  #to remove funciton locally defined here, we can use importing from mosaic_core
 
1507
  c2.metric("Outliers (-1)", outlier_count)
1508
  c3.metric("Outlier rate", f"{outlier_pct:.1f}%")
1509
  c4.metric("Units clustered", total_units)
1510
+
1511
+
1512
+ with st.expander("Model Quality Metrics (Coherence & Embeddings)"):
1513
+ st.caption(
1514
+ "These metrics assess topic quality. **Topic Coherence (C_v)** measures human interpretability "
1515
+ "(how often top words actually appear together in the text), while **Embedding Coherence** "
1516
+ "measures semantic tightness (how close the words are in the vector space)."
1517
+ )
1518
+
1519
+ if "quality_metrics" not in st.session_state or st.session_state.quality_metrics_hash != get_config_hash(current_config):
1520
+ with st.spinner("Calculating coherence metrics..."):
1521
+ # 1. Prepare Data for Gensim (C_v)
1522
+ # Tokenize on the fly (fast enough for inference)
1523
+ tokenized_docs = [d.split() for d in docs]
1524
+ dictionary = Dictionary(tokenized_docs)
1525
+
1526
+ # Get top 10 words for every active topic (excluding outliers)
1527
+ unique_topics = [t for t in set(tm.topics_) if t != -1]
1528
+ topics_top_words = [
1529
+ [word for word, _ in tm.get_topic(t)[:10]]
1530
+ for t in unique_topics
1531
+ ]
1532
+
1533
+ # 2. Calculate C_v
1534
+ # We use processes=1 to be safe on Cloud
1535
+ if topics_top_words:
1536
+ cm = CoherenceModel(
1537
+ topics=topics_top_words,
1538
+ texts=tokenized_docs,
1539
+ dictionary=dictionary,
1540
+ coherence='c_v',
1541
+ processes=1
1542
+ )
1543
+ c_v_score = cm.get_coherence()
1544
+ else:
1545
+ c_v_score = 0.0
1546
+
1547
+ # 3. Calculate Embedding Coherence (Proxy)
1548
+ # Average cosine similarity of top 10 words in embedding space
1549
+ emb_coh_score = 0.0
1550
+ if tm.embedding_model and unique_topics:
1551
+ total_sim = 0
1552
+ valid_topics = 0
1553
+ for words in topics_top_words:
1554
+ if len(words) < 2: continue
1555
+
1556
+ # Encode words to vectors
1557
+ word_embs = tm.embedding_model.encode(words)
1558
+
1559
+ # Compute similarity matrix
1560
+ sim_matrix = np.inner(word_embs, word_embs)
1561
+
1562
+ # Average of upper triangle (pairwise similarities)
1563
+ tri_u = sim_matrix[np.triu_indices(len(words), k=1)]
1564
+
1565
+ if len(tri_u) > 0:
1566
+ total_sim += np.mean(tri_u)
1567
+ valid_topics += 1
1568
+
1569
+ if valid_topics > 0:
1570
+ emb_coh_score = total_sim / valid_topics
1571
+
1572
+ # Save to session state so we don't re-calc on every interaction
1573
+ st.session_state.quality_metrics = (c_v_score, emb_coh_score)
1574
+ st.session_state.quality_metrics_hash = get_config_hash(current_config)
1575
+
1576
+ # Retrieve from cache
1577
+ c_v, emb_coh = st.session_state.quality_metrics
1578
+
1579
+ # Display
1580
+ qc1, qc2 = st.columns(2)
1581
+ qc1.metric(
1582
+ "Topic Coherence (C_v)",
1583
+ f"{c_v:.3f}",
1584
+ help="Measures how often the top words in a topic appear together in the original text. Good values: 0.5 - 0.7."
1585
+ )
1586
+ qc2.metric(
1587
+ "Embedding Coherence",
1588
+ f"{emb_coh:.3f}",
1589
+ help="Measures how mathematically close the top words are in the vector space. Higher means tighter semantic clusters."
1590
+ )
1591
 
1592
  with st.expander("Show topic-size overview"):
1593
  # Show biggest topics first (excluding outliers)
 
1722
  labs.append("Unlabelled")
1723
  else:
1724
  labs.append(llm_names.get(t, "Unlabelled"))
1725
+
1726
+
1727
+ # --- START OUTLIER REDUCTION ---
1728
+ if outlier_count > 0:
1729
+ st.markdown("### Outlier Reduction")
1730
+ with st.expander("Assign 'Unlabelled' reports to topics"):
1731
+ st.caption(
1732
+ "**Warning:** Reducing outliers alters the scientific strictness of your model. "
1733
+ "It forces noise points into their nearest semantic topic."
1734
+ )
1735
+
1736
+ col_red1, col_red2, col_red3 = st.columns([1, 1, 1])
1737
+
1738
+ with col_red1:
1739
+ red_strategy = st.selectbox(
1740
+ "Strategy",
1741
+ ["embeddings", "c-tf-idf"],
1742
+ index=0, # Default to embeddings
1743
+ help="Embeddings: Match based on meaning (semantic vectors). c-TF-IDF: Match based on shared keywords."
1744
+ )
1745
+
1746
+ with col_red2:
1747
+ red_threshold = st.slider(
1748
+ "Similarity Threshold",
1749
+ min_value=0.0,
1750
+ max_value=1.0,
1751
+ value=0.4,
1752
+ step=0.05,
1753
+ help="If an outlier's similarity to the nearest topic is below this number, it stays as an outlier. Higher = Stricter."
1754
+ )
1755
+
1756
+ with col_red3:
1757
+ st.write("") # Formatting spacer
1758
+ if st.button("Reduce Outliers", use_container_width=True):
1759
+ with st.spinner(f"Reassigning outliers (Strategy: {red_strategy}, Threshold: {red_threshold})..."):
1760
+ try:
1761
+ # 1. Calculate new assignments
1762
+ # Note: We use the cached 'docs' and 'embeddings'
1763
+ new_topics = tm.reduce_outliers(
1764
+ docs,
1765
+ tm.topics_,
1766
+ strategy=red_strategy,
1767
+ embeddings=embeddings,
1768
+ threshold=red_threshold
1769
+ )
1770
+
1771
+ # 2. Update the model internal state
1772
+ tm.update_topics(docs, topics=new_topics)
1773
+
1774
+ # 3. Regenerate labels for the visualization
1775
+ # (Because topic sizes and potentially keywords changed)
1776
+ new_info = tm.get_topic_info()
1777
+ new_name_map = new_info.set_index("Topic")["Name"].to_dict()
1778
+
1779
+ # Apply existing LLM labels if they map to valid topics
1780
+ final_labels = []
1781
+ current_llm_map = st.session_state.get("llm_names", {})
1782
+
1783
+ for t in new_topics:
1784
+ if t == -1:
1785
+ final_labels.append("Unlabelled")
1786
+ else:
1787
+ # Prefer LLM label, fallback to new default name
1788
+ lab = current_llm_map.get(t, new_name_map.get(t, f"Topic {t}"))
1789
+ final_labels.append(lab)
1790
+
1791
+ # 4. Save to session state
1792
+ st.session_state.latest_results = (tm, reduced, final_labels)
1793
+
1794
+ # 5. Refresh UI
1795
+ st.success(f"Outliers reduced! (Threshold {red_threshold})")
1796
+ st.rerun()
1797
+
1798
+ except Exception as e:
1799
+ st.error(f"Error reducing outliers: {e}")
1800
+
1801
 
1802
  # VISUALISATION
1803
  st.subheader("Experiential Topics Visualisation")