Spaces:
Sleeping
Sleeping
Vedant Jigarbhai Mehta commited on
Commit ·
aaf9ca5
1
Parent(s): aed5d76
fix search count, cluster data, and network filter
Browse files- backend/routes/clusters.py +33 -12
- backend/routes/network.py +29 -16
- backend/routes/search.py +1 -1
- frontend/src/pages/Clusters.jsx +13 -1
- frontend/src/pages/Network.jsx +2 -2
- frontend/src/services/api.js +2 -2
backend/routes/clusters.py
CHANGED
|
@@ -93,7 +93,7 @@ def get_clusters():
|
|
| 93 |
# Get texts for labeling
|
| 94 |
texts = [r[0] for r in conn.execute("SELECT combined_text FROM posts ORDER BY rowid").fetchall()]
|
| 95 |
|
| 96 |
-
# Generate labels
|
| 97 |
clusters = {}
|
| 98 |
for i in range(k):
|
| 99 |
cluster_texts = [t for t, l in zip(texts, labels) if l == i]
|
|
@@ -111,26 +111,47 @@ def get_clusters():
|
|
| 111 |
label = f"Cluster {i}"
|
| 112 |
|
| 113 |
cluster_post_ids = [post_ids[j] for j in range(len(labels)) if labels[j] == i]
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
top = conn.execute(f"""
|
| 117 |
-
SELECT id, title, subreddit, score FROM posts
|
| 118 |
-
WHERE id IN ({placeholders})
|
| 119 |
-
ORDER BY score DESC LIMIT 5
|
| 120 |
-
""", pids_sample).fetchall()
|
| 121 |
-
|
| 122 |
-
clusters[i] = {
|
| 123 |
'id': i,
|
| 124 |
'label': label,
|
| 125 |
'size': len(cluster_post_ids),
|
| 126 |
-
'top_posts': [
|
|
|
|
| 127 |
}
|
| 128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
conn.close()
|
| 130 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
result = {
|
| 132 |
-
'clusters':
|
| 133 |
'k': k,
|
|
|
|
| 134 |
}
|
| 135 |
if was_clamped:
|
| 136 |
result['warning'] = f'Requested k={original_k} was clamped to {k} (valid range: {MIN_K}-{MAX_K})'
|
|
|
|
| 93 |
# Get texts for labeling
|
| 94 |
texts = [r[0] for r in conn.execute("SELECT combined_text FROM posts ORDER BY rowid").fetchall()]
|
| 95 |
|
| 96 |
+
# Generate labels and gather full per-cluster data
|
| 97 |
clusters = {}
|
| 98 |
for i in range(k):
|
| 99 |
cluster_texts = [t for t, l in zip(texts, labels) if l == i]
|
|
|
|
| 111 |
label = f"Cluster {i}"
|
| 112 |
|
| 113 |
cluster_post_ids = [post_ids[j] for j in range(len(labels)) if labels[j] == i]
|
| 114 |
+
|
| 115 |
+
cluster_data = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
'id': i,
|
| 117 |
'label': label,
|
| 118 |
'size': len(cluster_post_ids),
|
| 119 |
+
'top_posts': [],
|
| 120 |
+
'subreddits': [],
|
| 121 |
}
|
| 122 |
|
| 123 |
+
# Top 10 posts by score and subreddit breakdown
|
| 124 |
+
if cluster_post_ids:
|
| 125 |
+
placeholders = ','.join(['?' for _ in cluster_post_ids])
|
| 126 |
+
|
| 127 |
+
top = conn.execute(f"""
|
| 128 |
+
SELECT id, title, subreddit, score, author, permalink, created_date FROM posts
|
| 129 |
+
WHERE id IN ({placeholders})
|
| 130 |
+
ORDER BY score DESC LIMIT 10
|
| 131 |
+
""", cluster_post_ids).fetchall()
|
| 132 |
+
cluster_data['top_posts'] = [
|
| 133 |
+
{'id': t[0], 'title': t[1], 'subreddit': t[2], 'score': t[3],
|
| 134 |
+
'author': t[4], 'permalink': t[5], 'date': t[6]} for t in top
|
| 135 |
+
]
|
| 136 |
+
|
| 137 |
+
sub_counts = conn.execute(f"""
|
| 138 |
+
SELECT subreddit, COUNT(*) as count FROM posts
|
| 139 |
+
WHERE id IN ({placeholders}) GROUP BY subreddit ORDER BY count DESC
|
| 140 |
+
""", cluster_post_ids).fetchall()
|
| 141 |
+
cluster_data['subreddits'] = [{'name': s[0], 'count': s[1]} for s in sub_counts]
|
| 142 |
+
|
| 143 |
+
clusters[i] = cluster_data
|
| 144 |
+
|
| 145 |
conn.close()
|
| 146 |
|
| 147 |
+
cluster_list = list(clusters.values())
|
| 148 |
+
from services.llm_service import generate_cluster_summary
|
| 149 |
+
summary = generate_cluster_summary(cluster_list, k)
|
| 150 |
+
|
| 151 |
result = {
|
| 152 |
+
'clusters': cluster_list,
|
| 153 |
'k': k,
|
| 154 |
+
'summary': summary,
|
| 155 |
}
|
| 156 |
if was_clamped:
|
| 157 |
result['warning'] = f'Requested k={original_k} was clamped to {k} (valid range: {MIN_K}-{MAX_K})'
|
backend/routes/network.py
CHANGED
|
@@ -42,16 +42,20 @@ def get_graph():
|
|
| 42 |
nodes_to_keep = [n for n in G.nodes() if G.degree(n) >= min_degree]
|
| 43 |
subgraph = G.subgraph(nodes_to_keep).copy()
|
| 44 |
|
|
|
|
|
|
|
| 45 |
edge_key = 'links'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
result = {
|
| 47 |
'nodes': [{'id': n, **subgraph.nodes[n]} for n in subgraph.nodes()],
|
| 48 |
edge_key: [{'source': u, 'target': v, **d} for u, v, d in subgraph.edges(data=True)],
|
| 49 |
-
'stats':
|
| 50 |
-
|
| 51 |
-
'num_edges': subgraph.number_of_edges(),
|
| 52 |
-
'num_components': nx.number_connected_components(subgraph),
|
| 53 |
-
'density': round(nx.density(subgraph), 6) if subgraph.number_of_nodes() > 1 else 0
|
| 54 |
-
}
|
| 55 |
}
|
| 56 |
|
| 57 |
return jsonify(result)
|
|
@@ -59,26 +63,35 @@ def get_graph():
|
|
| 59 |
|
| 60 |
@network_bp.route('/remove-node/<author>')
|
| 61 |
def remove_node(author):
|
|
|
|
| 62 |
graph_data = current_app.config['graph_data']
|
| 63 |
-
|
| 64 |
|
| 65 |
-
if author not in
|
| 66 |
return jsonify({
|
| 67 |
'error': True,
|
| 68 |
'message': f'Author "{author}" not found in the network.'
|
| 69 |
}), 404
|
| 70 |
|
| 71 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
components_before = nx.number_connected_components(G)
|
| 73 |
nodes_before = G.number_of_nodes()
|
| 74 |
edges_before = G.number_of_edges()
|
| 75 |
|
| 76 |
-
# Find which component the author belongs to
|
| 77 |
-
for comp in nx.connected_components(G):
|
| 78 |
-
if author in comp:
|
| 79 |
-
original_component_size = len(comp)
|
| 80 |
-
break
|
| 81 |
-
|
| 82 |
# Remove the node
|
| 83 |
removed_degree = G.degree(author)
|
| 84 |
removed_pagerank = G.nodes[author].get('pagerank', 0)
|
|
@@ -105,7 +118,7 @@ def remove_node(author):
|
|
| 105 |
f"Components: {components_after}. {edges_before - edges_after} edges removed."
|
| 106 |
)
|
| 107 |
|
| 108 |
-
# Return updated graph
|
| 109 |
edge_key = 'links'
|
| 110 |
result = {
|
| 111 |
'nodes': [{'id': n, **G.nodes[n]} for n in G.nodes()],
|
|
|
|
| 42 |
nodes_to_keep = [n for n in G.nodes() if G.degree(n) >= min_degree]
|
| 43 |
subgraph = G.subgraph(nodes_to_keep).copy()
|
| 44 |
|
| 45 |
+
from services.llm_service import generate_network_summary
|
| 46 |
+
|
| 47 |
edge_key = 'links'
|
| 48 |
+
filtered_stats = {
|
| 49 |
+
'num_nodes': subgraph.number_of_nodes(),
|
| 50 |
+
'num_edges': subgraph.number_of_edges(),
|
| 51 |
+
'num_components': nx.number_connected_components(subgraph),
|
| 52 |
+
'density': round(nx.density(subgraph), 6) if subgraph.number_of_nodes() > 1 else 0
|
| 53 |
+
}
|
| 54 |
result = {
|
| 55 |
'nodes': [{'id': n, **subgraph.nodes[n]} for n in subgraph.nodes()],
|
| 56 |
edge_key: [{'source': u, 'target': v, **d} for u, v, d in subgraph.edges(data=True)],
|
| 57 |
+
'stats': filtered_stats,
|
| 58 |
+
'summary': generate_network_summary(filtered_stats),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
}
|
| 60 |
|
| 61 |
return jsonify(result)
|
|
|
|
| 63 |
|
| 64 |
@network_bp.route('/remove-node/<author>')
|
| 65 |
def remove_node(author):
|
| 66 |
+
min_degree = request.args.get('min_degree', 1, type=int)
|
| 67 |
graph_data = current_app.config['graph_data']
|
| 68 |
+
G_full = graph_from_data(graph_data)
|
| 69 |
|
| 70 |
+
if author not in G_full:
|
| 71 |
return jsonify({
|
| 72 |
'error': True,
|
| 73 |
'message': f'Author "{author}" not found in the network.'
|
| 74 |
}), 404
|
| 75 |
|
| 76 |
+
# Apply the same min_degree filter the graph view is using
|
| 77 |
+
if min_degree > 1:
|
| 78 |
+
nodes_to_keep = [n for n in G_full.nodes() if G_full.degree(n) >= min_degree]
|
| 79 |
+
G = G_full.subgraph(nodes_to_keep).copy()
|
| 80 |
+
else:
|
| 81 |
+
G = G_full.copy()
|
| 82 |
+
|
| 83 |
+
# If the author was filtered out by min_degree, they're not in the visible graph
|
| 84 |
+
if author not in G:
|
| 85 |
+
return jsonify({
|
| 86 |
+
'error': True,
|
| 87 |
+
'message': f'Author "{author}" is not visible at min degree {min_degree}.'
|
| 88 |
+
}), 404
|
| 89 |
+
|
| 90 |
+
# Stats before removal (within the filtered graph)
|
| 91 |
components_before = nx.number_connected_components(G)
|
| 92 |
nodes_before = G.number_of_nodes()
|
| 93 |
edges_before = G.number_of_edges()
|
| 94 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
# Remove the node
|
| 96 |
removed_degree = G.degree(author)
|
| 97 |
removed_pagerank = G.nodes[author].get('pagerank', 0)
|
|
|
|
| 118 |
f"Components: {components_after}. {edges_before - edges_after} edges removed."
|
| 119 |
)
|
| 120 |
|
| 121 |
+
# Return updated graph (respecting min_degree filter)
|
| 122 |
edge_key = 'links'
|
| 123 |
result = {
|
| 124 |
'nodes': [{'id': n, **G.nodes[n]} for n in G.nodes()],
|
backend/routes/search.py
CHANGED
|
@@ -23,7 +23,7 @@ def detect_language(text):
|
|
| 23 |
def search():
|
| 24 |
data = request.get_json() or {}
|
| 25 |
query = data.get('message', '').strip()
|
| 26 |
-
limit = data.get('limit',
|
| 27 |
|
| 28 |
# Edge case: conversational/greeting queries
|
| 29 |
greetings = [
|
|
|
|
| 23 |
def search():
|
| 24 |
data = request.get_json() or {}
|
| 25 |
query = data.get('message', '').strip()
|
| 26 |
+
limit = data.get('limit', 10)
|
| 27 |
|
| 28 |
# Edge case: conversational/greeting queries
|
| 29 |
greetings = [
|
frontend/src/pages/Clusters.jsx
CHANGED
|
@@ -20,6 +20,9 @@ const SUBREDDIT_COLORS = {
|
|
| 20 |
neoliberal: '#6366f1', worldpolitics: '#14b8a6', Conservative: '#f97316', Republican: '#ea580c'
|
| 21 |
}
|
| 22 |
|
|
|
|
|
|
|
|
|
|
| 23 |
export default function Clusters() {
|
| 24 |
const [k, setK] = useState(8)
|
| 25 |
const [debouncedK, setDebouncedK] = useState(8)
|
|
@@ -71,11 +74,20 @@ export default function Clusters() {
|
|
| 71 |
onChange={e => setK(Number(e.target.value))}
|
| 72 |
className="flex-1 max-w-xs" />
|
| 73 |
<span className="text-2xl font-bold text-indigo-600 w-12 text-center">{k}</span>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
</div>
|
| 75 |
{warning && <p className="text-sm text-amber-600 mt-2">{warning}</p>}
|
| 76 |
<div className="flex items-center justify-between mt-3">
|
| 77 |
<p className="text-xs text-gray-400">
|
| 78 |
-
{clusters.length} clusters · {totalPosts.toLocaleString()} posts · KMeans on 384-dim embeddings
|
| 79 |
</p>
|
| 80 |
<Link to="/dashboard/embeddings" className="text-xs text-indigo-600 hover:text-indigo-800 font-medium">
|
| 81 |
Explore full embedding map →
|
|
|
|
| 20 |
neoliberal: '#6366f1', worldpolitics: '#14b8a6', Conservative: '#f97316', Republican: '#ea580c'
|
| 21 |
}
|
| 22 |
|
| 23 |
+
// Pre-computed k values load instantly from SQLite; other k values are computed on the fly
|
| 24 |
+
const PRECOMPUTED_K = new Set([3, 5, 8, 10, 15, 20, 30, 50])
|
| 25 |
+
|
| 26 |
export default function Clusters() {
|
| 27 |
const [k, setK] = useState(8)
|
| 28 |
const [debouncedK, setDebouncedK] = useState(8)
|
|
|
|
| 74 |
onChange={e => setK(Number(e.target.value))}
|
| 75 |
className="flex-1 max-w-xs" />
|
| 76 |
<span className="text-2xl font-bold text-indigo-600 w-12 text-center">{k}</span>
|
| 77 |
+
{PRECOMPUTED_K.has(k) ? (
|
| 78 |
+
<span className="text-[10px] font-medium text-emerald-700 bg-emerald-50 border border-emerald-200 px-2 py-0.5 rounded-full">
|
| 79 |
+
Instant · pre-computed
|
| 80 |
+
</span>
|
| 81 |
+
) : (
|
| 82 |
+
<span className="text-[10px] font-medium text-amber-700 bg-amber-50 border border-amber-200 px-2 py-0.5 rounded-full">
|
| 83 |
+
Computed on-the-fly
|
| 84 |
+
</span>
|
| 85 |
+
)}
|
| 86 |
</div>
|
| 87 |
{warning && <p className="text-sm text-amber-600 mt-2">{warning}</p>}
|
| 88 |
<div className="flex items-center justify-between mt-3">
|
| 89 |
<p className="text-xs text-gray-400">
|
| 90 |
+
{clusters.length} clusters · {totalPosts.toLocaleString()} posts · KMeans on 384-dim embeddings · Instant for k ∈ {'{'}3, 5, 8, 10, 15, 20, 30, 50{'}'}
|
| 91 |
</p>
|
| 92 |
<Link to="/dashboard/embeddings" className="text-xs text-indigo-600 hover:text-indigo-800 font-medium">
|
| 93 |
Explore full embedding map →
|
frontend/src/pages/Network.jsx
CHANGED
|
@@ -13,7 +13,7 @@ const COMMUNITY_COLORS = [
|
|
| 13 |
export default function Network() {
|
| 14 |
const [graphData, setGraphData] = useState(null)
|
| 15 |
const [stats, setStats] = useState(null)
|
| 16 |
-
const [minDegree, setMinDegree] = useState(
|
| 17 |
const [selectedNode, setSelectedNode] = useState(null)
|
| 18 |
const [removalImpact, setRemovalImpact] = useState(null)
|
| 19 |
const [loading, setLoading] = useState(true)
|
|
@@ -95,7 +95,7 @@ export default function Network() {
|
|
| 95 |
if (!selectedNode) return
|
| 96 |
setRemoving(true)
|
| 97 |
try {
|
| 98 |
-
const res = await removeNetworkNode(selectedNode.id)
|
| 99 |
setRemovalImpact(res.data)
|
| 100 |
} catch (err) {
|
| 101 |
console.error(err)
|
|
|
|
| 13 |
export default function Network() {
|
| 14 |
const [graphData, setGraphData] = useState(null)
|
| 15 |
const [stats, setStats] = useState(null)
|
| 16 |
+
const [minDegree, setMinDegree] = useState(1)
|
| 17 |
const [selectedNode, setSelectedNode] = useState(null)
|
| 18 |
const [removalImpact, setRemovalImpact] = useState(null)
|
| 19 |
const [loading, setLoading] = useState(true)
|
|
|
|
| 95 |
if (!selectedNode) return
|
| 96 |
setRemoving(true)
|
| 97 |
try {
|
| 98 |
+
const res = await removeNetworkNode(selectedNode.id, { min_degree: minDegree })
|
| 99 |
setRemovalImpact(res.data)
|
| 100 |
} catch (err) {
|
| 101 |
console.error(err)
|
frontend/src/services/api.js
CHANGED
|
@@ -32,8 +32,8 @@ export const searchTimeSeries = (data) =>
|
|
| 32 |
export const getNetworkGraph = (params) =>
|
| 33 |
api.get('/network/graph', { params })
|
| 34 |
|
| 35 |
-
export const removeNetworkNode = (author) =>
|
| 36 |
-
api.get(`/network/remove-node/${encodeURIComponent(author)}`)
|
| 37 |
|
| 38 |
// Clusters
|
| 39 |
export const getClusters = (params) =>
|
|
|
|
| 32 |
export const getNetworkGraph = (params) =>
|
| 33 |
api.get('/network/graph', { params })
|
| 34 |
|
| 35 |
+
export const removeNetworkNode = (author, params = {}) =>
|
| 36 |
+
api.get(`/network/remove-node/${encodeURIComponent(author)}`, { params })
|
| 37 |
|
| 38 |
// Clusters
|
| 39 |
export const getClusters = (params) =>
|