bradduy commited on
Commit
0d6e98d
·
verified ·
1 Parent(s): e851b80

Add macOS Swift app source (apps/macos/ — SwiftUI menu-bar overlay + MLX sidecar)

Browse files
apps/macos/Package.swift ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // swift-tools-version:5.9
2
+ import PackageDescription
3
+
4
+ let package = Package(
5
+ name: "BanhMi",
6
+ platforms: [.macOS(.v13)],
7
+ targets: [
8
+ .executableTarget(
9
+ name: "BanhMi",
10
+ path: "Sources/BanhMi"
11
+ )
12
+ ]
13
+ )
apps/macos/README.md ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Bánh mì chuyển ngữ — macOS App
2
+
3
+ Menu bar app that captures system audio and shows live translated subtitles. Zero config — install, pick your language, forget about it.
4
+
5
+ ## Requirements
6
+
7
+ - macOS 13 Ventura or later
8
+ - Swift 5.9+ (ships with Xcode 15 / Command Line Tools)
9
+
10
+ ## Build & run
11
+
12
+ ```bash
13
+ ./build.sh
14
+ open ".build/release/Bánh mì chuyển ngữ.app"
15
+ ```
16
+
17
+ For debug builds:
18
+
19
+ ```bash
20
+ CONFIG=debug ./build.sh
21
+ ```
22
+
23
+ ## Project layout
24
+
25
+ ```
26
+ apps/macos/
27
+ ├── Package.swift # Swift Package Manager manifest
28
+ ├── Sources/BanhMi/
29
+ │ ├── main.swift # NSApplication entry point
30
+ │ ├── AppDelegate.swift # Menu bar + popover wiring
31
+ │ ├── Settings.swift # Languages, text sizes, storage keys
32
+ │ └── SettingsView.swift # SwiftUI settings panel
33
+ ├── Resources/
34
+ │ └── Info.plist # LSUIElement = true (menu bar only, no dock)
35
+ └── build.sh # Builds the .app bundle
36
+ ```
37
+
38
+ ## Current status (v0.2)
39
+
40
+ - [x] Menu bar icon
41
+ - [x] Click → settings popover
42
+ - [x] Output language picker (16 languages)
43
+ - [x] Subtitle size selector (S / M / L)
44
+ - [x] Audio source picker (System audio / Microphone)
45
+ - [x] Settings persist via `UserDefaults`
46
+ - [x] Floating subtitle overlay (bottom-center, always on top, visible over fullscreen apps, click-through)
47
+ - [x] Microphone capture (AVAudioEngine)
48
+ - [x] System audio capture (ScreenCaptureKit)
49
+ - [x] Live transcription via Apple Speech framework (on-device when supported)
50
+ - [ ] Translation engine (Gemma 4 / Google Translate API)
51
+ - [ ] Dual-language overlay (original + translation)
52
+
53
+ ## Permissions needed
54
+
55
+ On first launch, macOS will prompt for:
56
+
57
+ 1. **Speech Recognition** — auto-prompt, click Allow
58
+ 2. **Screen Recording** (for system audio) — manual: System Settings → Privacy & Security → Screen Recording → enable Bánh mì chuyển ngữ, then quit & relaunch
59
+ 3. **Microphone** — only if you switch source to Microphone
apps/macos/Resources/Info.plist ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
3
+ <plist version="1.0">
4
+ <dict>
5
+ <key>CFBundleName</key>
6
+ <string>Bánh mì chuyển ngữ</string>
7
+ <key>CFBundleDisplayName</key>
8
+ <string>Bánh mì chuyển ngữ</string>
9
+ <key>CFBundleIdentifier</key>
10
+ <string>vn.banhmi.chuyenngu</string>
11
+ <key>CFBundleExecutable</key>
12
+ <string>BanhMi</string>
13
+ <key>CFBundlePackageType</key>
14
+ <string>APPL</string>
15
+ <key>CFBundleShortVersionString</key>
16
+ <string>0.5</string>
17
+ <key>CFBundleVersion</key>
18
+ <string>5</string>
19
+ <key>LSMinimumSystemVersion</key>
20
+ <string>13.0</string>
21
+ <key>LSUIElement</key>
22
+ <true/>
23
+ <key>NSHighResolutionCapable</key>
24
+ <true/>
25
+ <key>NSMicrophoneUsageDescription</key>
26
+ <string>Bánh mì chuyển ngữ listens to your microphone to show live transcriptions on screen.</string>
27
+ <key>NSScreenCaptureUsageDescription</key>
28
+ <string>Bánh mì chuyển ngữ captures system audio (via screen capture) to transcribe and translate what you hear.</string>
29
+ </dict>
30
+ </plist>
apps/macos/Sources/BanhMi/AppDelegate.swift ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import AppKit
2
+ import SwiftUI
3
+
4
+ @MainActor
5
+ final class AppDelegate: NSObject, NSApplicationDelegate {
6
+ private var statusItem: NSStatusItem!
7
+ private var popover: NSPopover!
8
+ private let state = AppState.shared
9
+ private var overlay: SubtitleOverlayController!
10
+ private var controller: TranscriptionController!
11
+
12
+ func applicationDidFinishLaunching(_ notification: Notification) {
13
+ statusItem = NSStatusBar.system.statusItem(withLength: NSStatusItem.variableLength)
14
+
15
+ if let button = statusItem.button {
16
+ let image = NSImage(systemSymbolName: "waveform", accessibilityDescription: "Bánh mì chuyển ngữ")
17
+ image?.isTemplate = true
18
+ button.image = image
19
+ button.action = #selector(togglePopover(_:))
20
+ button.target = self
21
+ }
22
+
23
+ let rootView = SettingsView(state: state, onToggleEnabled: { [weak self] in
24
+ self?.toggleEnabled()
25
+ })
26
+ popover = NSPopover()
27
+ popover.contentSize = NSSize(width: 340, height: 520)
28
+ popover.behavior = .transient
29
+ popover.animates = true
30
+ popover.contentViewController = NSHostingController(rootView: rootView)
31
+
32
+ overlay = SubtitleOverlayController(state: state)
33
+ overlay.show()
34
+
35
+ controller = TranscriptionController(state: state)
36
+ Task { await controller.bootstrap() }
37
+ }
38
+
39
+ func applicationWillTerminate(_ notification: Notification) {
40
+ Task { await controller.shutdown() }
41
+ }
42
+
43
+ @objc private func togglePopover(_ sender: Any?) {
44
+ guard let button = statusItem.button else { return }
45
+ if popover.isShown {
46
+ popover.performClose(sender)
47
+ } else {
48
+ popover.show(relativeTo: button.bounds, of: button, preferredEdge: .minY)
49
+ popover.contentViewController?.view.window?.makeKey()
50
+ }
51
+ }
52
+
53
+ private func toggleEnabled() {
54
+ Task { @MainActor in
55
+ if state.isEnabled {
56
+ await controller.pause()
57
+ } else {
58
+ await controller.resume()
59
+ }
60
+ }
61
+ }
62
+ }
apps/macos/Sources/BanhMi/AppState.swift ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import Foundation
2
+ import Combine
3
+ import AppKit
4
+
5
+ /// One box of text in the overlay. A line is "live" while partials stream;
6
+ /// once finalized it is locked and the next partials create a new line.
7
+ struct SubtitleLine: Identifiable, Equatable {
8
+ let id: UUID
9
+ var text: String
10
+ var isFinal: Bool = false
11
+ }
12
+
13
+ /// Single source of truth for the running app.
14
+ /// Views observe this, services write to it.
15
+ @MainActor
16
+ final class AppState: ObservableObject {
17
+ static let shared = AppState()
18
+
19
+ /// Maximum number of lines kept in the overlay buffer.
20
+ static let maxLines: Int = 3
21
+ /// Maximum words displayed per subtitle box. Slightly higher than the
22
+ /// recognizer's flush cap so translator expansion (e.g. Vietnamese often
23
+ /// expands vs. English) rarely triggers an extra re-split.
24
+ static let maxWordsPerLine: Int = 26
25
+ /// Short auto-hide delay for brief lines (≤ threshold words).
26
+ static let lineHideShortSeconds: TimeInterval = 3.0
27
+ /// Long auto-hide delay for longer lines (> threshold words).
28
+ static let lineHideLongSeconds: TimeInterval = 5.0
29
+ /// Word count threshold switching between short and long hide delays.
30
+ static let lineHideWordThreshold: Int = 10
31
+ /// When a new box arrives, older boxes have their remaining visible time
32
+ /// shortened to this value. Keeps the overlay "fresh" — old content fades
33
+ /// quickly once newer content pushes it up.
34
+ static let olderLineHideSeconds: TimeInterval = 1.2
35
+
36
+ // Accumulated raw text (for debugging/future use); the overlay does not
37
+ // render this directly anymore.
38
+ @Published var transcript: String = ""
39
+ /// Most recent translated text written by the translation pipeline.
40
+ @Published var translatedTranscript: String = ""
41
+
42
+ /// Stable lines currently shown in the overlay. New lines are appended;
43
+ /// old lines are removed via `expire(id:)` after their hide delay.
44
+ @Published private(set) var lines: [SubtitleLine] = []
45
+
46
+ @Published var isListening: Bool = false
47
+ @Published var isEnabled: Bool = true {
48
+ didSet { UserDefaults.standard.set(isEnabled, forKey: SettingsKey.isEnabled) }
49
+ }
50
+ @Published var statusMessage: String = "Starting…"
51
+ @Published var errorMessage: String?
52
+
53
+ /// Detected language of the incoming speech (BCP-47 base code). Nil until known.
54
+ @Published var detectedSourceLanguage: String?
55
+
56
+ // Persisted settings.
57
+ @Published var languageID: String {
58
+ didSet { UserDefaults.standard.set(languageID, forKey: SettingsKey.outputLanguage) }
59
+ }
60
+ @Published var textSize: TextSize {
61
+ didSet { UserDefaults.standard.set(textSize.rawValue, forKey: SettingsKey.textSize) }
62
+ }
63
+ @Published var audioSource: AudioSource {
64
+ didSet { UserDefaults.standard.set(audioSource.rawValue, forKey: SettingsKey.audioSource) }
65
+ }
66
+ @Published var asrEngine: ASREngine {
67
+ didSet { UserDefaults.standard.set(asrEngine.rawValue, forKey: SettingsKey.asrEngine) }
68
+ }
69
+
70
+ // Overlay geometry (persisted).
71
+ @Published var overlayWidth: CGFloat {
72
+ didSet { UserDefaults.standard.set(Double(overlayWidth), forKey: SettingsKey.overlayWidth) }
73
+ }
74
+ @Published var overlayOrigin: CGPoint? {
75
+ didSet {
76
+ if let p = overlayOrigin {
77
+ UserDefaults.standard.set(Double(p.x), forKey: SettingsKey.overlayOriginX)
78
+ UserDefaults.standard.set(Double(p.y), forKey: SettingsKey.overlayOriginY)
79
+ }
80
+ }
81
+ }
82
+
83
+ private var hideWorkByLine: [UUID: DispatchWorkItem] = [:]
84
+ /// Safety timer: if a live line hasn't been updated in this window, we
85
+ /// force-finalize it so it can fade out instead of sticking forever.
86
+ private var liveStaleTimer: DispatchWorkItem?
87
+ static let liveStaleSeconds: TimeInterval = 2.5
88
+
89
+ private init() {
90
+ let defaults = UserDefaults.standard
91
+ self.languageID = defaults.string(forKey: SettingsKey.outputLanguage) ?? SupportedLanguages.systemDefault
92
+ self.textSize = TextSize(rawValue: defaults.string(forKey: SettingsKey.textSize) ?? "") ?? .medium
93
+ self.audioSource = AudioSource(rawValue: defaults.string(forKey: SettingsKey.audioSource) ?? "") ?? .systemAudio
94
+ self.asrEngine = ASREngine(rawValue: defaults.string(forKey: SettingsKey.asrEngine) ?? "") ?? .gemmaMLX
95
+ if defaults.object(forKey: SettingsKey.isEnabled) != nil {
96
+ self.isEnabled = defaults.bool(forKey: SettingsKey.isEnabled)
97
+ } else {
98
+ self.isEnabled = true
99
+ }
100
+
101
+ let savedWidth = defaults.double(forKey: SettingsKey.overlayWidth)
102
+ self.overlayWidth = savedWidth > 0 ? CGFloat(savedWidth) : 900
103
+
104
+ if defaults.object(forKey: SettingsKey.overlayOriginX) != nil,
105
+ defaults.object(forKey: SettingsKey.overlayOriginY) != nil {
106
+ self.overlayOrigin = CGPoint(
107
+ x: defaults.double(forKey: SettingsKey.overlayOriginX),
108
+ y: defaults.double(forKey: SettingsKey.overlayOriginY)
109
+ )
110
+ } else {
111
+ self.overlayOrigin = nil
112
+ }
113
+ }
114
+
115
+ var currentLanguage: Language {
116
+ SupportedLanguages.named(languageID) ?? SupportedLanguages.all[0]
117
+ }
118
+
119
+ // MARK: - Live / finalized line management
120
+
121
+ /// Drive the overlay from streaming recognition updates.
122
+ /// - If there is an existing live (non-final) line, it is updated in place.
123
+ /// - If there isn't, a new live line is created.
124
+ /// - When `isFinal` is true, the last live line is locked and the next
125
+ /// `showLive` call will start a new line.
126
+ func showLive(text: String, isFinal: Bool) {
127
+ let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
128
+ guard !trimmed.isEmpty else { return }
129
+ scheduleLiveStaleCheck()
130
+
131
+ // Split overly long text into at-most-maxWordsPerLine boxes. If the
132
+ // update produces multiple chunks, the last chunk is the "live" one
133
+ // we'll keep updating; earlier chunks are treated as finalized.
134
+ let chunks = Self.splitIntoChunks(trimmed, maxWords: Self.maxWordsPerLine)
135
+
136
+ // Case 1: last line is still live — either extend it or, if we've
137
+ // produced multiple chunks, lock it and append the new ones.
138
+ if let last = lines.last, last.isFinal == false {
139
+ if chunks.count == 1 {
140
+ // Update in place.
141
+ if lines[lines.count - 1].text != chunks[0] {
142
+ lines[lines.count - 1].text = chunks[0]
143
+ }
144
+ if isFinal {
145
+ lines[lines.count - 1].isFinal = true
146
+ scheduleExpiry(for: lines[lines.count - 1])
147
+ }
148
+ return
149
+ } else {
150
+ // Chunks split — replace the live line with the first chunk
151
+ // (now final), then append the remaining chunks.
152
+ lines[lines.count - 1].text = chunks[0]
153
+ lines[lines.count - 1].isFinal = true
154
+ scheduleExpiry(for: lines[lines.count - 1])
155
+ for chunk in chunks.dropFirst() {
156
+ appendNewLine(text: chunk, isFinal: isFinal)
157
+ }
158
+ return
159
+ }
160
+ }
161
+
162
+ // Case 2: no live line. Append each chunk as a new line; only the
163
+ // last becomes "live" (unless isFinal was requested).
164
+ for (idx, chunk) in chunks.enumerated() {
165
+ let final = isFinal || (idx < chunks.count - 1)
166
+ appendNewLine(text: chunk, isFinal: final)
167
+ }
168
+ }
169
+
170
+ private func appendNewLine(text: String, isFinal: Bool) {
171
+ // Accelerate fade on existing older lines so the new line stands out.
172
+ shortenOlderTimers()
173
+
174
+ let line = SubtitleLine(id: UUID(), text: text, isFinal: isFinal)
175
+ lines.append(line)
176
+ while lines.count > Self.maxLines {
177
+ let removed = lines.removeFirst()
178
+ hideWorkByLine.removeValue(forKey: removed.id)?.cancel()
179
+ }
180
+ if isFinal {
181
+ scheduleExpiry(for: line)
182
+ }
183
+ // Non-final lines don't schedule expiry yet; they get one when finalized.
184
+ }
185
+
186
+ /// Append a finalized utterance as one or more new boxes. Kept for
187
+ /// backwards compatibility; callers should prefer `showLive`.
188
+ func appendLine(_ text: String) {
189
+ let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
190
+ guard !trimmed.isEmpty else { return }
191
+
192
+ let chunks = Self.splitIntoChunks(trimmed, maxWords: Self.maxWordsPerLine)
193
+ for chunk in chunks {
194
+ if let last = lines.last, last.text == chunk { continue }
195
+ // Before appending a new box, accelerate the fade-out of every
196
+ // existing box so stale content doesn't linger behind fresh text.
197
+ shortenOlderTimers()
198
+
199
+ let line = SubtitleLine(id: UUID(), text: chunk)
200
+ lines.append(line)
201
+ while lines.count > Self.maxLines {
202
+ let removed = lines.removeFirst()
203
+ hideWorkByLine.removeValue(forKey: removed.id)?.cancel()
204
+ }
205
+ scheduleExpiry(for: line)
206
+ }
207
+ }
208
+
209
+ /// Replace any pending hide timer on existing lines with a short one so
210
+ /// older boxes fade out quickly once a newer one arrives.
211
+ private func shortenOlderTimers() {
212
+ for existing in lines {
213
+ hideWorkByLine.removeValue(forKey: existing.id)?.cancel()
214
+ let work = DispatchWorkItem { [weak self] in
215
+ Task { @MainActor in self?.expire(id: existing.id) }
216
+ }
217
+ hideWorkByLine[existing.id] = work
218
+ DispatchQueue.main.asyncAfter(
219
+ deadline: .now() + Self.olderLineHideSeconds,
220
+ execute: work
221
+ )
222
+ }
223
+ }
224
+
225
+ /// Split text into successive chunks of at most `maxWords` whitespace-
226
+ /// separated words. Preserves original word order.
227
+ private static func splitIntoChunks(_ text: String, maxWords: Int) -> [String] {
228
+ let words = text.split(whereSeparator: { $0.isWhitespace }).map(String.init)
229
+ guard words.count > maxWords else { return [text] }
230
+ var chunks: [String] = []
231
+ var i = 0
232
+ while i < words.count {
233
+ let end = min(i + maxWords, words.count)
234
+ chunks.append(words[i..<end].joined(separator: " "))
235
+ i = end
236
+ }
237
+ return chunks
238
+ }
239
+
240
+ /// Remove all lines and cancel pending timers. Used on pause/stop.
241
+ func clearLines() {
242
+ hideWorkByLine.values.forEach { $0.cancel() }
243
+ hideWorkByLine.removeAll()
244
+ liveStaleTimer?.cancel()
245
+ liveStaleTimer = nil
246
+ lines.removeAll()
247
+ }
248
+
249
+ /// (Re)start the watchdog that force-finalizes a live line that stops
250
+ /// getting updates, so it can fade out instead of staying forever.
251
+ private func scheduleLiveStaleCheck() {
252
+ liveStaleTimer?.cancel()
253
+ let work = DispatchWorkItem { [weak self] in
254
+ Task { @MainActor in self?.forceFinalizeLive() }
255
+ }
256
+ liveStaleTimer = work
257
+ DispatchQueue.main.asyncAfter(deadline: .now() + Self.liveStaleSeconds, execute: work)
258
+ }
259
+
260
+ private func forceFinalizeLive() {
261
+ guard let last = lines.last, last.isFinal == false else { return }
262
+ lines[lines.count - 1].isFinal = true
263
+ scheduleExpiry(for: lines[lines.count - 1])
264
+ }
265
+
266
+ private func scheduleExpiry(for line: SubtitleLine) {
267
+ let words = line.text.split(whereSeparator: { $0.isWhitespace }).count
268
+ let delay: TimeInterval = words > Self.lineHideWordThreshold
269
+ ? Self.lineHideLongSeconds
270
+ : Self.lineHideShortSeconds
271
+
272
+ let work = DispatchWorkItem { [weak self] in
273
+ Task { @MainActor in self?.expire(id: line.id) }
274
+ }
275
+ hideWorkByLine[line.id] = work
276
+ DispatchQueue.main.asyncAfter(deadline: .now() + delay, execute: work)
277
+ }
278
+
279
+ private func expire(id: UUID) {
280
+ hideWorkByLine.removeValue(forKey: id)
281
+ lines.removeAll { $0.id == id }
282
+ }
283
+ }
apps/macos/Sources/BanhMi/BanhMiMain.swift ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import AppKit
2
+
3
+ @main
4
+ struct BanhMiMain {
5
+ static func main() {
6
+ MainActor.assumeIsolated {
7
+ // Start the session logger early so we capture everything.
8
+ _ = SessionLogger.shared
9
+
10
+ let app = NSApplication.shared
11
+ let delegate = AppDelegate()
12
+ app.delegate = delegate
13
+ app.setActivationPolicy(.accessory)
14
+ app.run()
15
+ }
16
+ }
17
+ }
apps/macos/Sources/BanhMi/GemmaMLXRecognizer.swift ADDED
@@ -0,0 +1,471 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import Foundation
2
+ @preconcurrency import AVFoundation
3
+ import OSLog
4
+
5
+ private let recLog = Logger(subsystem: "vn.banhmi.chuyenngu", category: "gemma-mlx")
6
+
7
+ /// Offline speech recognition + translation via the Gemma 4 MLX sidecar.
8
+ /// Conforms to the `AudioRecognizer` protocol so `TranscriptionController`
9
+ /// can drive it the same way as any future engine.
10
+ ///
11
+ /// Gemma's audio tower is encoder-based — it needs the full clip at once,
12
+ /// so this isn't true streaming. We accumulate audio with an energy-based
13
+ /// VAD and run inference once per detected utterance.
14
+ @MainActor
15
+ final class GemmaMLXRecognizer {
16
+ // Callbacks (AudioRecognizer surface)
17
+ var onStableUpdate: ((String, Bool, String?) -> Void)?
18
+ var onFinalUtterance: ((String, String?) -> Void)?
19
+ var onFinal: (() -> Void)?
20
+ var onError: ((Error) -> Void)?
21
+
22
+ private(set) var latestDetectedLanguage: String?
23
+
24
+ // VAD chunking — cut audio on speech boundaries rather than fixed windows
25
+ // so Gemma always sees complete utterances.
26
+ static let sampleRate: Double = 16_000
27
+ static let frameDurationSec: TimeInterval = 0.03 // 30 ms frames
28
+ static let minSpeechSec: TimeInterval = 0.6 // drop sub-600ms utterances
29
+ static let maxSpeechSec: TimeInterval = 8.0 // force flush after 8s
30
+ static let trailingSilenceSec: TimeInterval = 0.35 // silence that ends an utterance
31
+ static let speechActivationSec: TimeInterval = 0.15 // 150ms of voice to start a segment
32
+
33
+ /// Hysteresis thresholds for voice activity detection.
34
+ ///
35
+ /// A frame counts as voice only if its RMS is above `voiceActivateRMS`
36
+ /// (clear speech energy, not ambient room noise). A frame counts as
37
+ /// silence only if its RMS drops below `voiceDeactivateRMS`. Frames in
38
+ /// between leave the VAD state unchanged, which is what lets us span
39
+ /// brief quiet moments inside a word without losing speech context.
40
+ static let voiceActivateRMS: Float = 0.005 // >= this → voice
41
+ static let voiceDeactivateRMS: Float = 0.0015 // < this → silence
42
+ /// Minimum overall chunk RMS — below this we skip the whole utterance
43
+ /// (pure noise / mic tap). Evaluated on the chunk before peak normalize.
44
+ static let silenceRMSThreshold: Float = 0.001
45
+
46
+ private let service: GemmaMLXService
47
+ private var targetLanguageName: String = "English"
48
+
49
+ // Audio conversion to 16 kHz int16 mono
50
+ private let desiredFormat: AVAudioFormat
51
+ private var converter: AVAudioConverter?
52
+ private var converterInputFormat: AVAudioFormat?
53
+
54
+ // VAD state machine
55
+ private var pcmBuffer = Data() // all audio since start of current utterance (or since last flush)
56
+ private var isRunning = false
57
+ private var inFlight = false
58
+ private var inSpeech = false
59
+ private var speechRunSec: TimeInterval = 0 // accumulated voice-frames at start of utterance
60
+ private var silenceRunSec: TimeInterval = 0 // consecutive silence since last voice frame
61
+ private var utteranceSec: TimeInterval = 0 // length of current utterance
62
+
63
+ /// A queued or in-flight utterance with enough metadata to log metrics
64
+ /// and attach the saved WAV for offline review.
65
+ private struct PendingUtterance {
66
+ let pcm: Data
67
+ let rmsIn: Float
68
+ let rmsOut: Float
69
+ let durationSec: Double
70
+ }
71
+
72
+ /// Queue of preprocessed utterances waiting for Gemma. Keeping this
73
+ /// small — if the user speaks faster than the model, we drop the
74
+ /// *oldest* pending entry to keep overlay text fresh instead of lagging.
75
+ private var queue: [PendingUtterance] = []
76
+ private static let maxQueueDepth = 2
77
+
78
+ init(sidecarScriptPath: String,
79
+ pythonPath: String = "/usr/bin/env",
80
+ modelID: String = "unsloth/gemma-4-E2B-it-UD-MLX-4bit") throws {
81
+ self.desiredFormat = AVAudioFormat(
82
+ commonFormat: .pcmFormatInt16,
83
+ sampleRate: 16_000,
84
+ channels: 1,
85
+ interleaved: true
86
+ )!
87
+ // `/usr/bin/env python3 <script>` is the simplest launch that works with
88
+ // any user's python3 (venv, system, homebrew, etc).
89
+ self.service = try GemmaMLXService(
90
+ pythonPath: pythonPath == "/usr/bin/env" ? "/usr/bin/env" : pythonPath,
91
+ sidecarScript: sidecarScriptPath,
92
+ modelID: modelID
93
+ )
94
+ }
95
+
96
+ /// Configure target language. Source language is always auto-detected by
97
+ /// the model — hints are ignored here.
98
+ func configure(sourceLocale: Locale,
99
+ targetLanguageCode: String,
100
+ context: String = "") throws {
101
+ targetLanguageName = Self.targetLanguageName(from: targetLanguageCode)
102
+ }
103
+
104
+ /// Start processing. Waits until the sidecar has loaded the MLX model.
105
+ func startTask() throws {
106
+ guard !isRunning else { return }
107
+ isRunning = true
108
+ pcmBuffer.removeAll(keepingCapacity: true)
109
+ // Wait for sidecar ready asynchronously — don't block the caller.
110
+ Task { [weak self] in
111
+ do {
112
+ try await self?.service.waitUntilReady()
113
+ } catch {
114
+ self?.onError?(error)
115
+ }
116
+ }
117
+ }
118
+
119
+ func stop() {
120
+ isRunning = false
121
+ pcmBuffer.removeAll(keepingCapacity: false)
122
+ inSpeech = false
123
+ speechRunSec = 0
124
+ silenceRunSec = 0
125
+ utteranceSec = 0
126
+ queue.removeAll(keepingCapacity: false)
127
+ service.stop()
128
+ }
129
+
130
+ /// Feed an audio buffer. Converts to 16 kHz int16 mono, runs
131
+ /// energy-based VAD, and flushes a chunk to Gemma when we see a
132
+ /// complete utterance (voice + trailing silence, or hit max length).
133
+ func append(_ buffer: AVAudioPCMBuffer) {
134
+ guard isRunning else { return }
135
+ guard let int16Data = convertToInt16Mono16k(buffer), !int16Data.isEmpty else {
136
+ return
137
+ }
138
+ appendConverted(int16Data)
139
+ }
140
+
141
+ private func appendConverted(_ int16Data: Data) {
142
+ // Frame-based VAD walks the incoming int16 in 30ms frames.
143
+ let sr = Int(Self.sampleRate)
144
+ let frameSamples = Int(Double(sr) * Self.frameDurationSec) // 480 @ 16kHz
145
+ let frameBytes = frameSamples * 2
146
+
147
+ // Keep a local working buffer so we can add frame-by-frame to pcmBuffer.
148
+ var remaining = int16Data
149
+ while remaining.count >= frameBytes {
150
+ let frame = remaining.prefix(frameBytes)
151
+ remaining.removeSubrange(remaining.startIndex..<remaining.index(remaining.startIndex, offsetBy: frameBytes))
152
+
153
+ let frameRMS = Self.chunkRMS(Data(frame))
154
+ // Hysteresis: voice frames must cross the higher activation bar;
155
+ // silence frames must drop below the lower deactivation bar.
156
+ // Ambient noise in between does nothing — it neither starts nor
157
+ // ends an utterance, which prevents the VAD from being stuck.
158
+ let isVoice = frameRMS >= Self.voiceActivateRMS
159
+ let isSilence = frameRMS < Self.voiceDeactivateRMS
160
+
161
+ if !inSpeech {
162
+ // Waiting for enough clear voice to start a new utterance.
163
+ if isVoice {
164
+ speechRunSec += Self.frameDurationSec
165
+ pcmBuffer.append(frame)
166
+ if speechRunSec >= Self.speechActivationSec {
167
+ inSpeech = true
168
+ utteranceSec = speechRunSec
169
+ silenceRunSec = 0
170
+ }
171
+ } else {
172
+ // Drop: no speech started yet.
173
+ speechRunSec = 0
174
+ pcmBuffer.removeAll(keepingCapacity: true)
175
+ }
176
+ continue
177
+ }
178
+
179
+ // inSpeech == true — we're mid-utterance. Keep accumulating.
180
+ pcmBuffer.append(frame)
181
+ utteranceSec += Self.frameDurationSec
182
+
183
+ if isVoice {
184
+ silenceRunSec = 0
185
+ } else if isSilence {
186
+ silenceRunSec += Self.frameDurationSec
187
+ }
188
+ // else: ambient (in-between) — keep existing silenceRunSec
189
+
190
+ let shouldFlush =
191
+ (silenceRunSec >= Self.trailingSilenceSec && utteranceSec >= Self.minSpeechSec) ||
192
+ (utteranceSec >= Self.maxSpeechSec)
193
+
194
+ if shouldFlush {
195
+ flushCurrentUtterance()
196
+ }
197
+ }
198
+ }
199
+
200
+ private func flushCurrentUtterance() {
201
+ let chunk = pcmBuffer
202
+ // Reset state so we can start a new utterance immediately.
203
+ pcmBuffer.removeAll(keepingCapacity: true)
204
+ inSpeech = false
205
+ speechRunSec = 0
206
+ silenceRunSec = 0
207
+ utteranceSec = 0
208
+
209
+ let rmsIn = Self.chunkRMS(chunk)
210
+ if rmsIn < Self.silenceRMSThreshold {
211
+ recLog.info("VAD flush: utterance too quiet (rms=\(rmsIn, format: .fixed(precision: 4))) — dropping")
212
+ return
213
+ }
214
+
215
+ let normalized = Self.preprocessForASR(chunk)
216
+ let rmsOut = Self.chunkRMS(normalized)
217
+ let durSec = Double(chunk.count / 2) / Self.sampleRate
218
+ recLog.info("VAD flush: utterance \(durSec, format: .fixed(precision: 2))s, rms_in=\(rmsIn, format: .fixed(precision: 4)) rms_out=\(rmsOut, format: .fixed(precision: 4))")
219
+
220
+ let pending = PendingUtterance(
221
+ pcm: normalized, rmsIn: rmsIn, rmsOut: rmsOut, durationSec: durSec,
222
+ )
223
+
224
+ if inFlight {
225
+ while queue.count >= Self.maxQueueDepth {
226
+ queue.removeFirst()
227
+ recLog.info("Queue full — dropping oldest pending utterance")
228
+ }
229
+ queue.append(pending)
230
+ return
231
+ }
232
+
233
+ inFlight = true
234
+ Task { [weak self] in
235
+ await self?.runInference(pending)
236
+ }
237
+ }
238
+
239
+ /// Kick off the next queued utterance once the current call returns.
240
+ private func drainQueue() {
241
+ guard !queue.isEmpty, !inFlight, isRunning else { return }
242
+ let next = queue.removeFirst()
243
+ inFlight = true
244
+ Task { [weak self] in
245
+ await self?.runInference(next)
246
+ }
247
+ }
248
+
249
+ // MARK: - Private
250
+
251
+ private func runInference(_ pending: PendingUtterance) async {
252
+ defer {
253
+ Task { @MainActor [weak self] in
254
+ self?.inFlight = false
255
+ self?.drainQueue()
256
+ }
257
+ }
258
+ do {
259
+ // Single-pass translate: half the inference time of
260
+ // transcribe_translate because Gemma is only prompted once.
261
+ // The overlay only ever shows translated text anyway.
262
+ let result = try await service.translate(
263
+ pcm16: pending.pcm,
264
+ sampleRate: 16_000,
265
+ targetLanguage: targetLanguageName
266
+ )
267
+ recLog.info("Sidecar returned: source=\"\(result.sourceText)\" translated=\"\(result.translatedText)\" latency=\(result.latencyMs)ms")
268
+ await MainActor.run { [weak self] in
269
+ guard let self else { return }
270
+ SessionLogger.shared.record(
271
+ pcm16: pending.pcm,
272
+ sampleRate: 16_000,
273
+ rmsIn: pending.rmsIn,
274
+ rmsOut: pending.rmsOut,
275
+ durationSec: pending.durationSec,
276
+ latencyMs: result.latencyMs,
277
+ sourceText: result.sourceText,
278
+ translatedText: result.translatedText,
279
+ targetLang: self.targetLanguageName,
280
+ engine: "gemma_mlx"
281
+ )
282
+ guard self.isRunning else { return }
283
+ let out = result.translatedText.isEmpty
284
+ ? result.sourceText
285
+ : result.translatedText
286
+ guard !out.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else {
287
+ return
288
+ }
289
+ self.onStableUpdate?(out, true, self.latestDetectedLanguage)
290
+ self.onFinalUtterance?(out, self.latestDetectedLanguage)
291
+ }
292
+ } catch {
293
+ recLog.error("Sidecar call failed: \(String(describing: error))")
294
+ await MainActor.run { [weak self] in
295
+ guard let self else { return }
296
+ SessionLogger.shared.record(
297
+ pcm16: pending.pcm,
298
+ sampleRate: 16_000,
299
+ rmsIn: pending.rmsIn,
300
+ rmsOut: pending.rmsOut,
301
+ durationSec: pending.durationSec,
302
+ latencyMs: nil,
303
+ sourceText: nil,
304
+ translatedText: nil,
305
+ targetLang: self.targetLanguageName,
306
+ engine: "gemma_mlx",
307
+ error: String(describing: error)
308
+ )
309
+ self.onError?(error)
310
+ }
311
+ }
312
+ }
313
+
314
+ /// Minimal ASR preprocessing for Gemma 4: DC removal + peak normalize.
315
+ ///
316
+ /// Notes on what we deliberately DON'T do:
317
+ /// - No pre-emphasis: Gemma 4's USM Conformer audio tower operates on
318
+ /// raw waveforms, not MFCC-style features. Pre-emphasis distorts the
319
+ /// waveform's character enough that the model's language detector
320
+ /// mis-identifies the input (e.g. English → "Korean").
321
+ /// - No high-pass filter: would help classic ASR but here it sculpts
322
+ /// formants the model uses and hurts accuracy.
323
+ /// The safe, minimal pipeline just centres the waveform and scales it
324
+ /// up so the model sees usable dynamic range.
325
+ private static func preprocessForASR(_ pcm: Data) -> Data {
326
+ guard pcm.count >= 4 else { return pcm }
327
+ let sampleCount = pcm.count / 2
328
+
329
+ // Decode to float
330
+ var samples = [Float](repeating: 0, count: sampleCount)
331
+ pcm.withUnsafeBytes { raw in
332
+ let ins = raw.bindMemory(to: Int16.self)
333
+ let scale = Float(1.0 / Float(Int16.max))
334
+ for i in 0..<sampleCount {
335
+ samples[i] = Float(ins[i]) * scale
336
+ }
337
+ }
338
+
339
+ // DC offset removal
340
+ var mean: Float = 0
341
+ for s in samples { mean += s }
342
+ mean /= Float(sampleCount)
343
+ if abs(mean) > 1e-5 {
344
+ for i in 0..<sampleCount { samples[i] -= mean }
345
+ }
346
+
347
+ // Peak normalize (target 0.85, cap gain to avoid amplifying noise)
348
+ var peak: Float = 0
349
+ for s in samples { peak = max(peak, abs(s)) }
350
+ if peak > 0 {
351
+ let target: Float = 0.85
352
+ let gain = min(target / peak, 30.0)
353
+ if gain > 1.0 {
354
+ for i in 0..<sampleCount { samples[i] *= gain }
355
+ }
356
+ }
357
+
358
+ // Re-encode to int16
359
+ var out = Data(count: pcm.count)
360
+ out.withUnsafeMutableBytes { outRaw in
361
+ let outs = outRaw.bindMemory(to: Int16.self)
362
+ for i in 0..<sampleCount {
363
+ let v = samples[i] * Float(Int16.max)
364
+ outs[i] = Int16(clamping: Int(v.rounded()))
365
+ }
366
+ }
367
+ return out
368
+ }
369
+
370
+ /// Peak-normalize int16 PCM so the loudest sample hits ~0.85 of full
371
+ /// scale. Leaves a small headroom to avoid clipping on transients.
372
+ /// Caps gain at 30x to prevent amplifying noise when there's no speech.
373
+ private static func peakNormalize(_ pcm: Data) -> Data {
374
+ guard pcm.count >= 2 else { return pcm }
375
+ let sampleCount = pcm.count / 2
376
+ var peak: Int16 = 0
377
+ pcm.withUnsafeBytes { raw in
378
+ let samples = raw.bindMemory(to: Int16.self)
379
+ for i in 0..<sampleCount {
380
+ let a = samples[i] == Int16.min ? Int16.max : abs(samples[i])
381
+ if a > peak { peak = a }
382
+ }
383
+ }
384
+ if peak == 0 { return pcm }
385
+ let targetPeak: Float = 0.85 * Float(Int16.max)
386
+ let rawGain = targetPeak / Float(peak)
387
+ let gain = min(rawGain, 30.0) // cap to avoid blowing up noise
388
+ if gain <= 1.05 { return pcm } // already loud enough
389
+
390
+ var out = Data(count: pcm.count)
391
+ out.withUnsafeMutableBytes { outRaw in
392
+ let outSamples = outRaw.bindMemory(to: Int16.self)
393
+ pcm.withUnsafeBytes { raw in
394
+ let inSamples = raw.bindMemory(to: Int16.self)
395
+ for i in 0..<sampleCount {
396
+ let scaled = Float(inSamples[i]) * gain
397
+ outSamples[i] = Int16(clamping: Int(scaled.rounded()))
398
+ }
399
+ }
400
+ }
401
+ return out
402
+ }
403
+
404
+ /// RMS amplitude of a chunk in [0, 1] range (int16 normalized to float).
405
+ private static func chunkRMS(_ pcm: Data) -> Float {
406
+ guard pcm.count >= 2 else { return 0 }
407
+ let sampleCount = pcm.count / 2
408
+ var sumSq: Double = 0
409
+ pcm.withUnsafeBytes { raw in
410
+ let samples = raw.bindMemory(to: Int16.self)
411
+ for i in 0..<sampleCount {
412
+ let s = Double(samples[i]) / Double(Int16.max)
413
+ sumSq += s * s
414
+ }
415
+ }
416
+ return Float(sqrt(sumSq / Double(sampleCount)))
417
+ }
418
+
419
+ /// Map a BCP-47 code (e.g. "vi", "zh-CN") to the English language name
420
+ /// Gemma expects in prompts.
421
+ private static func targetLanguageName(from code: String) -> String {
422
+ let lower = code.lowercased()
423
+ let table: [String: String] = [
424
+ "en": "English", "vi": "Vietnamese", "es": "Spanish",
425
+ "zh": "Chinese", "zh-cn": "Chinese", "zh-tw": "Chinese",
426
+ "ja": "Japanese", "ko": "Korean", "fr": "French",
427
+ "de": "German", "pt": "Portuguese", "ru": "Russian",
428
+ "ar": "Arabic", "hi": "Hindi", "id": "Indonesian",
429
+ "th": "Thai", "it": "Italian", "tr": "Turkish",
430
+ ]
431
+ if let name = table[lower] { return name }
432
+ let base = lower.split(separator: "-").first.map(String.init) ?? lower
433
+ return table[base] ?? "English"
434
+ }
435
+
436
+ private func convertToInt16Mono16k(_ buffer: AVAudioPCMBuffer) -> Data? {
437
+ let inputFormat = buffer.format
438
+ if converter == nil || converterInputFormat != inputFormat {
439
+ converter = AVAudioConverter(from: inputFormat, to: desiredFormat)
440
+ converterInputFormat = inputFormat
441
+ guard converter != nil else { return nil }
442
+ }
443
+ guard let converter else { return nil }
444
+
445
+ let capacity = AVAudioFrameCount(
446
+ Double(buffer.frameLength) *
447
+ desiredFormat.sampleRate / inputFormat.sampleRate
448
+ ) + 1024
449
+ guard let out = AVAudioPCMBuffer(
450
+ pcmFormat: desiredFormat, frameCapacity: capacity
451
+ ) else { return nil }
452
+
453
+ var submitted = false
454
+ let inputBlock: AVAudioConverterInputBlock = { _, status in
455
+ if submitted {
456
+ status.pointee = .noDataNow
457
+ return nil
458
+ }
459
+ submitted = true
460
+ status.pointee = .haveData
461
+ return buffer
462
+ }
463
+ var error: NSError?
464
+ let s = converter.convert(to: out, error: &error, withInputFrom: inputBlock)
465
+ if s == .error || error != nil { return nil }
466
+
467
+ guard let data = out.int16ChannelData?.pointee else { return nil }
468
+ let byteCount = Int(out.frameLength) * 2
469
+ return Data(bytes: data, count: byteCount)
470
+ }
471
+ }
apps/macos/Sources/BanhMi/GemmaMLXService.swift ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import Foundation
2
+ import AVFoundation
3
+
4
+ /// Bridges the Swift app to the Python MLX sidecar process that runs
5
+ /// Gemma 4 E2B UD-MLX-4bit locally on Apple Silicon. The sidecar loads
6
+ /// the model once at startup; each request is a JSON line over stdin
7
+ /// and a JSON line back over stdout.
8
+ ///
9
+ /// Not a `TranslationService` — the sidecar takes *audio* in and emits
10
+ /// transcript + translation, fully offline on-device.
11
+ final class GemmaMLXService {
12
+ struct Result: Sendable {
13
+ let sourceText: String
14
+ let translatedText: String
15
+ let latencyMs: Int
16
+ }
17
+
18
+ enum ServiceError: Error, CustomStringConvertible {
19
+ case pythonNotFound
20
+ case sidecarScriptMissing(String)
21
+ case processNotStarted
22
+ case sidecarFailed(String)
23
+ case invalidResponse(String)
24
+
25
+ var description: String {
26
+ switch self {
27
+ case .pythonNotFound: return "python3 not found on PATH"
28
+ case .sidecarScriptMissing(let p): return "sidecar script missing: \(p)"
29
+ case .processNotStarted: return "sidecar process failed to start"
30
+ case .sidecarFailed(let msg): return "sidecar error: \(msg)"
31
+ case .invalidResponse(let s): return "sidecar invalid response: \(s)"
32
+ }
33
+ }
34
+ }
35
+
36
+ private let process: Process
37
+ private let stdinPipe = Pipe()
38
+ private let stdoutPipe = Pipe()
39
+ private let stderrPipe = Pipe()
40
+ private let readQueue = DispatchQueue(label: "gemma-mlx-read")
41
+ private let writeQueue = DispatchQueue(label: "gemma-mlx-write")
42
+ private var buffer = Data()
43
+ private var pendingResponses: [CheckedContinuation<String, Error>] = []
44
+ private let pendingLock = NSLock()
45
+
46
+ /// Creates and starts the sidecar process. The model loads lazily
47
+ /// inside the Python process — callers should await `waitUntilReady()`
48
+ /// before sending real work.
49
+ init(
50
+ pythonPath: String,
51
+ sidecarScript: String,
52
+ modelID: String = "unsloth/gemma-4-E2B-it-UD-MLX-4bit"
53
+ ) throws {
54
+ guard FileManager.default.fileExists(atPath: sidecarScript) else {
55
+ throw ServiceError.sidecarScriptMissing(sidecarScript)
56
+ }
57
+
58
+ process = Process()
59
+ process.executableURL = URL(fileURLWithPath: pythonPath)
60
+ if pythonPath.hasSuffix("/env") {
61
+ // `/usr/bin/env python3 <script>` — python3 must be first arg
62
+ process.arguments = ["python3", sidecarScript]
63
+ } else {
64
+ // Direct python3 binary — script is first arg
65
+ process.arguments = [sidecarScript]
66
+ }
67
+ process.standardInput = stdinPipe
68
+ process.standardOutput = stdoutPipe
69
+ process.standardError = stderrPipe
70
+
71
+ var env = ProcessInfo.processInfo.environment
72
+ env["GEMMA_MLX_MODEL"] = modelID
73
+ env["PYTHONUNBUFFERED"] = "1"
74
+ process.environment = env
75
+
76
+ try process.run()
77
+
78
+ // Read stdout continuously on a background queue
79
+ stdoutPipe.fileHandleForReading.readabilityHandler = { [weak self] h in
80
+ self?.handleStdoutChunk(h.availableData)
81
+ }
82
+ // Forward stderr to Console so we can see sidecar errors / progress
83
+ stderrPipe.fileHandleForReading.readabilityHandler = { h in
84
+ let data = h.availableData
85
+ guard !data.isEmpty,
86
+ let s = String(data: data, encoding: .utf8) else { return }
87
+ let trimmed = s.trimmingCharacters(in: .whitespacesAndNewlines)
88
+ if !trimmed.isEmpty {
89
+ NSLog("[gemma_sidecar] %@", trimmed)
90
+ }
91
+ }
92
+ }
93
+
94
+ deinit {
95
+ stop()
96
+ }
97
+
98
+ func stop() {
99
+ stdoutPipe.fileHandleForReading.readabilityHandler = nil
100
+ stderrPipe.fileHandleForReading.readabilityHandler = nil
101
+ if process.isRunning {
102
+ process.terminate()
103
+ }
104
+ }
105
+
106
+ /// Awaits the sidecar's first JSON line, which carries `{"event":"ready"}`.
107
+ func waitUntilReady() async throws {
108
+ let line = try await readNextLine()
109
+ guard let data = line.data(using: .utf8),
110
+ let obj = try? JSONSerialization.jsonObject(with: data) as? [String: Any],
111
+ obj["event"] as? String == "ready" else {
112
+ throw ServiceError.invalidResponse(line)
113
+ }
114
+ }
115
+
116
+ /// Writes an audio clip (int16 PCM or float32) to a temp WAV and
117
+ /// sends a transcribe+translate request. Returns both source and
118
+ /// translated text.
119
+ func transcribeAndTranslate(
120
+ audioFileURL: URL,
121
+ targetLanguage: String
122
+ ) async throws -> Result {
123
+ let req: [String: Any] = [
124
+ "task": "transcribe_translate",
125
+ "audio_path": audioFileURL.path,
126
+ "target_lang": targetLanguage,
127
+ ]
128
+ return try await send(req)
129
+ }
130
+
131
+ /// Convenience: writes an in-memory sample buffer to a temp WAV and
132
+ /// processes it. Caller owns the lifetime of the buffer data.
133
+ func transcribeAndTranslate(
134
+ pcm16: Data,
135
+ sampleRate: Double,
136
+ targetLanguage: String
137
+ ) async throws -> Result {
138
+ let url = try writeTempWAV(pcm16: pcm16, sampleRate: sampleRate)
139
+ defer { try? FileManager.default.removeItem(at: url) }
140
+ return try await transcribeAndTranslate(
141
+ audioFileURL: url, targetLanguage: targetLanguage,
142
+ )
143
+ }
144
+
145
+ /// Single-pass translate. Half the inference cost of
146
+ /// transcribe_translate because Gemma is only prompted once. Returns
147
+ /// only translated text — sourceText is empty.
148
+ func translate(
149
+ pcm16: Data,
150
+ sampleRate: Double,
151
+ targetLanguage: String
152
+ ) async throws -> Result {
153
+ let url = try writeTempWAV(pcm16: pcm16, sampleRate: sampleRate)
154
+ defer { try? FileManager.default.removeItem(at: url) }
155
+ let req: [String: Any] = [
156
+ "task": "translate",
157
+ "audio_path": url.path,
158
+ "target_lang": targetLanguage,
159
+ ]
160
+ return try await send(req)
161
+ }
162
+
163
+ // MARK: - Private
164
+
165
+ private func send(_ request: [String: Any]) async throws -> Result {
166
+ let data = try JSONSerialization.data(withJSONObject: request, options: [])
167
+ // Write request + newline
168
+ writeQueue.async { [weak self] in
169
+ guard let self else { return }
170
+ self.stdinPipe.fileHandleForWriting.write(data)
171
+ self.stdinPipe.fileHandleForWriting.write("\n".data(using: .utf8)!)
172
+ }
173
+ let responseLine = try await readNextLine()
174
+ guard let respData = responseLine.data(using: .utf8),
175
+ let json = try? JSONSerialization.jsonObject(with: respData) as? [String: Any] else {
176
+ throw ServiceError.invalidResponse(responseLine)
177
+ }
178
+ if let ok = json["ok"] as? Bool, ok == false {
179
+ let msg = json["error"] as? String ?? "unknown"
180
+ throw ServiceError.sidecarFailed(msg)
181
+ }
182
+ let source = (json["source_text"] as? String) ?? ""
183
+ let translated = (json["translated_text"] as? String) ?? ""
184
+ let latency = (json["latency_ms"] as? Int) ?? 0
185
+ return Result(
186
+ sourceText: source,
187
+ translatedText: translated,
188
+ latencyMs: latency,
189
+ )
190
+ }
191
+
192
+ private func readNextLine() async throws -> String {
193
+ try await withCheckedThrowingContinuation { cont in
194
+ pendingLock.lock()
195
+ pendingResponses.append(cont)
196
+ pendingLock.unlock()
197
+ drainBuffer()
198
+ }
199
+ }
200
+
201
+ private func handleStdoutChunk(_ chunk: Data) {
202
+ guard !chunk.isEmpty else { return }
203
+ readQueue.async { [weak self] in
204
+ guard let self else { return }
205
+ self.buffer.append(chunk)
206
+ self.drainBuffer()
207
+ }
208
+ }
209
+
210
+ private func drainBuffer() {
211
+ readQueue.async { [weak self] in
212
+ guard let self else { return }
213
+ let newline: UInt8 = 0x0A
214
+ while let idx = self.buffer.firstIndex(of: newline) {
215
+ let lineData = self.buffer.prefix(upTo: idx)
216
+ self.buffer.removeSubrange(self.buffer.startIndex...idx)
217
+ let line = String(data: lineData, encoding: .utf8) ?? ""
218
+
219
+ self.pendingLock.lock()
220
+ let cont = self.pendingResponses.isEmpty
221
+ ? nil
222
+ : self.pendingResponses.removeFirst()
223
+ self.pendingLock.unlock()
224
+
225
+ cont?.resume(returning: line)
226
+ }
227
+ }
228
+ }
229
+
230
+ /// Writes 16-bit PCM samples to a minimal WAV file. Mono assumed.
231
+ private func writeTempWAV(pcm16: Data, sampleRate: Double) throws -> URL {
232
+ let tmpDir = FileManager.default.temporaryDirectory
233
+ let url = tmpDir.appendingPathComponent(
234
+ "banhmi-\(UUID().uuidString).wav"
235
+ )
236
+
237
+ var header = Data()
238
+ let dataSize = UInt32(pcm16.count)
239
+ let byteRate = UInt32(sampleRate) * 2 // 16-bit mono
240
+ let blockAlign: UInt16 = 2
241
+
242
+ header.append("RIFF".data(using: .ascii)!)
243
+ header.append(uint32LE(36 + dataSize))
244
+ header.append("WAVE".data(using: .ascii)!)
245
+ header.append("fmt ".data(using: .ascii)!)
246
+ header.append(uint32LE(16)) // Subchunk1Size
247
+ header.append(uint16LE(1)) // PCM
248
+ header.append(uint16LE(1)) // mono
249
+ header.append(uint32LE(UInt32(sampleRate)))
250
+ header.append(uint32LE(byteRate))
251
+ header.append(uint16LE(blockAlign))
252
+ header.append(uint16LE(16)) // bits per sample
253
+ header.append("data".data(using: .ascii)!)
254
+ header.append(uint32LE(dataSize))
255
+
256
+ var out = header
257
+ out.append(pcm16)
258
+ try out.write(to: url)
259
+ return url
260
+ }
261
+
262
+ private func uint16LE(_ v: UInt16) -> Data {
263
+ var le = v.littleEndian
264
+ return Data(bytes: &le, count: 2)
265
+ }
266
+
267
+ private func uint32LE(_ v: UInt32) -> Data {
268
+ var le = v.littleEndian
269
+ return Data(bytes: &le, count: 4)
270
+ }
271
+ }
apps/macos/Sources/BanhMi/MicrophoneCapture.swift ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import Foundation
2
+ import AVFoundation
3
+
4
+ /// Captures audio from the default input device (microphone).
5
+ /// Emits `AVAudioPCMBuffer`s on a background queue via `onBuffer`.
6
+ final class MicrophoneCapture {
7
+ private let engine = AVAudioEngine()
8
+ private(set) var isRunning = false
9
+
10
+ var onBuffer: ((AVAudioPCMBuffer, AVAudioTime) -> Void)?
11
+
12
+ func start() throws {
13
+ guard !isRunning else { return }
14
+
15
+ let input = engine.inputNode
16
+ let format = input.outputFormat(forBus: 0)
17
+
18
+ input.installTap(onBus: 0, bufferSize: 1024, format: format) { [weak self] buffer, when in
19
+ self?.onBuffer?(buffer, when)
20
+ }
21
+
22
+ engine.prepare()
23
+ try engine.start()
24
+ isRunning = true
25
+ }
26
+
27
+ func stop() {
28
+ guard isRunning else { return }
29
+ engine.inputNode.removeTap(onBus: 0)
30
+ engine.stop()
31
+ isRunning = false
32
+ }
33
+
34
+ static func requestPermission() async -> Bool {
35
+ await withCheckedContinuation { continuation in
36
+ AVCaptureDevice.requestAccess(for: .audio) { granted in
37
+ continuation.resume(returning: granted)
38
+ }
39
+ }
40
+ }
41
+
42
+ static var hasPermission: Bool {
43
+ AVCaptureDevice.authorizationStatus(for: .audio) == .authorized
44
+ }
45
+ }
apps/macos/Sources/BanhMi/SessionLogger.swift ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import Foundation
2
+ import OSLog
3
+
4
+ /// Per-session logger. Each time the app launches we start a new session
5
+ /// directory under `~/Library/Logs/BanhMi/session-<timestamp>/` that holds:
6
+ /// - `session.jsonl` — one line per utterance with timing + transcripts
7
+ /// - `utt-NNN.wav` — the exact audio we sent to Gemma (post-preprocess)
8
+ ///
9
+ /// Call `record(...)` from the recognizer once per model round-trip.
10
+ /// Open the folder from the Settings popover to review a run.
11
+ @MainActor
12
+ final class SessionLogger {
13
+ static let shared = SessionLogger()
14
+
15
+ private let log = Logger(subsystem: "vn.banhmi.chuyenngu", category: "session-log")
16
+ private let queue = DispatchQueue(label: "session-logger", qos: .utility)
17
+ private let fm = FileManager.default
18
+ private let sessionDir: URL
19
+ private let jsonlURL: URL
20
+ private var utteranceIndex: Int = 0
21
+ private var isoFormatter: ISO8601DateFormatter = {
22
+ let f = ISO8601DateFormatter()
23
+ f.formatOptions = [.withInternetDateTime, .withFractionalSeconds]
24
+ return f
25
+ }()
26
+
27
+ private init() {
28
+ // ~/Library/Logs/BanhMi/session-2026-04-22T00-05-12Z/
29
+ let libraryLogs = fm.urls(for: .libraryDirectory, in: .userDomainMask)[0]
30
+ .appendingPathComponent("Logs")
31
+ .appendingPathComponent("BanhMi")
32
+ let ts = Self.timestampSlug(Date())
33
+ sessionDir = libraryLogs.appendingPathComponent("session-\(ts)")
34
+ jsonlURL = sessionDir.appendingPathComponent("session.jsonl")
35
+
36
+ try? fm.createDirectory(at: sessionDir, withIntermediateDirectories: true)
37
+ // Write a tiny header line so jq can read it too
38
+ let header: [String: Any] = [
39
+ "event": "session_start",
40
+ "ts": isoFormatter.string(from: Date()),
41
+ "session_dir": sessionDir.path,
42
+ ]
43
+ appendJSON(header)
44
+ log.info("Session log: \(self.sessionDir.path, privacy: .public)")
45
+ }
46
+
47
+ /// Open this session's folder in Finder. Safe to call from the UI.
48
+ func revealInFinder() {
49
+ NSWorkspace.shared.open(sessionDir)
50
+ }
51
+
52
+ var sessionDirectory: URL { sessionDir }
53
+
54
+ /// Record one utterance round-trip.
55
+ /// - Parameters:
56
+ /// - pcm16: the exact int16 PCM we sent to Gemma (post-preprocess)
57
+ /// - sampleRate: audio sample rate (typically 16000)
58
+ /// - rmsIn: RMS of the raw utterance before preprocessing
59
+ /// - rmsOut: RMS after preprocessing
60
+ /// - durationSec: utterance length
61
+ /// - latencyMs: sidecar round-trip time
62
+ /// - sourceText: transcript Gemma returned
63
+ /// - translatedText: translation Gemma returned (may equal source)
64
+ /// - targetLang: target language name
65
+ /// - error: non-nil if the call failed
66
+ func record(
67
+ pcm16: Data,
68
+ sampleRate: Double,
69
+ rmsIn: Float,
70
+ rmsOut: Float,
71
+ durationSec: Double,
72
+ latencyMs: Int?,
73
+ sourceText: String?,
74
+ translatedText: String?,
75
+ targetLang: String,
76
+ engine: String,
77
+ error: String? = nil
78
+ ) {
79
+ // Snapshot state that needs MainActor access before jumping queues
80
+ let index = utteranceIndex
81
+ utteranceIndex += 1
82
+
83
+ let nowStr = isoFormatter.string(from: Date())
84
+ let wavName = String(format: "utt-%04d.wav", index)
85
+ let wavURL = sessionDir.appendingPathComponent(wavName)
86
+
87
+ queue.async { [weak self] in
88
+ guard let self else { return }
89
+ // Write WAV
90
+ Self.writeWAV(pcm16: pcm16, sampleRate: sampleRate, to: wavURL)
91
+
92
+ // Write JSONL entry
93
+ var entry: [String: Any] = [
94
+ "event": "utterance",
95
+ "ts": nowStr,
96
+ "idx": index,
97
+ "engine": engine,
98
+ "audio": wavName,
99
+ "dur_sec": Self.round2(durationSec),
100
+ "rms_in": Self.round4(Double(rmsIn)),
101
+ "rms_out": Self.round4(Double(rmsOut)),
102
+ "target_lang": targetLang,
103
+ ]
104
+ if let latencyMs { entry["gemma_ms"] = latencyMs }
105
+ if let sourceText, !sourceText.isEmpty { entry["source"] = sourceText }
106
+ if let translatedText, !translatedText.isEmpty { entry["translated"] = translatedText }
107
+ if let error { entry["error"] = error }
108
+
109
+ self.appendJSONFromQueue(entry)
110
+ }
111
+ }
112
+
113
+ // MARK: - File writers
114
+
115
+ private func appendJSON(_ obj: [String: Any]) {
116
+ queue.async { [weak self] in self?.appendJSONFromQueue(obj) }
117
+ }
118
+
119
+ private func appendJSONFromQueue(_ obj: [String: Any]) {
120
+ guard let data = try? JSONSerialization.data(
121
+ withJSONObject: obj,
122
+ options: [.withoutEscapingSlashes, .sortedKeys]
123
+ ) else { return }
124
+ var line = data
125
+ line.append(UInt8(ascii: "\n"))
126
+ if let handle = try? FileHandle(forWritingTo: jsonlURL) {
127
+ handle.seekToEndOfFile()
128
+ handle.write(line)
129
+ try? handle.close()
130
+ } else {
131
+ try? line.write(to: jsonlURL, options: [.atomic])
132
+ }
133
+ }
134
+
135
+ private static func writeWAV(pcm16: Data, sampleRate: Double, to url: URL) {
136
+ var out = Data()
137
+ let dataSize = UInt32(pcm16.count)
138
+ let sr = UInt32(sampleRate)
139
+ let byteRate = sr * 2
140
+ func u16(_ v: UInt16) -> Data {
141
+ var le = v.littleEndian
142
+ return Data(bytes: &le, count: 2)
143
+ }
144
+ func u32(_ v: UInt32) -> Data {
145
+ var le = v.littleEndian
146
+ return Data(bytes: &le, count: 4)
147
+ }
148
+ out.append("RIFF".data(using: .ascii)!)
149
+ out.append(u32(36 + dataSize))
150
+ out.append("WAVE".data(using: .ascii)!)
151
+ out.append("fmt ".data(using: .ascii)!)
152
+ out.append(u32(16))
153
+ out.append(u16(1)) // PCM
154
+ out.append(u16(1)) // mono
155
+ out.append(u32(sr))
156
+ out.append(u32(byteRate))
157
+ out.append(u16(2)) // block align
158
+ out.append(u16(16)) // bits per sample
159
+ out.append("data".data(using: .ascii)!)
160
+ out.append(u32(dataSize))
161
+ out.append(pcm16)
162
+ try? out.write(to: url, options: [.atomic])
163
+ }
164
+
165
+ // MARK: - Helpers
166
+
167
+ private static func timestampSlug(_ date: Date) -> String {
168
+ let f = DateFormatter()
169
+ f.locale = Locale(identifier: "en_US_POSIX")
170
+ f.timeZone = TimeZone(secondsFromGMT: 0)
171
+ f.dateFormat = "yyyy-MM-dd'T'HH-mm-ss'Z'"
172
+ return f.string(from: date)
173
+ }
174
+
175
+ private static func round2(_ v: Double) -> Double {
176
+ (v * 100).rounded() / 100
177
+ }
178
+
179
+ private static func round4(_ v: Double) -> Double {
180
+ (v * 10_000).rounded() / 10_000
181
+ }
182
+ }
183
+
184
+ #if canImport(AppKit)
185
+ import AppKit
186
+ #endif
apps/macos/Sources/BanhMi/Settings.swift ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import Foundation
2
+ import CoreGraphics
3
+
4
+ enum TextSize: String, CaseIterable, Identifiable {
5
+ case small = "S"
6
+ case medium = "M"
7
+ case large = "L"
8
+
9
+ var id: String { rawValue }
10
+
11
+ var displayName: String {
12
+ switch self {
13
+ case .small: return "Small"
14
+ case .medium: return "Medium"
15
+ case .large: return "Large"
16
+ }
17
+ }
18
+
19
+ var pointSize: CGFloat {
20
+ switch self {
21
+ case .small: return 20
22
+ case .medium: return 28
23
+ case .large: return 40
24
+ }
25
+ }
26
+ }
27
+
28
+ enum ASREngine: String, CaseIterable, Identifiable {
29
+ case gemmaMLX // Fully offline Gemma 4 on Apple Silicon via MLX 4-bit
30
+
31
+ var id: String { rawValue }
32
+
33
+ var displayName: String {
34
+ switch self {
35
+ case .gemmaMLX: return "Gemma 4 (offline)"
36
+ }
37
+ }
38
+
39
+ var iconName: String {
40
+ switch self {
41
+ case .gemmaMLX: return "cpu.fill"
42
+ }
43
+ }
44
+ }
45
+
46
+ enum AudioSource: String, CaseIterable, Identifiable {
47
+ case systemAudio
48
+ case microphone
49
+
50
+ var id: String { rawValue }
51
+
52
+ var displayName: String {
53
+ switch self {
54
+ case .systemAudio: return "System audio"
55
+ case .microphone: return "Microphone"
56
+ }
57
+ }
58
+
59
+ var iconName: String {
60
+ switch self {
61
+ case .systemAudio: return "speaker.wave.2.fill"
62
+ case .microphone: return "mic.fill"
63
+ }
64
+ }
65
+ }
66
+
67
+ struct Language: Identifiable, Hashable {
68
+ let id: String // App-level tag used for storage (BCP-47 simple form)
69
+ let name: String
70
+ let nativeName: String
71
+ let speechLocale: String // BCP-47 locale identifier used as a recognition hint
72
+ }
73
+
74
+ enum SupportedLanguages {
75
+ static let all: [Language] = [
76
+ Language(id: "en", name: "English", nativeName: "English", speechLocale: "en-US"),
77
+ Language(id: "vi", name: "Vietnamese", nativeName: "Tiếng Việt", speechLocale: "vi-VN"),
78
+ Language(id: "es", name: "Spanish", nativeName: "Español", speechLocale: "es-ES"),
79
+ Language(id: "zh-CN", name: "Chinese (Simplified)", nativeName: "简体中文", speechLocale: "zh-CN"),
80
+ Language(id: "ja", name: "Japanese", nativeName: "日本語", speechLocale: "ja-JP"),
81
+ Language(id: "ko", name: "Korean", nativeName: "한국어", speechLocale: "ko-KR"),
82
+ Language(id: "fr", name: "French", nativeName: "Français", speechLocale: "fr-FR"),
83
+ Language(id: "de", name: "German", nativeName: "Deutsch", speechLocale: "de-DE"),
84
+ Language(id: "pt", name: "Portuguese", nativeName: "Português", speechLocale: "pt-BR"),
85
+ Language(id: "ru", name: "Russian", nativeName: "Русский", speechLocale: "ru-RU"),
86
+ Language(id: "ar", name: "Arabic", nativeName: "العربية", speechLocale: "ar-SA"),
87
+ Language(id: "hi", name: "Hindi", nativeName: "हिन्दी", speechLocale: "hi-IN"),
88
+ Language(id: "id", name: "Indonesian", nativeName: "Bahasa Indonesia", speechLocale: "id-ID"),
89
+ Language(id: "th", name: "Thai", nativeName: "ไทย", speechLocale: "th-TH"),
90
+ Language(id: "it", name: "Italian", nativeName: "Italiano", speechLocale: "it-IT"),
91
+ Language(id: "tr", name: "Turkish", nativeName: "Türkçe", speechLocale: "tr-TR"),
92
+ ]
93
+
94
+ static func named(_ id: String) -> Language? {
95
+ all.first { $0.id == id }
96
+ }
97
+
98
+ static var systemDefault: String {
99
+ let preferred = Locale.preferredLanguages.first ?? "en"
100
+ let code = Locale(identifier: preferred).language.languageCode?.identifier ?? "en"
101
+ let region = Locale(identifier: preferred).language.region?.identifier
102
+ let tag = region.map { "\(code)-\($0)" } ?? code
103
+ if all.contains(where: { $0.id == tag }) { return tag }
104
+ if all.contains(where: { $0.id == code }) { return code }
105
+ return "en"
106
+ }
107
+ }
108
+
109
+ enum SettingsKey {
110
+ static let outputLanguage = "outputLanguage"
111
+ static let textSize = "textSize"
112
+ static let audioSource = "audioSource"
113
+ static let asrEngine = "asrEngine"
114
+ static let isEnabled = "isEnabled"
115
+ static let overlayWidth = "overlayWidth"
116
+ static let overlayOriginX = "overlayOriginX"
117
+ static let overlayOriginY = "overlayOriginY"
118
+ }
apps/macos/Sources/BanhMi/SettingsView.swift ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import SwiftUI
2
+
3
+ struct SettingsView: View {
4
+ @ObservedObject var state: AppState
5
+ let onToggleEnabled: () -> Void
6
+
7
+ var body: some View {
8
+ VStack(alignment: .leading, spacing: 18) {
9
+ header
10
+ Divider()
11
+ transcriptionControl
12
+ audioSourceSection
13
+ engineSection
14
+ languageSection
15
+ textSizeSection
16
+ Spacer()
17
+ footer
18
+ }
19
+ .padding(20)
20
+ .frame(width: 340, height: 520)
21
+ }
22
+
23
+ private var header: some View {
24
+ HStack(spacing: 10) {
25
+ Image(systemName: "waveform.circle")
26
+ .font(.system(size: 28))
27
+ .foregroundStyle(.secondary)
28
+ VStack(alignment: .leading, spacing: 2) {
29
+ Text("Bánh mì chuyển ngữ")
30
+ .font(.headline)
31
+ Text(state.statusMessage)
32
+ .font(.caption)
33
+ .foregroundStyle(.secondary)
34
+ .lineLimit(1)
35
+ }
36
+ Spacer()
37
+ }
38
+ }
39
+
40
+ private var transcriptionControl: some View {
41
+ Button {
42
+ onToggleEnabled()
43
+ } label: {
44
+ HStack(spacing: 8) {
45
+ Image(systemName: state.isEnabled ? "pause.circle.fill" : "play.circle.fill")
46
+ .font(.system(size: 18))
47
+ Text(state.isEnabled ? "Pause transcription" : "Resume transcription")
48
+ .fontWeight(.medium)
49
+ Spacer()
50
+ }
51
+ .frame(maxWidth: .infinity)
52
+ .padding(.vertical, 10)
53
+ .padding(.horizontal, 14)
54
+ .background(
55
+ RoundedRectangle(cornerRadius: 10, style: .continuous)
56
+ .fill(state.isEnabled ? Color.orange.opacity(0.15) : Color.green.opacity(0.18))
57
+ )
58
+ .foregroundStyle(state.isEnabled ? Color.orange : Color.green)
59
+ }
60
+ .buttonStyle(.plain)
61
+ }
62
+
63
+ private var audioSourceSection: some View {
64
+ VStack(alignment: .leading, spacing: 8) {
65
+ Label("Audio source", systemImage: "waveform.badge.plus")
66
+ .font(.subheadline)
67
+ .foregroundStyle(.secondary)
68
+ Picker("", selection: $state.audioSource) {
69
+ ForEach(AudioSource.allCases) { source in
70
+ Label(source.displayName, systemImage: source.iconName)
71
+ .tag(source)
72
+ }
73
+ }
74
+ .labelsHidden()
75
+ .pickerStyle(.segmented)
76
+ if state.audioSource == .systemAudio {
77
+ Text("System audio uses macOS Screen Recording. You'll be asked to approve once — the grant is remembered for this build.")
78
+ .font(.caption2)
79
+ .foregroundStyle(.secondary)
80
+ .fixedSize(horizontal: false, vertical: true)
81
+ }
82
+ }
83
+ }
84
+
85
+ private var engineSection: some View {
86
+ VStack(alignment: .leading, spacing: 8) {
87
+ Label("Speech engine", systemImage: "cpu")
88
+ .font(.subheadline)
89
+ .foregroundStyle(.secondary)
90
+ Picker("", selection: $state.asrEngine) {
91
+ ForEach(ASREngine.allCases) { engine in
92
+ Label(engine.displayName, systemImage: engine.iconName)
93
+ .tag(engine)
94
+ }
95
+ }
96
+ .labelsHidden()
97
+ .pickerStyle(.segmented)
98
+ if state.asrEngine == .gemmaMLX {
99
+ Text("Gemma 4 runs fully offline on Apple Silicon (~4.8 GB). First launch downloads the model.")
100
+ .font(.caption2)
101
+ .foregroundStyle(.secondary)
102
+ .fixedSize(horizontal: false, vertical: true)
103
+ }
104
+ }
105
+ }
106
+
107
+ private var languageSection: some View {
108
+ VStack(alignment: .leading, spacing: 8) {
109
+ Label("Your language", systemImage: "globe")
110
+ .font(.subheadline)
111
+ .foregroundStyle(.secondary)
112
+ Picker("", selection: $state.languageID) {
113
+ ForEach(SupportedLanguages.all) { lang in
114
+ Text(lang.nativeName).tag(lang.id)
115
+ }
116
+ }
117
+ .labelsHidden()
118
+ .pickerStyle(.menu)
119
+ }
120
+ }
121
+
122
+ private var textSizeSection: some View {
123
+ VStack(alignment: .leading, spacing: 8) {
124
+ Label("Subtitle size", systemImage: "textformat.size")
125
+ .font(.subheadline)
126
+ .foregroundStyle(.secondary)
127
+ Picker("", selection: $state.textSize) {
128
+ ForEach(TextSize.allCases) { size in
129
+ Text(size.rawValue).tag(size)
130
+ }
131
+ }
132
+ .labelsHidden()
133
+ .pickerStyle(.segmented)
134
+ Text(state.textSize.displayName)
135
+ .font(.caption)
136
+ .foregroundStyle(.secondary)
137
+ }
138
+ }
139
+
140
+ private var footer: some View {
141
+ HStack {
142
+ Button("Quit") {
143
+ NSApplication.shared.terminate(nil)
144
+ }
145
+ .keyboardShortcut("q", modifiers: .command)
146
+
147
+ Button {
148
+ SessionLogger.shared.revealInFinder()
149
+ } label: {
150
+ Label("Logs", systemImage: "doc.text.magnifyingglass")
151
+ }
152
+ .buttonStyle(.borderless)
153
+ .font(.caption)
154
+
155
+ Spacer()
156
+
157
+ if let error = state.errorMessage {
158
+ Text(error)
159
+ .font(.caption2)
160
+ .foregroundStyle(.red)
161
+ .lineLimit(2)
162
+ .frame(maxWidth: 200, alignment: .trailing)
163
+ } else {
164
+ Text("v0.5")
165
+ .font(.caption2)
166
+ .foregroundStyle(.secondary)
167
+ }
168
+ }
169
+ }
170
+ }
apps/macos/Sources/BanhMi/SubtitleOverlay.swift ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import AppKit
2
+ import SwiftUI
3
+ import Combine
4
+
5
+ /// A borderless, always-on-top panel that hosts the draggable/resizable
6
+ /// subtitle bubble. Mouse events outside the bubble pass through to apps below.
7
+ final class SubtitleOverlayPanel: NSPanel {
8
+ init() {
9
+ super.init(
10
+ contentRect: NSRect(x: 0, y: 0, width: 900, height: 220),
11
+ styleMask: [.borderless, .nonactivatingPanel],
12
+ backing: .buffered,
13
+ defer: false
14
+ )
15
+
16
+ isOpaque = false
17
+ backgroundColor = .clear
18
+ hasShadow = false
19
+ level = .screenSaver
20
+ ignoresMouseEvents = false
21
+ collectionBehavior = [.canJoinAllSpaces, .fullScreenAuxiliary, .stationary, .ignoresCycle]
22
+ isMovableByWindowBackground = false
23
+ hidesOnDeactivate = false
24
+ isReleasedWhenClosed = false
25
+ }
26
+
27
+ override var canBecomeKey: Bool { false }
28
+ override var canBecomeMain: Bool { false }
29
+ }
30
+
31
+ /// Hosting view that only captures mouse events on opaque SwiftUI content.
32
+ /// Clear/background areas fall through to the app underneath.
33
+ final class ClickThroughHostingView<Content: View>: NSHostingView<Content> {
34
+ override func hitTest(_ point: NSPoint) -> NSView? {
35
+ guard let hit = super.hitTest(point) else { return nil }
36
+ // If the hit view is this root host (transparent zones), pass through.
37
+ if hit === self { return nil }
38
+ return hit
39
+ }
40
+ }
41
+
42
+ @MainActor
43
+ final class SubtitleOverlayController {
44
+ static let defaultWidth: CGFloat = 900
45
+ static let defaultHeight: CGFloat = 220
46
+ static let minWidth: CGFloat = 360
47
+ static let maxWidth: CGFloat = 1600
48
+
49
+ private let panel: SubtitleOverlayPanel
50
+ private let state: AppState
51
+ private var cancellables = Set<AnyCancellable>()
52
+
53
+ init(state: AppState) {
54
+ self.state = state
55
+ self.panel = SubtitleOverlayPanel()
56
+
57
+ let root = SubtitleView(
58
+ state: state,
59
+ onDrag: { [weak self] delta in self?.dragPanel(by: delta) }
60
+ )
61
+ let host = ClickThroughHostingView(rootView: root)
62
+ host.translatesAutoresizingMaskIntoConstraints = true
63
+ panel.contentView = host
64
+
65
+ NotificationCenter.default.addObserver(
66
+ forName: NSApplication.didChangeScreenParametersNotification,
67
+ object: nil,
68
+ queue: .main
69
+ ) { [weak self] _ in
70
+ Task { @MainActor in self?.clampToScreen() }
71
+ }
72
+ }
73
+
74
+ func show() {
75
+ applyInitialFrame()
76
+ panel.orderFrontRegardless()
77
+ }
78
+
79
+ func hide() {
80
+ panel.orderOut(nil)
81
+ }
82
+
83
+ private func applyInitialFrame() {
84
+ guard let screen = NSScreen.main else { return }
85
+ let visible = screen.visibleFrame
86
+ let width = clampWidth(state.overlayWidth)
87
+ let height = Self.defaultHeight
88
+
89
+ let x: CGFloat
90
+ let y: CGFloat
91
+ if let saved = state.overlayOrigin {
92
+ x = saved.x
93
+ y = saved.y
94
+ } else {
95
+ x = visible.midX - width / 2
96
+ y = visible.minY + 60
97
+ }
98
+ panel.setFrame(NSRect(x: x, y: y, width: width, height: height), display: true, animate: false)
99
+ clampToScreen()
100
+ }
101
+
102
+ private func dragPanel(by delta: CGSize) {
103
+ var frame = panel.frame
104
+ // SwiftUI drag deltas: +x right, +y down. AppKit window coords: +y up.
105
+ frame.origin.x += delta.width
106
+ frame.origin.y -= delta.height
107
+ panel.setFrame(frame, display: true)
108
+ clampToScreen()
109
+ state.overlayOrigin = CGPoint(x: panel.frame.origin.x, y: panel.frame.origin.y)
110
+ }
111
+
112
+ private func clampWidth(_ w: CGFloat) -> CGFloat {
113
+ min(Self.maxWidth, max(Self.minWidth, w))
114
+ }
115
+
116
+ private func clampToScreen() {
117
+ guard let screen = NSScreen.main else { return }
118
+ let visible = screen.visibleFrame
119
+ var frame = panel.frame
120
+ frame.origin.x = min(max(frame.origin.x, visible.minX), visible.maxX - frame.width)
121
+ frame.origin.y = min(max(frame.origin.y, visible.minY), visible.maxY - frame.height)
122
+ panel.setFrame(frame, display: true)
123
+ }
124
+ }
125
+
126
+ // MARK: - SwiftUI content
127
+
128
+ struct SubtitleView: View {
129
+ @ObservedObject var state: AppState
130
+ let onDrag: (CGSize) -> Void
131
+
132
+ // Track last drag translation so we can emit incremental deltas.
133
+ @State private var dragLast: CGSize = .zero
134
+
135
+ var body: some View {
136
+ ZStack(alignment: .bottom) {
137
+ Color.clear
138
+ VStack {
139
+ Spacer()
140
+ content
141
+ }
142
+ }
143
+ }
144
+
145
+ @ViewBuilder
146
+ private var content: some View {
147
+ if let error = state.errorMessage {
148
+ errorBubble(error)
149
+ } else if !state.lines.isEmpty {
150
+ lineStack
151
+ }
152
+ }
153
+
154
+ /// Stack of stable subtitle lines. Newest is at the bottom; older lines
155
+ /// appear above it, fade, and get pushed up as new lines arrive.
156
+ private var lineStack: some View {
157
+ VStack(spacing: 6) {
158
+ ForEach(Array(state.lines.enumerated()), id: \.element.id) { index, line in
159
+ let age = state.lines.count - 1 - index // 0 = newest
160
+ lineBubble(text: line.text, age: age)
161
+ .transition(.asymmetric(
162
+ insertion: .move(edge: .bottom).combined(with: .opacity),
163
+ removal: .move(edge: .top).combined(with: .opacity)
164
+ ))
165
+ }
166
+ }
167
+ .padding(.horizontal, 8)
168
+ .animation(.easeInOut(duration: 0.35), value: state.lines.map(\.id))
169
+ .gesture(
170
+ DragGesture(minimumDistance: 2)
171
+ .onChanged { value in
172
+ let incremental = CGSize(
173
+ width: value.translation.width - dragLast.width,
174
+ height: value.translation.height - dragLast.height
175
+ )
176
+ dragLast = value.translation
177
+ onDrag(incremental)
178
+ }
179
+ .onEnded { _ in dragLast = .zero }
180
+ )
181
+ }
182
+
183
+ private func lineBubble(text: String, age: Int) -> some View {
184
+ // Older lines dim slightly so the eye locks onto the newest.
185
+ let opacity = max(0.45, 1.0 - Double(age) * 0.25)
186
+ return Text(text)
187
+ .font(.system(size: state.textSize.pointSize, weight: .semibold, design: .rounded))
188
+ .foregroundStyle(.white)
189
+ .multilineTextAlignment(.center)
190
+ .lineLimit(3)
191
+ .padding(.horizontal, 24)
192
+ .padding(.vertical, 10)
193
+ .background(
194
+ RoundedRectangle(cornerRadius: 14, style: .continuous)
195
+ .fill(Color.black.opacity(0.72))
196
+ )
197
+ .shadow(color: .black.opacity(0.35), radius: 8, y: 3)
198
+ .opacity(opacity)
199
+ }
200
+
201
+ private func errorBubble(_ message: String) -> some View {
202
+ Text(message)
203
+ .font(.system(size: 14, weight: .medium, design: .rounded))
204
+ .foregroundStyle(.white)
205
+ .multilineTextAlignment(.center)
206
+ .padding(.horizontal, 20)
207
+ .padding(.vertical, 10)
208
+ .background(
209
+ RoundedRectangle(cornerRadius: 12, style: .continuous)
210
+ .fill(Color.red.opacity(0.8))
211
+ )
212
+ }
213
+
214
+ }
215
+
apps/macos/Sources/BanhMi/SystemAudioCapture.swift ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import Foundation
2
+ import AVFoundation
3
+ import ScreenCaptureKit
4
+ import CoreMedia
5
+ import CoreGraphics
6
+
7
+ /// Captures system audio (anything playing through macOS output) via ScreenCaptureKit.
8
+ /// Requires Screen Recording permission.
9
+ final class SystemAudioCapture: NSObject, SCStreamOutput, SCStreamDelegate {
10
+ private var stream: SCStream?
11
+ private let audioQueue = DispatchQueue(label: "vn.banhmi.audioQueue")
12
+ private(set) var isRunning = false
13
+
14
+ var onBuffer: ((AVAudioPCMBuffer, AVAudioTime) -> Void)?
15
+ var onError: ((Error) -> Void)?
16
+
17
+ func start() async throws {
18
+ guard !isRunning else { return }
19
+
20
+ // Explicitly request Screen Recording access. This forces macOS to
21
+ // show the prompt on first use (menu-bar apps otherwise sometimes
22
+ // have SCShareableContent fail silently). If this returns false,
23
+ // the user has denied access and we surface a clear error instead
24
+ // of failing without feedback.
25
+ if CGPreflightScreenCaptureAccess() == false {
26
+ let granted = CGRequestScreenCaptureAccess()
27
+ if granted == false {
28
+ throw NSError(
29
+ domain: "SystemAudioCapture",
30
+ code: 403,
31
+ userInfo: [NSLocalizedDescriptionKey:
32
+ "Screen Recording permission is required for system audio. Open System Settings → Privacy & Security → Screen & System Audio Recording, enable Bánh mì chuyển ngữ, then relaunch the app."]
33
+ )
34
+ }
35
+ }
36
+
37
+ // Pick any display — we capture the full display's audio output.
38
+ let content = try await SCShareableContent.excludingDesktopWindows(false, onScreenWindowsOnly: true)
39
+ guard let display = content.displays.first else {
40
+ throw NSError(domain: "SystemAudioCapture", code: 1,
41
+ userInfo: [NSLocalizedDescriptionKey: "No display found."])
42
+ }
43
+
44
+ let filter = SCContentFilter(display: display, excludingWindows: [])
45
+
46
+ let config = SCStreamConfiguration()
47
+ config.capturesAudio = true
48
+ config.sampleRate = 48_000
49
+ config.channelCount = 2
50
+ config.excludesCurrentProcessAudio = true
51
+ // Minimal video config — we don't use video frames but the stream requires
52
+ // a reasonable configuration.
53
+ config.width = 2
54
+ config.height = 2
55
+ config.minimumFrameInterval = CMTime(value: 1, timescale: 1)
56
+ config.queueDepth = 5
57
+
58
+ let stream = SCStream(filter: filter, configuration: config, delegate: self)
59
+ try stream.addStreamOutput(self, type: .audio, sampleHandlerQueue: audioQueue)
60
+ try await stream.startCapture()
61
+
62
+ self.stream = stream
63
+ self.isRunning = true
64
+ }
65
+
66
+ func stop() async {
67
+ guard let stream else { return }
68
+ do {
69
+ try await stream.stopCapture()
70
+ } catch {
71
+ // Non-fatal; we're tearing down anyway.
72
+ }
73
+ self.stream = nil
74
+ self.isRunning = false
75
+ }
76
+
77
+ // MARK: - SCStreamOutput
78
+
79
+ func stream(_ stream: SCStream, didOutputSampleBuffer sampleBuffer: CMSampleBuffer, of type: SCStreamOutputType) {
80
+ guard type == .audio, sampleBuffer.isValid, sampleBuffer.numSamples > 0 else { return }
81
+ guard let pcmBuffer = Self.pcmBuffer(from: sampleBuffer) else { return }
82
+
83
+ let audioTime = AVAudioTime(hostTime: mach_absolute_time())
84
+ onBuffer?(pcmBuffer, audioTime)
85
+ }
86
+
87
+ // MARK: - SCStreamDelegate
88
+
89
+ func stream(_ stream: SCStream, didStopWithError error: Error) {
90
+ isRunning = false
91
+ onError?(error)
92
+ }
93
+
94
+ // MARK: - Helpers
95
+
96
+ /// Converts a CMSampleBuffer (from ScreenCaptureKit) into an AVAudioPCMBuffer.
97
+ private static func pcmBuffer(from sampleBuffer: CMSampleBuffer) -> AVAudioPCMBuffer? {
98
+ guard let formatDescription = CMSampleBufferGetFormatDescription(sampleBuffer),
99
+ let asbdPointer = CMAudioFormatDescriptionGetStreamBasicDescription(formatDescription) else {
100
+ return nil
101
+ }
102
+
103
+ var asbd = asbdPointer.pointee
104
+ guard let format = AVAudioFormat(streamDescription: &asbd) else { return nil }
105
+
106
+ let numSamples = AVAudioFrameCount(CMSampleBufferGetNumSamples(sampleBuffer))
107
+ guard let pcmBuffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: numSamples) else {
108
+ return nil
109
+ }
110
+ pcmBuffer.frameLength = numSamples
111
+
112
+ let status = CMSampleBufferCopyPCMDataIntoAudioBufferList(
113
+ sampleBuffer,
114
+ at: 0,
115
+ frameCount: Int32(numSamples),
116
+ into: pcmBuffer.mutableAudioBufferList
117
+ )
118
+ return status == noErr ? pcmBuffer : nil
119
+ }
120
+ }
apps/macos/Sources/BanhMi/TranscriptionController.swift ADDED
@@ -0,0 +1,362 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import Foundation
2
+ import AVFoundation
3
+ import AppKit
4
+ import Combine
5
+ import OSLog
6
+
7
+ /// Engine-agnostic recognizer surface implemented by GemmaMLXRecognizer.
8
+ /// Keeps TranscriptionController free of per-engine branches.
9
+ @MainActor
10
+ protocol AudioRecognizer: AnyObject {
11
+ var onStableUpdate: ((String, Bool, String?) -> Void)? { get set }
12
+ var onFinalUtterance: ((String, String?) -> Void)? { get set }
13
+ var onFinal: (() -> Void)? { get set }
14
+ var onError: ((Error) -> Void)? { get set }
15
+ func configure(sourceLocale: Locale, targetLanguageCode: String, context: String) throws
16
+ func startTask() throws
17
+ func append(_ buffer: AVAudioPCMBuffer)
18
+ func stop()
19
+ }
20
+
21
+ extension GemmaMLXRecognizer: AudioRecognizer {}
22
+
23
+ /// Orchestrates audio capture + streaming recognition (Gemma 4 MLX
24
+ /// offline) and publishes stable updates to `AppState`. Optimized for
25
+ /// sub-second end-to-end latency.
26
+ @MainActor
27
+ final class TranscriptionController {
28
+ private let state: AppState
29
+ private let mic = MicrophoneCapture()
30
+ private let system = SystemAudioCapture()
31
+
32
+ private var recognizer: AudioRecognizer?
33
+ private var cancellables = Set<AnyCancellable>()
34
+ private var recognitionRestartWork: DispatchWorkItem?
35
+
36
+ private var activeSource: AudioSource?
37
+ private var configuredLanguageID: String?
38
+ private var configuredEngine: ASREngine?
39
+
40
+ private let log = Logger(subsystem: "vn.banhmi.chuyenngu", category: "stt")
41
+
42
+ init(state: AppState) {
43
+ self.state = state
44
+
45
+ system.onError = { [weak self] error in
46
+ Task { @MainActor in
47
+ guard let self else { return }
48
+ self.state.errorMessage = "System audio stopped: \(error.localizedDescription)"
49
+ await self.stopAll()
50
+ self.state.isListening = false
51
+ self.state.statusMessage = "Paused"
52
+ }
53
+ }
54
+
55
+ let forward: (AVAudioPCMBuffer, AVAudioTime) -> Void = { [weak self] buf, _ in
56
+ self?.recognizer?.append(buf)
57
+ }
58
+ mic.onBuffer = forward
59
+ system.onBuffer = forward
60
+
61
+ state.$audioSource
62
+ .dropFirst()
63
+ .removeDuplicates()
64
+ .sink { [weak self] _ in
65
+ Task { await self?.handleSettingsChange() }
66
+ }
67
+ .store(in: &cancellables)
68
+
69
+ state.$languageID
70
+ .dropFirst()
71
+ .removeDuplicates()
72
+ .sink { [weak self] _ in
73
+ Task { await self?.handleSettingsChange() }
74
+ }
75
+ .store(in: &cancellables)
76
+
77
+ state.$asrEngine
78
+ .dropFirst()
79
+ .removeDuplicates()
80
+ .sink { [weak self] _ in
81
+ Task { await self?.handleSettingsChange() }
82
+ }
83
+ .store(in: &cancellables)
84
+ }
85
+
86
+ // MARK: - Public API
87
+
88
+ func bootstrap() async {
89
+ if state.isEnabled {
90
+ await start()
91
+ } else {
92
+ state.statusMessage = "Paused"
93
+ }
94
+ }
95
+
96
+ func resume() async {
97
+ state.isEnabled = true
98
+ await start()
99
+ }
100
+
101
+ func pause() async {
102
+ state.isEnabled = false
103
+ await stopAll()
104
+ state.isListening = false
105
+ state.transcript = ""
106
+ state.translatedTranscript = ""
107
+ state.detectedSourceLanguage = nil
108
+ state.clearLines()
109
+ state.statusMessage = "Paused"
110
+ }
111
+
112
+ func shutdown() async {
113
+ recognitionRestartWork?.cancel()
114
+ await stopAll()
115
+ }
116
+
117
+ // MARK: - Start / stop
118
+
119
+ private func start() async {
120
+ state.errorMessage = nil
121
+ state.transcript = ""
122
+ state.translatedTranscript = ""
123
+ state.detectedSourceLanguage = nil
124
+ state.clearLines()
125
+ state.statusMessage = "Starting…"
126
+
127
+ let recognizer: AudioRecognizer
128
+ switch state.asrEngine {
129
+ case .gemmaMLX:
130
+ guard let sidecarPath = Self.findSidecarScript() else {
131
+ state.statusMessage = "Gemma sidecar not found"
132
+ state.errorMessage = "Expected gemma_sidecar.py under apps/macos/scripts/. Rebuild with ./build.sh."
133
+ state.isListening = false
134
+ return
135
+ }
136
+ let pythonPath = Self.findPythonWithMLX()
137
+ ?? "/usr/bin/env" // last-ditch fallback
138
+ do {
139
+ recognizer = try GemmaMLXRecognizer(
140
+ sidecarScriptPath: sidecarPath,
141
+ pythonPath: pythonPath
142
+ )
143
+ state.statusMessage = "Loading Gemma 4 (first launch ~1 min)…"
144
+ } catch {
145
+ state.isListening = false
146
+ state.statusMessage = "Could not start Gemma"
147
+ state.errorMessage = error.localizedDescription
148
+ return
149
+ }
150
+ }
151
+
152
+ wireCallbacks(on: recognizer)
153
+ self.recognizer = recognizer
154
+
155
+ do {
156
+ try recognizer.configure(
157
+ sourceLocale: Locale(identifier: state.currentLanguage.speechLocale),
158
+ targetLanguageCode: baseLanguageCode(state.languageID),
159
+ context: ""
160
+ )
161
+ try recognizer.startTask()
162
+ } catch {
163
+ self.recognizer = nil
164
+ state.isListening = false
165
+ state.statusMessage = "Could not start \(state.asrEngine.displayName)"
166
+ state.errorMessage = error.localizedDescription
167
+ return
168
+ }
169
+
170
+ switch state.audioSource {
171
+ case .microphone:
172
+ await startMicrophone()
173
+ case .systemAudio:
174
+ await startSystemAudio()
175
+ }
176
+ configuredLanguageID = state.languageID
177
+ configuredEngine = state.asrEngine
178
+ }
179
+
180
+ /// Locate a python3 interpreter that has `mlx-vlm` importable. Launched
181
+ /// .app bundles don't inherit shell PATH, so we search known locations.
182
+ private static func findPythonWithMLX() -> String? {
183
+ let candidates: [String] = [
184
+ // 1) User override
185
+ ProcessInfo.processInfo.environment["BANHMI_PYTHON"] ?? "",
186
+ // 2) Unsloth's installer venv (recommended path)
187
+ (NSHomeDirectory() as NSString)
188
+ .appendingPathComponent(".unsloth/unsloth_gemma4_mlx/bin/python3"),
189
+ // 3) Project venv (dev machine)
190
+ "/Users/duytran10/Documents/Others/Any2AnyModels/venv_local/bin/python3",
191
+ // 4) Homebrew python3
192
+ "/opt/homebrew/bin/python3",
193
+ // 5) System python3 (unlikely to have mlx-vlm, but try)
194
+ "/usr/bin/python3",
195
+ ].filter { !$0.isEmpty }
196
+
197
+ let fm = FileManager.default
198
+ for candidate in candidates where fm.isExecutableFile(atPath: candidate) {
199
+ // Quick check: does this python import mlx_vlm?
200
+ let probe = Process()
201
+ probe.executableURL = URL(fileURLWithPath: candidate)
202
+ probe.arguments = ["-c", "import mlx_vlm"]
203
+ probe.standardOutput = Pipe()
204
+ probe.standardError = Pipe()
205
+ do {
206
+ try probe.run()
207
+ probe.waitUntilExit()
208
+ if probe.terminationStatus == 0 {
209
+ return candidate
210
+ }
211
+ } catch {
212
+ continue
213
+ }
214
+ }
215
+ return nil
216
+ }
217
+
218
+ /// Locate the Gemma sidecar Python script. Looks in the repo layout first
219
+ /// (useful during development from the build dir) and then inside the
220
+ /// installed .app bundle Resources.
221
+ private static func findSidecarScript() -> String? {
222
+ let fm = FileManager.default
223
+ // 1) Bundled under Resources/scripts/gemma_sidecar.py
224
+ if let bundled = Bundle.main.resourceURL?
225
+ .appendingPathComponent("scripts/gemma_sidecar.py").path,
226
+ fm.fileExists(atPath: bundled) {
227
+ return bundled
228
+ }
229
+ // 2) Dev path: apps/macos/scripts/gemma_sidecar.py relative to binary
230
+ if let exe = Bundle.main.executableURL {
231
+ let dev = exe
232
+ .deletingLastPathComponent() // Contents/MacOS
233
+ .deletingLastPathComponent() // Contents
234
+ .deletingLastPathComponent() // <App>.app
235
+ .deletingLastPathComponent() // parent
236
+ .appendingPathComponent("scripts/gemma_sidecar.py").path
237
+ if fm.fileExists(atPath: dev) { return dev }
238
+ }
239
+ // 3) Absolute fallback during development
240
+ let repoGuess = "/Users/duytran10/Documents/Others/Any2AnyModels/apps/macos/scripts/gemma_sidecar.py"
241
+ if fm.fileExists(atPath: repoGuess) { return repoGuess }
242
+ return nil
243
+ }
244
+
245
+ private func startMicrophone() async {
246
+ let granted = MicrophoneCapture.hasPermission
247
+ ? true
248
+ : await MicrophoneCapture.requestPermission()
249
+ guard granted else {
250
+ recognizer?.stop()
251
+ recognizer = nil
252
+ state.statusMessage = "Microphone not authorized"
253
+ state.errorMessage = "Enable in System Settings → Privacy & Security → Microphone."
254
+ state.isListening = false
255
+ return
256
+ }
257
+
258
+ do {
259
+ try mic.start()
260
+ activeSource = .microphone
261
+ state.isListening = true
262
+ state.statusMessage = "Listening to microphone"
263
+ } catch {
264
+ recognizer?.stop()
265
+ recognizer = nil
266
+ state.isListening = false
267
+ state.statusMessage = "Could not start microphone"
268
+ state.errorMessage = error.localizedDescription
269
+ }
270
+ }
271
+
272
+ private func startSystemAudio() async {
273
+ do {
274
+ try await system.start()
275
+ activeSource = .systemAudio
276
+ state.isListening = true
277
+ state.statusMessage = "Listening to system audio"
278
+ } catch {
279
+ recognizer?.stop()
280
+ recognizer = nil
281
+ state.isListening = false
282
+ state.statusMessage = "Could not capture system audio"
283
+ let message = error.localizedDescription.lowercased()
284
+ if message.contains("tcc") || message.contains("declined") || message.contains("permission") {
285
+ state.errorMessage = "Grant Screen Recording in System Settings → Privacy & Security → Screen Recording, then relaunch."
286
+ } else {
287
+ state.errorMessage = error.localizedDescription
288
+ }
289
+ }
290
+ }
291
+
292
+ private func stopAll() async {
293
+ recognitionRestartWork?.cancel()
294
+ recognitionRestartWork = nil
295
+ mic.stop()
296
+ await system.stop()
297
+ recognizer?.stop()
298
+ recognizer = nil
299
+ activeSource = nil
300
+ configuredLanguageID = nil
301
+ configuredEngine = nil
302
+ }
303
+
304
+ // MARK: - Wiring
305
+
306
+ private func wireCallbacks(on recognizer: AudioRecognizer) {
307
+ recognizer.onStableUpdate = { [weak self] text, isFinal, detected in
308
+ guard let self else { return }
309
+ if self.state.errorMessage != nil {
310
+ self.state.errorMessage = nil
311
+ }
312
+ if let detected { self.state.detectedSourceLanguage = detected }
313
+ self.state.translatedTranscript = text
314
+ self.state.showLive(text: text, isFinal: isFinal)
315
+ }
316
+ recognizer.onFinalUtterance = { _, _ in
317
+ // No-op: onStableUpdate with isFinal=true already drove the UI.
318
+ }
319
+ recognizer.onFinal = {}
320
+ recognizer.onError = { [weak self] error in
321
+ guard let self else { return }
322
+ self.state.errorMessage = error.localizedDescription
323
+ self.scheduleRecognitionRestart(delay: 2.0)
324
+ }
325
+ }
326
+
327
+ private func baseLanguageCode(_ id: String) -> String {
328
+ return id
329
+ }
330
+
331
+ // MARK: - Settings-triggered updates
332
+
333
+ private func handleSettingsChange() async {
334
+ guard state.isEnabled else { return }
335
+ let sourceChanged = state.audioSource != activeSource
336
+ let languageChanged = state.languageID != configuredLanguageID
337
+ let engineChanged = state.asrEngine != configuredEngine
338
+ guard sourceChanged || languageChanged || engineChanged else { return }
339
+
340
+ await stopAll()
341
+ await start()
342
+ }
343
+
344
+ private func scheduleRecognitionRestart(delay: TimeInterval) {
345
+ recognitionRestartWork?.cancel()
346
+ let work = DispatchWorkItem { [weak self] in
347
+ Task { @MainActor in
348
+ guard let self,
349
+ self.state.isEnabled,
350
+ self.activeSource != nil,
351
+ let recognizer = self.recognizer else { return }
352
+ do {
353
+ try recognizer.startTask()
354
+ } catch {
355
+ self.state.errorMessage = error.localizedDescription
356
+ }
357
+ }
358
+ }
359
+ recognitionRestartWork = work
360
+ DispatchQueue.main.asyncAfter(deadline: .now() + delay, execute: work)
361
+ }
362
+ }
apps/macos/Sources/BanhMi/TranslationService.swift ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import Foundation
2
+
3
+ /// Translates text between languages. Implementations may call remote APIs
4
+ /// or a local Gemma 4 model.
5
+ protocol TranslationService: AnyObject {
6
+ /// Returns translated text. Throws on transport/auth errors.
7
+ /// Implementations should return the original text if `from == to`.
8
+ func translate(_ text: String, from: String, to: String) async throws -> String
9
+ }
10
+
11
+ /// Fallback that never translates — the UI will simply show the transcript
12
+ /// unchanged. Used when no translation backend is configured.
13
+ final class PassthroughTranslator: TranslationService {
14
+ func translate(_ text: String, from: String, to: String) async throws -> String {
15
+ return text
16
+ }
17
+ }
18
+
19
+ /// Free, keyless translator backed by Google's public `translate_a/single`
20
+ /// endpoint. Rate-limited and unofficial — use as a zero-config fallback until
21
+ /// Gemma 4 / an authenticated API is wired in.
22
+ final class FreeGoogleTranslator: TranslationService {
23
+ private let session: URLSession
24
+
25
+ init(session: URLSession = .shared) {
26
+ self.session = session
27
+ }
28
+
29
+ func translate(_ text: String, from: String, to: String) async throws -> String {
30
+ guard !text.isEmpty else { return text }
31
+ let src = Self.shortCode(from)
32
+ let tgt = Self.shortCode(to)
33
+ guard src != tgt else { return text }
34
+
35
+ var components = URLComponents(string: "https://translate.googleapis.com/translate_a/single")!
36
+ components.queryItems = [
37
+ URLQueryItem(name: "client", value: "gtx"),
38
+ URLQueryItem(name: "sl", value: src.isEmpty ? "auto" : src),
39
+ URLQueryItem(name: "tl", value: tgt),
40
+ URLQueryItem(name: "dt", value: "t"),
41
+ URLQueryItem(name: "q", value: text)
42
+ ]
43
+ guard let url = components.url else { return text }
44
+
45
+ var request = URLRequest(url: url)
46
+ request.setValue("Mozilla/5.0", forHTTPHeaderField: "User-Agent")
47
+
48
+ let (data, response) = try await session.data(for: request)
49
+ guard let http = response as? HTTPURLResponse, (200..<300).contains(http.statusCode) else {
50
+ return text
51
+ }
52
+ // Response shape: [[["translated","original",...],...],...]
53
+ guard let outer = try? JSONSerialization.jsonObject(with: data) as? [Any],
54
+ let sentences = outer.first as? [Any] else {
55
+ return text
56
+ }
57
+ var out = ""
58
+ for entry in sentences {
59
+ if let parts = entry as? [Any], let piece = parts.first as? String {
60
+ out += piece
61
+ }
62
+ }
63
+ return out.isEmpty ? text : out
64
+ }
65
+
66
+ /// Reduce "en-US" → "en" for the translate endpoint (with a few exceptions).
67
+ private static func shortCode(_ id: String) -> String {
68
+ let lower = id.lowercased()
69
+ // Google expects these specific regional codes.
70
+ if lower.hasPrefix("zh-cn") { return "zh-CN" }
71
+ if lower.hasPrefix("zh-tw") { return "zh-TW" }
72
+ if lower.hasPrefix("pt-br") { return "pt" }
73
+ if let dash = lower.firstIndex(of: "-") {
74
+ return String(lower[..<dash])
75
+ }
76
+ return lower
77
+ }
78
+ }
79
+
80
+ /// Google Cloud Translation v2 — simple REST call. Returns the original
81
+ /// text on any network/parsing failure so the overlay keeps updating.
82
+ final class GoogleTranslator: TranslationService {
83
+ private let apiKey: String
84
+ private let session: URLSession
85
+
86
+ init(apiKey: String, session: URLSession = .shared) {
87
+ self.apiKey = apiKey
88
+ self.session = session
89
+ }
90
+
91
+ func translate(_ text: String, from: String, to: String) async throws -> String {
92
+ guard !apiKey.isEmpty else { return text }
93
+ guard from != to else { return text }
94
+
95
+ var components = URLComponents(string: "https://translation.googleapis.com/language/translate/v2")!
96
+ components.queryItems = [
97
+ URLQueryItem(name: "key", value: apiKey),
98
+ URLQueryItem(name: "q", value: text),
99
+ URLQueryItem(name: "source", value: from),
100
+ URLQueryItem(name: "target", value: to),
101
+ URLQueryItem(name: "format", value: "text")
102
+ ]
103
+ guard let url = components.url else { return text }
104
+
105
+ var request = URLRequest(url: url)
106
+ request.httpMethod = "POST"
107
+
108
+ let (data, response) = try await session.data(for: request)
109
+ guard let http = response as? HTTPURLResponse, (200..<300).contains(http.statusCode) else {
110
+ return text
111
+ }
112
+
113
+ guard let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any],
114
+ let dataField = json["data"] as? [String: Any],
115
+ let translations = dataField["translations"] as? [[String: Any]],
116
+ let first = translations.first,
117
+ let translated = first["translatedText"] as? String else {
118
+ return text
119
+ }
120
+ return translated
121
+ }
122
+ }
apps/macos/build.sh ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ cd "$(dirname "$0")"
5
+
6
+ APP_NAME="Bánh mì chuyển ngữ"
7
+ EXEC_NAME="BanhMi"
8
+ CONFIG="${CONFIG:-release}"
9
+ BUILD_DIR=".build/$CONFIG"
10
+ STAGE_BUNDLE="$BUILD_DIR/$APP_NAME.app"
11
+
12
+ # The app is installed outside Dropbox so TCC permissions (Screen Recording,
13
+ # Microphone) survive across rebuilds. Dropbox rewrites xattrs/mtimes during
14
+ # sync and silently invalidates ad-hoc signatures — that's what was causing
15
+ # macOS to re-prompt for permissions.
16
+ INSTALL_DIR="${INSTALL_DIR:-$HOME/Applications}"
17
+ INSTALLED_BUNDLE="$INSTALL_DIR/$APP_NAME.app"
18
+
19
+ echo "▸ Building $CONFIG binary..."
20
+ swift build -c "$CONFIG"
21
+
22
+ echo "▸ Assembling app bundle in staging..."
23
+ rm -rf "$STAGE_BUNDLE"
24
+ mkdir -p "$STAGE_BUNDLE/Contents/MacOS"
25
+ mkdir -p "$STAGE_BUNDLE/Contents/Resources"
26
+
27
+ cp "$BUILD_DIR/$EXEC_NAME" "$STAGE_BUNDLE/Contents/MacOS/$EXEC_NAME"
28
+ cp Resources/Info.plist "$STAGE_BUNDLE/Contents/Info.plist"
29
+
30
+ echo "▸ Signing staging bundle…"
31
+ ENTITLEMENTS_FILE="$(mktemp -t banhmi-entitlements).plist"
32
+ cat > "$ENTITLEMENTS_FILE" <<'PLIST'
33
+ <?xml version="1.0" encoding="UTF-8"?>
34
+ <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
35
+ <plist version="1.0">
36
+ <dict>
37
+ <key>com.apple.security.device.audio-input</key>
38
+ <true/>
39
+ </dict>
40
+ </plist>
41
+ PLIST
42
+
43
+ # Prefer an "Apple Development" identity from Xcode — it gives the bundle a
44
+ # stable code-signing identity, so macOS TCC (Screen Recording, Microphone)
45
+ # grants persist across rebuilds. Falls back to ad-hoc if not found.
46
+ SIGN_IDENTITY="$(security find-identity -v 2>/dev/null \
47
+ | awk '/Apple Development/ {print $2; exit}')"
48
+ if [[ -z "$SIGN_IDENTITY" ]]; then
49
+ # Fallback: first valid identity of any kind (Xcode-managed dev certs
50
+ # sometimes only show as UUID labels instead of the friendly name).
51
+ SIGN_IDENTITY="$(security find-identity -v 2>/dev/null \
52
+ | awk '/^ *[0-9]+\)/ {print $2; exit}')"
53
+ fi
54
+ if [[ -z "$SIGN_IDENTITY" ]]; then
55
+ echo " ⚠️ No code-signing identity found — falling back to ad-hoc."
56
+ echo " TCC permissions will reset on every rebuild."
57
+ SIGN_IDENTITY="-"
58
+ else
59
+ echo " ↳ Using signing identity: $SIGN_IDENTITY"
60
+ fi
61
+
62
+ codesign --force --deep --sign "$SIGN_IDENTITY" \
63
+ --options runtime \
64
+ --entitlements "$ENTITLEMENTS_FILE" \
65
+ "$STAGE_BUNDLE"
66
+ rm -f "$ENTITLEMENTS_FILE"
67
+
68
+ echo "▸ Installing to $INSTALLED_BUNDLE (outside Dropbox)…"
69
+ mkdir -p "$INSTALL_DIR"
70
+
71
+ # If the bundle already exists and the signatures match, skip copying so TCC
72
+ # keeps its grant for the exact same on-disk identity. If they differ, we
73
+ # atomically replace the installed bundle and re-apply TCC (user will need to
74
+ # re-grant only when the executable actually changed).
75
+ STAGE_HASH="$(codesign -dvvv "$STAGE_BUNDLE" 2>&1 | awk -F'=' '/^CDHash=/{print $2}')"
76
+ INSTALLED_HASH=""
77
+ if [[ -d "$INSTALLED_BUNDLE" ]]; then
78
+ INSTALLED_HASH="$(codesign -dvvv "$INSTALLED_BUNDLE" 2>&1 | awk -F'=' '/^CDHash=/{print $2}' || true)"
79
+ fi
80
+
81
+ if [[ -n "$INSTALLED_HASH" && "$STAGE_HASH" == "$INSTALLED_HASH" ]]; then
82
+ echo " ↳ Signature unchanged — keeping existing install."
83
+ else
84
+ rm -rf "$INSTALLED_BUNDLE"
85
+ cp -R "$STAGE_BUNDLE" "$INSTALLED_BUNDLE"
86
+ echo " ↳ Installed new build."
87
+ if [[ "$SIGN_IDENTITY" == "-" ]]; then
88
+ echo " ⚠️ Ad-hoc signed — macOS may re-prompt for Screen Recording / Microphone."
89
+ fi
90
+ fi
91
+
92
+ echo ""
93
+ echo "✓ Installed: $INSTALLED_BUNDLE"
94
+ echo ""
95
+ echo "Run with:"
96
+ echo " open \"$INSTALLED_BUNDLE\""
apps/macos/scripts/gemma_sidecar.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Gemma 4 MLX sidecar for the macOS Bánh mì chuyển ngữ app.
4
+
5
+ Reads NDJSON requests from stdin, one per line. Each request describes an
6
+ audio clip (either file path or raw PCM over stdin) and a task. Writes
7
+ NDJSON responses to stdout.
8
+
9
+ Request format (JSON, one per line):
10
+ {"task": "transcribe_translate",
11
+ "audio_path": "/tmp/clip.wav",
12
+ "target_lang": "Vietnamese"}
13
+
14
+ Response format (JSON, one per line):
15
+ {"ok": true,
16
+ "source_text": "...",
17
+ "translated_text": "...",
18
+ "latency_ms": 1234}
19
+
20
+ On error:
21
+ {"ok": false, "error": "..."}
22
+
23
+ The model is loaded once at startup so subsequent requests are fast.
24
+
25
+ Usage from Swift:
26
+ let proc = Process()
27
+ proc.executableURL = URL(fileURLWithPath: "/usr/bin/env")
28
+ proc.arguments = ["python3", "/path/to/gemma_sidecar.py"]
29
+ // Write one JSON object + newline per request, read back one JSON per line.
30
+ """
31
+
32
+ import json
33
+ import os
34
+ import sys
35
+ import tempfile
36
+ import time
37
+
38
+
39
+ MODEL_ID_DEFAULT = "unsloth/gemma-4-E2B-it-UD-MLX-4bit"
40
+
41
+
42
+ def eprint(*args, **kwargs):
43
+ print(*args, file=sys.stderr, flush=True, **kwargs)
44
+
45
+
46
+ def load_model(model_id: str):
47
+ from mlx_vlm import load, generate, apply_chat_template
48
+ model, processor = load(model_id)
49
+ config = getattr(model, "config", None)
50
+ return model, processor, config, generate, apply_chat_template
51
+
52
+
53
+ def run_prompt(bundle, audio_path: str, prompt_text: str, max_tokens: int = 256) -> str:
54
+ model, processor, config, generate, apply_chat_template = bundle
55
+ formatted = apply_chat_template(processor, config, prompt_text, num_audios=1)
56
+ result = generate(
57
+ model, processor, formatted,
58
+ audio=audio_path,
59
+ max_tokens=max_tokens,
60
+ temperature=0.0,
61
+ verbose=False,
62
+ )
63
+ return (result.text if hasattr(result, "text") else str(result)).strip()
64
+
65
+
66
+ def handle(req: dict, bundle) -> dict:
67
+ task = req.get("task", "transcribe")
68
+ audio_path = req.get("audio_path")
69
+ target_lang = req.get("target_lang")
70
+ max_tokens = int(req.get("max_tokens", 256))
71
+
72
+ if not audio_path or not os.path.exists(audio_path):
73
+ return {"ok": False, "error": f"audio_path missing or not found: {audio_path!r}"}
74
+
75
+ # Save each received chunk to /tmp so we can inspect what we actually sent
76
+ try:
77
+ import shutil
78
+ shutil.copy(audio_path, "/tmp/banhmi_last_chunk.wav")
79
+ eprint(f"[sidecar] chunk {os.path.getsize(audio_path)} bytes -> /tmp/banhmi_last_chunk.wav")
80
+ except Exception as exc:
81
+ eprint(f"[sidecar] debug copy failed: {exc}")
82
+
83
+ t0 = time.time()
84
+
85
+ if task == "transcribe":
86
+ text = run_prompt(bundle, audio_path, "Transcribe this audio", max_tokens)
87
+ return {
88
+ "ok": True,
89
+ "source_text": text,
90
+ "translated_text": None,
91
+ "latency_ms": int((time.time() - t0) * 1000),
92
+ }
93
+
94
+ if task == "translate":
95
+ if not target_lang:
96
+ return {"ok": False, "error": "target_lang required for translate task"}
97
+ prompt = (
98
+ f"Translate the speech in this audio into {target_lang}. "
99
+ f"The speaker may be using any language — detect it and translate. "
100
+ f"If the speech is already in {target_lang}, output the speech as-is. "
101
+ f"Reply with only the {target_lang} text, no explanations or quotes."
102
+ )
103
+ translated = run_prompt(bundle, audio_path, prompt, max_tokens)
104
+ return {
105
+ "ok": True,
106
+ "source_text": None,
107
+ "translated_text": translated,
108
+ "latency_ms": int((time.time() - t0) * 1000),
109
+ }
110
+
111
+ if task == "transcribe_translate":
112
+ if not target_lang:
113
+ return {"ok": False, "error": "target_lang required"}
114
+ # Two calls: original + translation. Run them sequentially.
115
+ translate_prompt = (
116
+ f"Translate the speech in this audio into {target_lang}. "
117
+ f"The speaker may be using any language — detect it and translate. "
118
+ f"If the speech is already in {target_lang}, output the speech as-is. "
119
+ f"Reply with only the {target_lang} text, no explanations or quotes."
120
+ )
121
+ translated = run_prompt(bundle, audio_path, translate_prompt, max_tokens)
122
+ source = run_prompt(bundle, audio_path, "Transcribe this audio", max_tokens)
123
+ return {
124
+ "ok": True,
125
+ "source_text": source,
126
+ "translated_text": translated,
127
+ "latency_ms": int((time.time() - t0) * 1000),
128
+ }
129
+
130
+ return {"ok": False, "error": f"unknown task: {task}"}
131
+
132
+
133
+ def main():
134
+ model_id = os.environ.get("GEMMA_MLX_MODEL", MODEL_ID_DEFAULT)
135
+ eprint(f"[sidecar] loading {model_id}")
136
+ bundle = load_model(model_id)
137
+ eprint(f"[sidecar] ready — awaiting requests on stdin")
138
+
139
+ # Emit a "ready" message so the host knows the model is loaded.
140
+ print(json.dumps({"event": "ready", "model": model_id}), flush=True)
141
+
142
+ # Tempdir for audio uploads if needed
143
+ with tempfile.TemporaryDirectory(prefix="gemma_sidecar_") as tmpdir:
144
+ for line in sys.stdin:
145
+ line = line.strip()
146
+ if not line:
147
+ continue
148
+ try:
149
+ req = json.loads(line)
150
+ except json.JSONDecodeError as exc:
151
+ print(json.dumps({"ok": False, "error": f"invalid JSON: {exc}"}), flush=True)
152
+ continue
153
+ try:
154
+ resp = handle(req, bundle)
155
+ except Exception as exc:
156
+ resp = {"ok": False, "error": f"{type(exc).__name__}: {exc}"}
157
+ print(json.dumps(resp, ensure_ascii=False), flush=True)
158
+
159
+
160
+ if __name__ == "__main__":
161
+ main()