Text Generation
PEFT
Safetensors
GGUF
gemma4
unsloth
lora
qlora
fine-tuning
hackathon
gemma-4-good-hackathon
kaggle
translation
speech-recognition
accessibility
on-device
conversational
Instructions to use bradduy/banhmi-gemma4-e4b with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use bradduy/banhmi-gemma4-e4b with PEFT:
from peft import PeftModel from transformers import AutoModelForCausalLM base_model = AutoModelForCausalLM.from_pretrained("unsloth/gemma-4-E4B-it-unsloth-bnb-4bit") model = PeftModel.from_pretrained(base_model, "bradduy/banhmi-gemma4-e4b") - llama-cpp-python
How to use bradduy/banhmi-gemma4-e4b with llama-cpp-python:
# !pip install llama-cpp-python from llama_cpp import Llama llm = Llama.from_pretrained( repo_id="bradduy/banhmi-gemma4-e4b", filename="banhmi-gemma4.Q3_K_S.gguf", )
llm.create_chat_completion( messages = [ { "role": "user", "content": "What is the capital of France?" } ] ) - Notebooks
- Google Colab
- Kaggle
- Local Apps
- llama.cpp
How to use bradduy/banhmi-gemma4-e4b with llama.cpp:
Install from brew
brew install llama.cpp # Start a local OpenAI-compatible server with a web UI: llama-server -hf bradduy/banhmi-gemma4-e4b:Q3_K_S # Run inference directly in the terminal: llama-cli -hf bradduy/banhmi-gemma4-e4b:Q3_K_S
Install from WinGet (Windows)
winget install llama.cpp # Start a local OpenAI-compatible server with a web UI: llama-server -hf bradduy/banhmi-gemma4-e4b:Q3_K_S # Run inference directly in the terminal: llama-cli -hf bradduy/banhmi-gemma4-e4b:Q3_K_S
Use pre-built binary
# Download pre-built binary from: # https://github.com/ggerganov/llama.cpp/releases # Start a local OpenAI-compatible server with a web UI: ./llama-server -hf bradduy/banhmi-gemma4-e4b:Q3_K_S # Run inference directly in the terminal: ./llama-cli -hf bradduy/banhmi-gemma4-e4b:Q3_K_S
Build from source code
git clone https://github.com/ggerganov/llama.cpp.git cd llama.cpp cmake -B build cmake --build build -j --target llama-server llama-cli # Start a local OpenAI-compatible server with a web UI: ./build/bin/llama-server -hf bradduy/banhmi-gemma4-e4b:Q3_K_S # Run inference directly in the terminal: ./build/bin/llama-cli -hf bradduy/banhmi-gemma4-e4b:Q3_K_S
Use Docker
docker model run hf.co/bradduy/banhmi-gemma4-e4b:Q3_K_S
- LM Studio
- Jan
- vLLM
How to use bradduy/banhmi-gemma4-e4b with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "bradduy/banhmi-gemma4-e4b" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "bradduy/banhmi-gemma4-e4b", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/bradduy/banhmi-gemma4-e4b:Q3_K_S
- Ollama
How to use bradduy/banhmi-gemma4-e4b with Ollama:
ollama run hf.co/bradduy/banhmi-gemma4-e4b:Q3_K_S
- Unsloth Studio new
How to use bradduy/banhmi-gemma4-e4b with Unsloth Studio:
Install Unsloth Studio (macOS, Linux, WSL)
curl -fsSL https://unsloth.ai/install.sh | sh # Run unsloth studio unsloth studio -H 0.0.0.0 -p 8888 # Then open http://localhost:8888 in your browser # Search for bradduy/banhmi-gemma4-e4b to start chatting
Install Unsloth Studio (Windows)
irm https://unsloth.ai/install.ps1 | iex # Run unsloth studio unsloth studio -H 0.0.0.0 -p 8888 # Then open http://localhost:8888 in your browser # Search for bradduy/banhmi-gemma4-e4b to start chatting
Using HuggingFace Spaces for Unsloth
# No setup required # Open https://huggingface.co/spaces/unsloth/studio in your browser # Search for bradduy/banhmi-gemma4-e4b to start chatting
- Docker Model Runner
How to use bradduy/banhmi-gemma4-e4b with Docker Model Runner:
docker model run hf.co/bradduy/banhmi-gemma4-e4b:Q3_K_S
- Lemonade
How to use bradduy/banhmi-gemma4-e4b with Lemonade:
Pull the model
# Download Lemonade from https://lemonade-server.ai/ lemonade pull bradduy/banhmi-gemma4-e4b:Q3_K_S
Run and chat with the model
lemonade run user.banhmi-gemma4-e4b-Q3_K_S
List all available models
lemonade list
Add macOS Swift app source (apps/macos/ — SwiftUI menu-bar overlay + MLX sidecar)
Browse files- apps/macos/Package.swift +13 -0
- apps/macos/README.md +59 -0
- apps/macos/Resources/Info.plist +30 -0
- apps/macos/Sources/BanhMi/AppDelegate.swift +62 -0
- apps/macos/Sources/BanhMi/AppState.swift +283 -0
- apps/macos/Sources/BanhMi/BanhMiMain.swift +17 -0
- apps/macos/Sources/BanhMi/GemmaMLXRecognizer.swift +471 -0
- apps/macos/Sources/BanhMi/GemmaMLXService.swift +271 -0
- apps/macos/Sources/BanhMi/MicrophoneCapture.swift +45 -0
- apps/macos/Sources/BanhMi/SessionLogger.swift +186 -0
- apps/macos/Sources/BanhMi/Settings.swift +118 -0
- apps/macos/Sources/BanhMi/SettingsView.swift +170 -0
- apps/macos/Sources/BanhMi/SubtitleOverlay.swift +215 -0
- apps/macos/Sources/BanhMi/SystemAudioCapture.swift +120 -0
- apps/macos/Sources/BanhMi/TranscriptionController.swift +362 -0
- apps/macos/Sources/BanhMi/TranslationService.swift +122 -0
- apps/macos/build.sh +96 -0
- apps/macos/scripts/gemma_sidecar.py +161 -0
apps/macos/Package.swift
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// swift-tools-version:5.9
|
| 2 |
+
import PackageDescription
|
| 3 |
+
|
| 4 |
+
let package = Package(
|
| 5 |
+
name: "BanhMi",
|
| 6 |
+
platforms: [.macOS(.v13)],
|
| 7 |
+
targets: [
|
| 8 |
+
.executableTarget(
|
| 9 |
+
name: "BanhMi",
|
| 10 |
+
path: "Sources/BanhMi"
|
| 11 |
+
)
|
| 12 |
+
]
|
| 13 |
+
)
|
apps/macos/README.md
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Bánh mì chuyển ngữ — macOS App
|
| 2 |
+
|
| 3 |
+
Menu bar app that captures system audio and shows live translated subtitles. Zero config — install, pick your language, forget about it.
|
| 4 |
+
|
| 5 |
+
## Requirements
|
| 6 |
+
|
| 7 |
+
- macOS 13 Ventura or later
|
| 8 |
+
- Swift 5.9+ (ships with Xcode 15 / Command Line Tools)
|
| 9 |
+
|
| 10 |
+
## Build & run
|
| 11 |
+
|
| 12 |
+
```bash
|
| 13 |
+
./build.sh
|
| 14 |
+
open ".build/release/Bánh mì chuyển ngữ.app"
|
| 15 |
+
```
|
| 16 |
+
|
| 17 |
+
For debug builds:
|
| 18 |
+
|
| 19 |
+
```bash
|
| 20 |
+
CONFIG=debug ./build.sh
|
| 21 |
+
```
|
| 22 |
+
|
| 23 |
+
## Project layout
|
| 24 |
+
|
| 25 |
+
```
|
| 26 |
+
apps/macos/
|
| 27 |
+
├── Package.swift # Swift Package Manager manifest
|
| 28 |
+
├── Sources/BanhMi/
|
| 29 |
+
│ ├── main.swift # NSApplication entry point
|
| 30 |
+
│ ├── AppDelegate.swift # Menu bar + popover wiring
|
| 31 |
+
│ ├── Settings.swift # Languages, text sizes, storage keys
|
| 32 |
+
│ └── SettingsView.swift # SwiftUI settings panel
|
| 33 |
+
├── Resources/
|
| 34 |
+
│ └── Info.plist # LSUIElement = true (menu bar only, no dock)
|
| 35 |
+
└── build.sh # Builds the .app bundle
|
| 36 |
+
```
|
| 37 |
+
|
| 38 |
+
## Current status (v0.2)
|
| 39 |
+
|
| 40 |
+
- [x] Menu bar icon
|
| 41 |
+
- [x] Click → settings popover
|
| 42 |
+
- [x] Output language picker (16 languages)
|
| 43 |
+
- [x] Subtitle size selector (S / M / L)
|
| 44 |
+
- [x] Audio source picker (System audio / Microphone)
|
| 45 |
+
- [x] Settings persist via `UserDefaults`
|
| 46 |
+
- [x] Floating subtitle overlay (bottom-center, always on top, visible over fullscreen apps, click-through)
|
| 47 |
+
- [x] Microphone capture (AVAudioEngine)
|
| 48 |
+
- [x] System audio capture (ScreenCaptureKit)
|
| 49 |
+
- [x] Live transcription via Apple Speech framework (on-device when supported)
|
| 50 |
+
- [ ] Translation engine (Gemma 4 / Google Translate API)
|
| 51 |
+
- [ ] Dual-language overlay (original + translation)
|
| 52 |
+
|
| 53 |
+
## Permissions needed
|
| 54 |
+
|
| 55 |
+
On first launch, macOS will prompt for:
|
| 56 |
+
|
| 57 |
+
1. **Speech Recognition** — auto-prompt, click Allow
|
| 58 |
+
2. **Screen Recording** (for system audio) — manual: System Settings → Privacy & Security → Screen Recording → enable Bánh mì chuyển ngữ, then quit & relaunch
|
| 59 |
+
3. **Microphone** — only if you switch source to Microphone
|
apps/macos/Resources/Info.plist
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
| 2 |
+
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
| 3 |
+
<plist version="1.0">
|
| 4 |
+
<dict>
|
| 5 |
+
<key>CFBundleName</key>
|
| 6 |
+
<string>Bánh mì chuyển ngữ</string>
|
| 7 |
+
<key>CFBundleDisplayName</key>
|
| 8 |
+
<string>Bánh mì chuyển ngữ</string>
|
| 9 |
+
<key>CFBundleIdentifier</key>
|
| 10 |
+
<string>vn.banhmi.chuyenngu</string>
|
| 11 |
+
<key>CFBundleExecutable</key>
|
| 12 |
+
<string>BanhMi</string>
|
| 13 |
+
<key>CFBundlePackageType</key>
|
| 14 |
+
<string>APPL</string>
|
| 15 |
+
<key>CFBundleShortVersionString</key>
|
| 16 |
+
<string>0.5</string>
|
| 17 |
+
<key>CFBundleVersion</key>
|
| 18 |
+
<string>5</string>
|
| 19 |
+
<key>LSMinimumSystemVersion</key>
|
| 20 |
+
<string>13.0</string>
|
| 21 |
+
<key>LSUIElement</key>
|
| 22 |
+
<true/>
|
| 23 |
+
<key>NSHighResolutionCapable</key>
|
| 24 |
+
<true/>
|
| 25 |
+
<key>NSMicrophoneUsageDescription</key>
|
| 26 |
+
<string>Bánh mì chuyển ngữ listens to your microphone to show live transcriptions on screen.</string>
|
| 27 |
+
<key>NSScreenCaptureUsageDescription</key>
|
| 28 |
+
<string>Bánh mì chuyển ngữ captures system audio (via screen capture) to transcribe and translate what you hear.</string>
|
| 29 |
+
</dict>
|
| 30 |
+
</plist>
|
apps/macos/Sources/BanhMi/AppDelegate.swift
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import AppKit
|
| 2 |
+
import SwiftUI
|
| 3 |
+
|
| 4 |
+
@MainActor
|
| 5 |
+
final class AppDelegate: NSObject, NSApplicationDelegate {
|
| 6 |
+
private var statusItem: NSStatusItem!
|
| 7 |
+
private var popover: NSPopover!
|
| 8 |
+
private let state = AppState.shared
|
| 9 |
+
private var overlay: SubtitleOverlayController!
|
| 10 |
+
private var controller: TranscriptionController!
|
| 11 |
+
|
| 12 |
+
func applicationDidFinishLaunching(_ notification: Notification) {
|
| 13 |
+
statusItem = NSStatusBar.system.statusItem(withLength: NSStatusItem.variableLength)
|
| 14 |
+
|
| 15 |
+
if let button = statusItem.button {
|
| 16 |
+
let image = NSImage(systemSymbolName: "waveform", accessibilityDescription: "Bánh mì chuyển ngữ")
|
| 17 |
+
image?.isTemplate = true
|
| 18 |
+
button.image = image
|
| 19 |
+
button.action = #selector(togglePopover(_:))
|
| 20 |
+
button.target = self
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
let rootView = SettingsView(state: state, onToggleEnabled: { [weak self] in
|
| 24 |
+
self?.toggleEnabled()
|
| 25 |
+
})
|
| 26 |
+
popover = NSPopover()
|
| 27 |
+
popover.contentSize = NSSize(width: 340, height: 520)
|
| 28 |
+
popover.behavior = .transient
|
| 29 |
+
popover.animates = true
|
| 30 |
+
popover.contentViewController = NSHostingController(rootView: rootView)
|
| 31 |
+
|
| 32 |
+
overlay = SubtitleOverlayController(state: state)
|
| 33 |
+
overlay.show()
|
| 34 |
+
|
| 35 |
+
controller = TranscriptionController(state: state)
|
| 36 |
+
Task { await controller.bootstrap() }
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
func applicationWillTerminate(_ notification: Notification) {
|
| 40 |
+
Task { await controller.shutdown() }
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
@objc private func togglePopover(_ sender: Any?) {
|
| 44 |
+
guard let button = statusItem.button else { return }
|
| 45 |
+
if popover.isShown {
|
| 46 |
+
popover.performClose(sender)
|
| 47 |
+
} else {
|
| 48 |
+
popover.show(relativeTo: button.bounds, of: button, preferredEdge: .minY)
|
| 49 |
+
popover.contentViewController?.view.window?.makeKey()
|
| 50 |
+
}
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
private func toggleEnabled() {
|
| 54 |
+
Task { @MainActor in
|
| 55 |
+
if state.isEnabled {
|
| 56 |
+
await controller.pause()
|
| 57 |
+
} else {
|
| 58 |
+
await controller.resume()
|
| 59 |
+
}
|
| 60 |
+
}
|
| 61 |
+
}
|
| 62 |
+
}
|
apps/macos/Sources/BanhMi/AppState.swift
ADDED
|
@@ -0,0 +1,283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import Foundation
|
| 2 |
+
import Combine
|
| 3 |
+
import AppKit
|
| 4 |
+
|
| 5 |
+
/// One box of text in the overlay. A line is "live" while partials stream;
|
| 6 |
+
/// once finalized it is locked and the next partials create a new line.
|
| 7 |
+
struct SubtitleLine: Identifiable, Equatable {
|
| 8 |
+
let id: UUID
|
| 9 |
+
var text: String
|
| 10 |
+
var isFinal: Bool = false
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
/// Single source of truth for the running app.
|
| 14 |
+
/// Views observe this, services write to it.
|
| 15 |
+
@MainActor
|
| 16 |
+
final class AppState: ObservableObject {
|
| 17 |
+
static let shared = AppState()
|
| 18 |
+
|
| 19 |
+
/// Maximum number of lines kept in the overlay buffer.
|
| 20 |
+
static let maxLines: Int = 3
|
| 21 |
+
/// Maximum words displayed per subtitle box. Slightly higher than the
|
| 22 |
+
/// recognizer's flush cap so translator expansion (e.g. Vietnamese often
|
| 23 |
+
/// expands vs. English) rarely triggers an extra re-split.
|
| 24 |
+
static let maxWordsPerLine: Int = 26
|
| 25 |
+
/// Short auto-hide delay for brief lines (≤ threshold words).
|
| 26 |
+
static let lineHideShortSeconds: TimeInterval = 3.0
|
| 27 |
+
/// Long auto-hide delay for longer lines (> threshold words).
|
| 28 |
+
static let lineHideLongSeconds: TimeInterval = 5.0
|
| 29 |
+
/// Word count threshold switching between short and long hide delays.
|
| 30 |
+
static let lineHideWordThreshold: Int = 10
|
| 31 |
+
/// When a new box arrives, older boxes have their remaining visible time
|
| 32 |
+
/// shortened to this value. Keeps the overlay "fresh" — old content fades
|
| 33 |
+
/// quickly once newer content pushes it up.
|
| 34 |
+
static let olderLineHideSeconds: TimeInterval = 1.2
|
| 35 |
+
|
| 36 |
+
// Accumulated raw text (for debugging/future use); the overlay does not
|
| 37 |
+
// render this directly anymore.
|
| 38 |
+
@Published var transcript: String = ""
|
| 39 |
+
/// Most recent translated text written by the translation pipeline.
|
| 40 |
+
@Published var translatedTranscript: String = ""
|
| 41 |
+
|
| 42 |
+
/// Stable lines currently shown in the overlay. New lines are appended;
|
| 43 |
+
/// old lines are removed via `expire(id:)` after their hide delay.
|
| 44 |
+
@Published private(set) var lines: [SubtitleLine] = []
|
| 45 |
+
|
| 46 |
+
@Published var isListening: Bool = false
|
| 47 |
+
@Published var isEnabled: Bool = true {
|
| 48 |
+
didSet { UserDefaults.standard.set(isEnabled, forKey: SettingsKey.isEnabled) }
|
| 49 |
+
}
|
| 50 |
+
@Published var statusMessage: String = "Starting…"
|
| 51 |
+
@Published var errorMessage: String?
|
| 52 |
+
|
| 53 |
+
/// Detected language of the incoming speech (BCP-47 base code). Nil until known.
|
| 54 |
+
@Published var detectedSourceLanguage: String?
|
| 55 |
+
|
| 56 |
+
// Persisted settings.
|
| 57 |
+
@Published var languageID: String {
|
| 58 |
+
didSet { UserDefaults.standard.set(languageID, forKey: SettingsKey.outputLanguage) }
|
| 59 |
+
}
|
| 60 |
+
@Published var textSize: TextSize {
|
| 61 |
+
didSet { UserDefaults.standard.set(textSize.rawValue, forKey: SettingsKey.textSize) }
|
| 62 |
+
}
|
| 63 |
+
@Published var audioSource: AudioSource {
|
| 64 |
+
didSet { UserDefaults.standard.set(audioSource.rawValue, forKey: SettingsKey.audioSource) }
|
| 65 |
+
}
|
| 66 |
+
@Published var asrEngine: ASREngine {
|
| 67 |
+
didSet { UserDefaults.standard.set(asrEngine.rawValue, forKey: SettingsKey.asrEngine) }
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
// Overlay geometry (persisted).
|
| 71 |
+
@Published var overlayWidth: CGFloat {
|
| 72 |
+
didSet { UserDefaults.standard.set(Double(overlayWidth), forKey: SettingsKey.overlayWidth) }
|
| 73 |
+
}
|
| 74 |
+
@Published var overlayOrigin: CGPoint? {
|
| 75 |
+
didSet {
|
| 76 |
+
if let p = overlayOrigin {
|
| 77 |
+
UserDefaults.standard.set(Double(p.x), forKey: SettingsKey.overlayOriginX)
|
| 78 |
+
UserDefaults.standard.set(Double(p.y), forKey: SettingsKey.overlayOriginY)
|
| 79 |
+
}
|
| 80 |
+
}
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
private var hideWorkByLine: [UUID: DispatchWorkItem] = [:]
|
| 84 |
+
/// Safety timer: if a live line hasn't been updated in this window, we
|
| 85 |
+
/// force-finalize it so it can fade out instead of sticking forever.
|
| 86 |
+
private var liveStaleTimer: DispatchWorkItem?
|
| 87 |
+
static let liveStaleSeconds: TimeInterval = 2.5
|
| 88 |
+
|
| 89 |
+
private init() {
|
| 90 |
+
let defaults = UserDefaults.standard
|
| 91 |
+
self.languageID = defaults.string(forKey: SettingsKey.outputLanguage) ?? SupportedLanguages.systemDefault
|
| 92 |
+
self.textSize = TextSize(rawValue: defaults.string(forKey: SettingsKey.textSize) ?? "") ?? .medium
|
| 93 |
+
self.audioSource = AudioSource(rawValue: defaults.string(forKey: SettingsKey.audioSource) ?? "") ?? .systemAudio
|
| 94 |
+
self.asrEngine = ASREngine(rawValue: defaults.string(forKey: SettingsKey.asrEngine) ?? "") ?? .gemmaMLX
|
| 95 |
+
if defaults.object(forKey: SettingsKey.isEnabled) != nil {
|
| 96 |
+
self.isEnabled = defaults.bool(forKey: SettingsKey.isEnabled)
|
| 97 |
+
} else {
|
| 98 |
+
self.isEnabled = true
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
let savedWidth = defaults.double(forKey: SettingsKey.overlayWidth)
|
| 102 |
+
self.overlayWidth = savedWidth > 0 ? CGFloat(savedWidth) : 900
|
| 103 |
+
|
| 104 |
+
if defaults.object(forKey: SettingsKey.overlayOriginX) != nil,
|
| 105 |
+
defaults.object(forKey: SettingsKey.overlayOriginY) != nil {
|
| 106 |
+
self.overlayOrigin = CGPoint(
|
| 107 |
+
x: defaults.double(forKey: SettingsKey.overlayOriginX),
|
| 108 |
+
y: defaults.double(forKey: SettingsKey.overlayOriginY)
|
| 109 |
+
)
|
| 110 |
+
} else {
|
| 111 |
+
self.overlayOrigin = nil
|
| 112 |
+
}
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
var currentLanguage: Language {
|
| 116 |
+
SupportedLanguages.named(languageID) ?? SupportedLanguages.all[0]
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
// MARK: - Live / finalized line management
|
| 120 |
+
|
| 121 |
+
/// Drive the overlay from streaming recognition updates.
|
| 122 |
+
/// - If there is an existing live (non-final) line, it is updated in place.
|
| 123 |
+
/// - If there isn't, a new live line is created.
|
| 124 |
+
/// - When `isFinal` is true, the last live line is locked and the next
|
| 125 |
+
/// `showLive` call will start a new line.
|
| 126 |
+
func showLive(text: String, isFinal: Bool) {
|
| 127 |
+
let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
|
| 128 |
+
guard !trimmed.isEmpty else { return }
|
| 129 |
+
scheduleLiveStaleCheck()
|
| 130 |
+
|
| 131 |
+
// Split overly long text into at-most-maxWordsPerLine boxes. If the
|
| 132 |
+
// update produces multiple chunks, the last chunk is the "live" one
|
| 133 |
+
// we'll keep updating; earlier chunks are treated as finalized.
|
| 134 |
+
let chunks = Self.splitIntoChunks(trimmed, maxWords: Self.maxWordsPerLine)
|
| 135 |
+
|
| 136 |
+
// Case 1: last line is still live — either extend it or, if we've
|
| 137 |
+
// produced multiple chunks, lock it and append the new ones.
|
| 138 |
+
if let last = lines.last, last.isFinal == false {
|
| 139 |
+
if chunks.count == 1 {
|
| 140 |
+
// Update in place.
|
| 141 |
+
if lines[lines.count - 1].text != chunks[0] {
|
| 142 |
+
lines[lines.count - 1].text = chunks[0]
|
| 143 |
+
}
|
| 144 |
+
if isFinal {
|
| 145 |
+
lines[lines.count - 1].isFinal = true
|
| 146 |
+
scheduleExpiry(for: lines[lines.count - 1])
|
| 147 |
+
}
|
| 148 |
+
return
|
| 149 |
+
} else {
|
| 150 |
+
// Chunks split — replace the live line with the first chunk
|
| 151 |
+
// (now final), then append the remaining chunks.
|
| 152 |
+
lines[lines.count - 1].text = chunks[0]
|
| 153 |
+
lines[lines.count - 1].isFinal = true
|
| 154 |
+
scheduleExpiry(for: lines[lines.count - 1])
|
| 155 |
+
for chunk in chunks.dropFirst() {
|
| 156 |
+
appendNewLine(text: chunk, isFinal: isFinal)
|
| 157 |
+
}
|
| 158 |
+
return
|
| 159 |
+
}
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
// Case 2: no live line. Append each chunk as a new line; only the
|
| 163 |
+
// last becomes "live" (unless isFinal was requested).
|
| 164 |
+
for (idx, chunk) in chunks.enumerated() {
|
| 165 |
+
let final = isFinal || (idx < chunks.count - 1)
|
| 166 |
+
appendNewLine(text: chunk, isFinal: final)
|
| 167 |
+
}
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
private func appendNewLine(text: String, isFinal: Bool) {
|
| 171 |
+
// Accelerate fade on existing older lines so the new line stands out.
|
| 172 |
+
shortenOlderTimers()
|
| 173 |
+
|
| 174 |
+
let line = SubtitleLine(id: UUID(), text: text, isFinal: isFinal)
|
| 175 |
+
lines.append(line)
|
| 176 |
+
while lines.count > Self.maxLines {
|
| 177 |
+
let removed = lines.removeFirst()
|
| 178 |
+
hideWorkByLine.removeValue(forKey: removed.id)?.cancel()
|
| 179 |
+
}
|
| 180 |
+
if isFinal {
|
| 181 |
+
scheduleExpiry(for: line)
|
| 182 |
+
}
|
| 183 |
+
// Non-final lines don't schedule expiry yet; they get one when finalized.
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
/// Append a finalized utterance as one or more new boxes. Kept for
|
| 187 |
+
/// backwards compatibility; callers should prefer `showLive`.
|
| 188 |
+
func appendLine(_ text: String) {
|
| 189 |
+
let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
|
| 190 |
+
guard !trimmed.isEmpty else { return }
|
| 191 |
+
|
| 192 |
+
let chunks = Self.splitIntoChunks(trimmed, maxWords: Self.maxWordsPerLine)
|
| 193 |
+
for chunk in chunks {
|
| 194 |
+
if let last = lines.last, last.text == chunk { continue }
|
| 195 |
+
// Before appending a new box, accelerate the fade-out of every
|
| 196 |
+
// existing box so stale content doesn't linger behind fresh text.
|
| 197 |
+
shortenOlderTimers()
|
| 198 |
+
|
| 199 |
+
let line = SubtitleLine(id: UUID(), text: chunk)
|
| 200 |
+
lines.append(line)
|
| 201 |
+
while lines.count > Self.maxLines {
|
| 202 |
+
let removed = lines.removeFirst()
|
| 203 |
+
hideWorkByLine.removeValue(forKey: removed.id)?.cancel()
|
| 204 |
+
}
|
| 205 |
+
scheduleExpiry(for: line)
|
| 206 |
+
}
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
/// Replace any pending hide timer on existing lines with a short one so
|
| 210 |
+
/// older boxes fade out quickly once a newer one arrives.
|
| 211 |
+
private func shortenOlderTimers() {
|
| 212 |
+
for existing in lines {
|
| 213 |
+
hideWorkByLine.removeValue(forKey: existing.id)?.cancel()
|
| 214 |
+
let work = DispatchWorkItem { [weak self] in
|
| 215 |
+
Task { @MainActor in self?.expire(id: existing.id) }
|
| 216 |
+
}
|
| 217 |
+
hideWorkByLine[existing.id] = work
|
| 218 |
+
DispatchQueue.main.asyncAfter(
|
| 219 |
+
deadline: .now() + Self.olderLineHideSeconds,
|
| 220 |
+
execute: work
|
| 221 |
+
)
|
| 222 |
+
}
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
/// Split text into successive chunks of at most `maxWords` whitespace-
|
| 226 |
+
/// separated words. Preserves original word order.
|
| 227 |
+
private static func splitIntoChunks(_ text: String, maxWords: Int) -> [String] {
|
| 228 |
+
let words = text.split(whereSeparator: { $0.isWhitespace }).map(String.init)
|
| 229 |
+
guard words.count > maxWords else { return [text] }
|
| 230 |
+
var chunks: [String] = []
|
| 231 |
+
var i = 0
|
| 232 |
+
while i < words.count {
|
| 233 |
+
let end = min(i + maxWords, words.count)
|
| 234 |
+
chunks.append(words[i..<end].joined(separator: " "))
|
| 235 |
+
i = end
|
| 236 |
+
}
|
| 237 |
+
return chunks
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
/// Remove all lines and cancel pending timers. Used on pause/stop.
|
| 241 |
+
func clearLines() {
|
| 242 |
+
hideWorkByLine.values.forEach { $0.cancel() }
|
| 243 |
+
hideWorkByLine.removeAll()
|
| 244 |
+
liveStaleTimer?.cancel()
|
| 245 |
+
liveStaleTimer = nil
|
| 246 |
+
lines.removeAll()
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
/// (Re)start the watchdog that force-finalizes a live line that stops
|
| 250 |
+
/// getting updates, so it can fade out instead of staying forever.
|
| 251 |
+
private func scheduleLiveStaleCheck() {
|
| 252 |
+
liveStaleTimer?.cancel()
|
| 253 |
+
let work = DispatchWorkItem { [weak self] in
|
| 254 |
+
Task { @MainActor in self?.forceFinalizeLive() }
|
| 255 |
+
}
|
| 256 |
+
liveStaleTimer = work
|
| 257 |
+
DispatchQueue.main.asyncAfter(deadline: .now() + Self.liveStaleSeconds, execute: work)
|
| 258 |
+
}
|
| 259 |
+
|
| 260 |
+
private func forceFinalizeLive() {
|
| 261 |
+
guard let last = lines.last, last.isFinal == false else { return }
|
| 262 |
+
lines[lines.count - 1].isFinal = true
|
| 263 |
+
scheduleExpiry(for: lines[lines.count - 1])
|
| 264 |
+
}
|
| 265 |
+
|
| 266 |
+
private func scheduleExpiry(for line: SubtitleLine) {
|
| 267 |
+
let words = line.text.split(whereSeparator: { $0.isWhitespace }).count
|
| 268 |
+
let delay: TimeInterval = words > Self.lineHideWordThreshold
|
| 269 |
+
? Self.lineHideLongSeconds
|
| 270 |
+
: Self.lineHideShortSeconds
|
| 271 |
+
|
| 272 |
+
let work = DispatchWorkItem { [weak self] in
|
| 273 |
+
Task { @MainActor in self?.expire(id: line.id) }
|
| 274 |
+
}
|
| 275 |
+
hideWorkByLine[line.id] = work
|
| 276 |
+
DispatchQueue.main.asyncAfter(deadline: .now() + delay, execute: work)
|
| 277 |
+
}
|
| 278 |
+
|
| 279 |
+
private func expire(id: UUID) {
|
| 280 |
+
hideWorkByLine.removeValue(forKey: id)
|
| 281 |
+
lines.removeAll { $0.id == id }
|
| 282 |
+
}
|
| 283 |
+
}
|
apps/macos/Sources/BanhMi/BanhMiMain.swift
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import AppKit
|
| 2 |
+
|
| 3 |
+
@main
|
| 4 |
+
struct BanhMiMain {
|
| 5 |
+
static func main() {
|
| 6 |
+
MainActor.assumeIsolated {
|
| 7 |
+
// Start the session logger early so we capture everything.
|
| 8 |
+
_ = SessionLogger.shared
|
| 9 |
+
|
| 10 |
+
let app = NSApplication.shared
|
| 11 |
+
let delegate = AppDelegate()
|
| 12 |
+
app.delegate = delegate
|
| 13 |
+
app.setActivationPolicy(.accessory)
|
| 14 |
+
app.run()
|
| 15 |
+
}
|
| 16 |
+
}
|
| 17 |
+
}
|
apps/macos/Sources/BanhMi/GemmaMLXRecognizer.swift
ADDED
|
@@ -0,0 +1,471 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import Foundation
|
| 2 |
+
@preconcurrency import AVFoundation
|
| 3 |
+
import OSLog
|
| 4 |
+
|
| 5 |
+
private let recLog = Logger(subsystem: "vn.banhmi.chuyenngu", category: "gemma-mlx")
|
| 6 |
+
|
| 7 |
+
/// Offline speech recognition + translation via the Gemma 4 MLX sidecar.
|
| 8 |
+
/// Conforms to the `AudioRecognizer` protocol so `TranscriptionController`
|
| 9 |
+
/// can drive it the same way as any future engine.
|
| 10 |
+
///
|
| 11 |
+
/// Gemma's audio tower is encoder-based — it needs the full clip at once,
|
| 12 |
+
/// so this isn't true streaming. We accumulate audio with an energy-based
|
| 13 |
+
/// VAD and run inference once per detected utterance.
|
| 14 |
+
@MainActor
|
| 15 |
+
final class GemmaMLXRecognizer {
|
| 16 |
+
// Callbacks (AudioRecognizer surface)
|
| 17 |
+
var onStableUpdate: ((String, Bool, String?) -> Void)?
|
| 18 |
+
var onFinalUtterance: ((String, String?) -> Void)?
|
| 19 |
+
var onFinal: (() -> Void)?
|
| 20 |
+
var onError: ((Error) -> Void)?
|
| 21 |
+
|
| 22 |
+
private(set) var latestDetectedLanguage: String?
|
| 23 |
+
|
| 24 |
+
// VAD chunking — cut audio on speech boundaries rather than fixed windows
|
| 25 |
+
// so Gemma always sees complete utterances.
|
| 26 |
+
static let sampleRate: Double = 16_000
|
| 27 |
+
static let frameDurationSec: TimeInterval = 0.03 // 30 ms frames
|
| 28 |
+
static let minSpeechSec: TimeInterval = 0.6 // drop sub-600ms utterances
|
| 29 |
+
static let maxSpeechSec: TimeInterval = 8.0 // force flush after 8s
|
| 30 |
+
static let trailingSilenceSec: TimeInterval = 0.35 // silence that ends an utterance
|
| 31 |
+
static let speechActivationSec: TimeInterval = 0.15 // 150ms of voice to start a segment
|
| 32 |
+
|
| 33 |
+
/// Hysteresis thresholds for voice activity detection.
|
| 34 |
+
///
|
| 35 |
+
/// A frame counts as voice only if its RMS is above `voiceActivateRMS`
|
| 36 |
+
/// (clear speech energy, not ambient room noise). A frame counts as
|
| 37 |
+
/// silence only if its RMS drops below `voiceDeactivateRMS`. Frames in
|
| 38 |
+
/// between leave the VAD state unchanged, which is what lets us span
|
| 39 |
+
/// brief quiet moments inside a word without losing speech context.
|
| 40 |
+
static let voiceActivateRMS: Float = 0.005 // >= this → voice
|
| 41 |
+
static let voiceDeactivateRMS: Float = 0.0015 // < this → silence
|
| 42 |
+
/// Minimum overall chunk RMS — below this we skip the whole utterance
|
| 43 |
+
/// (pure noise / mic tap). Evaluated on the chunk before peak normalize.
|
| 44 |
+
static let silenceRMSThreshold: Float = 0.001
|
| 45 |
+
|
| 46 |
+
private let service: GemmaMLXService
|
| 47 |
+
private var targetLanguageName: String = "English"
|
| 48 |
+
|
| 49 |
+
// Audio conversion to 16 kHz int16 mono
|
| 50 |
+
private let desiredFormat: AVAudioFormat
|
| 51 |
+
private var converter: AVAudioConverter?
|
| 52 |
+
private var converterInputFormat: AVAudioFormat?
|
| 53 |
+
|
| 54 |
+
// VAD state machine
|
| 55 |
+
private var pcmBuffer = Data() // all audio since start of current utterance (or since last flush)
|
| 56 |
+
private var isRunning = false
|
| 57 |
+
private var inFlight = false
|
| 58 |
+
private var inSpeech = false
|
| 59 |
+
private var speechRunSec: TimeInterval = 0 // accumulated voice-frames at start of utterance
|
| 60 |
+
private var silenceRunSec: TimeInterval = 0 // consecutive silence since last voice frame
|
| 61 |
+
private var utteranceSec: TimeInterval = 0 // length of current utterance
|
| 62 |
+
|
| 63 |
+
/// A queued or in-flight utterance with enough metadata to log metrics
|
| 64 |
+
/// and attach the saved WAV for offline review.
|
| 65 |
+
private struct PendingUtterance {
|
| 66 |
+
let pcm: Data
|
| 67 |
+
let rmsIn: Float
|
| 68 |
+
let rmsOut: Float
|
| 69 |
+
let durationSec: Double
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
/// Queue of preprocessed utterances waiting for Gemma. Keeping this
|
| 73 |
+
/// small — if the user speaks faster than the model, we drop the
|
| 74 |
+
/// *oldest* pending entry to keep overlay text fresh instead of lagging.
|
| 75 |
+
private var queue: [PendingUtterance] = []
|
| 76 |
+
private static let maxQueueDepth = 2
|
| 77 |
+
|
| 78 |
+
init(sidecarScriptPath: String,
|
| 79 |
+
pythonPath: String = "/usr/bin/env",
|
| 80 |
+
modelID: String = "unsloth/gemma-4-E2B-it-UD-MLX-4bit") throws {
|
| 81 |
+
self.desiredFormat = AVAudioFormat(
|
| 82 |
+
commonFormat: .pcmFormatInt16,
|
| 83 |
+
sampleRate: 16_000,
|
| 84 |
+
channels: 1,
|
| 85 |
+
interleaved: true
|
| 86 |
+
)!
|
| 87 |
+
// `/usr/bin/env python3 <script>` is the simplest launch that works with
|
| 88 |
+
// any user's python3 (venv, system, homebrew, etc).
|
| 89 |
+
self.service = try GemmaMLXService(
|
| 90 |
+
pythonPath: pythonPath == "/usr/bin/env" ? "/usr/bin/env" : pythonPath,
|
| 91 |
+
sidecarScript: sidecarScriptPath,
|
| 92 |
+
modelID: modelID
|
| 93 |
+
)
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
/// Configure target language. Source language is always auto-detected by
|
| 97 |
+
/// the model — hints are ignored here.
|
| 98 |
+
func configure(sourceLocale: Locale,
|
| 99 |
+
targetLanguageCode: String,
|
| 100 |
+
context: String = "") throws {
|
| 101 |
+
targetLanguageName = Self.targetLanguageName(from: targetLanguageCode)
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
/// Start processing. Waits until the sidecar has loaded the MLX model.
|
| 105 |
+
func startTask() throws {
|
| 106 |
+
guard !isRunning else { return }
|
| 107 |
+
isRunning = true
|
| 108 |
+
pcmBuffer.removeAll(keepingCapacity: true)
|
| 109 |
+
// Wait for sidecar ready asynchronously — don't block the caller.
|
| 110 |
+
Task { [weak self] in
|
| 111 |
+
do {
|
| 112 |
+
try await self?.service.waitUntilReady()
|
| 113 |
+
} catch {
|
| 114 |
+
self?.onError?(error)
|
| 115 |
+
}
|
| 116 |
+
}
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
func stop() {
|
| 120 |
+
isRunning = false
|
| 121 |
+
pcmBuffer.removeAll(keepingCapacity: false)
|
| 122 |
+
inSpeech = false
|
| 123 |
+
speechRunSec = 0
|
| 124 |
+
silenceRunSec = 0
|
| 125 |
+
utteranceSec = 0
|
| 126 |
+
queue.removeAll(keepingCapacity: false)
|
| 127 |
+
service.stop()
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
/// Feed an audio buffer. Converts to 16 kHz int16 mono, runs
|
| 131 |
+
/// energy-based VAD, and flushes a chunk to Gemma when we see a
|
| 132 |
+
/// complete utterance (voice + trailing silence, or hit max length).
|
| 133 |
+
func append(_ buffer: AVAudioPCMBuffer) {
|
| 134 |
+
guard isRunning else { return }
|
| 135 |
+
guard let int16Data = convertToInt16Mono16k(buffer), !int16Data.isEmpty else {
|
| 136 |
+
return
|
| 137 |
+
}
|
| 138 |
+
appendConverted(int16Data)
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
private func appendConverted(_ int16Data: Data) {
|
| 142 |
+
// Frame-based VAD walks the incoming int16 in 30ms frames.
|
| 143 |
+
let sr = Int(Self.sampleRate)
|
| 144 |
+
let frameSamples = Int(Double(sr) * Self.frameDurationSec) // 480 @ 16kHz
|
| 145 |
+
let frameBytes = frameSamples * 2
|
| 146 |
+
|
| 147 |
+
// Keep a local working buffer so we can add frame-by-frame to pcmBuffer.
|
| 148 |
+
var remaining = int16Data
|
| 149 |
+
while remaining.count >= frameBytes {
|
| 150 |
+
let frame = remaining.prefix(frameBytes)
|
| 151 |
+
remaining.removeSubrange(remaining.startIndex..<remaining.index(remaining.startIndex, offsetBy: frameBytes))
|
| 152 |
+
|
| 153 |
+
let frameRMS = Self.chunkRMS(Data(frame))
|
| 154 |
+
// Hysteresis: voice frames must cross the higher activation bar;
|
| 155 |
+
// silence frames must drop below the lower deactivation bar.
|
| 156 |
+
// Ambient noise in between does nothing — it neither starts nor
|
| 157 |
+
// ends an utterance, which prevents the VAD from being stuck.
|
| 158 |
+
let isVoice = frameRMS >= Self.voiceActivateRMS
|
| 159 |
+
let isSilence = frameRMS < Self.voiceDeactivateRMS
|
| 160 |
+
|
| 161 |
+
if !inSpeech {
|
| 162 |
+
// Waiting for enough clear voice to start a new utterance.
|
| 163 |
+
if isVoice {
|
| 164 |
+
speechRunSec += Self.frameDurationSec
|
| 165 |
+
pcmBuffer.append(frame)
|
| 166 |
+
if speechRunSec >= Self.speechActivationSec {
|
| 167 |
+
inSpeech = true
|
| 168 |
+
utteranceSec = speechRunSec
|
| 169 |
+
silenceRunSec = 0
|
| 170 |
+
}
|
| 171 |
+
} else {
|
| 172 |
+
// Drop: no speech started yet.
|
| 173 |
+
speechRunSec = 0
|
| 174 |
+
pcmBuffer.removeAll(keepingCapacity: true)
|
| 175 |
+
}
|
| 176 |
+
continue
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
// inSpeech == true — we're mid-utterance. Keep accumulating.
|
| 180 |
+
pcmBuffer.append(frame)
|
| 181 |
+
utteranceSec += Self.frameDurationSec
|
| 182 |
+
|
| 183 |
+
if isVoice {
|
| 184 |
+
silenceRunSec = 0
|
| 185 |
+
} else if isSilence {
|
| 186 |
+
silenceRunSec += Self.frameDurationSec
|
| 187 |
+
}
|
| 188 |
+
// else: ambient (in-between) — keep existing silenceRunSec
|
| 189 |
+
|
| 190 |
+
let shouldFlush =
|
| 191 |
+
(silenceRunSec >= Self.trailingSilenceSec && utteranceSec >= Self.minSpeechSec) ||
|
| 192 |
+
(utteranceSec >= Self.maxSpeechSec)
|
| 193 |
+
|
| 194 |
+
if shouldFlush {
|
| 195 |
+
flushCurrentUtterance()
|
| 196 |
+
}
|
| 197 |
+
}
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
private func flushCurrentUtterance() {
|
| 201 |
+
let chunk = pcmBuffer
|
| 202 |
+
// Reset state so we can start a new utterance immediately.
|
| 203 |
+
pcmBuffer.removeAll(keepingCapacity: true)
|
| 204 |
+
inSpeech = false
|
| 205 |
+
speechRunSec = 0
|
| 206 |
+
silenceRunSec = 0
|
| 207 |
+
utteranceSec = 0
|
| 208 |
+
|
| 209 |
+
let rmsIn = Self.chunkRMS(chunk)
|
| 210 |
+
if rmsIn < Self.silenceRMSThreshold {
|
| 211 |
+
recLog.info("VAD flush: utterance too quiet (rms=\(rmsIn, format: .fixed(precision: 4))) — dropping")
|
| 212 |
+
return
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
+
let normalized = Self.preprocessForASR(chunk)
|
| 216 |
+
let rmsOut = Self.chunkRMS(normalized)
|
| 217 |
+
let durSec = Double(chunk.count / 2) / Self.sampleRate
|
| 218 |
+
recLog.info("VAD flush: utterance \(durSec, format: .fixed(precision: 2))s, rms_in=\(rmsIn, format: .fixed(precision: 4)) rms_out=\(rmsOut, format: .fixed(precision: 4))")
|
| 219 |
+
|
| 220 |
+
let pending = PendingUtterance(
|
| 221 |
+
pcm: normalized, rmsIn: rmsIn, rmsOut: rmsOut, durationSec: durSec,
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
if inFlight {
|
| 225 |
+
while queue.count >= Self.maxQueueDepth {
|
| 226 |
+
queue.removeFirst()
|
| 227 |
+
recLog.info("Queue full — dropping oldest pending utterance")
|
| 228 |
+
}
|
| 229 |
+
queue.append(pending)
|
| 230 |
+
return
|
| 231 |
+
}
|
| 232 |
+
|
| 233 |
+
inFlight = true
|
| 234 |
+
Task { [weak self] in
|
| 235 |
+
await self?.runInference(pending)
|
| 236 |
+
}
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
/// Kick off the next queued utterance once the current call returns.
|
| 240 |
+
private func drainQueue() {
|
| 241 |
+
guard !queue.isEmpty, !inFlight, isRunning else { return }
|
| 242 |
+
let next = queue.removeFirst()
|
| 243 |
+
inFlight = true
|
| 244 |
+
Task { [weak self] in
|
| 245 |
+
await self?.runInference(next)
|
| 246 |
+
}
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
// MARK: - Private
|
| 250 |
+
|
| 251 |
+
private func runInference(_ pending: PendingUtterance) async {
|
| 252 |
+
defer {
|
| 253 |
+
Task { @MainActor [weak self] in
|
| 254 |
+
self?.inFlight = false
|
| 255 |
+
self?.drainQueue()
|
| 256 |
+
}
|
| 257 |
+
}
|
| 258 |
+
do {
|
| 259 |
+
// Single-pass translate: half the inference time of
|
| 260 |
+
// transcribe_translate because Gemma is only prompted once.
|
| 261 |
+
// The overlay only ever shows translated text anyway.
|
| 262 |
+
let result = try await service.translate(
|
| 263 |
+
pcm16: pending.pcm,
|
| 264 |
+
sampleRate: 16_000,
|
| 265 |
+
targetLanguage: targetLanguageName
|
| 266 |
+
)
|
| 267 |
+
recLog.info("Sidecar returned: source=\"\(result.sourceText)\" translated=\"\(result.translatedText)\" latency=\(result.latencyMs)ms")
|
| 268 |
+
await MainActor.run { [weak self] in
|
| 269 |
+
guard let self else { return }
|
| 270 |
+
SessionLogger.shared.record(
|
| 271 |
+
pcm16: pending.pcm,
|
| 272 |
+
sampleRate: 16_000,
|
| 273 |
+
rmsIn: pending.rmsIn,
|
| 274 |
+
rmsOut: pending.rmsOut,
|
| 275 |
+
durationSec: pending.durationSec,
|
| 276 |
+
latencyMs: result.latencyMs,
|
| 277 |
+
sourceText: result.sourceText,
|
| 278 |
+
translatedText: result.translatedText,
|
| 279 |
+
targetLang: self.targetLanguageName,
|
| 280 |
+
engine: "gemma_mlx"
|
| 281 |
+
)
|
| 282 |
+
guard self.isRunning else { return }
|
| 283 |
+
let out = result.translatedText.isEmpty
|
| 284 |
+
? result.sourceText
|
| 285 |
+
: result.translatedText
|
| 286 |
+
guard !out.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else {
|
| 287 |
+
return
|
| 288 |
+
}
|
| 289 |
+
self.onStableUpdate?(out, true, self.latestDetectedLanguage)
|
| 290 |
+
self.onFinalUtterance?(out, self.latestDetectedLanguage)
|
| 291 |
+
}
|
| 292 |
+
} catch {
|
| 293 |
+
recLog.error("Sidecar call failed: \(String(describing: error))")
|
| 294 |
+
await MainActor.run { [weak self] in
|
| 295 |
+
guard let self else { return }
|
| 296 |
+
SessionLogger.shared.record(
|
| 297 |
+
pcm16: pending.pcm,
|
| 298 |
+
sampleRate: 16_000,
|
| 299 |
+
rmsIn: pending.rmsIn,
|
| 300 |
+
rmsOut: pending.rmsOut,
|
| 301 |
+
durationSec: pending.durationSec,
|
| 302 |
+
latencyMs: nil,
|
| 303 |
+
sourceText: nil,
|
| 304 |
+
translatedText: nil,
|
| 305 |
+
targetLang: self.targetLanguageName,
|
| 306 |
+
engine: "gemma_mlx",
|
| 307 |
+
error: String(describing: error)
|
| 308 |
+
)
|
| 309 |
+
self.onError?(error)
|
| 310 |
+
}
|
| 311 |
+
}
|
| 312 |
+
}
|
| 313 |
+
|
| 314 |
+
/// Minimal ASR preprocessing for Gemma 4: DC removal + peak normalize.
|
| 315 |
+
///
|
| 316 |
+
/// Notes on what we deliberately DON'T do:
|
| 317 |
+
/// - No pre-emphasis: Gemma 4's USM Conformer audio tower operates on
|
| 318 |
+
/// raw waveforms, not MFCC-style features. Pre-emphasis distorts the
|
| 319 |
+
/// waveform's character enough that the model's language detector
|
| 320 |
+
/// mis-identifies the input (e.g. English → "Korean").
|
| 321 |
+
/// - No high-pass filter: would help classic ASR but here it sculpts
|
| 322 |
+
/// formants the model uses and hurts accuracy.
|
| 323 |
+
/// The safe, minimal pipeline just centres the waveform and scales it
|
| 324 |
+
/// up so the model sees usable dynamic range.
|
| 325 |
+
private static func preprocessForASR(_ pcm: Data) -> Data {
|
| 326 |
+
guard pcm.count >= 4 else { return pcm }
|
| 327 |
+
let sampleCount = pcm.count / 2
|
| 328 |
+
|
| 329 |
+
// Decode to float
|
| 330 |
+
var samples = [Float](repeating: 0, count: sampleCount)
|
| 331 |
+
pcm.withUnsafeBytes { raw in
|
| 332 |
+
let ins = raw.bindMemory(to: Int16.self)
|
| 333 |
+
let scale = Float(1.0 / Float(Int16.max))
|
| 334 |
+
for i in 0..<sampleCount {
|
| 335 |
+
samples[i] = Float(ins[i]) * scale
|
| 336 |
+
}
|
| 337 |
+
}
|
| 338 |
+
|
| 339 |
+
// DC offset removal
|
| 340 |
+
var mean: Float = 0
|
| 341 |
+
for s in samples { mean += s }
|
| 342 |
+
mean /= Float(sampleCount)
|
| 343 |
+
if abs(mean) > 1e-5 {
|
| 344 |
+
for i in 0..<sampleCount { samples[i] -= mean }
|
| 345 |
+
}
|
| 346 |
+
|
| 347 |
+
// Peak normalize (target 0.85, cap gain to avoid amplifying noise)
|
| 348 |
+
var peak: Float = 0
|
| 349 |
+
for s in samples { peak = max(peak, abs(s)) }
|
| 350 |
+
if peak > 0 {
|
| 351 |
+
let target: Float = 0.85
|
| 352 |
+
let gain = min(target / peak, 30.0)
|
| 353 |
+
if gain > 1.0 {
|
| 354 |
+
for i in 0..<sampleCount { samples[i] *= gain }
|
| 355 |
+
}
|
| 356 |
+
}
|
| 357 |
+
|
| 358 |
+
// Re-encode to int16
|
| 359 |
+
var out = Data(count: pcm.count)
|
| 360 |
+
out.withUnsafeMutableBytes { outRaw in
|
| 361 |
+
let outs = outRaw.bindMemory(to: Int16.self)
|
| 362 |
+
for i in 0..<sampleCount {
|
| 363 |
+
let v = samples[i] * Float(Int16.max)
|
| 364 |
+
outs[i] = Int16(clamping: Int(v.rounded()))
|
| 365 |
+
}
|
| 366 |
+
}
|
| 367 |
+
return out
|
| 368 |
+
}
|
| 369 |
+
|
| 370 |
+
/// Peak-normalize int16 PCM so the loudest sample hits ~0.85 of full
|
| 371 |
+
/// scale. Leaves a small headroom to avoid clipping on transients.
|
| 372 |
+
/// Caps gain at 30x to prevent amplifying noise when there's no speech.
|
| 373 |
+
private static func peakNormalize(_ pcm: Data) -> Data {
|
| 374 |
+
guard pcm.count >= 2 else { return pcm }
|
| 375 |
+
let sampleCount = pcm.count / 2
|
| 376 |
+
var peak: Int16 = 0
|
| 377 |
+
pcm.withUnsafeBytes { raw in
|
| 378 |
+
let samples = raw.bindMemory(to: Int16.self)
|
| 379 |
+
for i in 0..<sampleCount {
|
| 380 |
+
let a = samples[i] == Int16.min ? Int16.max : abs(samples[i])
|
| 381 |
+
if a > peak { peak = a }
|
| 382 |
+
}
|
| 383 |
+
}
|
| 384 |
+
if peak == 0 { return pcm }
|
| 385 |
+
let targetPeak: Float = 0.85 * Float(Int16.max)
|
| 386 |
+
let rawGain = targetPeak / Float(peak)
|
| 387 |
+
let gain = min(rawGain, 30.0) // cap to avoid blowing up noise
|
| 388 |
+
if gain <= 1.05 { return pcm } // already loud enough
|
| 389 |
+
|
| 390 |
+
var out = Data(count: pcm.count)
|
| 391 |
+
out.withUnsafeMutableBytes { outRaw in
|
| 392 |
+
let outSamples = outRaw.bindMemory(to: Int16.self)
|
| 393 |
+
pcm.withUnsafeBytes { raw in
|
| 394 |
+
let inSamples = raw.bindMemory(to: Int16.self)
|
| 395 |
+
for i in 0..<sampleCount {
|
| 396 |
+
let scaled = Float(inSamples[i]) * gain
|
| 397 |
+
outSamples[i] = Int16(clamping: Int(scaled.rounded()))
|
| 398 |
+
}
|
| 399 |
+
}
|
| 400 |
+
}
|
| 401 |
+
return out
|
| 402 |
+
}
|
| 403 |
+
|
| 404 |
+
/// RMS amplitude of a chunk in [0, 1] range (int16 normalized to float).
|
| 405 |
+
private static func chunkRMS(_ pcm: Data) -> Float {
|
| 406 |
+
guard pcm.count >= 2 else { return 0 }
|
| 407 |
+
let sampleCount = pcm.count / 2
|
| 408 |
+
var sumSq: Double = 0
|
| 409 |
+
pcm.withUnsafeBytes { raw in
|
| 410 |
+
let samples = raw.bindMemory(to: Int16.self)
|
| 411 |
+
for i in 0..<sampleCount {
|
| 412 |
+
let s = Double(samples[i]) / Double(Int16.max)
|
| 413 |
+
sumSq += s * s
|
| 414 |
+
}
|
| 415 |
+
}
|
| 416 |
+
return Float(sqrt(sumSq / Double(sampleCount)))
|
| 417 |
+
}
|
| 418 |
+
|
| 419 |
+
/// Map a BCP-47 code (e.g. "vi", "zh-CN") to the English language name
|
| 420 |
+
/// Gemma expects in prompts.
|
| 421 |
+
private static func targetLanguageName(from code: String) -> String {
|
| 422 |
+
let lower = code.lowercased()
|
| 423 |
+
let table: [String: String] = [
|
| 424 |
+
"en": "English", "vi": "Vietnamese", "es": "Spanish",
|
| 425 |
+
"zh": "Chinese", "zh-cn": "Chinese", "zh-tw": "Chinese",
|
| 426 |
+
"ja": "Japanese", "ko": "Korean", "fr": "French",
|
| 427 |
+
"de": "German", "pt": "Portuguese", "ru": "Russian",
|
| 428 |
+
"ar": "Arabic", "hi": "Hindi", "id": "Indonesian",
|
| 429 |
+
"th": "Thai", "it": "Italian", "tr": "Turkish",
|
| 430 |
+
]
|
| 431 |
+
if let name = table[lower] { return name }
|
| 432 |
+
let base = lower.split(separator: "-").first.map(String.init) ?? lower
|
| 433 |
+
return table[base] ?? "English"
|
| 434 |
+
}
|
| 435 |
+
|
| 436 |
+
private func convertToInt16Mono16k(_ buffer: AVAudioPCMBuffer) -> Data? {
|
| 437 |
+
let inputFormat = buffer.format
|
| 438 |
+
if converter == nil || converterInputFormat != inputFormat {
|
| 439 |
+
converter = AVAudioConverter(from: inputFormat, to: desiredFormat)
|
| 440 |
+
converterInputFormat = inputFormat
|
| 441 |
+
guard converter != nil else { return nil }
|
| 442 |
+
}
|
| 443 |
+
guard let converter else { return nil }
|
| 444 |
+
|
| 445 |
+
let capacity = AVAudioFrameCount(
|
| 446 |
+
Double(buffer.frameLength) *
|
| 447 |
+
desiredFormat.sampleRate / inputFormat.sampleRate
|
| 448 |
+
) + 1024
|
| 449 |
+
guard let out = AVAudioPCMBuffer(
|
| 450 |
+
pcmFormat: desiredFormat, frameCapacity: capacity
|
| 451 |
+
) else { return nil }
|
| 452 |
+
|
| 453 |
+
var submitted = false
|
| 454 |
+
let inputBlock: AVAudioConverterInputBlock = { _, status in
|
| 455 |
+
if submitted {
|
| 456 |
+
status.pointee = .noDataNow
|
| 457 |
+
return nil
|
| 458 |
+
}
|
| 459 |
+
submitted = true
|
| 460 |
+
status.pointee = .haveData
|
| 461 |
+
return buffer
|
| 462 |
+
}
|
| 463 |
+
var error: NSError?
|
| 464 |
+
let s = converter.convert(to: out, error: &error, withInputFrom: inputBlock)
|
| 465 |
+
if s == .error || error != nil { return nil }
|
| 466 |
+
|
| 467 |
+
guard let data = out.int16ChannelData?.pointee else { return nil }
|
| 468 |
+
let byteCount = Int(out.frameLength) * 2
|
| 469 |
+
return Data(bytes: data, count: byteCount)
|
| 470 |
+
}
|
| 471 |
+
}
|
apps/macos/Sources/BanhMi/GemmaMLXService.swift
ADDED
|
@@ -0,0 +1,271 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import Foundation
|
| 2 |
+
import AVFoundation
|
| 3 |
+
|
| 4 |
+
/// Bridges the Swift app to the Python MLX sidecar process that runs
|
| 5 |
+
/// Gemma 4 E2B UD-MLX-4bit locally on Apple Silicon. The sidecar loads
|
| 6 |
+
/// the model once at startup; each request is a JSON line over stdin
|
| 7 |
+
/// and a JSON line back over stdout.
|
| 8 |
+
///
|
| 9 |
+
/// Not a `TranslationService` — the sidecar takes *audio* in and emits
|
| 10 |
+
/// transcript + translation, fully offline on-device.
|
| 11 |
+
final class GemmaMLXService {
|
| 12 |
+
struct Result: Sendable {
|
| 13 |
+
let sourceText: String
|
| 14 |
+
let translatedText: String
|
| 15 |
+
let latencyMs: Int
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
enum ServiceError: Error, CustomStringConvertible {
|
| 19 |
+
case pythonNotFound
|
| 20 |
+
case sidecarScriptMissing(String)
|
| 21 |
+
case processNotStarted
|
| 22 |
+
case sidecarFailed(String)
|
| 23 |
+
case invalidResponse(String)
|
| 24 |
+
|
| 25 |
+
var description: String {
|
| 26 |
+
switch self {
|
| 27 |
+
case .pythonNotFound: return "python3 not found on PATH"
|
| 28 |
+
case .sidecarScriptMissing(let p): return "sidecar script missing: \(p)"
|
| 29 |
+
case .processNotStarted: return "sidecar process failed to start"
|
| 30 |
+
case .sidecarFailed(let msg): return "sidecar error: \(msg)"
|
| 31 |
+
case .invalidResponse(let s): return "sidecar invalid response: \(s)"
|
| 32 |
+
}
|
| 33 |
+
}
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
private let process: Process
|
| 37 |
+
private let stdinPipe = Pipe()
|
| 38 |
+
private let stdoutPipe = Pipe()
|
| 39 |
+
private let stderrPipe = Pipe()
|
| 40 |
+
private let readQueue = DispatchQueue(label: "gemma-mlx-read")
|
| 41 |
+
private let writeQueue = DispatchQueue(label: "gemma-mlx-write")
|
| 42 |
+
private var buffer = Data()
|
| 43 |
+
private var pendingResponses: [CheckedContinuation<String, Error>] = []
|
| 44 |
+
private let pendingLock = NSLock()
|
| 45 |
+
|
| 46 |
+
/// Creates and starts the sidecar process. The model loads lazily
|
| 47 |
+
/// inside the Python process — callers should await `waitUntilReady()`
|
| 48 |
+
/// before sending real work.
|
| 49 |
+
init(
|
| 50 |
+
pythonPath: String,
|
| 51 |
+
sidecarScript: String,
|
| 52 |
+
modelID: String = "unsloth/gemma-4-E2B-it-UD-MLX-4bit"
|
| 53 |
+
) throws {
|
| 54 |
+
guard FileManager.default.fileExists(atPath: sidecarScript) else {
|
| 55 |
+
throw ServiceError.sidecarScriptMissing(sidecarScript)
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
process = Process()
|
| 59 |
+
process.executableURL = URL(fileURLWithPath: pythonPath)
|
| 60 |
+
if pythonPath.hasSuffix("/env") {
|
| 61 |
+
// `/usr/bin/env python3 <script>` — python3 must be first arg
|
| 62 |
+
process.arguments = ["python3", sidecarScript]
|
| 63 |
+
} else {
|
| 64 |
+
// Direct python3 binary — script is first arg
|
| 65 |
+
process.arguments = [sidecarScript]
|
| 66 |
+
}
|
| 67 |
+
process.standardInput = stdinPipe
|
| 68 |
+
process.standardOutput = stdoutPipe
|
| 69 |
+
process.standardError = stderrPipe
|
| 70 |
+
|
| 71 |
+
var env = ProcessInfo.processInfo.environment
|
| 72 |
+
env["GEMMA_MLX_MODEL"] = modelID
|
| 73 |
+
env["PYTHONUNBUFFERED"] = "1"
|
| 74 |
+
process.environment = env
|
| 75 |
+
|
| 76 |
+
try process.run()
|
| 77 |
+
|
| 78 |
+
// Read stdout continuously on a background queue
|
| 79 |
+
stdoutPipe.fileHandleForReading.readabilityHandler = { [weak self] h in
|
| 80 |
+
self?.handleStdoutChunk(h.availableData)
|
| 81 |
+
}
|
| 82 |
+
// Forward stderr to Console so we can see sidecar errors / progress
|
| 83 |
+
stderrPipe.fileHandleForReading.readabilityHandler = { h in
|
| 84 |
+
let data = h.availableData
|
| 85 |
+
guard !data.isEmpty,
|
| 86 |
+
let s = String(data: data, encoding: .utf8) else { return }
|
| 87 |
+
let trimmed = s.trimmingCharacters(in: .whitespacesAndNewlines)
|
| 88 |
+
if !trimmed.isEmpty {
|
| 89 |
+
NSLog("[gemma_sidecar] %@", trimmed)
|
| 90 |
+
}
|
| 91 |
+
}
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
deinit {
|
| 95 |
+
stop()
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
func stop() {
|
| 99 |
+
stdoutPipe.fileHandleForReading.readabilityHandler = nil
|
| 100 |
+
stderrPipe.fileHandleForReading.readabilityHandler = nil
|
| 101 |
+
if process.isRunning {
|
| 102 |
+
process.terminate()
|
| 103 |
+
}
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
/// Awaits the sidecar's first JSON line, which carries `{"event":"ready"}`.
|
| 107 |
+
func waitUntilReady() async throws {
|
| 108 |
+
let line = try await readNextLine()
|
| 109 |
+
guard let data = line.data(using: .utf8),
|
| 110 |
+
let obj = try? JSONSerialization.jsonObject(with: data) as? [String: Any],
|
| 111 |
+
obj["event"] as? String == "ready" else {
|
| 112 |
+
throw ServiceError.invalidResponse(line)
|
| 113 |
+
}
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
/// Writes an audio clip (int16 PCM or float32) to a temp WAV and
|
| 117 |
+
/// sends a transcribe+translate request. Returns both source and
|
| 118 |
+
/// translated text.
|
| 119 |
+
func transcribeAndTranslate(
|
| 120 |
+
audioFileURL: URL,
|
| 121 |
+
targetLanguage: String
|
| 122 |
+
) async throws -> Result {
|
| 123 |
+
let req: [String: Any] = [
|
| 124 |
+
"task": "transcribe_translate",
|
| 125 |
+
"audio_path": audioFileURL.path,
|
| 126 |
+
"target_lang": targetLanguage,
|
| 127 |
+
]
|
| 128 |
+
return try await send(req)
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
/// Convenience: writes an in-memory sample buffer to a temp WAV and
|
| 132 |
+
/// processes it. Caller owns the lifetime of the buffer data.
|
| 133 |
+
func transcribeAndTranslate(
|
| 134 |
+
pcm16: Data,
|
| 135 |
+
sampleRate: Double,
|
| 136 |
+
targetLanguage: String
|
| 137 |
+
) async throws -> Result {
|
| 138 |
+
let url = try writeTempWAV(pcm16: pcm16, sampleRate: sampleRate)
|
| 139 |
+
defer { try? FileManager.default.removeItem(at: url) }
|
| 140 |
+
return try await transcribeAndTranslate(
|
| 141 |
+
audioFileURL: url, targetLanguage: targetLanguage,
|
| 142 |
+
)
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
/// Single-pass translate. Half the inference cost of
|
| 146 |
+
/// transcribe_translate because Gemma is only prompted once. Returns
|
| 147 |
+
/// only translated text — sourceText is empty.
|
| 148 |
+
func translate(
|
| 149 |
+
pcm16: Data,
|
| 150 |
+
sampleRate: Double,
|
| 151 |
+
targetLanguage: String
|
| 152 |
+
) async throws -> Result {
|
| 153 |
+
let url = try writeTempWAV(pcm16: pcm16, sampleRate: sampleRate)
|
| 154 |
+
defer { try? FileManager.default.removeItem(at: url) }
|
| 155 |
+
let req: [String: Any] = [
|
| 156 |
+
"task": "translate",
|
| 157 |
+
"audio_path": url.path,
|
| 158 |
+
"target_lang": targetLanguage,
|
| 159 |
+
]
|
| 160 |
+
return try await send(req)
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
// MARK: - Private
|
| 164 |
+
|
| 165 |
+
private func send(_ request: [String: Any]) async throws -> Result {
|
| 166 |
+
let data = try JSONSerialization.data(withJSONObject: request, options: [])
|
| 167 |
+
// Write request + newline
|
| 168 |
+
writeQueue.async { [weak self] in
|
| 169 |
+
guard let self else { return }
|
| 170 |
+
self.stdinPipe.fileHandleForWriting.write(data)
|
| 171 |
+
self.stdinPipe.fileHandleForWriting.write("\n".data(using: .utf8)!)
|
| 172 |
+
}
|
| 173 |
+
let responseLine = try await readNextLine()
|
| 174 |
+
guard let respData = responseLine.data(using: .utf8),
|
| 175 |
+
let json = try? JSONSerialization.jsonObject(with: respData) as? [String: Any] else {
|
| 176 |
+
throw ServiceError.invalidResponse(responseLine)
|
| 177 |
+
}
|
| 178 |
+
if let ok = json["ok"] as? Bool, ok == false {
|
| 179 |
+
let msg = json["error"] as? String ?? "unknown"
|
| 180 |
+
throw ServiceError.sidecarFailed(msg)
|
| 181 |
+
}
|
| 182 |
+
let source = (json["source_text"] as? String) ?? ""
|
| 183 |
+
let translated = (json["translated_text"] as? String) ?? ""
|
| 184 |
+
let latency = (json["latency_ms"] as? Int) ?? 0
|
| 185 |
+
return Result(
|
| 186 |
+
sourceText: source,
|
| 187 |
+
translatedText: translated,
|
| 188 |
+
latencyMs: latency,
|
| 189 |
+
)
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
private func readNextLine() async throws -> String {
|
| 193 |
+
try await withCheckedThrowingContinuation { cont in
|
| 194 |
+
pendingLock.lock()
|
| 195 |
+
pendingResponses.append(cont)
|
| 196 |
+
pendingLock.unlock()
|
| 197 |
+
drainBuffer()
|
| 198 |
+
}
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
private func handleStdoutChunk(_ chunk: Data) {
|
| 202 |
+
guard !chunk.isEmpty else { return }
|
| 203 |
+
readQueue.async { [weak self] in
|
| 204 |
+
guard let self else { return }
|
| 205 |
+
self.buffer.append(chunk)
|
| 206 |
+
self.drainBuffer()
|
| 207 |
+
}
|
| 208 |
+
}
|
| 209 |
+
|
| 210 |
+
private func drainBuffer() {
|
| 211 |
+
readQueue.async { [weak self] in
|
| 212 |
+
guard let self else { return }
|
| 213 |
+
let newline: UInt8 = 0x0A
|
| 214 |
+
while let idx = self.buffer.firstIndex(of: newline) {
|
| 215 |
+
let lineData = self.buffer.prefix(upTo: idx)
|
| 216 |
+
self.buffer.removeSubrange(self.buffer.startIndex...idx)
|
| 217 |
+
let line = String(data: lineData, encoding: .utf8) ?? ""
|
| 218 |
+
|
| 219 |
+
self.pendingLock.lock()
|
| 220 |
+
let cont = self.pendingResponses.isEmpty
|
| 221 |
+
? nil
|
| 222 |
+
: self.pendingResponses.removeFirst()
|
| 223 |
+
self.pendingLock.unlock()
|
| 224 |
+
|
| 225 |
+
cont?.resume(returning: line)
|
| 226 |
+
}
|
| 227 |
+
}
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
/// Writes 16-bit PCM samples to a minimal WAV file. Mono assumed.
|
| 231 |
+
private func writeTempWAV(pcm16: Data, sampleRate: Double) throws -> URL {
|
| 232 |
+
let tmpDir = FileManager.default.temporaryDirectory
|
| 233 |
+
let url = tmpDir.appendingPathComponent(
|
| 234 |
+
"banhmi-\(UUID().uuidString).wav"
|
| 235 |
+
)
|
| 236 |
+
|
| 237 |
+
var header = Data()
|
| 238 |
+
let dataSize = UInt32(pcm16.count)
|
| 239 |
+
let byteRate = UInt32(sampleRate) * 2 // 16-bit mono
|
| 240 |
+
let blockAlign: UInt16 = 2
|
| 241 |
+
|
| 242 |
+
header.append("RIFF".data(using: .ascii)!)
|
| 243 |
+
header.append(uint32LE(36 + dataSize))
|
| 244 |
+
header.append("WAVE".data(using: .ascii)!)
|
| 245 |
+
header.append("fmt ".data(using: .ascii)!)
|
| 246 |
+
header.append(uint32LE(16)) // Subchunk1Size
|
| 247 |
+
header.append(uint16LE(1)) // PCM
|
| 248 |
+
header.append(uint16LE(1)) // mono
|
| 249 |
+
header.append(uint32LE(UInt32(sampleRate)))
|
| 250 |
+
header.append(uint32LE(byteRate))
|
| 251 |
+
header.append(uint16LE(blockAlign))
|
| 252 |
+
header.append(uint16LE(16)) // bits per sample
|
| 253 |
+
header.append("data".data(using: .ascii)!)
|
| 254 |
+
header.append(uint32LE(dataSize))
|
| 255 |
+
|
| 256 |
+
var out = header
|
| 257 |
+
out.append(pcm16)
|
| 258 |
+
try out.write(to: url)
|
| 259 |
+
return url
|
| 260 |
+
}
|
| 261 |
+
|
| 262 |
+
private func uint16LE(_ v: UInt16) -> Data {
|
| 263 |
+
var le = v.littleEndian
|
| 264 |
+
return Data(bytes: &le, count: 2)
|
| 265 |
+
}
|
| 266 |
+
|
| 267 |
+
private func uint32LE(_ v: UInt32) -> Data {
|
| 268 |
+
var le = v.littleEndian
|
| 269 |
+
return Data(bytes: &le, count: 4)
|
| 270 |
+
}
|
| 271 |
+
}
|
apps/macos/Sources/BanhMi/MicrophoneCapture.swift
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import Foundation
|
| 2 |
+
import AVFoundation
|
| 3 |
+
|
| 4 |
+
/// Captures audio from the default input device (microphone).
|
| 5 |
+
/// Emits `AVAudioPCMBuffer`s on a background queue via `onBuffer`.
|
| 6 |
+
final class MicrophoneCapture {
|
| 7 |
+
private let engine = AVAudioEngine()
|
| 8 |
+
private(set) var isRunning = false
|
| 9 |
+
|
| 10 |
+
var onBuffer: ((AVAudioPCMBuffer, AVAudioTime) -> Void)?
|
| 11 |
+
|
| 12 |
+
func start() throws {
|
| 13 |
+
guard !isRunning else { return }
|
| 14 |
+
|
| 15 |
+
let input = engine.inputNode
|
| 16 |
+
let format = input.outputFormat(forBus: 0)
|
| 17 |
+
|
| 18 |
+
input.installTap(onBus: 0, bufferSize: 1024, format: format) { [weak self] buffer, when in
|
| 19 |
+
self?.onBuffer?(buffer, when)
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
engine.prepare()
|
| 23 |
+
try engine.start()
|
| 24 |
+
isRunning = true
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
func stop() {
|
| 28 |
+
guard isRunning else { return }
|
| 29 |
+
engine.inputNode.removeTap(onBus: 0)
|
| 30 |
+
engine.stop()
|
| 31 |
+
isRunning = false
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
static func requestPermission() async -> Bool {
|
| 35 |
+
await withCheckedContinuation { continuation in
|
| 36 |
+
AVCaptureDevice.requestAccess(for: .audio) { granted in
|
| 37 |
+
continuation.resume(returning: granted)
|
| 38 |
+
}
|
| 39 |
+
}
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
static var hasPermission: Bool {
|
| 43 |
+
AVCaptureDevice.authorizationStatus(for: .audio) == .authorized
|
| 44 |
+
}
|
| 45 |
+
}
|
apps/macos/Sources/BanhMi/SessionLogger.swift
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import Foundation
|
| 2 |
+
import OSLog
|
| 3 |
+
|
| 4 |
+
/// Per-session logger. Each time the app launches we start a new session
|
| 5 |
+
/// directory under `~/Library/Logs/BanhMi/session-<timestamp>/` that holds:
|
| 6 |
+
/// - `session.jsonl` — one line per utterance with timing + transcripts
|
| 7 |
+
/// - `utt-NNN.wav` — the exact audio we sent to Gemma (post-preprocess)
|
| 8 |
+
///
|
| 9 |
+
/// Call `record(...)` from the recognizer once per model round-trip.
|
| 10 |
+
/// Open the folder from the Settings popover to review a run.
|
| 11 |
+
@MainActor
|
| 12 |
+
final class SessionLogger {
|
| 13 |
+
static let shared = SessionLogger()
|
| 14 |
+
|
| 15 |
+
private let log = Logger(subsystem: "vn.banhmi.chuyenngu", category: "session-log")
|
| 16 |
+
private let queue = DispatchQueue(label: "session-logger", qos: .utility)
|
| 17 |
+
private let fm = FileManager.default
|
| 18 |
+
private let sessionDir: URL
|
| 19 |
+
private let jsonlURL: URL
|
| 20 |
+
private var utteranceIndex: Int = 0
|
| 21 |
+
private var isoFormatter: ISO8601DateFormatter = {
|
| 22 |
+
let f = ISO8601DateFormatter()
|
| 23 |
+
f.formatOptions = [.withInternetDateTime, .withFractionalSeconds]
|
| 24 |
+
return f
|
| 25 |
+
}()
|
| 26 |
+
|
| 27 |
+
private init() {
|
| 28 |
+
// ~/Library/Logs/BanhMi/session-2026-04-22T00-05-12Z/
|
| 29 |
+
let libraryLogs = fm.urls(for: .libraryDirectory, in: .userDomainMask)[0]
|
| 30 |
+
.appendingPathComponent("Logs")
|
| 31 |
+
.appendingPathComponent("BanhMi")
|
| 32 |
+
let ts = Self.timestampSlug(Date())
|
| 33 |
+
sessionDir = libraryLogs.appendingPathComponent("session-\(ts)")
|
| 34 |
+
jsonlURL = sessionDir.appendingPathComponent("session.jsonl")
|
| 35 |
+
|
| 36 |
+
try? fm.createDirectory(at: sessionDir, withIntermediateDirectories: true)
|
| 37 |
+
// Write a tiny header line so jq can read it too
|
| 38 |
+
let header: [String: Any] = [
|
| 39 |
+
"event": "session_start",
|
| 40 |
+
"ts": isoFormatter.string(from: Date()),
|
| 41 |
+
"session_dir": sessionDir.path,
|
| 42 |
+
]
|
| 43 |
+
appendJSON(header)
|
| 44 |
+
log.info("Session log: \(self.sessionDir.path, privacy: .public)")
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
/// Open this session's folder in Finder. Safe to call from the UI.
|
| 48 |
+
func revealInFinder() {
|
| 49 |
+
NSWorkspace.shared.open(sessionDir)
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
var sessionDirectory: URL { sessionDir }
|
| 53 |
+
|
| 54 |
+
/// Record one utterance round-trip.
|
| 55 |
+
/// - Parameters:
|
| 56 |
+
/// - pcm16: the exact int16 PCM we sent to Gemma (post-preprocess)
|
| 57 |
+
/// - sampleRate: audio sample rate (typically 16000)
|
| 58 |
+
/// - rmsIn: RMS of the raw utterance before preprocessing
|
| 59 |
+
/// - rmsOut: RMS after preprocessing
|
| 60 |
+
/// - durationSec: utterance length
|
| 61 |
+
/// - latencyMs: sidecar round-trip time
|
| 62 |
+
/// - sourceText: transcript Gemma returned
|
| 63 |
+
/// - translatedText: translation Gemma returned (may equal source)
|
| 64 |
+
/// - targetLang: target language name
|
| 65 |
+
/// - error: non-nil if the call failed
|
| 66 |
+
func record(
|
| 67 |
+
pcm16: Data,
|
| 68 |
+
sampleRate: Double,
|
| 69 |
+
rmsIn: Float,
|
| 70 |
+
rmsOut: Float,
|
| 71 |
+
durationSec: Double,
|
| 72 |
+
latencyMs: Int?,
|
| 73 |
+
sourceText: String?,
|
| 74 |
+
translatedText: String?,
|
| 75 |
+
targetLang: String,
|
| 76 |
+
engine: String,
|
| 77 |
+
error: String? = nil
|
| 78 |
+
) {
|
| 79 |
+
// Snapshot state that needs MainActor access before jumping queues
|
| 80 |
+
let index = utteranceIndex
|
| 81 |
+
utteranceIndex += 1
|
| 82 |
+
|
| 83 |
+
let nowStr = isoFormatter.string(from: Date())
|
| 84 |
+
let wavName = String(format: "utt-%04d.wav", index)
|
| 85 |
+
let wavURL = sessionDir.appendingPathComponent(wavName)
|
| 86 |
+
|
| 87 |
+
queue.async { [weak self] in
|
| 88 |
+
guard let self else { return }
|
| 89 |
+
// Write WAV
|
| 90 |
+
Self.writeWAV(pcm16: pcm16, sampleRate: sampleRate, to: wavURL)
|
| 91 |
+
|
| 92 |
+
// Write JSONL entry
|
| 93 |
+
var entry: [String: Any] = [
|
| 94 |
+
"event": "utterance",
|
| 95 |
+
"ts": nowStr,
|
| 96 |
+
"idx": index,
|
| 97 |
+
"engine": engine,
|
| 98 |
+
"audio": wavName,
|
| 99 |
+
"dur_sec": Self.round2(durationSec),
|
| 100 |
+
"rms_in": Self.round4(Double(rmsIn)),
|
| 101 |
+
"rms_out": Self.round4(Double(rmsOut)),
|
| 102 |
+
"target_lang": targetLang,
|
| 103 |
+
]
|
| 104 |
+
if let latencyMs { entry["gemma_ms"] = latencyMs }
|
| 105 |
+
if let sourceText, !sourceText.isEmpty { entry["source"] = sourceText }
|
| 106 |
+
if let translatedText, !translatedText.isEmpty { entry["translated"] = translatedText }
|
| 107 |
+
if let error { entry["error"] = error }
|
| 108 |
+
|
| 109 |
+
self.appendJSONFromQueue(entry)
|
| 110 |
+
}
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
// MARK: - File writers
|
| 114 |
+
|
| 115 |
+
private func appendJSON(_ obj: [String: Any]) {
|
| 116 |
+
queue.async { [weak self] in self?.appendJSONFromQueue(obj) }
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
private func appendJSONFromQueue(_ obj: [String: Any]) {
|
| 120 |
+
guard let data = try? JSONSerialization.data(
|
| 121 |
+
withJSONObject: obj,
|
| 122 |
+
options: [.withoutEscapingSlashes, .sortedKeys]
|
| 123 |
+
) else { return }
|
| 124 |
+
var line = data
|
| 125 |
+
line.append(UInt8(ascii: "\n"))
|
| 126 |
+
if let handle = try? FileHandle(forWritingTo: jsonlURL) {
|
| 127 |
+
handle.seekToEndOfFile()
|
| 128 |
+
handle.write(line)
|
| 129 |
+
try? handle.close()
|
| 130 |
+
} else {
|
| 131 |
+
try? line.write(to: jsonlURL, options: [.atomic])
|
| 132 |
+
}
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
private static func writeWAV(pcm16: Data, sampleRate: Double, to url: URL) {
|
| 136 |
+
var out = Data()
|
| 137 |
+
let dataSize = UInt32(pcm16.count)
|
| 138 |
+
let sr = UInt32(sampleRate)
|
| 139 |
+
let byteRate = sr * 2
|
| 140 |
+
func u16(_ v: UInt16) -> Data {
|
| 141 |
+
var le = v.littleEndian
|
| 142 |
+
return Data(bytes: &le, count: 2)
|
| 143 |
+
}
|
| 144 |
+
func u32(_ v: UInt32) -> Data {
|
| 145 |
+
var le = v.littleEndian
|
| 146 |
+
return Data(bytes: &le, count: 4)
|
| 147 |
+
}
|
| 148 |
+
out.append("RIFF".data(using: .ascii)!)
|
| 149 |
+
out.append(u32(36 + dataSize))
|
| 150 |
+
out.append("WAVE".data(using: .ascii)!)
|
| 151 |
+
out.append("fmt ".data(using: .ascii)!)
|
| 152 |
+
out.append(u32(16))
|
| 153 |
+
out.append(u16(1)) // PCM
|
| 154 |
+
out.append(u16(1)) // mono
|
| 155 |
+
out.append(u32(sr))
|
| 156 |
+
out.append(u32(byteRate))
|
| 157 |
+
out.append(u16(2)) // block align
|
| 158 |
+
out.append(u16(16)) // bits per sample
|
| 159 |
+
out.append("data".data(using: .ascii)!)
|
| 160 |
+
out.append(u32(dataSize))
|
| 161 |
+
out.append(pcm16)
|
| 162 |
+
try? out.write(to: url, options: [.atomic])
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
// MARK: - Helpers
|
| 166 |
+
|
| 167 |
+
private static func timestampSlug(_ date: Date) -> String {
|
| 168 |
+
let f = DateFormatter()
|
| 169 |
+
f.locale = Locale(identifier: "en_US_POSIX")
|
| 170 |
+
f.timeZone = TimeZone(secondsFromGMT: 0)
|
| 171 |
+
f.dateFormat = "yyyy-MM-dd'T'HH-mm-ss'Z'"
|
| 172 |
+
return f.string(from: date)
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
private static func round2(_ v: Double) -> Double {
|
| 176 |
+
(v * 100).rounded() / 100
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
private static func round4(_ v: Double) -> Double {
|
| 180 |
+
(v * 10_000).rounded() / 10_000
|
| 181 |
+
}
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
#if canImport(AppKit)
|
| 185 |
+
import AppKit
|
| 186 |
+
#endif
|
apps/macos/Sources/BanhMi/Settings.swift
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import Foundation
|
| 2 |
+
import CoreGraphics
|
| 3 |
+
|
| 4 |
+
enum TextSize: String, CaseIterable, Identifiable {
|
| 5 |
+
case small = "S"
|
| 6 |
+
case medium = "M"
|
| 7 |
+
case large = "L"
|
| 8 |
+
|
| 9 |
+
var id: String { rawValue }
|
| 10 |
+
|
| 11 |
+
var displayName: String {
|
| 12 |
+
switch self {
|
| 13 |
+
case .small: return "Small"
|
| 14 |
+
case .medium: return "Medium"
|
| 15 |
+
case .large: return "Large"
|
| 16 |
+
}
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
var pointSize: CGFloat {
|
| 20 |
+
switch self {
|
| 21 |
+
case .small: return 20
|
| 22 |
+
case .medium: return 28
|
| 23 |
+
case .large: return 40
|
| 24 |
+
}
|
| 25 |
+
}
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
enum ASREngine: String, CaseIterable, Identifiable {
|
| 29 |
+
case gemmaMLX // Fully offline Gemma 4 on Apple Silicon via MLX 4-bit
|
| 30 |
+
|
| 31 |
+
var id: String { rawValue }
|
| 32 |
+
|
| 33 |
+
var displayName: String {
|
| 34 |
+
switch self {
|
| 35 |
+
case .gemmaMLX: return "Gemma 4 (offline)"
|
| 36 |
+
}
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
var iconName: String {
|
| 40 |
+
switch self {
|
| 41 |
+
case .gemmaMLX: return "cpu.fill"
|
| 42 |
+
}
|
| 43 |
+
}
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
enum AudioSource: String, CaseIterable, Identifiable {
|
| 47 |
+
case systemAudio
|
| 48 |
+
case microphone
|
| 49 |
+
|
| 50 |
+
var id: String { rawValue }
|
| 51 |
+
|
| 52 |
+
var displayName: String {
|
| 53 |
+
switch self {
|
| 54 |
+
case .systemAudio: return "System audio"
|
| 55 |
+
case .microphone: return "Microphone"
|
| 56 |
+
}
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
var iconName: String {
|
| 60 |
+
switch self {
|
| 61 |
+
case .systemAudio: return "speaker.wave.2.fill"
|
| 62 |
+
case .microphone: return "mic.fill"
|
| 63 |
+
}
|
| 64 |
+
}
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
struct Language: Identifiable, Hashable {
|
| 68 |
+
let id: String // App-level tag used for storage (BCP-47 simple form)
|
| 69 |
+
let name: String
|
| 70 |
+
let nativeName: String
|
| 71 |
+
let speechLocale: String // BCP-47 locale identifier used as a recognition hint
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
enum SupportedLanguages {
|
| 75 |
+
static let all: [Language] = [
|
| 76 |
+
Language(id: "en", name: "English", nativeName: "English", speechLocale: "en-US"),
|
| 77 |
+
Language(id: "vi", name: "Vietnamese", nativeName: "Tiếng Việt", speechLocale: "vi-VN"),
|
| 78 |
+
Language(id: "es", name: "Spanish", nativeName: "Español", speechLocale: "es-ES"),
|
| 79 |
+
Language(id: "zh-CN", name: "Chinese (Simplified)", nativeName: "简体中文", speechLocale: "zh-CN"),
|
| 80 |
+
Language(id: "ja", name: "Japanese", nativeName: "日本語", speechLocale: "ja-JP"),
|
| 81 |
+
Language(id: "ko", name: "Korean", nativeName: "한국어", speechLocale: "ko-KR"),
|
| 82 |
+
Language(id: "fr", name: "French", nativeName: "Français", speechLocale: "fr-FR"),
|
| 83 |
+
Language(id: "de", name: "German", nativeName: "Deutsch", speechLocale: "de-DE"),
|
| 84 |
+
Language(id: "pt", name: "Portuguese", nativeName: "Português", speechLocale: "pt-BR"),
|
| 85 |
+
Language(id: "ru", name: "Russian", nativeName: "Русский", speechLocale: "ru-RU"),
|
| 86 |
+
Language(id: "ar", name: "Arabic", nativeName: "العربية", speechLocale: "ar-SA"),
|
| 87 |
+
Language(id: "hi", name: "Hindi", nativeName: "हिन्दी", speechLocale: "hi-IN"),
|
| 88 |
+
Language(id: "id", name: "Indonesian", nativeName: "Bahasa Indonesia", speechLocale: "id-ID"),
|
| 89 |
+
Language(id: "th", name: "Thai", nativeName: "ไทย", speechLocale: "th-TH"),
|
| 90 |
+
Language(id: "it", name: "Italian", nativeName: "Italiano", speechLocale: "it-IT"),
|
| 91 |
+
Language(id: "tr", name: "Turkish", nativeName: "Türkçe", speechLocale: "tr-TR"),
|
| 92 |
+
]
|
| 93 |
+
|
| 94 |
+
static func named(_ id: String) -> Language? {
|
| 95 |
+
all.first { $0.id == id }
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
static var systemDefault: String {
|
| 99 |
+
let preferred = Locale.preferredLanguages.first ?? "en"
|
| 100 |
+
let code = Locale(identifier: preferred).language.languageCode?.identifier ?? "en"
|
| 101 |
+
let region = Locale(identifier: preferred).language.region?.identifier
|
| 102 |
+
let tag = region.map { "\(code)-\($0)" } ?? code
|
| 103 |
+
if all.contains(where: { $0.id == tag }) { return tag }
|
| 104 |
+
if all.contains(where: { $0.id == code }) { return code }
|
| 105 |
+
return "en"
|
| 106 |
+
}
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
enum SettingsKey {
|
| 110 |
+
static let outputLanguage = "outputLanguage"
|
| 111 |
+
static let textSize = "textSize"
|
| 112 |
+
static let audioSource = "audioSource"
|
| 113 |
+
static let asrEngine = "asrEngine"
|
| 114 |
+
static let isEnabled = "isEnabled"
|
| 115 |
+
static let overlayWidth = "overlayWidth"
|
| 116 |
+
static let overlayOriginX = "overlayOriginX"
|
| 117 |
+
static let overlayOriginY = "overlayOriginY"
|
| 118 |
+
}
|
apps/macos/Sources/BanhMi/SettingsView.swift
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import SwiftUI
|
| 2 |
+
|
| 3 |
+
struct SettingsView: View {
|
| 4 |
+
@ObservedObject var state: AppState
|
| 5 |
+
let onToggleEnabled: () -> Void
|
| 6 |
+
|
| 7 |
+
var body: some View {
|
| 8 |
+
VStack(alignment: .leading, spacing: 18) {
|
| 9 |
+
header
|
| 10 |
+
Divider()
|
| 11 |
+
transcriptionControl
|
| 12 |
+
audioSourceSection
|
| 13 |
+
engineSection
|
| 14 |
+
languageSection
|
| 15 |
+
textSizeSection
|
| 16 |
+
Spacer()
|
| 17 |
+
footer
|
| 18 |
+
}
|
| 19 |
+
.padding(20)
|
| 20 |
+
.frame(width: 340, height: 520)
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
private var header: some View {
|
| 24 |
+
HStack(spacing: 10) {
|
| 25 |
+
Image(systemName: "waveform.circle")
|
| 26 |
+
.font(.system(size: 28))
|
| 27 |
+
.foregroundStyle(.secondary)
|
| 28 |
+
VStack(alignment: .leading, spacing: 2) {
|
| 29 |
+
Text("Bánh mì chuyển ngữ")
|
| 30 |
+
.font(.headline)
|
| 31 |
+
Text(state.statusMessage)
|
| 32 |
+
.font(.caption)
|
| 33 |
+
.foregroundStyle(.secondary)
|
| 34 |
+
.lineLimit(1)
|
| 35 |
+
}
|
| 36 |
+
Spacer()
|
| 37 |
+
}
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
private var transcriptionControl: some View {
|
| 41 |
+
Button {
|
| 42 |
+
onToggleEnabled()
|
| 43 |
+
} label: {
|
| 44 |
+
HStack(spacing: 8) {
|
| 45 |
+
Image(systemName: state.isEnabled ? "pause.circle.fill" : "play.circle.fill")
|
| 46 |
+
.font(.system(size: 18))
|
| 47 |
+
Text(state.isEnabled ? "Pause transcription" : "Resume transcription")
|
| 48 |
+
.fontWeight(.medium)
|
| 49 |
+
Spacer()
|
| 50 |
+
}
|
| 51 |
+
.frame(maxWidth: .infinity)
|
| 52 |
+
.padding(.vertical, 10)
|
| 53 |
+
.padding(.horizontal, 14)
|
| 54 |
+
.background(
|
| 55 |
+
RoundedRectangle(cornerRadius: 10, style: .continuous)
|
| 56 |
+
.fill(state.isEnabled ? Color.orange.opacity(0.15) : Color.green.opacity(0.18))
|
| 57 |
+
)
|
| 58 |
+
.foregroundStyle(state.isEnabled ? Color.orange : Color.green)
|
| 59 |
+
}
|
| 60 |
+
.buttonStyle(.plain)
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
private var audioSourceSection: some View {
|
| 64 |
+
VStack(alignment: .leading, spacing: 8) {
|
| 65 |
+
Label("Audio source", systemImage: "waveform.badge.plus")
|
| 66 |
+
.font(.subheadline)
|
| 67 |
+
.foregroundStyle(.secondary)
|
| 68 |
+
Picker("", selection: $state.audioSource) {
|
| 69 |
+
ForEach(AudioSource.allCases) { source in
|
| 70 |
+
Label(source.displayName, systemImage: source.iconName)
|
| 71 |
+
.tag(source)
|
| 72 |
+
}
|
| 73 |
+
}
|
| 74 |
+
.labelsHidden()
|
| 75 |
+
.pickerStyle(.segmented)
|
| 76 |
+
if state.audioSource == .systemAudio {
|
| 77 |
+
Text("System audio uses macOS Screen Recording. You'll be asked to approve once — the grant is remembered for this build.")
|
| 78 |
+
.font(.caption2)
|
| 79 |
+
.foregroundStyle(.secondary)
|
| 80 |
+
.fixedSize(horizontal: false, vertical: true)
|
| 81 |
+
}
|
| 82 |
+
}
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
private var engineSection: some View {
|
| 86 |
+
VStack(alignment: .leading, spacing: 8) {
|
| 87 |
+
Label("Speech engine", systemImage: "cpu")
|
| 88 |
+
.font(.subheadline)
|
| 89 |
+
.foregroundStyle(.secondary)
|
| 90 |
+
Picker("", selection: $state.asrEngine) {
|
| 91 |
+
ForEach(ASREngine.allCases) { engine in
|
| 92 |
+
Label(engine.displayName, systemImage: engine.iconName)
|
| 93 |
+
.tag(engine)
|
| 94 |
+
}
|
| 95 |
+
}
|
| 96 |
+
.labelsHidden()
|
| 97 |
+
.pickerStyle(.segmented)
|
| 98 |
+
if state.asrEngine == .gemmaMLX {
|
| 99 |
+
Text("Gemma 4 runs fully offline on Apple Silicon (~4.8 GB). First launch downloads the model.")
|
| 100 |
+
.font(.caption2)
|
| 101 |
+
.foregroundStyle(.secondary)
|
| 102 |
+
.fixedSize(horizontal: false, vertical: true)
|
| 103 |
+
}
|
| 104 |
+
}
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
private var languageSection: some View {
|
| 108 |
+
VStack(alignment: .leading, spacing: 8) {
|
| 109 |
+
Label("Your language", systemImage: "globe")
|
| 110 |
+
.font(.subheadline)
|
| 111 |
+
.foregroundStyle(.secondary)
|
| 112 |
+
Picker("", selection: $state.languageID) {
|
| 113 |
+
ForEach(SupportedLanguages.all) { lang in
|
| 114 |
+
Text(lang.nativeName).tag(lang.id)
|
| 115 |
+
}
|
| 116 |
+
}
|
| 117 |
+
.labelsHidden()
|
| 118 |
+
.pickerStyle(.menu)
|
| 119 |
+
}
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
private var textSizeSection: some View {
|
| 123 |
+
VStack(alignment: .leading, spacing: 8) {
|
| 124 |
+
Label("Subtitle size", systemImage: "textformat.size")
|
| 125 |
+
.font(.subheadline)
|
| 126 |
+
.foregroundStyle(.secondary)
|
| 127 |
+
Picker("", selection: $state.textSize) {
|
| 128 |
+
ForEach(TextSize.allCases) { size in
|
| 129 |
+
Text(size.rawValue).tag(size)
|
| 130 |
+
}
|
| 131 |
+
}
|
| 132 |
+
.labelsHidden()
|
| 133 |
+
.pickerStyle(.segmented)
|
| 134 |
+
Text(state.textSize.displayName)
|
| 135 |
+
.font(.caption)
|
| 136 |
+
.foregroundStyle(.secondary)
|
| 137 |
+
}
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
private var footer: some View {
|
| 141 |
+
HStack {
|
| 142 |
+
Button("Quit") {
|
| 143 |
+
NSApplication.shared.terminate(nil)
|
| 144 |
+
}
|
| 145 |
+
.keyboardShortcut("q", modifiers: .command)
|
| 146 |
+
|
| 147 |
+
Button {
|
| 148 |
+
SessionLogger.shared.revealInFinder()
|
| 149 |
+
} label: {
|
| 150 |
+
Label("Logs", systemImage: "doc.text.magnifyingglass")
|
| 151 |
+
}
|
| 152 |
+
.buttonStyle(.borderless)
|
| 153 |
+
.font(.caption)
|
| 154 |
+
|
| 155 |
+
Spacer()
|
| 156 |
+
|
| 157 |
+
if let error = state.errorMessage {
|
| 158 |
+
Text(error)
|
| 159 |
+
.font(.caption2)
|
| 160 |
+
.foregroundStyle(.red)
|
| 161 |
+
.lineLimit(2)
|
| 162 |
+
.frame(maxWidth: 200, alignment: .trailing)
|
| 163 |
+
} else {
|
| 164 |
+
Text("v0.5")
|
| 165 |
+
.font(.caption2)
|
| 166 |
+
.foregroundStyle(.secondary)
|
| 167 |
+
}
|
| 168 |
+
}
|
| 169 |
+
}
|
| 170 |
+
}
|
apps/macos/Sources/BanhMi/SubtitleOverlay.swift
ADDED
|
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import AppKit
|
| 2 |
+
import SwiftUI
|
| 3 |
+
import Combine
|
| 4 |
+
|
| 5 |
+
/// A borderless, always-on-top panel that hosts the draggable/resizable
|
| 6 |
+
/// subtitle bubble. Mouse events outside the bubble pass through to apps below.
|
| 7 |
+
final class SubtitleOverlayPanel: NSPanel {
|
| 8 |
+
init() {
|
| 9 |
+
super.init(
|
| 10 |
+
contentRect: NSRect(x: 0, y: 0, width: 900, height: 220),
|
| 11 |
+
styleMask: [.borderless, .nonactivatingPanel],
|
| 12 |
+
backing: .buffered,
|
| 13 |
+
defer: false
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
isOpaque = false
|
| 17 |
+
backgroundColor = .clear
|
| 18 |
+
hasShadow = false
|
| 19 |
+
level = .screenSaver
|
| 20 |
+
ignoresMouseEvents = false
|
| 21 |
+
collectionBehavior = [.canJoinAllSpaces, .fullScreenAuxiliary, .stationary, .ignoresCycle]
|
| 22 |
+
isMovableByWindowBackground = false
|
| 23 |
+
hidesOnDeactivate = false
|
| 24 |
+
isReleasedWhenClosed = false
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
override var canBecomeKey: Bool { false }
|
| 28 |
+
override var canBecomeMain: Bool { false }
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
/// Hosting view that only captures mouse events on opaque SwiftUI content.
|
| 32 |
+
/// Clear/background areas fall through to the app underneath.
|
| 33 |
+
final class ClickThroughHostingView<Content: View>: NSHostingView<Content> {
|
| 34 |
+
override func hitTest(_ point: NSPoint) -> NSView? {
|
| 35 |
+
guard let hit = super.hitTest(point) else { return nil }
|
| 36 |
+
// If the hit view is this root host (transparent zones), pass through.
|
| 37 |
+
if hit === self { return nil }
|
| 38 |
+
return hit
|
| 39 |
+
}
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
@MainActor
|
| 43 |
+
final class SubtitleOverlayController {
|
| 44 |
+
static let defaultWidth: CGFloat = 900
|
| 45 |
+
static let defaultHeight: CGFloat = 220
|
| 46 |
+
static let minWidth: CGFloat = 360
|
| 47 |
+
static let maxWidth: CGFloat = 1600
|
| 48 |
+
|
| 49 |
+
private let panel: SubtitleOverlayPanel
|
| 50 |
+
private let state: AppState
|
| 51 |
+
private var cancellables = Set<AnyCancellable>()
|
| 52 |
+
|
| 53 |
+
init(state: AppState) {
|
| 54 |
+
self.state = state
|
| 55 |
+
self.panel = SubtitleOverlayPanel()
|
| 56 |
+
|
| 57 |
+
let root = SubtitleView(
|
| 58 |
+
state: state,
|
| 59 |
+
onDrag: { [weak self] delta in self?.dragPanel(by: delta) }
|
| 60 |
+
)
|
| 61 |
+
let host = ClickThroughHostingView(rootView: root)
|
| 62 |
+
host.translatesAutoresizingMaskIntoConstraints = true
|
| 63 |
+
panel.contentView = host
|
| 64 |
+
|
| 65 |
+
NotificationCenter.default.addObserver(
|
| 66 |
+
forName: NSApplication.didChangeScreenParametersNotification,
|
| 67 |
+
object: nil,
|
| 68 |
+
queue: .main
|
| 69 |
+
) { [weak self] _ in
|
| 70 |
+
Task { @MainActor in self?.clampToScreen() }
|
| 71 |
+
}
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
func show() {
|
| 75 |
+
applyInitialFrame()
|
| 76 |
+
panel.orderFrontRegardless()
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
func hide() {
|
| 80 |
+
panel.orderOut(nil)
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
private func applyInitialFrame() {
|
| 84 |
+
guard let screen = NSScreen.main else { return }
|
| 85 |
+
let visible = screen.visibleFrame
|
| 86 |
+
let width = clampWidth(state.overlayWidth)
|
| 87 |
+
let height = Self.defaultHeight
|
| 88 |
+
|
| 89 |
+
let x: CGFloat
|
| 90 |
+
let y: CGFloat
|
| 91 |
+
if let saved = state.overlayOrigin {
|
| 92 |
+
x = saved.x
|
| 93 |
+
y = saved.y
|
| 94 |
+
} else {
|
| 95 |
+
x = visible.midX - width / 2
|
| 96 |
+
y = visible.minY + 60
|
| 97 |
+
}
|
| 98 |
+
panel.setFrame(NSRect(x: x, y: y, width: width, height: height), display: true, animate: false)
|
| 99 |
+
clampToScreen()
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
private func dragPanel(by delta: CGSize) {
|
| 103 |
+
var frame = panel.frame
|
| 104 |
+
// SwiftUI drag deltas: +x right, +y down. AppKit window coords: +y up.
|
| 105 |
+
frame.origin.x += delta.width
|
| 106 |
+
frame.origin.y -= delta.height
|
| 107 |
+
panel.setFrame(frame, display: true)
|
| 108 |
+
clampToScreen()
|
| 109 |
+
state.overlayOrigin = CGPoint(x: panel.frame.origin.x, y: panel.frame.origin.y)
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
private func clampWidth(_ w: CGFloat) -> CGFloat {
|
| 113 |
+
min(Self.maxWidth, max(Self.minWidth, w))
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
private func clampToScreen() {
|
| 117 |
+
guard let screen = NSScreen.main else { return }
|
| 118 |
+
let visible = screen.visibleFrame
|
| 119 |
+
var frame = panel.frame
|
| 120 |
+
frame.origin.x = min(max(frame.origin.x, visible.minX), visible.maxX - frame.width)
|
| 121 |
+
frame.origin.y = min(max(frame.origin.y, visible.minY), visible.maxY - frame.height)
|
| 122 |
+
panel.setFrame(frame, display: true)
|
| 123 |
+
}
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
// MARK: - SwiftUI content
|
| 127 |
+
|
| 128 |
+
struct SubtitleView: View {
|
| 129 |
+
@ObservedObject var state: AppState
|
| 130 |
+
let onDrag: (CGSize) -> Void
|
| 131 |
+
|
| 132 |
+
// Track last drag translation so we can emit incremental deltas.
|
| 133 |
+
@State private var dragLast: CGSize = .zero
|
| 134 |
+
|
| 135 |
+
var body: some View {
|
| 136 |
+
ZStack(alignment: .bottom) {
|
| 137 |
+
Color.clear
|
| 138 |
+
VStack {
|
| 139 |
+
Spacer()
|
| 140 |
+
content
|
| 141 |
+
}
|
| 142 |
+
}
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
@ViewBuilder
|
| 146 |
+
private var content: some View {
|
| 147 |
+
if let error = state.errorMessage {
|
| 148 |
+
errorBubble(error)
|
| 149 |
+
} else if !state.lines.isEmpty {
|
| 150 |
+
lineStack
|
| 151 |
+
}
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
/// Stack of stable subtitle lines. Newest is at the bottom; older lines
|
| 155 |
+
/// appear above it, fade, and get pushed up as new lines arrive.
|
| 156 |
+
private var lineStack: some View {
|
| 157 |
+
VStack(spacing: 6) {
|
| 158 |
+
ForEach(Array(state.lines.enumerated()), id: \.element.id) { index, line in
|
| 159 |
+
let age = state.lines.count - 1 - index // 0 = newest
|
| 160 |
+
lineBubble(text: line.text, age: age)
|
| 161 |
+
.transition(.asymmetric(
|
| 162 |
+
insertion: .move(edge: .bottom).combined(with: .opacity),
|
| 163 |
+
removal: .move(edge: .top).combined(with: .opacity)
|
| 164 |
+
))
|
| 165 |
+
}
|
| 166 |
+
}
|
| 167 |
+
.padding(.horizontal, 8)
|
| 168 |
+
.animation(.easeInOut(duration: 0.35), value: state.lines.map(\.id))
|
| 169 |
+
.gesture(
|
| 170 |
+
DragGesture(minimumDistance: 2)
|
| 171 |
+
.onChanged { value in
|
| 172 |
+
let incremental = CGSize(
|
| 173 |
+
width: value.translation.width - dragLast.width,
|
| 174 |
+
height: value.translation.height - dragLast.height
|
| 175 |
+
)
|
| 176 |
+
dragLast = value.translation
|
| 177 |
+
onDrag(incremental)
|
| 178 |
+
}
|
| 179 |
+
.onEnded { _ in dragLast = .zero }
|
| 180 |
+
)
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
private func lineBubble(text: String, age: Int) -> some View {
|
| 184 |
+
// Older lines dim slightly so the eye locks onto the newest.
|
| 185 |
+
let opacity = max(0.45, 1.0 - Double(age) * 0.25)
|
| 186 |
+
return Text(text)
|
| 187 |
+
.font(.system(size: state.textSize.pointSize, weight: .semibold, design: .rounded))
|
| 188 |
+
.foregroundStyle(.white)
|
| 189 |
+
.multilineTextAlignment(.center)
|
| 190 |
+
.lineLimit(3)
|
| 191 |
+
.padding(.horizontal, 24)
|
| 192 |
+
.padding(.vertical, 10)
|
| 193 |
+
.background(
|
| 194 |
+
RoundedRectangle(cornerRadius: 14, style: .continuous)
|
| 195 |
+
.fill(Color.black.opacity(0.72))
|
| 196 |
+
)
|
| 197 |
+
.shadow(color: .black.opacity(0.35), radius: 8, y: 3)
|
| 198 |
+
.opacity(opacity)
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
private func errorBubble(_ message: String) -> some View {
|
| 202 |
+
Text(message)
|
| 203 |
+
.font(.system(size: 14, weight: .medium, design: .rounded))
|
| 204 |
+
.foregroundStyle(.white)
|
| 205 |
+
.multilineTextAlignment(.center)
|
| 206 |
+
.padding(.horizontal, 20)
|
| 207 |
+
.padding(.vertical, 10)
|
| 208 |
+
.background(
|
| 209 |
+
RoundedRectangle(cornerRadius: 12, style: .continuous)
|
| 210 |
+
.fill(Color.red.opacity(0.8))
|
| 211 |
+
)
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
}
|
| 215 |
+
|
apps/macos/Sources/BanhMi/SystemAudioCapture.swift
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import Foundation
|
| 2 |
+
import AVFoundation
|
| 3 |
+
import ScreenCaptureKit
|
| 4 |
+
import CoreMedia
|
| 5 |
+
import CoreGraphics
|
| 6 |
+
|
| 7 |
+
/// Captures system audio (anything playing through macOS output) via ScreenCaptureKit.
|
| 8 |
+
/// Requires Screen Recording permission.
|
| 9 |
+
final class SystemAudioCapture: NSObject, SCStreamOutput, SCStreamDelegate {
|
| 10 |
+
private var stream: SCStream?
|
| 11 |
+
private let audioQueue = DispatchQueue(label: "vn.banhmi.audioQueue")
|
| 12 |
+
private(set) var isRunning = false
|
| 13 |
+
|
| 14 |
+
var onBuffer: ((AVAudioPCMBuffer, AVAudioTime) -> Void)?
|
| 15 |
+
var onError: ((Error) -> Void)?
|
| 16 |
+
|
| 17 |
+
func start() async throws {
|
| 18 |
+
guard !isRunning else { return }
|
| 19 |
+
|
| 20 |
+
// Explicitly request Screen Recording access. This forces macOS to
|
| 21 |
+
// show the prompt on first use (menu-bar apps otherwise sometimes
|
| 22 |
+
// have SCShareableContent fail silently). If this returns false,
|
| 23 |
+
// the user has denied access and we surface a clear error instead
|
| 24 |
+
// of failing without feedback.
|
| 25 |
+
if CGPreflightScreenCaptureAccess() == false {
|
| 26 |
+
let granted = CGRequestScreenCaptureAccess()
|
| 27 |
+
if granted == false {
|
| 28 |
+
throw NSError(
|
| 29 |
+
domain: "SystemAudioCapture",
|
| 30 |
+
code: 403,
|
| 31 |
+
userInfo: [NSLocalizedDescriptionKey:
|
| 32 |
+
"Screen Recording permission is required for system audio. Open System Settings → Privacy & Security → Screen & System Audio Recording, enable Bánh mì chuyển ngữ, then relaunch the app."]
|
| 33 |
+
)
|
| 34 |
+
}
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
// Pick any display — we capture the full display's audio output.
|
| 38 |
+
let content = try await SCShareableContent.excludingDesktopWindows(false, onScreenWindowsOnly: true)
|
| 39 |
+
guard let display = content.displays.first else {
|
| 40 |
+
throw NSError(domain: "SystemAudioCapture", code: 1,
|
| 41 |
+
userInfo: [NSLocalizedDescriptionKey: "No display found."])
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
let filter = SCContentFilter(display: display, excludingWindows: [])
|
| 45 |
+
|
| 46 |
+
let config = SCStreamConfiguration()
|
| 47 |
+
config.capturesAudio = true
|
| 48 |
+
config.sampleRate = 48_000
|
| 49 |
+
config.channelCount = 2
|
| 50 |
+
config.excludesCurrentProcessAudio = true
|
| 51 |
+
// Minimal video config — we don't use video frames but the stream requires
|
| 52 |
+
// a reasonable configuration.
|
| 53 |
+
config.width = 2
|
| 54 |
+
config.height = 2
|
| 55 |
+
config.minimumFrameInterval = CMTime(value: 1, timescale: 1)
|
| 56 |
+
config.queueDepth = 5
|
| 57 |
+
|
| 58 |
+
let stream = SCStream(filter: filter, configuration: config, delegate: self)
|
| 59 |
+
try stream.addStreamOutput(self, type: .audio, sampleHandlerQueue: audioQueue)
|
| 60 |
+
try await stream.startCapture()
|
| 61 |
+
|
| 62 |
+
self.stream = stream
|
| 63 |
+
self.isRunning = true
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
func stop() async {
|
| 67 |
+
guard let stream else { return }
|
| 68 |
+
do {
|
| 69 |
+
try await stream.stopCapture()
|
| 70 |
+
} catch {
|
| 71 |
+
// Non-fatal; we're tearing down anyway.
|
| 72 |
+
}
|
| 73 |
+
self.stream = nil
|
| 74 |
+
self.isRunning = false
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
// MARK: - SCStreamOutput
|
| 78 |
+
|
| 79 |
+
func stream(_ stream: SCStream, didOutputSampleBuffer sampleBuffer: CMSampleBuffer, of type: SCStreamOutputType) {
|
| 80 |
+
guard type == .audio, sampleBuffer.isValid, sampleBuffer.numSamples > 0 else { return }
|
| 81 |
+
guard let pcmBuffer = Self.pcmBuffer(from: sampleBuffer) else { return }
|
| 82 |
+
|
| 83 |
+
let audioTime = AVAudioTime(hostTime: mach_absolute_time())
|
| 84 |
+
onBuffer?(pcmBuffer, audioTime)
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
// MARK: - SCStreamDelegate
|
| 88 |
+
|
| 89 |
+
func stream(_ stream: SCStream, didStopWithError error: Error) {
|
| 90 |
+
isRunning = false
|
| 91 |
+
onError?(error)
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
// MARK: - Helpers
|
| 95 |
+
|
| 96 |
+
/// Converts a CMSampleBuffer (from ScreenCaptureKit) into an AVAudioPCMBuffer.
|
| 97 |
+
private static func pcmBuffer(from sampleBuffer: CMSampleBuffer) -> AVAudioPCMBuffer? {
|
| 98 |
+
guard let formatDescription = CMSampleBufferGetFormatDescription(sampleBuffer),
|
| 99 |
+
let asbdPointer = CMAudioFormatDescriptionGetStreamBasicDescription(formatDescription) else {
|
| 100 |
+
return nil
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
var asbd = asbdPointer.pointee
|
| 104 |
+
guard let format = AVAudioFormat(streamDescription: &asbd) else { return nil }
|
| 105 |
+
|
| 106 |
+
let numSamples = AVAudioFrameCount(CMSampleBufferGetNumSamples(sampleBuffer))
|
| 107 |
+
guard let pcmBuffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: numSamples) else {
|
| 108 |
+
return nil
|
| 109 |
+
}
|
| 110 |
+
pcmBuffer.frameLength = numSamples
|
| 111 |
+
|
| 112 |
+
let status = CMSampleBufferCopyPCMDataIntoAudioBufferList(
|
| 113 |
+
sampleBuffer,
|
| 114 |
+
at: 0,
|
| 115 |
+
frameCount: Int32(numSamples),
|
| 116 |
+
into: pcmBuffer.mutableAudioBufferList
|
| 117 |
+
)
|
| 118 |
+
return status == noErr ? pcmBuffer : nil
|
| 119 |
+
}
|
| 120 |
+
}
|
apps/macos/Sources/BanhMi/TranscriptionController.swift
ADDED
|
@@ -0,0 +1,362 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import Foundation
|
| 2 |
+
import AVFoundation
|
| 3 |
+
import AppKit
|
| 4 |
+
import Combine
|
| 5 |
+
import OSLog
|
| 6 |
+
|
| 7 |
+
/// Engine-agnostic recognizer surface implemented by GemmaMLXRecognizer.
|
| 8 |
+
/// Keeps TranscriptionController free of per-engine branches.
|
| 9 |
+
@MainActor
|
| 10 |
+
protocol AudioRecognizer: AnyObject {
|
| 11 |
+
var onStableUpdate: ((String, Bool, String?) -> Void)? { get set }
|
| 12 |
+
var onFinalUtterance: ((String, String?) -> Void)? { get set }
|
| 13 |
+
var onFinal: (() -> Void)? { get set }
|
| 14 |
+
var onError: ((Error) -> Void)? { get set }
|
| 15 |
+
func configure(sourceLocale: Locale, targetLanguageCode: String, context: String) throws
|
| 16 |
+
func startTask() throws
|
| 17 |
+
func append(_ buffer: AVAudioPCMBuffer)
|
| 18 |
+
func stop()
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
extension GemmaMLXRecognizer: AudioRecognizer {}
|
| 22 |
+
|
| 23 |
+
/// Orchestrates audio capture + streaming recognition (Gemma 4 MLX
|
| 24 |
+
/// offline) and publishes stable updates to `AppState`. Optimized for
|
| 25 |
+
/// sub-second end-to-end latency.
|
| 26 |
+
@MainActor
|
| 27 |
+
final class TranscriptionController {
|
| 28 |
+
private let state: AppState
|
| 29 |
+
private let mic = MicrophoneCapture()
|
| 30 |
+
private let system = SystemAudioCapture()
|
| 31 |
+
|
| 32 |
+
private var recognizer: AudioRecognizer?
|
| 33 |
+
private var cancellables = Set<AnyCancellable>()
|
| 34 |
+
private var recognitionRestartWork: DispatchWorkItem?
|
| 35 |
+
|
| 36 |
+
private var activeSource: AudioSource?
|
| 37 |
+
private var configuredLanguageID: String?
|
| 38 |
+
private var configuredEngine: ASREngine?
|
| 39 |
+
|
| 40 |
+
private let log = Logger(subsystem: "vn.banhmi.chuyenngu", category: "stt")
|
| 41 |
+
|
| 42 |
+
init(state: AppState) {
|
| 43 |
+
self.state = state
|
| 44 |
+
|
| 45 |
+
system.onError = { [weak self] error in
|
| 46 |
+
Task { @MainActor in
|
| 47 |
+
guard let self else { return }
|
| 48 |
+
self.state.errorMessage = "System audio stopped: \(error.localizedDescription)"
|
| 49 |
+
await self.stopAll()
|
| 50 |
+
self.state.isListening = false
|
| 51 |
+
self.state.statusMessage = "Paused"
|
| 52 |
+
}
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
let forward: (AVAudioPCMBuffer, AVAudioTime) -> Void = { [weak self] buf, _ in
|
| 56 |
+
self?.recognizer?.append(buf)
|
| 57 |
+
}
|
| 58 |
+
mic.onBuffer = forward
|
| 59 |
+
system.onBuffer = forward
|
| 60 |
+
|
| 61 |
+
state.$audioSource
|
| 62 |
+
.dropFirst()
|
| 63 |
+
.removeDuplicates()
|
| 64 |
+
.sink { [weak self] _ in
|
| 65 |
+
Task { await self?.handleSettingsChange() }
|
| 66 |
+
}
|
| 67 |
+
.store(in: &cancellables)
|
| 68 |
+
|
| 69 |
+
state.$languageID
|
| 70 |
+
.dropFirst()
|
| 71 |
+
.removeDuplicates()
|
| 72 |
+
.sink { [weak self] _ in
|
| 73 |
+
Task { await self?.handleSettingsChange() }
|
| 74 |
+
}
|
| 75 |
+
.store(in: &cancellables)
|
| 76 |
+
|
| 77 |
+
state.$asrEngine
|
| 78 |
+
.dropFirst()
|
| 79 |
+
.removeDuplicates()
|
| 80 |
+
.sink { [weak self] _ in
|
| 81 |
+
Task { await self?.handleSettingsChange() }
|
| 82 |
+
}
|
| 83 |
+
.store(in: &cancellables)
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
// MARK: - Public API
|
| 87 |
+
|
| 88 |
+
func bootstrap() async {
|
| 89 |
+
if state.isEnabled {
|
| 90 |
+
await start()
|
| 91 |
+
} else {
|
| 92 |
+
state.statusMessage = "Paused"
|
| 93 |
+
}
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
func resume() async {
|
| 97 |
+
state.isEnabled = true
|
| 98 |
+
await start()
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
func pause() async {
|
| 102 |
+
state.isEnabled = false
|
| 103 |
+
await stopAll()
|
| 104 |
+
state.isListening = false
|
| 105 |
+
state.transcript = ""
|
| 106 |
+
state.translatedTranscript = ""
|
| 107 |
+
state.detectedSourceLanguage = nil
|
| 108 |
+
state.clearLines()
|
| 109 |
+
state.statusMessage = "Paused"
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
func shutdown() async {
|
| 113 |
+
recognitionRestartWork?.cancel()
|
| 114 |
+
await stopAll()
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
// MARK: - Start / stop
|
| 118 |
+
|
| 119 |
+
private func start() async {
|
| 120 |
+
state.errorMessage = nil
|
| 121 |
+
state.transcript = ""
|
| 122 |
+
state.translatedTranscript = ""
|
| 123 |
+
state.detectedSourceLanguage = nil
|
| 124 |
+
state.clearLines()
|
| 125 |
+
state.statusMessage = "Starting…"
|
| 126 |
+
|
| 127 |
+
let recognizer: AudioRecognizer
|
| 128 |
+
switch state.asrEngine {
|
| 129 |
+
case .gemmaMLX:
|
| 130 |
+
guard let sidecarPath = Self.findSidecarScript() else {
|
| 131 |
+
state.statusMessage = "Gemma sidecar not found"
|
| 132 |
+
state.errorMessage = "Expected gemma_sidecar.py under apps/macos/scripts/. Rebuild with ./build.sh."
|
| 133 |
+
state.isListening = false
|
| 134 |
+
return
|
| 135 |
+
}
|
| 136 |
+
let pythonPath = Self.findPythonWithMLX()
|
| 137 |
+
?? "/usr/bin/env" // last-ditch fallback
|
| 138 |
+
do {
|
| 139 |
+
recognizer = try GemmaMLXRecognizer(
|
| 140 |
+
sidecarScriptPath: sidecarPath,
|
| 141 |
+
pythonPath: pythonPath
|
| 142 |
+
)
|
| 143 |
+
state.statusMessage = "Loading Gemma 4 (first launch ~1 min)…"
|
| 144 |
+
} catch {
|
| 145 |
+
state.isListening = false
|
| 146 |
+
state.statusMessage = "Could not start Gemma"
|
| 147 |
+
state.errorMessage = error.localizedDescription
|
| 148 |
+
return
|
| 149 |
+
}
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
wireCallbacks(on: recognizer)
|
| 153 |
+
self.recognizer = recognizer
|
| 154 |
+
|
| 155 |
+
do {
|
| 156 |
+
try recognizer.configure(
|
| 157 |
+
sourceLocale: Locale(identifier: state.currentLanguage.speechLocale),
|
| 158 |
+
targetLanguageCode: baseLanguageCode(state.languageID),
|
| 159 |
+
context: ""
|
| 160 |
+
)
|
| 161 |
+
try recognizer.startTask()
|
| 162 |
+
} catch {
|
| 163 |
+
self.recognizer = nil
|
| 164 |
+
state.isListening = false
|
| 165 |
+
state.statusMessage = "Could not start \(state.asrEngine.displayName)"
|
| 166 |
+
state.errorMessage = error.localizedDescription
|
| 167 |
+
return
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
switch state.audioSource {
|
| 171 |
+
case .microphone:
|
| 172 |
+
await startMicrophone()
|
| 173 |
+
case .systemAudio:
|
| 174 |
+
await startSystemAudio()
|
| 175 |
+
}
|
| 176 |
+
configuredLanguageID = state.languageID
|
| 177 |
+
configuredEngine = state.asrEngine
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
/// Locate a python3 interpreter that has `mlx-vlm` importable. Launched
|
| 181 |
+
/// .app bundles don't inherit shell PATH, so we search known locations.
|
| 182 |
+
private static func findPythonWithMLX() -> String? {
|
| 183 |
+
let candidates: [String] = [
|
| 184 |
+
// 1) User override
|
| 185 |
+
ProcessInfo.processInfo.environment["BANHMI_PYTHON"] ?? "",
|
| 186 |
+
// 2) Unsloth's installer venv (recommended path)
|
| 187 |
+
(NSHomeDirectory() as NSString)
|
| 188 |
+
.appendingPathComponent(".unsloth/unsloth_gemma4_mlx/bin/python3"),
|
| 189 |
+
// 3) Project venv (dev machine)
|
| 190 |
+
"/Users/duytran10/Documents/Others/Any2AnyModels/venv_local/bin/python3",
|
| 191 |
+
// 4) Homebrew python3
|
| 192 |
+
"/opt/homebrew/bin/python3",
|
| 193 |
+
// 5) System python3 (unlikely to have mlx-vlm, but try)
|
| 194 |
+
"/usr/bin/python3",
|
| 195 |
+
].filter { !$0.isEmpty }
|
| 196 |
+
|
| 197 |
+
let fm = FileManager.default
|
| 198 |
+
for candidate in candidates where fm.isExecutableFile(atPath: candidate) {
|
| 199 |
+
// Quick check: does this python import mlx_vlm?
|
| 200 |
+
let probe = Process()
|
| 201 |
+
probe.executableURL = URL(fileURLWithPath: candidate)
|
| 202 |
+
probe.arguments = ["-c", "import mlx_vlm"]
|
| 203 |
+
probe.standardOutput = Pipe()
|
| 204 |
+
probe.standardError = Pipe()
|
| 205 |
+
do {
|
| 206 |
+
try probe.run()
|
| 207 |
+
probe.waitUntilExit()
|
| 208 |
+
if probe.terminationStatus == 0 {
|
| 209 |
+
return candidate
|
| 210 |
+
}
|
| 211 |
+
} catch {
|
| 212 |
+
continue
|
| 213 |
+
}
|
| 214 |
+
}
|
| 215 |
+
return nil
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
/// Locate the Gemma sidecar Python script. Looks in the repo layout first
|
| 219 |
+
/// (useful during development from the build dir) and then inside the
|
| 220 |
+
/// installed .app bundle Resources.
|
| 221 |
+
private static func findSidecarScript() -> String? {
|
| 222 |
+
let fm = FileManager.default
|
| 223 |
+
// 1) Bundled under Resources/scripts/gemma_sidecar.py
|
| 224 |
+
if let bundled = Bundle.main.resourceURL?
|
| 225 |
+
.appendingPathComponent("scripts/gemma_sidecar.py").path,
|
| 226 |
+
fm.fileExists(atPath: bundled) {
|
| 227 |
+
return bundled
|
| 228 |
+
}
|
| 229 |
+
// 2) Dev path: apps/macos/scripts/gemma_sidecar.py relative to binary
|
| 230 |
+
if let exe = Bundle.main.executableURL {
|
| 231 |
+
let dev = exe
|
| 232 |
+
.deletingLastPathComponent() // Contents/MacOS
|
| 233 |
+
.deletingLastPathComponent() // Contents
|
| 234 |
+
.deletingLastPathComponent() // <App>.app
|
| 235 |
+
.deletingLastPathComponent() // parent
|
| 236 |
+
.appendingPathComponent("scripts/gemma_sidecar.py").path
|
| 237 |
+
if fm.fileExists(atPath: dev) { return dev }
|
| 238 |
+
}
|
| 239 |
+
// 3) Absolute fallback during development
|
| 240 |
+
let repoGuess = "/Users/duytran10/Documents/Others/Any2AnyModels/apps/macos/scripts/gemma_sidecar.py"
|
| 241 |
+
if fm.fileExists(atPath: repoGuess) { return repoGuess }
|
| 242 |
+
return nil
|
| 243 |
+
}
|
| 244 |
+
|
| 245 |
+
private func startMicrophone() async {
|
| 246 |
+
let granted = MicrophoneCapture.hasPermission
|
| 247 |
+
? true
|
| 248 |
+
: await MicrophoneCapture.requestPermission()
|
| 249 |
+
guard granted else {
|
| 250 |
+
recognizer?.stop()
|
| 251 |
+
recognizer = nil
|
| 252 |
+
state.statusMessage = "Microphone not authorized"
|
| 253 |
+
state.errorMessage = "Enable in System Settings → Privacy & Security → Microphone."
|
| 254 |
+
state.isListening = false
|
| 255 |
+
return
|
| 256 |
+
}
|
| 257 |
+
|
| 258 |
+
do {
|
| 259 |
+
try mic.start()
|
| 260 |
+
activeSource = .microphone
|
| 261 |
+
state.isListening = true
|
| 262 |
+
state.statusMessage = "Listening to microphone"
|
| 263 |
+
} catch {
|
| 264 |
+
recognizer?.stop()
|
| 265 |
+
recognizer = nil
|
| 266 |
+
state.isListening = false
|
| 267 |
+
state.statusMessage = "Could not start microphone"
|
| 268 |
+
state.errorMessage = error.localizedDescription
|
| 269 |
+
}
|
| 270 |
+
}
|
| 271 |
+
|
| 272 |
+
private func startSystemAudio() async {
|
| 273 |
+
do {
|
| 274 |
+
try await system.start()
|
| 275 |
+
activeSource = .systemAudio
|
| 276 |
+
state.isListening = true
|
| 277 |
+
state.statusMessage = "Listening to system audio"
|
| 278 |
+
} catch {
|
| 279 |
+
recognizer?.stop()
|
| 280 |
+
recognizer = nil
|
| 281 |
+
state.isListening = false
|
| 282 |
+
state.statusMessage = "Could not capture system audio"
|
| 283 |
+
let message = error.localizedDescription.lowercased()
|
| 284 |
+
if message.contains("tcc") || message.contains("declined") || message.contains("permission") {
|
| 285 |
+
state.errorMessage = "Grant Screen Recording in System Settings → Privacy & Security → Screen Recording, then relaunch."
|
| 286 |
+
} else {
|
| 287 |
+
state.errorMessage = error.localizedDescription
|
| 288 |
+
}
|
| 289 |
+
}
|
| 290 |
+
}
|
| 291 |
+
|
| 292 |
+
private func stopAll() async {
|
| 293 |
+
recognitionRestartWork?.cancel()
|
| 294 |
+
recognitionRestartWork = nil
|
| 295 |
+
mic.stop()
|
| 296 |
+
await system.stop()
|
| 297 |
+
recognizer?.stop()
|
| 298 |
+
recognizer = nil
|
| 299 |
+
activeSource = nil
|
| 300 |
+
configuredLanguageID = nil
|
| 301 |
+
configuredEngine = nil
|
| 302 |
+
}
|
| 303 |
+
|
| 304 |
+
// MARK: - Wiring
|
| 305 |
+
|
| 306 |
+
private func wireCallbacks(on recognizer: AudioRecognizer) {
|
| 307 |
+
recognizer.onStableUpdate = { [weak self] text, isFinal, detected in
|
| 308 |
+
guard let self else { return }
|
| 309 |
+
if self.state.errorMessage != nil {
|
| 310 |
+
self.state.errorMessage = nil
|
| 311 |
+
}
|
| 312 |
+
if let detected { self.state.detectedSourceLanguage = detected }
|
| 313 |
+
self.state.translatedTranscript = text
|
| 314 |
+
self.state.showLive(text: text, isFinal: isFinal)
|
| 315 |
+
}
|
| 316 |
+
recognizer.onFinalUtterance = { _, _ in
|
| 317 |
+
// No-op: onStableUpdate with isFinal=true already drove the UI.
|
| 318 |
+
}
|
| 319 |
+
recognizer.onFinal = {}
|
| 320 |
+
recognizer.onError = { [weak self] error in
|
| 321 |
+
guard let self else { return }
|
| 322 |
+
self.state.errorMessage = error.localizedDescription
|
| 323 |
+
self.scheduleRecognitionRestart(delay: 2.0)
|
| 324 |
+
}
|
| 325 |
+
}
|
| 326 |
+
|
| 327 |
+
private func baseLanguageCode(_ id: String) -> String {
|
| 328 |
+
return id
|
| 329 |
+
}
|
| 330 |
+
|
| 331 |
+
// MARK: - Settings-triggered updates
|
| 332 |
+
|
| 333 |
+
private func handleSettingsChange() async {
|
| 334 |
+
guard state.isEnabled else { return }
|
| 335 |
+
let sourceChanged = state.audioSource != activeSource
|
| 336 |
+
let languageChanged = state.languageID != configuredLanguageID
|
| 337 |
+
let engineChanged = state.asrEngine != configuredEngine
|
| 338 |
+
guard sourceChanged || languageChanged || engineChanged else { return }
|
| 339 |
+
|
| 340 |
+
await stopAll()
|
| 341 |
+
await start()
|
| 342 |
+
}
|
| 343 |
+
|
| 344 |
+
private func scheduleRecognitionRestart(delay: TimeInterval) {
|
| 345 |
+
recognitionRestartWork?.cancel()
|
| 346 |
+
let work = DispatchWorkItem { [weak self] in
|
| 347 |
+
Task { @MainActor in
|
| 348 |
+
guard let self,
|
| 349 |
+
self.state.isEnabled,
|
| 350 |
+
self.activeSource != nil,
|
| 351 |
+
let recognizer = self.recognizer else { return }
|
| 352 |
+
do {
|
| 353 |
+
try recognizer.startTask()
|
| 354 |
+
} catch {
|
| 355 |
+
self.state.errorMessage = error.localizedDescription
|
| 356 |
+
}
|
| 357 |
+
}
|
| 358 |
+
}
|
| 359 |
+
recognitionRestartWork = work
|
| 360 |
+
DispatchQueue.main.asyncAfter(deadline: .now() + delay, execute: work)
|
| 361 |
+
}
|
| 362 |
+
}
|
apps/macos/Sources/BanhMi/TranslationService.swift
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import Foundation
|
| 2 |
+
|
| 3 |
+
/// Translates text between languages. Implementations may call remote APIs
|
| 4 |
+
/// or a local Gemma 4 model.
|
| 5 |
+
protocol TranslationService: AnyObject {
|
| 6 |
+
/// Returns translated text. Throws on transport/auth errors.
|
| 7 |
+
/// Implementations should return the original text if `from == to`.
|
| 8 |
+
func translate(_ text: String, from: String, to: String) async throws -> String
|
| 9 |
+
}
|
| 10 |
+
|
| 11 |
+
/// Fallback that never translates — the UI will simply show the transcript
|
| 12 |
+
/// unchanged. Used when no translation backend is configured.
|
| 13 |
+
final class PassthroughTranslator: TranslationService {
|
| 14 |
+
func translate(_ text: String, from: String, to: String) async throws -> String {
|
| 15 |
+
return text
|
| 16 |
+
}
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
/// Free, keyless translator backed by Google's public `translate_a/single`
|
| 20 |
+
/// endpoint. Rate-limited and unofficial — use as a zero-config fallback until
|
| 21 |
+
/// Gemma 4 / an authenticated API is wired in.
|
| 22 |
+
final class FreeGoogleTranslator: TranslationService {
|
| 23 |
+
private let session: URLSession
|
| 24 |
+
|
| 25 |
+
init(session: URLSession = .shared) {
|
| 26 |
+
self.session = session
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
func translate(_ text: String, from: String, to: String) async throws -> String {
|
| 30 |
+
guard !text.isEmpty else { return text }
|
| 31 |
+
let src = Self.shortCode(from)
|
| 32 |
+
let tgt = Self.shortCode(to)
|
| 33 |
+
guard src != tgt else { return text }
|
| 34 |
+
|
| 35 |
+
var components = URLComponents(string: "https://translate.googleapis.com/translate_a/single")!
|
| 36 |
+
components.queryItems = [
|
| 37 |
+
URLQueryItem(name: "client", value: "gtx"),
|
| 38 |
+
URLQueryItem(name: "sl", value: src.isEmpty ? "auto" : src),
|
| 39 |
+
URLQueryItem(name: "tl", value: tgt),
|
| 40 |
+
URLQueryItem(name: "dt", value: "t"),
|
| 41 |
+
URLQueryItem(name: "q", value: text)
|
| 42 |
+
]
|
| 43 |
+
guard let url = components.url else { return text }
|
| 44 |
+
|
| 45 |
+
var request = URLRequest(url: url)
|
| 46 |
+
request.setValue("Mozilla/5.0", forHTTPHeaderField: "User-Agent")
|
| 47 |
+
|
| 48 |
+
let (data, response) = try await session.data(for: request)
|
| 49 |
+
guard let http = response as? HTTPURLResponse, (200..<300).contains(http.statusCode) else {
|
| 50 |
+
return text
|
| 51 |
+
}
|
| 52 |
+
// Response shape: [[["translated","original",...],...],...]
|
| 53 |
+
guard let outer = try? JSONSerialization.jsonObject(with: data) as? [Any],
|
| 54 |
+
let sentences = outer.first as? [Any] else {
|
| 55 |
+
return text
|
| 56 |
+
}
|
| 57 |
+
var out = ""
|
| 58 |
+
for entry in sentences {
|
| 59 |
+
if let parts = entry as? [Any], let piece = parts.first as? String {
|
| 60 |
+
out += piece
|
| 61 |
+
}
|
| 62 |
+
}
|
| 63 |
+
return out.isEmpty ? text : out
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
/// Reduce "en-US" → "en" for the translate endpoint (with a few exceptions).
|
| 67 |
+
private static func shortCode(_ id: String) -> String {
|
| 68 |
+
let lower = id.lowercased()
|
| 69 |
+
// Google expects these specific regional codes.
|
| 70 |
+
if lower.hasPrefix("zh-cn") { return "zh-CN" }
|
| 71 |
+
if lower.hasPrefix("zh-tw") { return "zh-TW" }
|
| 72 |
+
if lower.hasPrefix("pt-br") { return "pt" }
|
| 73 |
+
if let dash = lower.firstIndex(of: "-") {
|
| 74 |
+
return String(lower[..<dash])
|
| 75 |
+
}
|
| 76 |
+
return lower
|
| 77 |
+
}
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
/// Google Cloud Translation v2 — simple REST call. Returns the original
|
| 81 |
+
/// text on any network/parsing failure so the overlay keeps updating.
|
| 82 |
+
final class GoogleTranslator: TranslationService {
|
| 83 |
+
private let apiKey: String
|
| 84 |
+
private let session: URLSession
|
| 85 |
+
|
| 86 |
+
init(apiKey: String, session: URLSession = .shared) {
|
| 87 |
+
self.apiKey = apiKey
|
| 88 |
+
self.session = session
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
func translate(_ text: String, from: String, to: String) async throws -> String {
|
| 92 |
+
guard !apiKey.isEmpty else { return text }
|
| 93 |
+
guard from != to else { return text }
|
| 94 |
+
|
| 95 |
+
var components = URLComponents(string: "https://translation.googleapis.com/language/translate/v2")!
|
| 96 |
+
components.queryItems = [
|
| 97 |
+
URLQueryItem(name: "key", value: apiKey),
|
| 98 |
+
URLQueryItem(name: "q", value: text),
|
| 99 |
+
URLQueryItem(name: "source", value: from),
|
| 100 |
+
URLQueryItem(name: "target", value: to),
|
| 101 |
+
URLQueryItem(name: "format", value: "text")
|
| 102 |
+
]
|
| 103 |
+
guard let url = components.url else { return text }
|
| 104 |
+
|
| 105 |
+
var request = URLRequest(url: url)
|
| 106 |
+
request.httpMethod = "POST"
|
| 107 |
+
|
| 108 |
+
let (data, response) = try await session.data(for: request)
|
| 109 |
+
guard let http = response as? HTTPURLResponse, (200..<300).contains(http.statusCode) else {
|
| 110 |
+
return text
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
guard let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any],
|
| 114 |
+
let dataField = json["data"] as? [String: Any],
|
| 115 |
+
let translations = dataField["translations"] as? [[String: Any]],
|
| 116 |
+
let first = translations.first,
|
| 117 |
+
let translated = first["translatedText"] as? String else {
|
| 118 |
+
return text
|
| 119 |
+
}
|
| 120 |
+
return translated
|
| 121 |
+
}
|
| 122 |
+
}
|
apps/macos/build.sh
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
cd "$(dirname "$0")"
|
| 5 |
+
|
| 6 |
+
APP_NAME="Bánh mì chuyển ngữ"
|
| 7 |
+
EXEC_NAME="BanhMi"
|
| 8 |
+
CONFIG="${CONFIG:-release}"
|
| 9 |
+
BUILD_DIR=".build/$CONFIG"
|
| 10 |
+
STAGE_BUNDLE="$BUILD_DIR/$APP_NAME.app"
|
| 11 |
+
|
| 12 |
+
# The app is installed outside Dropbox so TCC permissions (Screen Recording,
|
| 13 |
+
# Microphone) survive across rebuilds. Dropbox rewrites xattrs/mtimes during
|
| 14 |
+
# sync and silently invalidates ad-hoc signatures — that's what was causing
|
| 15 |
+
# macOS to re-prompt for permissions.
|
| 16 |
+
INSTALL_DIR="${INSTALL_DIR:-$HOME/Applications}"
|
| 17 |
+
INSTALLED_BUNDLE="$INSTALL_DIR/$APP_NAME.app"
|
| 18 |
+
|
| 19 |
+
echo "▸ Building $CONFIG binary..."
|
| 20 |
+
swift build -c "$CONFIG"
|
| 21 |
+
|
| 22 |
+
echo "▸ Assembling app bundle in staging..."
|
| 23 |
+
rm -rf "$STAGE_BUNDLE"
|
| 24 |
+
mkdir -p "$STAGE_BUNDLE/Contents/MacOS"
|
| 25 |
+
mkdir -p "$STAGE_BUNDLE/Contents/Resources"
|
| 26 |
+
|
| 27 |
+
cp "$BUILD_DIR/$EXEC_NAME" "$STAGE_BUNDLE/Contents/MacOS/$EXEC_NAME"
|
| 28 |
+
cp Resources/Info.plist "$STAGE_BUNDLE/Contents/Info.plist"
|
| 29 |
+
|
| 30 |
+
echo "▸ Signing staging bundle…"
|
| 31 |
+
ENTITLEMENTS_FILE="$(mktemp -t banhmi-entitlements).plist"
|
| 32 |
+
cat > "$ENTITLEMENTS_FILE" <<'PLIST'
|
| 33 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
| 34 |
+
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
| 35 |
+
<plist version="1.0">
|
| 36 |
+
<dict>
|
| 37 |
+
<key>com.apple.security.device.audio-input</key>
|
| 38 |
+
<true/>
|
| 39 |
+
</dict>
|
| 40 |
+
</plist>
|
| 41 |
+
PLIST
|
| 42 |
+
|
| 43 |
+
# Prefer an "Apple Development" identity from Xcode — it gives the bundle a
|
| 44 |
+
# stable code-signing identity, so macOS TCC (Screen Recording, Microphone)
|
| 45 |
+
# grants persist across rebuilds. Falls back to ad-hoc if not found.
|
| 46 |
+
SIGN_IDENTITY="$(security find-identity -v 2>/dev/null \
|
| 47 |
+
| awk '/Apple Development/ {print $2; exit}')"
|
| 48 |
+
if [[ -z "$SIGN_IDENTITY" ]]; then
|
| 49 |
+
# Fallback: first valid identity of any kind (Xcode-managed dev certs
|
| 50 |
+
# sometimes only show as UUID labels instead of the friendly name).
|
| 51 |
+
SIGN_IDENTITY="$(security find-identity -v 2>/dev/null \
|
| 52 |
+
| awk '/^ *[0-9]+\)/ {print $2; exit}')"
|
| 53 |
+
fi
|
| 54 |
+
if [[ -z "$SIGN_IDENTITY" ]]; then
|
| 55 |
+
echo " ⚠️ No code-signing identity found — falling back to ad-hoc."
|
| 56 |
+
echo " TCC permissions will reset on every rebuild."
|
| 57 |
+
SIGN_IDENTITY="-"
|
| 58 |
+
else
|
| 59 |
+
echo " ↳ Using signing identity: $SIGN_IDENTITY"
|
| 60 |
+
fi
|
| 61 |
+
|
| 62 |
+
codesign --force --deep --sign "$SIGN_IDENTITY" \
|
| 63 |
+
--options runtime \
|
| 64 |
+
--entitlements "$ENTITLEMENTS_FILE" \
|
| 65 |
+
"$STAGE_BUNDLE"
|
| 66 |
+
rm -f "$ENTITLEMENTS_FILE"
|
| 67 |
+
|
| 68 |
+
echo "▸ Installing to $INSTALLED_BUNDLE (outside Dropbox)…"
|
| 69 |
+
mkdir -p "$INSTALL_DIR"
|
| 70 |
+
|
| 71 |
+
# If the bundle already exists and the signatures match, skip copying so TCC
|
| 72 |
+
# keeps its grant for the exact same on-disk identity. If they differ, we
|
| 73 |
+
# atomically replace the installed bundle and re-apply TCC (user will need to
|
| 74 |
+
# re-grant only when the executable actually changed).
|
| 75 |
+
STAGE_HASH="$(codesign -dvvv "$STAGE_BUNDLE" 2>&1 | awk -F'=' '/^CDHash=/{print $2}')"
|
| 76 |
+
INSTALLED_HASH=""
|
| 77 |
+
if [[ -d "$INSTALLED_BUNDLE" ]]; then
|
| 78 |
+
INSTALLED_HASH="$(codesign -dvvv "$INSTALLED_BUNDLE" 2>&1 | awk -F'=' '/^CDHash=/{print $2}' || true)"
|
| 79 |
+
fi
|
| 80 |
+
|
| 81 |
+
if [[ -n "$INSTALLED_HASH" && "$STAGE_HASH" == "$INSTALLED_HASH" ]]; then
|
| 82 |
+
echo " ↳ Signature unchanged — keeping existing install."
|
| 83 |
+
else
|
| 84 |
+
rm -rf "$INSTALLED_BUNDLE"
|
| 85 |
+
cp -R "$STAGE_BUNDLE" "$INSTALLED_BUNDLE"
|
| 86 |
+
echo " ↳ Installed new build."
|
| 87 |
+
if [[ "$SIGN_IDENTITY" == "-" ]]; then
|
| 88 |
+
echo " ⚠️ Ad-hoc signed — macOS may re-prompt for Screen Recording / Microphone."
|
| 89 |
+
fi
|
| 90 |
+
fi
|
| 91 |
+
|
| 92 |
+
echo ""
|
| 93 |
+
echo "✓ Installed: $INSTALLED_BUNDLE"
|
| 94 |
+
echo ""
|
| 95 |
+
echo "Run with:"
|
| 96 |
+
echo " open \"$INSTALLED_BUNDLE\""
|
apps/macos/scripts/gemma_sidecar.py
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Gemma 4 MLX sidecar for the macOS Bánh mì chuyển ngữ app.
|
| 4 |
+
|
| 5 |
+
Reads NDJSON requests from stdin, one per line. Each request describes an
|
| 6 |
+
audio clip (either file path or raw PCM over stdin) and a task. Writes
|
| 7 |
+
NDJSON responses to stdout.
|
| 8 |
+
|
| 9 |
+
Request format (JSON, one per line):
|
| 10 |
+
{"task": "transcribe_translate",
|
| 11 |
+
"audio_path": "/tmp/clip.wav",
|
| 12 |
+
"target_lang": "Vietnamese"}
|
| 13 |
+
|
| 14 |
+
Response format (JSON, one per line):
|
| 15 |
+
{"ok": true,
|
| 16 |
+
"source_text": "...",
|
| 17 |
+
"translated_text": "...",
|
| 18 |
+
"latency_ms": 1234}
|
| 19 |
+
|
| 20 |
+
On error:
|
| 21 |
+
{"ok": false, "error": "..."}
|
| 22 |
+
|
| 23 |
+
The model is loaded once at startup so subsequent requests are fast.
|
| 24 |
+
|
| 25 |
+
Usage from Swift:
|
| 26 |
+
let proc = Process()
|
| 27 |
+
proc.executableURL = URL(fileURLWithPath: "/usr/bin/env")
|
| 28 |
+
proc.arguments = ["python3", "/path/to/gemma_sidecar.py"]
|
| 29 |
+
// Write one JSON object + newline per request, read back one JSON per line.
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
import json
|
| 33 |
+
import os
|
| 34 |
+
import sys
|
| 35 |
+
import tempfile
|
| 36 |
+
import time
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
MODEL_ID_DEFAULT = "unsloth/gemma-4-E2B-it-UD-MLX-4bit"
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def eprint(*args, **kwargs):
|
| 43 |
+
print(*args, file=sys.stderr, flush=True, **kwargs)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def load_model(model_id: str):
|
| 47 |
+
from mlx_vlm import load, generate, apply_chat_template
|
| 48 |
+
model, processor = load(model_id)
|
| 49 |
+
config = getattr(model, "config", None)
|
| 50 |
+
return model, processor, config, generate, apply_chat_template
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def run_prompt(bundle, audio_path: str, prompt_text: str, max_tokens: int = 256) -> str:
|
| 54 |
+
model, processor, config, generate, apply_chat_template = bundle
|
| 55 |
+
formatted = apply_chat_template(processor, config, prompt_text, num_audios=1)
|
| 56 |
+
result = generate(
|
| 57 |
+
model, processor, formatted,
|
| 58 |
+
audio=audio_path,
|
| 59 |
+
max_tokens=max_tokens,
|
| 60 |
+
temperature=0.0,
|
| 61 |
+
verbose=False,
|
| 62 |
+
)
|
| 63 |
+
return (result.text if hasattr(result, "text") else str(result)).strip()
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def handle(req: dict, bundle) -> dict:
|
| 67 |
+
task = req.get("task", "transcribe")
|
| 68 |
+
audio_path = req.get("audio_path")
|
| 69 |
+
target_lang = req.get("target_lang")
|
| 70 |
+
max_tokens = int(req.get("max_tokens", 256))
|
| 71 |
+
|
| 72 |
+
if not audio_path or not os.path.exists(audio_path):
|
| 73 |
+
return {"ok": False, "error": f"audio_path missing or not found: {audio_path!r}"}
|
| 74 |
+
|
| 75 |
+
# Save each received chunk to /tmp so we can inspect what we actually sent
|
| 76 |
+
try:
|
| 77 |
+
import shutil
|
| 78 |
+
shutil.copy(audio_path, "/tmp/banhmi_last_chunk.wav")
|
| 79 |
+
eprint(f"[sidecar] chunk {os.path.getsize(audio_path)} bytes -> /tmp/banhmi_last_chunk.wav")
|
| 80 |
+
except Exception as exc:
|
| 81 |
+
eprint(f"[sidecar] debug copy failed: {exc}")
|
| 82 |
+
|
| 83 |
+
t0 = time.time()
|
| 84 |
+
|
| 85 |
+
if task == "transcribe":
|
| 86 |
+
text = run_prompt(bundle, audio_path, "Transcribe this audio", max_tokens)
|
| 87 |
+
return {
|
| 88 |
+
"ok": True,
|
| 89 |
+
"source_text": text,
|
| 90 |
+
"translated_text": None,
|
| 91 |
+
"latency_ms": int((time.time() - t0) * 1000),
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
if task == "translate":
|
| 95 |
+
if not target_lang:
|
| 96 |
+
return {"ok": False, "error": "target_lang required for translate task"}
|
| 97 |
+
prompt = (
|
| 98 |
+
f"Translate the speech in this audio into {target_lang}. "
|
| 99 |
+
f"The speaker may be using any language — detect it and translate. "
|
| 100 |
+
f"If the speech is already in {target_lang}, output the speech as-is. "
|
| 101 |
+
f"Reply with only the {target_lang} text, no explanations or quotes."
|
| 102 |
+
)
|
| 103 |
+
translated = run_prompt(bundle, audio_path, prompt, max_tokens)
|
| 104 |
+
return {
|
| 105 |
+
"ok": True,
|
| 106 |
+
"source_text": None,
|
| 107 |
+
"translated_text": translated,
|
| 108 |
+
"latency_ms": int((time.time() - t0) * 1000),
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
if task == "transcribe_translate":
|
| 112 |
+
if not target_lang:
|
| 113 |
+
return {"ok": False, "error": "target_lang required"}
|
| 114 |
+
# Two calls: original + translation. Run them sequentially.
|
| 115 |
+
translate_prompt = (
|
| 116 |
+
f"Translate the speech in this audio into {target_lang}. "
|
| 117 |
+
f"The speaker may be using any language — detect it and translate. "
|
| 118 |
+
f"If the speech is already in {target_lang}, output the speech as-is. "
|
| 119 |
+
f"Reply with only the {target_lang} text, no explanations or quotes."
|
| 120 |
+
)
|
| 121 |
+
translated = run_prompt(bundle, audio_path, translate_prompt, max_tokens)
|
| 122 |
+
source = run_prompt(bundle, audio_path, "Transcribe this audio", max_tokens)
|
| 123 |
+
return {
|
| 124 |
+
"ok": True,
|
| 125 |
+
"source_text": source,
|
| 126 |
+
"translated_text": translated,
|
| 127 |
+
"latency_ms": int((time.time() - t0) * 1000),
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
return {"ok": False, "error": f"unknown task: {task}"}
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def main():
|
| 134 |
+
model_id = os.environ.get("GEMMA_MLX_MODEL", MODEL_ID_DEFAULT)
|
| 135 |
+
eprint(f"[sidecar] loading {model_id}")
|
| 136 |
+
bundle = load_model(model_id)
|
| 137 |
+
eprint(f"[sidecar] ready — awaiting requests on stdin")
|
| 138 |
+
|
| 139 |
+
# Emit a "ready" message so the host knows the model is loaded.
|
| 140 |
+
print(json.dumps({"event": "ready", "model": model_id}), flush=True)
|
| 141 |
+
|
| 142 |
+
# Tempdir for audio uploads if needed
|
| 143 |
+
with tempfile.TemporaryDirectory(prefix="gemma_sidecar_") as tmpdir:
|
| 144 |
+
for line in sys.stdin:
|
| 145 |
+
line = line.strip()
|
| 146 |
+
if not line:
|
| 147 |
+
continue
|
| 148 |
+
try:
|
| 149 |
+
req = json.loads(line)
|
| 150 |
+
except json.JSONDecodeError as exc:
|
| 151 |
+
print(json.dumps({"ok": False, "error": f"invalid JSON: {exc}"}), flush=True)
|
| 152 |
+
continue
|
| 153 |
+
try:
|
| 154 |
+
resp = handle(req, bundle)
|
| 155 |
+
except Exception as exc:
|
| 156 |
+
resp = {"ok": False, "error": f"{type(exc).__name__}: {exc}"}
|
| 157 |
+
print(json.dumps(resp, ensure_ascii=False), flush=True)
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
if __name__ == "__main__":
|
| 161 |
+
main()
|