Add interactive edit mode with floating popup

This commit is contained in:
Thales Maciel 2026-02-26 15:11:06 -03:00
parent b42298b9b5
commit 99f07aef82
10 changed files with 1045 additions and 46 deletions

396
src/aman.py Executable file → Normal file
View file

@ -29,8 +29,19 @@ class State:
STT = "stt"
PROCESSING = "processing"
OUTPUTTING = "outputting"
EDIT_IDLE = "edit_idle"
EDIT_RECORDING = "edit_recording"
EDIT_STT = "edit_stt"
EDIT_PROCESSING = "edit_processing"
EDIT_STATES = {
State.EDIT_IDLE,
State.EDIT_RECORDING,
State.EDIT_STT,
State.EDIT_PROCESSING,
}
_LOCK_HANDLE = None
@ -60,12 +71,22 @@ class Daemon:
self.cfg = cfg
self.desktop = desktop
self.verbose = verbose
self.lock = threading.Lock()
self.lock = threading.RLock()
self._shutdown_requested = threading.Event()
self.state = State.IDLE
self.stream = None
self.record = None
self.timer: threading.Timer | None = None
self.edit_stream = None
self.edit_record = None
self.edit_timer: threading.Timer | None = None
self.edit_active = False
self.edit_text = ""
self.edit_instruction_history: list[str] = []
self.edit_session_token = 0
self.model = _build_whisper_model(
cfg.stt.model,
cfg.stt.device,
@ -77,6 +98,18 @@ class Daemon:
self.vocabulary = VocabularyEngine(cfg.vocabulary)
self._stt_hint_kwargs_cache: dict[str, Any] | None = None
def _arm_cancel_listener_for_recording(self):
try:
self.desktop.start_cancel_listener(lambda: self.cancel_recording())
except Exception as exc:
logging.error("failed to arm cancel listener: %s", exc)
def _disarm_cancel_listener_for_recording(self):
try:
self.desktop.stop_cancel_listener()
except Exception as exc:
logging.debug("failed to disarm cancel listener: %s", exc)
def set_state(self, state: str):
with self.lock:
prev = self.state
@ -99,6 +132,9 @@ class Daemon:
if self._shutdown_requested.is_set():
logging.info("shutdown in progress, trigger ignored")
return
if self.edit_active:
logging.info("edit session active, dictate trigger ignored")
return
if self.state == State.IDLE:
self._start_recording_locked()
return
@ -109,10 +145,60 @@ class Daemon:
if should_stop:
self.stop_recording(trigger="user")
def toggle_edit(self):
action = ""
token = 0
with self.lock:
if self._shutdown_requested.is_set():
logging.info("shutdown in progress, edit trigger ignored")
return
if self.edit_active:
token = self.edit_session_token
if self.state == State.EDIT_IDLE:
action = "start_recording"
elif self.state == State.EDIT_RECORDING:
action = "stop_recording"
else:
logging.info("edit session busy (%s), trigger ignored", self.state)
return
else:
if self.state != State.IDLE:
logging.info("busy (%s), edit trigger ignored", self.state)
return
self.edit_active = True
self.edit_session_token += 1
token = self.edit_session_token
self.edit_instruction_history = []
self.edit_text = ""
self.set_state(State.EDIT_IDLE)
action = "open_session"
if action == "stop_recording":
self.stop_edit_recording(trigger="user")
return
if action == "start_recording":
self._start_edit_recording(token=token, trigger="user")
return
if action == "open_session":
self._open_edit_session(token)
def handle_cancel(self):
with self.lock:
edit_active = self.edit_active
state = self.state
if edit_active:
self.cancel_edit_session()
return
if state == State.RECORDING:
self.cancel_recording()
def _start_recording_locked(self):
if self.state != State.IDLE:
logging.info("busy (%s), trigger ignored", self.state)
return
if self.edit_active:
logging.info("edit session active, dictate trigger ignored")
return
try:
stream, record = start_audio_recording(self.cfg.recording.input)
except Exception as exc:
@ -120,9 +206,8 @@ class Daemon:
return
self.stream = stream
self.record = record
prev = self.state
self.state = State.RECORDING
logging.debug("state: %s -> %s", prev, self.state)
self.set_state(State.RECORDING)
self._arm_cancel_listener_for_recording()
logging.info("recording started")
if self.timer:
self.timer.cancel()
@ -150,13 +235,12 @@ class Daemon:
if self.timer:
self.timer.cancel()
self.timer = None
prev = self.state
self.state = State.STT
logging.debug("state: %s -> %s", prev, self.state)
self._disarm_cancel_listener_for_recording()
self.set_state(State.STT)
if stream is None or record is None:
logging.warning("recording resources are unavailable during stop")
self.state = State.IDLE
self.set_state(State.IDLE)
return None
return stream, record
@ -254,8 +338,292 @@ class Daemon:
return
self.stop_recording(trigger="cancel", process_audio=False)
def _open_edit_session(self, token: int):
initial_text = ""
try:
initial_text = self.desktop.read_clipboard_text() or ""
except Exception as exc:
logging.error("failed reading clipboard for edit session: %s", exc)
with self.lock:
if not self._edit_session_is_active_locked(token):
return
self.edit_text = initial_text
try:
self.desktop.open_edit_popup(
initial_text,
on_submit=self.finalize_edit_session_inject,
on_copy=self.finalize_edit_session_copy,
on_cancel=self.cancel_edit_session,
)
self._safe_set_edit_popup_status("Recording instruction...")
except Exception as exc:
logging.error("failed opening edit popup: %s", exc)
self._close_edit_session(close_popup=False)
return
if not self._start_edit_recording(token=token, trigger="open"):
self._safe_set_edit_popup_status("Ready. Press edit hotkey to record.")
def _start_edit_recording(self, *, token: int, trigger: str) -> bool:
with self.lock:
if not self._edit_session_is_active_locked(token):
return False
if self.state != State.EDIT_IDLE:
logging.info("edit session busy (%s), start ignored", self.state)
return False
try:
stream, record = start_audio_recording(self.cfg.recording.input)
except Exception as exc:
logging.error("edit record start failed: %s", exc)
return False
self.edit_stream = stream
self.edit_record = record
if self.edit_timer:
self.edit_timer.cancel()
self.edit_timer = threading.Timer(RECORD_TIMEOUT_SEC, self._timeout_edit_stop)
self.edit_timer.daemon = True
self.edit_timer.start()
self.set_state(State.EDIT_RECORDING)
self._safe_set_edit_popup_status("Recording instruction...")
logging.info("edit recording started (%s)", trigger)
return True
def _timeout_edit_stop(self):
self.stop_edit_recording(trigger="timeout")
def stop_edit_recording(self, *, trigger: str = "user", process_audio: bool = True):
payload = None
token = 0
with self.lock:
if not self.edit_active or self.state != State.EDIT_RECORDING:
return
payload = (self.edit_stream, self.edit_record)
token = self.edit_session_token
self.edit_stream = None
self.edit_record = None
if self.edit_timer:
self.edit_timer.cancel()
self.edit_timer = None
self.set_state(State.EDIT_STT)
self._safe_set_edit_popup_status("Transcribing instruction...")
stream, record = payload
if stream is None or record is None:
logging.warning("edit recording resources are unavailable during stop")
with self.lock:
if self._edit_session_is_active_locked(token):
self.set_state(State.EDIT_IDLE)
self._safe_set_edit_popup_status("Ready. Press edit hotkey to record.")
return
threading.Thread(
target=self._edit_stop_and_process,
args=(stream, record, token, trigger, process_audio),
daemon=True,
).start()
def _edit_stop_and_process(
self,
stream: Any,
record: Any,
token: int,
trigger: str,
process_audio: bool,
):
logging.info("stopping edit recording (%s)", trigger)
try:
audio = stop_audio_recording(stream, record)
except Exception as exc:
logging.error("edit record stop failed: %s", exc)
with self.lock:
if self._edit_session_is_active_locked(token):
self.set_state(State.EDIT_IDLE)
self._safe_set_edit_popup_status("Failed to stop recording.")
return
if not process_audio or self._shutdown_requested.is_set():
with self.lock:
if self._edit_session_is_active_locked(token):
self.set_state(State.EDIT_IDLE)
self._safe_set_edit_popup_status("Ready. Press edit hotkey to record.")
return
if audio.size == 0:
logging.error("no audio captured for edit instruction")
with self.lock:
if self._edit_session_is_active_locked(token):
self.set_state(State.EDIT_IDLE)
self._safe_set_edit_popup_status("No audio captured. Record again.")
return
try:
instruction = self._transcribe(audio).strip()
except Exception as exc:
logging.error("edit stt failed: %s", exc)
with self.lock:
if self._edit_session_is_active_locked(token):
self.set_state(State.EDIT_IDLE)
self._safe_set_edit_popup_status("STT failed. Record again.")
return
if not instruction:
with self.lock:
if self._edit_session_is_active_locked(token):
self.set_state(State.EDIT_IDLE)
self._safe_set_edit_popup_status("No instruction heard. Record again.")
return
if self.log_transcript:
logging.debug("edit instruction: %s", instruction)
else:
logging.info("edit instruction length: %d", len(instruction))
with self.lock:
if not self._edit_session_is_active_locked(token):
return
self.edit_instruction_history.append(instruction)
instruction_history = list(self.edit_instruction_history)
self.set_state(State.EDIT_PROCESSING)
self._safe_set_edit_popup_status("Applying instruction...")
current_text = self._current_edit_text()
updated_text = current_text
try:
ai_text = self._get_ai_processor().process_edit(
current_text,
instruction,
instruction_history,
lang=STT_LANGUAGE,
dictionary_context=self.vocabulary.build_ai_dictionary_context(),
)
if ai_text and ai_text.strip():
updated_text = ai_text
except Exception as exc:
logging.error("edit process failed: %s", exc)
updated_text = self.vocabulary.apply_deterministic_replacements(updated_text).strip()
with self.lock:
if not self._edit_session_is_active_locked(token):
return
self.edit_text = updated_text
self.set_state(State.EDIT_IDLE)
self._safe_set_edit_popup_text(updated_text)
self._safe_set_edit_popup_status("Ready. Press edit hotkey to record.")
def _current_edit_text(self) -> str:
try:
text = self.desktop.get_edit_popup_text()
except Exception:
with self.lock:
return self.edit_text
with self.lock:
self.edit_text = text
return text
def finalize_edit_session_inject(self):
threading.Thread(target=self._finalize_edit_session_inject_worker, daemon=True).start()
def _finalize_edit_session_inject_worker(self):
text = self._current_edit_text()
self._close_edit_session(close_popup=True)
if self._shutdown_requested.is_set():
return
try:
self.desktop.restore_previous_focus()
except Exception as exc:
logging.warning("could not restore previous focus: %s", exc)
try:
self.set_state(State.OUTPUTTING)
self.desktop.inject_text(
text,
self.cfg.injection.backend,
remove_transcription_from_clipboard=(
self.cfg.injection.remove_transcription_from_clipboard
),
)
except Exception as exc:
logging.error("edit output failed: %s", exc)
finally:
self.set_state(State.IDLE)
def finalize_edit_session_copy(self):
threading.Thread(target=self._finalize_edit_session_copy_worker, daemon=True).start()
def _finalize_edit_session_copy_worker(self):
text = self._current_edit_text()
self._close_edit_session(close_popup=True)
try:
self.desktop.write_clipboard_text(text)
except Exception as exc:
logging.error("failed to copy edited text to clipboard: %s", exc)
def cancel_edit_session(self):
threading.Thread(target=self._cancel_edit_session_worker, daemon=True).start()
def _cancel_edit_session_worker(self):
self._close_edit_session(close_popup=True)
def _close_edit_session(self, *, close_popup: bool):
stream = None
record = None
with self.lock:
stream = self.edit_stream
record = self.edit_record
self.edit_stream = None
self.edit_record = None
if self.edit_timer:
self.edit_timer.cancel()
self.edit_timer = None
self.edit_active = False
self.edit_session_token += 1
self.edit_instruction_history = []
self.edit_text = ""
if self.state in EDIT_STATES:
self.set_state(State.IDLE)
if close_popup:
try:
self.desktop.close_edit_popup()
except Exception as exc:
logging.debug("failed closing edit popup: %s", exc)
if stream is not None and record is not None:
try:
stop_audio_recording(stream, record)
except Exception:
pass
def _edit_session_is_active_locked(self, token: int) -> bool:
return self.edit_active and self.edit_session_token == token
def _safe_set_edit_popup_status(self, status: str):
with self.lock:
if not self.edit_active:
return
try:
self.desktop.set_edit_popup_status(status)
except Exception as exc:
logging.debug("failed setting edit popup status: %s", exc)
def _safe_set_edit_popup_text(self, text: str):
with self.lock:
if not self.edit_active:
return
try:
self.desktop.set_edit_popup_text(text)
except Exception as exc:
logging.debug("failed setting edit popup text: %s", exc)
def shutdown(self, timeout: float = 5.0) -> bool:
self.request_shutdown()
self._disarm_cancel_listener_for_recording()
self._close_edit_session(close_popup=True)
self.stop_recording(trigger="shutdown", process_audio=False)
return self.wait_for_idle(timeout)
@ -367,6 +735,7 @@ def main():
_LOCK_HANDLE = _lock_single_instance()
logging.info("hotkey: %s", cfg.daemon.hotkey)
logging.info("edit hotkey: %s", cfg.daemon.edit_hotkey)
logging.info(
"config (%s):\n%s",
args.config or str(Path.home() / ".config" / "aman" / "config.json"),
@ -400,9 +769,16 @@ def main():
try:
desktop.start_hotkey_listener(
cfg.daemon.hotkey,
lambda: logging.info("hotkey pressed (dry-run)") if args.dry_run else daemon.toggle(),
lambda: logging.info("dictate hotkey pressed (dry-run)")
if args.dry_run
else daemon.toggle(),
)
desktop.start_hotkey_listener(
cfg.daemon.edit_hotkey,
lambda: logging.info("edit hotkey pressed (dry-run)")
if args.dry_run
else daemon.toggle_edit(),
)
desktop.start_cancel_listener(lambda: daemon.cancel_recording())
except Exception as exc:
logging.error("hotkey setup failed: %s", exc)
raise SystemExit(1)