diff --git a/README.md b/README.md index a92102f..edef7a1 100644 --- a/README.md +++ b/README.md @@ -11,16 +11,16 @@ ## 빠른 시작 (개발) ```bash -uv sync # 코어 의존성 -uv run luke-scribe detect # 하드웨어 감지 → 능력등급/정밀도/워커수 -# 엔진(transcribe/bench)은 다음 증분: -# uv sync --extra engine +uv sync # 코어 의존성 +uv run luke-scribe detect # 하드웨어 감지 → 능력등급/정밀도/워커수 +uv sync --extra engine # 엔진(faster-whisper) +uv run luke-scribe transcribe FILE --model tiny # 단발 전사 ``` ## CLI | 명령 | 설명 | 상태 | |------|------|------| | `detect` | 하드웨어 감지·능력등급(T0~T3)·정밀도·워커수 | ✅ P1 | -| `transcribe ` | 단발 파일 전사 | ⏳ P1 | +| `transcribe ` | 단발 파일 전사 (faster-whisper, CPU/GPU) | ✅ P1 | | `bench` | turbo vs large-v3 도메인 벤치(게이트) | ⏳ P1 (샘플셋 필요) | | `serve` | API 서버 | ⏳ P2 | diff --git a/src/luke_scribe/audio/__init__.py b/src/luke_scribe/audio/__init__.py new file mode 100644 index 0000000..33d380a --- /dev/null +++ b/src/luke_scribe/audio/__init__.py @@ -0,0 +1,4 @@ +"""오디오/영상 입력 — ingest(probe·상한), VAD (스펙 §4-4).""" +from .ingest import MediaInfo, probe_media + +__all__ = ["MediaInfo", "probe_media"] diff --git a/src/luke_scribe/audio/ingest.py b/src/luke_scribe/audio/ingest.py new file mode 100644 index 0000000..f2ce1d8 --- /dev/null +++ b/src/luke_scribe/audio/ingest.py @@ -0,0 +1,41 @@ +"""미디어 입력 — duration/size probe + 상한 점검 (스펙 §4-4, AC-7). + +상한 초과는 호출측이 413으로 매핑(P2). 실제 디코딩은 엔진(faster-whisper/PyAV)이 수행. +""" +from __future__ import annotations + +import json +import os +import shutil +import subprocess +from dataclasses import dataclass + + +@dataclass +class MediaInfo: + path: str + duration_s: float + size_bytes: int + + +def probe_media(path: str) -> MediaInfo: + if not os.path.exists(path): + raise FileNotFoundError(path) + return MediaInfo(path=path, duration_s=_ffprobe_duration(path), size_bytes=os.path.getsize(path)) + + +def _ffprobe_duration(path: str) -> float: + ffprobe = shutil.which("ffprobe") + if not ffprobe: + return 0.0 + try: + out = subprocess.run( + [ffprobe, "-v", "error", "-show_entries", "format=duration", "-of", "json", path], + capture_output=True, + text=True, + timeout=30, + check=True, + ).stdout + return float(json.loads(out).get("format", {}).get("duration") or 0.0) + except Exception: + return 0.0 diff --git a/src/luke_scribe/cli.py b/src/luke_scribe/cli.py index f7d26e6..09c30ca 100644 --- a/src/luke_scribe/cli.py +++ b/src/luke_scribe/cli.py @@ -48,9 +48,59 @@ def _todo(name: str, hint: str = "") -> None: @app.command() -def transcribe(file: str = typer.Argument(..., help="오디오/영상 파일")) -> None: - """단발 파일 전사 (다음 증분: engine + ffmpeg ingest).""" - _todo("transcribe", "→ `uv sync --extra engine` 후 구현 예정") +def transcribe( + file: str = typer.Argument(..., help="오디오/영상 파일"), + model: str = typer.Option(None, help="모델 오버라이드(기본=실시간 모델). tiny|base|large-v3|large-v3-turbo"), + language: str = typer.Option(None, help="언어(기본 설정값). 'auto' 가능"), + device: str = typer.Option("auto", help="auto|cpu|cuda"), + word_timestamps: bool = typer.Option(False, "--word-timestamps"), + vad: bool = typer.Option(True, "--vad/--no-vad", help="무음 제거"), + timestamps: bool = typer.Option(False, "--timestamps", help="세그먼트 [start–end] 표시"), +) -> None: + """단발 파일 전사 (faster-whisper, CPU/GPU 자동, AC-4 일부).""" + from .config import settings + + try: + from .audio.ingest import probe_media + from .engine.faster_whisper_engine import FasterWhisperEngine + except ImportError as exc: + console.print(f"[red]엔진 미설치:[/] {exc}\n→ `uv sync --extra engine` 후 다시 시도하세요.") + raise typer.Exit(code=1) from exc + + try: + info = probe_media(file) + except FileNotFoundError: + console.print(f"[red]파일 없음:[/] {file}") + raise typer.Exit(code=1) from None + + if info.duration_s > settings.max_duration_s or info.size_bytes > settings.max_size_bytes: + console.print( + f"[red]입력 상한 초과(413):[/] {info.duration_s:.0f}s / {info.size_bytes}B " + f"(상한 {settings.max_duration_s}s / {settings.max_size_bytes}B)" + ) + raise typer.Exit(code=1) + + profile = DeviceManager.detect(force_device=(None if device == "auto" else device)) + dev = "cpu" if profile.kind == "cpu" else "cuda" + model_name = model or settings.model_realtime + lang = language or settings.language + console.print( + f"[dim]model={model_name} device={dev} compute={profile.compute_type} " + f"lang={lang} dur={info.duration_s:.1f}s[/]" + ) + + engine = FasterWhisperEngine(model_name, dev, profile.compute_type, cache_dir=settings.model_cache_dir) + segments, tinfo = engine.transcribe(file, language=lang, word_timestamps=word_timestamps, vad=vad) + + count = 0 + for seg in segments: + count += 1 + if timestamps: + console.print(f"[cyan][{seg.start:6.2f}–{seg.end:6.2f}][/] {seg.text.strip()}") + else: + console.print(seg.text.strip()) + detected = getattr(tinfo, "language", None) + console.print(f"[green]✓ {count} segments · detected_lang={detected} · model_used={model_name}[/]") @app.command() diff --git a/src/luke_scribe/engine/__init__.py b/src/luke_scribe/engine/__init__.py new file mode 100644 index 0000000..de79378 --- /dev/null +++ b/src/luke_scribe/engine/__init__.py @@ -0,0 +1,5 @@ +"""추론 엔진 — faster-whisper(CTranslate2) 단일 엔진 + 얇은 추상화 (계획 §3 D3).""" +from .faster_whisper_engine import FasterWhisperEngine +from .model_registry import resolve_model + +__all__ = ["FasterWhisperEngine", "resolve_model"] diff --git a/src/luke_scribe/engine/faster_whisper_engine.py b/src/luke_scribe/engine/faster_whisper_engine.py new file mode 100644 index 0000000..48e9d70 --- /dev/null +++ b/src/luke_scribe/engine/faster_whisper_engine.py @@ -0,0 +1,55 @@ +"""faster-whisper(CTranslate2) 엔진 래퍼 (스펙 §2 / 계획 §4-3). + +faster-whisper가 내부적으로 PyAV로 디코딩하므로 파일 경로(오디오/영상)를 그대로 받는다. +segments는 제너레이터 — 호출측이 소비하며 progress/취소 점검(P2)에 활용. +""" +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from .model_registry import resolve_model + +if TYPE_CHECKING: + from collections.abc import Iterable + + +class FasterWhisperEngine: + def __init__( + self, + model_name: str, + device: str, + compute_type: str, + cache_dir: str | None = None, + ) -> None: + from faster_whisper import WhisperModel + + self.model_name = model_name + self.device = device + self.compute_type = compute_type + self.model = WhisperModel( + resolve_model(model_name), + device=device, + compute_type=compute_type, + download_root=cache_dir, + ) + + def transcribe( + self, + audio: str, + *, + language: str | None = "ko", + word_timestamps: bool = False, + vad: bool = True, + hotwords: list[str] | None = None, + initial_prompt: str | None = None, + beam_size: int = 5, + ) -> tuple[Iterable[Any], Any]: + return self.model.transcribe( + audio, + language=(None if language in (None, "auto") else language), + word_timestamps=word_timestamps, + vad_filter=vad, + hotwords=(" ".join(hotwords) if hotwords else None), + initial_prompt=initial_prompt, + beam_size=beam_size, + ) diff --git a/src/luke_scribe/engine/model_registry.py b/src/luke_scribe/engine/model_registry.py new file mode 100644 index 0000000..f5a880a --- /dev/null +++ b/src/luke_scribe/engine/model_registry.py @@ -0,0 +1,16 @@ +"""논리 모델명 → faster-whisper(CT2) 식별자 (계획 §4-3). + +표준 사이즈(tiny/base/small/medium/large-v3)는 그대로 통과. +turbo류는 검증된 CT2 변환 레포로 매핑. +""" +from __future__ import annotations + +_MODEL_IDS: dict[str, str] = { + "large-v3-turbo": "deepdml/faster-whisper-large-v3-turbo-ct2", + "turbo": "deepdml/faster-whisper-large-v3-turbo-ct2", + "large-v3": "large-v3", +} + + +def resolve_model(name: str) -> str: + return _MODEL_IDS.get(name, name) diff --git a/tests/test_engine_audio.py b/tests/test_engine_audio.py new file mode 100644 index 0000000..c44e74f --- /dev/null +++ b/tests/test_engine_audio.py @@ -0,0 +1,23 @@ +"""engine.model_registry / audio.ingest 경량 단위 테스트 (모델 로드 불요).""" +from __future__ import annotations + +import pytest + +from luke_scribe.audio.ingest import probe_media +from luke_scribe.engine.model_registry import resolve_model + + +def test_resolve_model_turbo_maps_to_ct2_repo(): + expected = "deepdml/faster-whisper-large-v3-turbo-ct2" + assert resolve_model("large-v3-turbo") == expected + assert resolve_model("turbo") == expected + + +def test_resolve_model_standard_passthrough(): + assert resolve_model("tiny") == "tiny" + assert resolve_model("large-v3") == "large-v3" + + +def test_probe_media_missing_raises(): + with pytest.raises(FileNotFoundError): + probe_media("/no/such/file.wav")