chore(omc): hotpaths (chunked correction)

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
feat(api): chunk LLM correction for small context windows (+running glossary)
2026-06-09 07:09:51 +09:00 · 2026-06-09 07:09:51 +09:00
5 changed files with 139 additions and 46 deletions
@@ -30,7 +30,7 @@
  },
  "build": {
    "buildCommand": null,
-    "testCommand": "export PATH=\"$HOME/.local/bin:$HOME/.cargo/bin:$PATH\"\nclip=\"samples/ko_en/clips/GDG 인천 - EmbeddingGemma 200% 활용하기 - 지주영.m4a\"\nffmpeg -nostdin -ss 70 -t 12 -i \"$clip\" -ac 1 -ar 16000 -y /tmp/api_smoke.wav 2>/dev/null\nls -l /tmp/api_smoke.wav\necho \"=== pytest 재확인(413 수정 후) ===\"; uv run pytest -q 2>&1 | tail -3",
+    "testCommand": "export PATH=\"$HOME/.local/bin:$HOME/.cargo/bin:$PATH\"\necho \"=== ruff ===\"; uv run ruff check src/ tests/ && echo clean\necho \"=== pytest ===\"; uv run pytest -q 2>&1 | tail -6\necho \"=== 청크 분할 빠른 점검 ===\"; uv run python -c \"\nfrom luke_scribe.postprocess import llm\nt='. '.join(f'문장{i} EmbeddingGemma' for i in range(300))\nch=llm._chunk(t, 200)\nprint('total chars', len(t), '→ chunks', len(ch), '| max chunk', max(len(c) for c in ch))\nprint('all<=200:', all(len(c)<=200 for c in ch))\n\"",
    "lintCommand": "ruff check",
    "devCommand": null,
    "scripts": {}
@@ -129,6 +129,12 @@
      "lastAccessed": 1780928043613,
      "type": "file"
    },
    {
      "path": "src/luke_scribe/config.py",
      "accessCount": 4,
      "lastAccessed": 1780956547899,
      "type": "file"
    },
    {
      "path": "README.md",
      "accessCount": 3,
@@ -136,15 +142,21 @@
      "type": "file"
    },
    {
-      "path": "src/luke_scribe/config.py",
+      "path": "src/luke_scribe/postprocess/llm.py",
      "accessCount": 3,
-      "lastAccessed": 1780927884587,
+      "lastAccessed": 1780956524689,
      "type": "file"
    },
    {
      "path": "src/luke_scribe/api/routes/transcribe.py",
      "accessCount": 3,
      "lastAccessed": 1780956549345,
      "type": "file"
    },
    {
      "path": "tests/test_postprocess.py",
      "accessCount": 2,
-      "lastAccessed": 1780928097713,
+      "lastAccessed": 1780956556589,
      "type": "file"
    },
    {
@@ -267,12 +279,6 @@
      "lastAccessed": 1780927897308,
      "type": "file"
    },
    {
      "path": "src/luke_scribe/postprocess/llm.py",
      "accessCount": 1,
      "lastAccessed": 1780927908123,
      "type": "file"
    },
    {
      "path": "src/luke_scribe/api/__init__.py",
      "accessCount": 1,
@@ -327,12 +333,6 @@
      "lastAccessed": 1780928016400,
      "type": "file"
    },
    {
      "path": "tests/test_postprocess.py",
      "accessCount": 1,
      "lastAccessed": 1780928018944,
      "type": "file"
    },
    {
      "path": "tests/test_api.py",
      "accessCount": 1,
@@ -93,6 +93,7 @@ def transcribe_ep(  # noqa: PLR0913 — 요청 옵션 다수(스펙 options 스
                        base_url=settings.llm_base_url,
                        api_key=settings.llm_api_key,
                        model=settings.llm_model,
                        max_chars=settings.llm_max_chars,
                    )
                )
                corrected = True
@@ -43,6 +43,8 @@ class Settings(BaseSettings):
    llm_base_url: str | None = None      # 예: http://192.168.0.123:8080/v1 (allowlist=이 endpoint만)
    llm_api_key: str | None = None       # env SCRIBE_LLM_API_KEY 로만 주입
    llm_model: str = "copilot-gpt-4o"
    # 보정 청크 크기(글자) — 사내 LLM 컨텍스트 창에 맞춰 조정 (예: ~8k창→1500, ~16k→3000, ~30k→6000)
    llm_max_chars: int = 3000
 settings = Settings()
@@ -1,13 +1,17 @@
 """LLM 보정 (스펙 §7 stage 3 / §3.8) — 음차된 영문 용어를 문맥+지식으로 복원.
-OpenAI 호환 백엔드(사내/로컬). **opt-in**(요청 correct=true에서만 호출), **allowlist**(설정된
+작은 컨텍스트 창 대응(사내 GPT-4o < 30k 토큰): 긴 전사는 **문장 경계로 청크 분할**,
-base_url만), **감사로그**(호출 1줄). transient(연결 reset/timeout) 재시도.
+각 청크를 순차 보정하며 **이미 확정된 영문 표기(러닝 글로서리)** 를 다음 청크로 전달 →
-긴 입력 청크/러닝글로서리는 TODO — MVP는 단일 호출(짧은 클립엔 충분).
+큰 창 없이도 강연 전체 용어 일관성 유지.
 OpenAI 호환 백엔드(사내/로컬). **opt-in**(요청 correct=true) · **allowlist**(설정 base_url만) ·
 **감사로그**(호출 요약 1줄). transient(연결 reset/timeout) 재시도.
 """
 from __future__ import annotations
 import json
 import logging
 import re
 import time
 import urllib.error
 import urllib.request
@@ -20,47 +24,115 @@ SYSTEM = (
    "일반 한국어는 그대로 두고, 확실하지 않으면 바꾸지 마라. 설명 없이 교정된 전사문만 출력하라."
 )
 _SENT_RE = re.compile(r"(?<=[.!?。…\n])\s+")          # 문장 경계
 _TERM_RE = re.compile(r"[A-Za-z][A-Za-z0-9.+/#-]{1,}")  # 러닝 글로서리용 영문 토큰
 _GLOSSARY_CAP = 60
 class LLMNotConfigured(RuntimeError):
    """llm_base_url / llm_api_key 미설정."""
 def _chunk(text: str, max_chars: int) -> list[str]:
    """문장 경계로 max_chars 이하 청크 패킹. 한 문장이 과대하면 글자 단위 강제 분할."""
    if len(text) <= max_chars:
        return [text]
    packed: list[str] = []
    cur = ""
    for part in _SENT_RE.split(text):
        if not part:
            continue
        if cur and len(cur) + len(part) + 1 > max_chars:
            packed.append(cur)
            cur = part
        else:
            cur = f"{cur} {part}" if cur else part
    if cur:
        packed.append(cur)
    out: list[str] = []
    for c in packed:  # 안전망: 단일 문장이 너무 길면 글자 단위 강제 분할
        if len(c) > max_chars:
            out.extend(c[i : i + max_chars] for i in range(0, len(c), max_chars))
        else:
            out.append(c)
    return out
 def _terms(text: str) -> list[str]:
    seen: dict[str, None] = {}
    for m in _TERM_RE.finditer(text):
        seen.setdefault(m.group(0), None)
    return list(seen)
 def _request(
    messages: list[dict],
    *,
    url: str,
    api_key: str,
    model: str,
    retries: int,
    timeout: float,
 ) -> str:
    payload = {"model": model, "temperature": 0, "messages": messages}
    req = urllib.request.Request(
        url,
        data=json.dumps(payload).encode(),
        headers={"Content-Type": "application/json", "Authorization": "Bearer " + api_key},
    )
    for attempt in range(1, retries + 1):
        try:
            with urllib.request.urlopen(req, timeout=timeout) as resp:
                return json.loads(resp.read())["choices"][0]["message"]["content"]
        except urllib.error.HTTPError:
            raise  # 실제 HTTP 응답(401/4xx) — 재시도 무의미
        except (urllib.error.URLError, OSError):  # transient
            if attempt == retries:
                raise
            time.sleep(1.0 * attempt)
    raise RuntimeError("unreachable")
 def correct(
    text: str,
    *,
    base_url: str | None,
    api_key: str | None,
    model: str = "copilot-gpt-4o",
    max_chars: int = 3000,
    retries: int = 4,
    timeout: float = 90.0,
 ) -> str:
    """음차 영문 용어 복원. max_chars로 청크 분할(작은 컨텍스트 창 대응)."""
    if not base_url or not api_key:
-        raise LLMNotConfigured("llm_base_url/llm_api_key 미설정 — correct를 쓰려면 SCRIBE_LLM_* 설정 필요")
+        raise LLMNotConfigured("llm_base_url/llm_api_key 미설정 — correct에 SCRIBE_LLM_* 필요")
    url = base_url.rstrip("/") + "/chat/completions"
-    payload = {
+    chunks = _chunk(text, max_chars)
-        "model": model,
+    logger.info(
-        "temperature": 0,
+        "llm-correct egress endpoint=%s model=%s chars=%d chunks=%d",
-        "messages": [
+        url, model, len(text), len(chunks),
            {"role": "system", "content": SYSTEM},
            {"role": "user", "content": text},
        ],
    }
    req = urllib.request.Request(
        url,
        data=json.dumps(payload).encode(),
        headers={"Content-Type": "application/json", "Authorization": "Bearer " + api_key},
    )
-    # 감사로그 (allowlist=설정 endpoint, 호출 1줄)
+    glossary: dict[str, None] = {}
-    logger.info("llm-correct egress endpoint=%s model=%s chars=%d", url, model, len(text))
+    out: list[str] = []
-    for attempt in range(1, retries + 1):
+    for chunk in chunks:
-        try:
+        system = SYSTEM
-            with urllib.request.urlopen(req, timeout=timeout) as resp:
+        if glossary:
-                data = json.loads(resp.read())
+            system += (
-            return data["choices"][0]["message"]["content"]
+                "\n이미 이 전사에서 확정된 영문 표기: "
-        except urllib.error.HTTPError:
+                + ", ".join(glossary)
-            raise  # 실제 HTTP 응답(401/4xx) — 재시도 무의미
+                + ". 같은/유사 용어는 이 표기로 통일하라."
-        except (urllib.error.URLError, OSError):  # 연결 reset/timeout 등 transient
+            )
-            if attempt == retries:
+        corrected = _request(
-                raise
+            [{"role": "system", "content": system}, {"role": "user", "content": chunk}],
-            time.sleep(1.0 * attempt)
+            url=url,
-    raise RuntimeError("unreachable")
+            api_key=api_key,
            model=model,
            retries=retries,
            timeout=timeout,
        )
        out.append(corrected)
        for term in _terms(corrected):
            glossary.setdefault(term, None)
        if len(glossary) > _GLOSSARY_CAP:
            glossary = dict(list(glossary.items())[-_GLOSSARY_CAP:])
    return " ".join(out).strip()
@@ -39,3 +39,21 @@ def test_llm_correct_monkeypatched(monkeypatch):
    monkeypatch.setattr(llm.urllib.request, "urlopen", fake_urlopen)
    out = llm.correct("인베딩 점마", base_url="http://x/v1", api_key="k", model="m")
    assert out == "EmbeddingGemma 복원됨"
 def test_llm_chunking_and_glossary(monkeypatch):
    """긴 입력 → 청크 분할 + 러닝 글로서리(작은 컨텍스트 창 대응)."""
    calls: list[list[dict]] = []
    def fake_request(messages, **_kw):
        calls.append(messages)
        return messages[1]["content"]  # 청크 그대로 echo
    monkeypatch.setattr(llm, "_request", fake_request)
    long_text = ". ".join(f"문장{i} EmbeddingGemma 설명" for i in range(400))
    out = llm.correct(long_text, base_url="http://x/v1", api_key="k", max_chars=200)
    assert len(calls) > 1  # 분할됨
    assert "EmbeddingGemma" in out  # 재조립됨
    # 2번째 청크부터 이전에 확정된 영문 표기가 system에 주입됨
    assert any("확정된 영문 표기" in m[0]["content"] for m in calls[1:])