From 1ea96c36c8352b8c88144fa6c680a434033f934b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EB=B0=95=EC=83=81=ED=98=B8=20Sangho=20Park?= Date: Mon, 8 Jun 2026 23:20:01 +0900 Subject: [PATCH] chore(omc): record GPT-4o correction finding + P2 API progress (hotpaths) Co-Authored-By: Claude Opus 4.8 --- .omc/project-memory.json | 218 ++++++++++++++++++++++++++++++++++----- 1 file changed, 194 insertions(+), 24 deletions(-) diff --git a/.omc/project-memory.json b/.omc/project-memory.json index 75ab79f..c455222 100644 --- a/.omc/project-memory.json +++ b/.omc/project-memory.json @@ -1,22 +1,38 @@ { "version": "1.0.0", - "lastScanned": 1780794206309, + "lastScanned": 1780919472386, "projectRoot": "/root/luke_scribe", "techStack": { "languages": [ - "Python" + { + "name": "Python", + "version": null, + "confidence": "high", + "markers": [ + "pyproject.toml" + ] + } ], "frameworks": [ - "FastAPI · faster-whisper/CTranslate2 · Redis/RQ(no-fork) · pydantic v2 · ffmpeg · Silero VAD" + { + "name": "fastapi", + "version": null, + "category": "backend" + }, + { + "name": "pytest", + "version": null, + "category": "testing" + } ], - "packageManager": "uv", - "runtime": "Python 3.11+" + "packageManager": null, + "runtime": null }, "build": { - "buildCommand": "uv sync", - "testCommand": "export PATH=\"$HOME/.local/bin:$HOME/.cargo/bin:$PATH\"\nuv run pytest -q 2>&1 | tail -8\necho \"=== ruff ===\"; uv run ruff check src/ tests/ && echo \"clean\"", - "lintCommand": "uv run ruff check src/ tests/", - "devCommand": "uv run luke-scribe detect", + "buildCommand": null, + "testCommand": "export PATH=\"$HOME/.local/bin:$HOME/.cargo/bin:$PATH\"\nclip=\"samples/ko_en/clips/GDG 인천 - EmbeddingGemma 200% 활용하기 - 지주영.m4a\"\nffmpeg -nostdin -ss 70 -t 12 -i \"$clip\" -ac 1 -ar 16000 -y /tmp/api_smoke.wav 2>/dev/null\nls -l /tmp/api_smoke.wav\necho \"=== pytest 재확인(413 수정 후) ===\"; uv run pytest -q 2>&1 | tail -3", + "lintCommand": "ruff check", + "devCommand": null, "scripts": {} }, "conventions": { @@ -29,9 +45,10 @@ "isMonorepo": false, "workspaces": [], "mainDirectories": [ - "src/luke_scribe (계획, 미생성)" + "src", + "tests" ], - "gitBranches": "main" + "gitBranches": null }, "customNotes": [ { @@ -57,10 +74,61 @@ "source": "manual", "category": "status", "content": "P1 진행(2026-06-07): ✅ detect(능력등급 T0~T3, 1050→T0_CPU 명시강등) · ✅ transcribe(faster-whisper CPU 검증: JFK 11s 클립 정확 전사, model_used 출력) · 단위테스트 10개 통과. 코드 존재함(더 이상 0%). 남음: word-ts/format 출력옵션·Silero VAD 옵션화, VRAM 실측 probe(정적추정 대체), bench(라벨 KO+EN 샘플셋 필요), 상위 tier(T2/T3) Colab 검증, P2(API+Redis/RQ). 브랜치 feat/p1-core." + }, + { + "timestamp": 1780926195887, + "source": "manual", + "category": "finding", + "content": "검증된 발견(2026-06-07): KO+EN 혼용어 음차 문제의 open-vocab 해법 = 사내 GPT-4o 텍스트 후처리 보정. faster-whisper(turbo)가 음차로 망친 영문 용어를 hotwords 등록 없이 문맥+지식으로 복원. 실증(EmbeddingGemma 강연 90초 슬라이스): 인베딩 점마→Embedding Gemma, 재미나이→Gemini, 점마→Gemma, 랭기징→Language, 구글 포 디벨로퍼스→Google for Developers (5/5, 일반 한국어는 보존). 게이트=OpenAI 호환(baseURL http://192.168.0.123:8080/v1, model copilot-gpt-4o, API키 필요·키는 메모리에 저장 안 함; localhost:8080은 사용자 머신 터널이라 샌드박스선 미도달) → 사내 호출이라 외부 egress 0(프라이버시 OK). 함의: hotwords는 등록된 것만 잡아 불충분, LLM 문맥보정이 '모르는 용어'까지 커버. 단서: (1) 'Embedding Gemma' 띄어쓰기(공식 EmbeddingGemma)→rules/glossary 정규화 병행 필요, (2) LLM이 아는/추론가능 용어만·초신조어는 confidence 플래그→휴먼, (3) 샘플1개라 과교정 추가검증, (4) 게이트 경로 불안정(401→timeout→reset)→재시도 필요(스크립트에 반영). 작은 컨텍스트는 청크+러닝글로서리로 우회. PoC=scripts/llm_correct.py → 승격 대상 postprocess/llm.py(confidence-gated·청크·backend=internal·감사로그) + transcribe --correct 플래그." } ], - "directoryMap": {}, + "directoryMap": { + "samples": { + "path": "samples", + "purpose": null, + "fileCount": 1, + "lastAccessed": 1780919472362, + "keyFiles": [ + "README.md" + ] + }, + "src": { + "path": "src", + "purpose": "Source code", + "fileCount": 0, + "lastAccessed": 1780919472371, + "keyFiles": [] + }, + "tests": { + "path": "tests", + "purpose": "Test files", + "fileCount": 2, + "lastAccessed": 1780919472373, + "keyFiles": [ + "test_device_manager.py", + "test_engine_audio.py" + ] + } + }, "hotPaths": [ + { + "path": "scripts/llm_correct.py", + "accessCount": 4, + "lastAccessed": 1780925584647, + "type": "file" + }, + { + "path": "src/luke_scribe/cli.py", + "accessCount": 4, + "lastAccessed": 1780927984393, + "type": "file" + }, + { + "path": "pyproject.toml", + "accessCount": 4, + "lastAccessed": 1780928043613, + "type": "file" + }, { "path": "README.md", "accessCount": 3, @@ -68,15 +136,15 @@ "type": "file" }, { - "path": "src/luke_scribe/cli.py", - "accessCount": 2, - "lastAccessed": 1780812315014, + "path": "src/luke_scribe/config.py", + "accessCount": 3, + "lastAccessed": 1780927884587, "type": "file" }, { - "path": "pyproject.toml", - "accessCount": 1, - "lastAccessed": 1780804235420, + "path": "src/luke_scribe/api/routes/transcribe.py", + "accessCount": 2, + "lastAccessed": 1780928097713, "type": "file" }, { @@ -85,12 +153,6 @@ "lastAccessed": 1780804261889, "type": "file" }, - { - "path": "src/luke_scribe/config.py", - "accessCount": 1, - "lastAccessed": 1780804262703, - "type": "file" - }, { "path": "src/luke_scribe/devices/__init__.py", "accessCount": 1, @@ -168,6 +230,114 @@ "accessCount": 1, "lastAccessed": 1780812413312, "type": "file" + }, + { + "path": "samples/README.md", + "accessCount": 1, + "lastAccessed": 1780812722445, + "type": "file" + }, + { + "path": "samples/ko_en/manifest.jsonl.example", + "accessCount": 1, + "lastAccessed": 1780812854083, + "type": "file" + }, + { + "path": "src/luke_scribe/results/__init__.py", + "accessCount": 1, + "lastAccessed": 1780927886298, + "type": "file" + }, + { + "path": "src/luke_scribe/results/formats.py", + "accessCount": 1, + "lastAccessed": 1780927892282, + "type": "file" + }, + { + "path": "src/luke_scribe/postprocess/__init__.py", + "accessCount": 1, + "lastAccessed": 1780927894092, + "type": "file" + }, + { + "path": "src/luke_scribe/postprocess/rules.py", + "accessCount": 1, + "lastAccessed": 1780927897308, + "type": "file" + }, + { + "path": "src/luke_scribe/postprocess/llm.py", + "accessCount": 1, + "lastAccessed": 1780927908123, + "type": "file" + }, + { + "path": "src/luke_scribe/api/__init__.py", + "accessCount": 1, + "lastAccessed": 1780927952439, + "type": "file" + }, + { + "path": "src/luke_scribe/api/schemas.py", + "accessCount": 1, + "lastAccessed": 1780927953308, + "type": "file" + }, + { + "path": "src/luke_scribe/api/engine_pool.py", + "accessCount": 1, + "lastAccessed": 1780927954191, + "type": "file" + }, + { + "path": "src/luke_scribe/api/deps.py", + "accessCount": 1, + "lastAccessed": 1780927955218, + "type": "file" + }, + { + "path": "src/luke_scribe/api/app.py", + "accessCount": 1, + "lastAccessed": 1780927956175, + "type": "file" + }, + { + "path": "src/luke_scribe/api/routes/__init__.py", + "accessCount": 1, + "lastAccessed": 1780927957095, + "type": "file" + }, + { + "path": "src/luke_scribe/connectivity/__init__.py", + "accessCount": 1, + "lastAccessed": 1780927962648, + "type": "file" + }, + { + "path": "src/luke_scribe/connectivity/tunnel.py", + "accessCount": 1, + "lastAccessed": 1780927971385, + "type": "file" + }, + { + "path": "tests/test_formats.py", + "accessCount": 1, + "lastAccessed": 1780928016400, + "type": "file" + }, + { + "path": "tests/test_postprocess.py", + "accessCount": 1, + "lastAccessed": 1780928018944, + "type": "file" + }, + { + "path": "tests/test_api.py", + "accessCount": 1, + "lastAccessed": 1780928028187, + "type": "file" } ], "userDirectives": [