diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..c269e49 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "chatgpt.openOnStartup": false +} \ No newline at end of file diff --git a/AI_Web_Scraper/README.md b/AI_Web_Scraper/README.md index 566e346..810e861 100644 --- a/AI_Web_Scraper/README.md +++ b/AI_Web_Scraper/README.md @@ -39,6 +39,13 @@ pip install -r requirements.txt 자세한 실행 방법은 `run_guide.md` 파일을 참고하세요. +## 로그 + +- 실행 시 `./logs/run_*.jsonl`에 구조화된 이벤트 로그가 저장됩니다. +- LLM 내부 추론(Thought) 로그는 기본 비활성화입니다. 필요 시 환경변수로 활성화할 수 있습니다: + - `AIWS_SHOW_THOUGHTS=1` + - 저장 파일 미리보기 로그: `AIWS_LOG_FILE_PREVIEW=1` + ## 파일 구조 ``` diff --git a/AI_Web_Scraper/ai_agent.py b/AI_Web_Scraper/ai_agent.py index 59314c7..f79ffa3 100644 --- a/AI_Web_Scraper/ai_agent.py +++ b/AI_Web_Scraper/ai_agent.py @@ -9,6 +9,7 @@ from langchain.tools import Tool from langchain.memory import ConversationBufferMemory from web_scraper import WebScraper from google_drive_uploader import GoogleDriveUploader, SimpleDriveSaver +from event_logger import get_logger, LangChainEventsHandler class AIAgent: def __init__(self, config_path='./config.json'): @@ -85,6 +86,9 @@ class AIAgent: memory=self.memory, verbose=True ) + # 콜백 핸들러 구성 (이벤트 로깅) + logger = get_logger() + self.callback_handler = LangChainEventsHandler(logger) if logger else None def load_model(self): """ @@ -349,9 +353,15 @@ class AIAgent: 주제별로 웹 검색 → 스크래핑 → 요약 → 저장까지 수행 반환: [{ topic, response }] """ + from event_logger import get_logger + logger = get_logger() results = [] for topic in topics: + if logger: + logger.log_event("topic_start", topic=topic) urls = self._search_urls(topic, k=5) + if logger: + logger.log_event("search_done", topic=topic, url_count=len(urls)) collected = [] for u in urls[:5]: data = self.web_scraper.scrape_website(u) @@ -372,11 +382,18 @@ class AIAgent: 자료: {snippet} """ - summary = self.llm(prompt) + # LangChain 0.1+: __call__ deprecated → use invoke + if logger: + logger.log_event("llm_summary_start", topic=topic) + summary = self.llm.invoke(prompt) + if logger: + logger.log_event("llm_summary_end", topic=topic) except Exception as e: summary = f"요약 실패: {e}" results.append({"topic": topic, "response": summary}) + if logger: + logger.log_event("topic_done", topic=topic) return results @@ -432,7 +449,10 @@ class AIAgent: AI 에이전트를 실행합니다. """ try: - response = self.agent.run(task_description) + if self.callback_handler: + response = self.agent.run(task_description, callbacks=[self.callback_handler]) + else: + response = self.agent.run(task_description) return response except Exception as e: print(f"에이전트 실행 실패: {e}") @@ -459,7 +479,14 @@ class AIAgent: """ try: - response = self.llm(prompt) + # LangChain 0.1+: __call__ deprecated → use invoke + from event_logger import get_logger + logger = get_logger() + if logger: + logger.log_event("llm_topics_start", count=num_topics) + response = self.llm.invoke(prompt) + if logger: + logger.log_event("llm_topics_end", count=num_topics) # 응답에서 주제들을 추출 (줄 단위로 분리) topics = [line.strip() for line in response.split('\n') if line.strip() and not line.startswith(('1.', '2.', '3.', '-'))] # 최대 num_topics개 반환 diff --git a/AI_Web_Scraper/event_logger.py b/AI_Web_Scraper/event_logger.py new file mode 100644 index 0000000..876670e --- /dev/null +++ b/AI_Web_Scraper/event_logger.py @@ -0,0 +1,181 @@ +import os +import json +import uuid +import datetime as _dt +from typing import Any, Dict, Optional + +_LOGGER_INSTANCE = None + + +class EventLogger: + """ + Lightweight JSONL + console event logger for the app. + Avoids exposing LLM chain-of-thought by default; can be opted-in via config/env. + """ + + def __init__(self, log_dir: str = "./logs", enable_file: bool = True, + show_llm_thoughts: bool = False, console_level: str = "INFO", + preview_saved_files: bool = False, preview_limit: int = 500): + self.log_dir = log_dir + self.enable_file = enable_file + self.show_llm_thoughts = show_llm_thoughts + self.console_level = console_level.upper() + self.preview_saved_files = preview_saved_files + self.preview_limit = preview_limit + self.run_id = _dt.datetime.now().strftime("%Y%m%d_%H%M%S") + "_" + uuid.uuid4().hex[:6] + self.file_path = None + if enable_file: + os.makedirs(log_dir, exist_ok=True) + self.file_path = os.path.join(log_dir, f"run_{self.run_id}.jsonl") + + def _now(self) -> str: + return _dt.datetime.now().isoformat(timespec='seconds') + + def _console_enabled(self, level: str) -> bool: + order = ["DEBUG", "INFO", "WARN", "ERROR"] + try: + return order.index(level) >= order.index(self.console_level) + except Exception: + return True + + def log_event(self, event: str, message: Optional[str] = None, **fields: Any) -> None: + rec: Dict[str, Any] = { + "ts": self._now(), + "run_id": self.run_id, + "event": event, + } + if message: + rec["message"] = message + if fields: + rec.update(fields) + + # console (pretty one-liner) + if self._console_enabled("INFO"): + kv = " ".join( + f"{k}={str(v)[:120]}" for k, v in fields.items() if v is not None + ) + line = f"[{rec['ts']}] {event}" + if message: + line += f" | {message}" + if kv: + line += f" | {kv}" + print(line) + + # JSONL file + if self.enable_file and self.file_path: + try: + with open(self.file_path, "a", encoding="utf-8") as f: + f.write(json.dumps(rec, ensure_ascii=False) + "\n") + except Exception: + # Do not crash on logging errors + pass + + +def init_from_config(config: Dict[str, Any]) -> EventLogger: + global _LOGGER_INSTANCE + lg = config.get("logging", {}) if isinstance(config, dict) else {} + log_dir = lg.get("log_dir", "./logs") + enable_file = bool(lg.get("log_to_file", True)) + console_level = str(lg.get("console_level", "INFO")).upper() + # env override for showing LLM thoughts + env_flag = os.environ.get("AIWS_SHOW_THOUGHTS") + show_llm_thoughts = bool(lg.get("show_thoughts", False)) or (str(env_flag).lower() in ("1", "true", "yes")) + from os import environ + preview_files = bool(lg.get("preview_saved_files", False)) or (str(environ.get("AIWS_LOG_FILE_PREVIEW")).lower() in ("1", "true", "yes")) + preview_limit = int(lg.get("preview_limit", 500)) if str(lg.get("preview_limit", "")).isdigit() else 500 + _LOGGER_INSTANCE = EventLogger( + log_dir=log_dir, + enable_file=enable_file, + show_llm_thoughts=show_llm_thoughts, + console_level=console_level, + preview_saved_files=preview_files, + preview_limit=preview_limit, + ) + _LOGGER_INSTANCE.log_event("run_start", message="Application run started") + return _LOGGER_INSTANCE + + +def get_logger() -> Optional[EventLogger]: + return _LOGGER_INSTANCE + + +# LangChain callback handler +try: + from langchain.callbacks.base import BaseCallbackHandler +except Exception: # pragma: no cover - fallback for newer versions + try: + from langchain_core.callbacks.base import BaseCallbackHandler # type: ignore + except Exception: + BaseCallbackHandler = object # minimal fallback + + +class LangChainEventsHandler(BaseCallbackHandler): + def __init__(self, logger: EventLogger): + super().__init__() + self.logger = logger + + # Chains + def on_chain_start(self, serialized, inputs, **kwargs): + name = serialized.get("name") if isinstance(serialized, dict) else str(serialized) + self.logger.log_event("chain_start", name=name, inputs=_truncate(inputs)) + + def on_chain_end(self, outputs, **kwargs): + self.logger.log_event("chain_end", outputs=_truncate(outputs)) + + # LLMs + def on_llm_start(self, serialized, prompts, **kwargs): + if self.logger.show_llm_thoughts: + self.logger.log_event("llm_start", prompts=_truncate(prompts)) + else: + self.logger.log_event("llm_start", message="prompt issued", prompt_count=len(prompts) if prompts else 0) + + def on_llm_end(self, response, **kwargs): + try: + if self.logger.show_llm_thoughts: + texts = [g[0].text for g in response.generations] # type: ignore[attr-defined] + self.logger.log_event("llm_end", outputs=_truncate(texts)) + else: + # token/length only when possible + usage = getattr(response, 'llm_output', None) or {} + self.logger.log_event("llm_end", message="llm completed", meta=_truncate(usage)) + except Exception: + self.logger.log_event("llm_end") + + # Tools + def on_tool_start(self, serialized, input_str, **kwargs): + name = serialized.get("name") if isinstance(serialized, dict) else str(serialized) + self.logger.log_event("tool_start", name=name, input=_truncate(input_str)) + + def on_tool_end(self, output, **kwargs): + self.logger.log_event("tool_end", output=_truncate(output)) + + # Agent actions + def on_agent_action(self, action, **kwargs): + try: + self.logger.log_event( + "agent_action", + tool=getattr(action, 'tool', None), + tool_input=_truncate(getattr(action, 'tool_input', None)), + log=_truncate(getattr(action, 'log', None)) if self.logger.show_llm_thoughts else None, + ) + except Exception: + self.logger.log_event("agent_action") + + def on_agent_finish(self, finish, **kwargs): + try: + out = getattr(finish, 'return_values', {}).get('output') + self.logger.log_event("agent_finish", output=_truncate(out)) + except Exception: + self.logger.log_event("agent_finish") + + +def _truncate(obj: Any, limit: int = 800) -> Any: + try: + s = obj if isinstance(obj, str) else json.dumps(obj, ensure_ascii=False) + return s if len(s) <= limit else (s[:limit] + "…") + except Exception: + try: + s = str(obj) + return s if len(s) <= limit else (s[:limit] + "…") + except Exception: + return None diff --git a/AI_Web_Scraper/google_drive_uploader.py b/AI_Web_Scraper/google_drive_uploader.py index b240379..fbff897 100644 --- a/AI_Web_Scraper/google_drive_uploader.py +++ b/AI_Web_Scraper/google_drive_uploader.py @@ -5,6 +5,7 @@ from googleapiclient.http import MediaFileUpload from google.oauth2.credentials import Credentials from google_auth_oauthlib.flow import InstalledAppFlow from google.auth.transport.requests import Request +from event_logger import get_logger class GoogleDriveUploader: def __init__(self, config_path='./config.json'): @@ -52,6 +53,7 @@ class GoogleDriveUploader: """ 파일을 Google Drive에 업로드 """ + logger = get_logger() if self.service is None: raise RuntimeError('Google Drive API가 초기화되지 않았습니다. credentials.json과 folder_id를 설정하세요.') @@ -71,11 +73,15 @@ class GoogleDriveUploader: media_body=media, fields='id' ).execute() - - print(f'파일 업로드 완료: {file_name} (ID: {file.get("id")})') - return file.get('id') + fid = file.get('id') + print(f'파일 업로드 완료: {file_name} (ID: {fid})') + if logger: + logger.log_event("drive_upload", name=file_name, id=fid) + return fid except Exception as e: print(f'업로드 실패: {e}') + if logger: + logger.log_event("drive_upload_error", name=file_name, error=str(e)) return None def upload_data_as_json(self, data, filename='collected_data.json'): @@ -83,6 +89,7 @@ class GoogleDriveUploader: 데이터를 JSON 파일로 변환하여 업로드 """ import tempfile + logger = get_logger() with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: json.dump(data, f, ensure_ascii=False, indent=2) @@ -90,9 +97,19 @@ class GoogleDriveUploader: try: file_id = self.upload_file(temp_path, filename) + logger = get_logger() + if logger and logger.preview_saved_files: + try: + with open(temp_path, 'r', encoding='utf-8') as rf: + content = rf.read(logger.preview_limit) + logger.log_event("file_preview", name=filename, preview=content) + except Exception: + pass return file_id finally: os.unlink(temp_path) + if logger: + logger.log_event("tempfile_cleanup", path=temp_path) def list_files(self): """ @@ -109,9 +126,15 @@ class GoogleDriveUploader: ).execute() items = results.get('files', []) + logger = get_logger() + if logger: + logger.log_event("drive_list", count=len(items)) return items except Exception as e: print(f'파일 목록 조회 실패: {e}') + logger = get_logger() + if logger: + logger.log_event("drive_list_error", error=str(e)) return [] class SimpleDriveSaver: @@ -132,9 +155,22 @@ class SimpleDriveSaver: with open(filepath, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2) print(f'데이터 저장 완료: {filepath}') + logger = get_logger() + if logger: + logger.log_event("file_saved", path=filepath, bytes=os.path.getsize(filepath)) + if logger.preview_saved_files: + try: + with open(filepath, 'r', encoding='utf-8') as rf: + preview = rf.read(logger.preview_limit) + logger.log_event("file_preview", path=filepath, preview=preview) + except Exception: + pass return filepath except Exception as e: print(f'저장 실패: {e}') + logger = get_logger() + if logger: + logger.log_event("file_save_error", path=filepath, error=str(e)) return None def save_text_data(self, data, filename='collected_data.txt'): @@ -150,9 +186,15 @@ class SimpleDriveSaver: else: f.write(str(data)) print(f'텍스트 데이터 저장 완료: {filepath}') + logger = get_logger() + if logger: + logger.log_event("file_saved", path=filepath, bytes=os.path.getsize(filepath)) return filepath except Exception as e: print(f'저장 실패: {e}') + logger = get_logger() + if logger: + logger.log_event("file_save_error", path=filepath, error=str(e)) return None def save_to_drive_simple(data, filename='collected_data.json', mount_path='/content/drive/MyDrive/AI_Data'): diff --git a/AI_Web_Scraper/main.py b/AI_Web_Scraper/main.py index 6bd88f3..7a782b8 100644 --- a/AI_Web_Scraper/main.py +++ b/AI_Web_Scraper/main.py @@ -3,6 +3,7 @@ import json from model_downloader import download_model from ai_agent import AIAgent import argparse +from event_logger import init_from_config, get_logger def main(): parser = argparse.ArgumentParser(description='AI 웹 정보 수집 시스템') @@ -19,8 +20,10 @@ def main(): with open(args.config, 'r') as f: config = json.load(f) - # 2. 모델 다운로드 (필요한 경우) + # 2. 로깅 초기화 및 모델 다운로드 (필요한 경우) + logger = init_from_config(config) print("모델 확인 중...") + logger.log_event("model_check") model, tokenizer = download_model(args.config) # 모델 로딩은 AIAgent에서 수행하므로, 다운로드만 성공해도 계속 진행 @@ -42,22 +45,28 @@ def main(): with open(args.config, 'w') as f: json.dump(config, f, indent=2) print(f"저장 경로 설정됨: {args.save_path}") + logger.log_event("save_path_set", path=args.save_path) # 3. AI 에이전트 초기화 print("AI 에이전트 초기화 중...") + logger.log_event("agent_init_start") agent = AIAgent(args.config) + logger.log_event("agent_init_done") # 3. 주제 결정 if args.auto_topics or args.topics is None: print("AI가 스스로 주제를 선정합니다...") topics = agent.generate_topics(num_topics=3) print(f"선정된 주제: {topics}") + logger.log_event("auto_topics", topics=topics) else: topics = args.topics + logger.log_event("user_topics", topics=topics) # 4. 정보 수집 실행 print(f"다음 주제들에 대해 정보를 수집합니다: {topics}") results = agent.collect_information(topics) + logger.log_event("collect_done", topic_count=len(topics)) # 5. 결과 출력 print("\n=== 수집 결과 ===") @@ -68,6 +77,7 @@ def main(): # 6. 정리 agent.close() + logger.log_event("run_complete", message="Program finished") print("프로그램 완료") if __name__ == "__main__": diff --git a/AI_Web_Scraper/model_downloader.py b/AI_Web_Scraper/model_downloader.py index 39c1569..899577b 100644 --- a/AI_Web_Scraper/model_downloader.py +++ b/AI_Web_Scraper/model_downloader.py @@ -3,6 +3,7 @@ import json from typing import Tuple, Optional from transformers import AutoTokenizer from huggingface_hub import snapshot_download +from event_logger import get_logger def download_model(config_path: str = './config.json') -> Tuple[Optional[object], Optional[AutoTokenizer]]: """ @@ -22,6 +23,9 @@ def download_model(config_path: str = './config.json') -> Tuple[Optional[object] os.makedirs(local_path, exist_ok=True) print(f"모델 {model_name}을 {local_path}에 다운로드 중...") + logger = get_logger() + if logger: + logger.log_event("model_download_start", model=model_name, path=local_path) try: # 인증 토큰(필요 시) 지원: 환경변수 HF_TOKEN 사용 @@ -45,9 +49,13 @@ def download_model(config_path: str = './config.json') -> Tuple[Optional[object] print("토크나이저 확인 실패(계속 진행): 로컬 경로에 tokenizer 파일이 없을 수 있습니다.") print(f"모델 다운로드 완료: {local_path}") + if logger: + logger.log_event("model_download_done", path=local_path) return None, tokenizer except Exception as e: print(f"모델 다운로드 실패: {e}") + if logger: + logger.log_event("model_download_error", error=str(e)) return None, None if __name__ == "__main__": diff --git a/AI_Web_Scraper/run_guide.md b/AI_Web_Scraper/run_guide.md index 7a8cfe0..17037fe 100644 --- a/AI_Web_Scraper/run_guide.md +++ b/AI_Web_Scraper/run_guide.md @@ -139,6 +139,19 @@ os.environ["HF_TOKEN"] = "hf_********************************" !free -h ``` +### 5.4 이벤트 로그(JSONL) + +- 실행 시 `./logs/` 폴더에 `run_YYYYMMDD_HHMMSS_xxxxxx.jsonl` 파일이 생성됩니다. +- 각 단계(모델 다운로드, 에이전트/LLM 호출, 도구 실행, 스크래핑 시작/완료, 파일 저장 등)가 구조화된 레코드로 기록됩니다. +- 실시간 확인 예시: + ```bash + tail -f logs/run_*.jsonl + ``` +- LLM의 내부 추론(Thought) 노출은 기본 비활성화입니다. 필요 시 다음 환경변수로 활성화할 수 있습니다(권장하지 않음): + ```bash + export AIWS_SHOW_THOUGHTS=1 + ``` + ## 6. 문제 해결 ### 6.1 모델 다운로드 실패 diff --git a/AI_Web_Scraper/web_scraper.py b/AI_Web_Scraper/web_scraper.py index 0f1c152..a0dac46 100644 --- a/AI_Web_Scraper/web_scraper.py +++ b/AI_Web_Scraper/web_scraper.py @@ -3,6 +3,7 @@ from bs4 import BeautifulSoup import json import time import os +from event_logger import get_logger try: from selenium import webdriver @@ -64,7 +65,10 @@ class WebScraper: """ 웹사이트에서 정보를 수집합니다. """ + logger = get_logger() try: + if logger: + logger.log_event("scrape_start", url=url) if self.use_selenium and self.driver is not None: self.driver.get(url) time.sleep(self.delay) @@ -88,9 +92,13 @@ class WebScraper: 'content': text_content[:5000], 'timestamp': time.time() } + if logger: + logger.log_event("scrape_done", url=url, title=title, size=len(data['content'])) return data except Exception as e: print(f"스크래핑 실패: {url} - {e}") + if logger: + logger.log_event("scrape_error", url=url, error=str(e)) return None def crawl_multiple_pages(self, start_urls, keywords=None):