feat: AI 웹 스크래퍼의 핵심 컴포넌트 및 실행 가이드 구현

2025-08-28 11:01:04 +09:00
parent 50a6c3d407
commit 0206e42780
4 changed files with 187 additions and 150 deletions
--- a/AI_Web_Scraper/ai_agent.py
+++ b/AI_Web_Scraper/ai_agent.py
@@ -1,6 +1,7 @@
 import json
 import os
-from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+from typing import List, Dict
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, pipeline
 from langchain_community.llms import HuggingFacePipeline
 from langchain.agents import initialize_agent, AgentType
 from langchain.tools import Tool
@@ -10,6 +11,7 @@ from google_drive_uploader import GoogleDriveUploader, SimpleDriveSaver

 class AIAgent:
    def __init__(self, config_path='./config.json'):
+        self.config_path = config_path
        with open(config_path, 'r') as f:
            self.config = json.load(f)

@@ -61,140 +63,142 @@ class AIAgent:

    def load_model(self):
        """
-        Hugging Face 모델을 로드합니다. 없으면 다운로드 후 로드.
-        GPU와 CPU 메모리를 함께 활용.
+        Hugging Face 모델을 로드합니다.
+        - model_downloader가 가져온 로컬 스냅샷을 우선 사용
+        - 양자화/디바이스 맵은 가능한 한 보수적으로 설정하고, 실패 시 단계적 폴백
        """
-        import os
        # GPU 메모리 최적화 설정
        os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

-        try:
-            print(f"모델 로드 시도: {self.model_path}")
+        model_settings = self.config.get('model_settings', {})
+        use_quantization = bool(model_settings.get('use_quantization', False))
+        torch_dtype_cfg = str(model_settings.get('torch_dtype', 'auto')).lower()

-            # 모델 로드 시도
-            from transformers import BitsAndBytesConfig
-            from accelerate import infer_auto_device_map, init_empty_weights
+        # dtype 파싱
+        import torch
+        dtype = None
+        if torch_dtype_cfg in ("float16", "fp16", "half"):
+            dtype = torch.float16
+        elif torch_dtype_cfg in ("bfloat16", "bf16"):
+            dtype = torch.bfloat16
+        elif torch_dtype_cfg in ("float32", "fp32"):
+            dtype = torch.float32
+        else:
+            dtype = None  # auto

-            model_settings = self.config.get('model_settings', {})
-            use_quantization = model_settings.get('use_quantization', False)
-            max_memory_config = model_settings.get('max_memory', {})
+        # 로컬 스냅샷이 있으면 우선 사용, 없으면 모델 이름 사용
+        model_source = self.model_path if os.path.isdir(self.model_path) else self.config.get('model_name')
+        if not model_source:
+            raise RuntimeError("모델 경로/이름이 설정되지 않았습니다.")

-            # 메모리 제한 설정
-            max_memory = {}
-            if 'gpu' in max_memory_config:
-                max_memory[0] = max_memory_config['gpu']
-            if 'cpu' in max_memory_config:
-                max_memory['cpu'] = max_memory_config['cpu']
-
-            if use_quantization:
-                print("8bit 양자화 적용")
-                quantization_config = BitsAndBytesConfig(
+        # quantization 설정 (가능한 경우에만)
+        quant_args = {}
+        if use_quantization:
+            try:
+                from transformers import BitsAndBytesConfig
+                quant_args["quantization_config"] = BitsAndBytesConfig(
                    load_in_8bit=True,
                    llm_int8_enable_fp32_cpu_offload=True
                )
-            else:
-                quantization_config = None
+                print("8bit 양자화 적용")
+            except Exception as _:
+                # transformers/bitsandbytes 호환 문제 시 양자화 비활성화
+                print("bitsandbytes/transformers 호환 문제로 양자화를 비활성화합니다.")
+                quant_args = {}

-            # 최적의 device_map 계산
-            if max_memory:
-                print(f"GPU/CPU 메모리 분배 적용: {max_memory}")
-                with init_empty_weights():
-                    empty_model = AutoModelForCausalLM.from_config(
-                        AutoConfig.from_pretrained(self.model_path)
-                    )
-                device_map = infer_auto_device_map(
-                    empty_model,
-                    max_memory=max_memory,
-                    no_split_module_classes=["GPTNeoXLayer"]
-                )
-                print(f"계산된 device_map: {device_map}")
-            else:
-                device_map = "auto"
-
-            self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
+        # 1차 시도: device_map="auto" 로 로드
+        try:
+            self.tokenizer = AutoTokenizer.from_pretrained(model_source)
            self.model = AutoModelForCausalLM.from_pretrained(
-                self.model_path,
-                quantization_config=quantization_config,
-                device_map=device_map,
-                torch_dtype="auto"
+                model_source,
+                device_map="auto",
+                torch_dtype=dtype if dtype is not None else None,
+                low_cpu_mem_usage=True,
+                **quant_args
+            )
+        except Exception as e1:
+            print(f"device_map=auto 로드 실패: {e1}\nCPU로 폴백합니다.")
+            # 2차 시도: CPU 강제 로드
+            self.tokenizer = AutoTokenizer.from_pretrained(model_source)
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_source,
+                device_map={"": "cpu"},
+                torch_dtype=torch.float32,
+                low_cpu_mem_usage=False
            )

-            # 파이프라인 생성
-            pipe = pipeline(
-                "text-generation",
-                model=self.model,
-                tokenizer=self.tokenizer,
-                max_new_tokens=self.max_tokens,
-                temperature=self.temperature,
-                do_sample=True,
-                pad_token_id=self.tokenizer.eos_token_id
-            )
-
-            self.llm = HuggingFacePipeline(pipeline=pipe)
-            print("모델 로드 완료")
+        # 파이프라인 생성
+        pad_id = self.tokenizer.eos_token_id if getattr(self.tokenizer, 'eos_token_id', None) is not None else None
+        pipe = pipeline(
+            "text-generation",
+            model=self.model,
+            tokenizer=self.tokenizer,
+            max_new_tokens=self.max_tokens,
+            temperature=self.temperature,
+            do_sample=True,
+            pad_token_id=pad_id
+        )
+        self.llm = HuggingFacePipeline(pipeline=pipe)
+        print("모델 로드 완료")

+    # 간단 검색: DuckDuckGo HTML 결과 파싱 (외부 API 불필요)
+    def _search_urls(self, query: str, k: int = 5) -> List[str]:
+        import requests
+        from bs4 import BeautifulSoup
+        q = query.strip().replace(' ', '+')
+        url = f"https://duckduckgo.com/html/?q={q}"
+        headers = {"User-Agent": self.config['web_scraping']['user_agent']}
+        try:
+            r = requests.get(url, headers=headers, timeout=20)
+            r.raise_for_status()
+            soup = BeautifulSoup(r.text, 'html.parser')
+            links = []
+            for a in soup.select('a.result__a'):
+                href = a.get('href')
+                if href and href.startswith('http'):
+                    links.append(href)
+                if len(links) >= k:
+                    break
+            return links
        except Exception as e:
-            print(f"모델 로드 실패: {e}")
-            print("모델을 다운로드합니다...")
+            print(f"검색 실패({query}): {e}")
+            return []

-            # 모델 다운로드
-            from model_downloader import download_model as dl_model
-            success = dl_model(self.config_path.replace('config.json', ''))
+    def collect_information(self, topics: List[str]) -> List[Dict[str, str]]:
+        """
+        주제별로 웹 검색 → 스크래핑 → 요약 → 저장까지 수행
+        반환: [{ topic, response }]
+        """
+        results = []
+        for topic in topics:
+            urls = self._search_urls(topic, k=5)
+            collected = []
+            for u in urls[:5]:
+                data = self.web_scraper.scrape_website(u)
+                if data:
+                    collected.append(data)

-            if success[0] is None:
-                raise Exception("모델 다운로드 실패")
+            # 저장 (간단 저장 도구)
+            filename = f"{topic[:50].replace(' ', '_')}.json"
+            self.simple_saver.save_data_as_json(collected, filename)

-            # 다운로드 후 다시 로드 시도
+            # 간단 요약 생성
            try:
-                print("다운로드 완료, 모델 재로드 시도...")
-                from transformers import BitsAndBytesConfig
-                from accelerate import infer_auto_device_map, init_empty_weights
+                snippet = "\n\n".join([d.get('title', '') + ": " + d.get('description', '') for d in collected[:3]])
+                prompt = f"""
+                다음 자료를 간결히 요약하고 핵심 포인트 3가지를 bullet로 정리하세요.
+                주제: {topic}

-                if use_quantization:
-                    quantization_config = BitsAndBytesConfig(
-                        load_in_8bit=True,
-                        llm_int8_enable_fp32_cpu_offload=True
-                    )
-                else:
-                    quantization_config = None
+                자료:
+                {snippet}
+                """
+                summary = self.llm(prompt)
+            except Exception as e:
+                summary = f"요약 실패: {e}"

-                if max_memory:
-                    with init_empty_weights():
-                        empty_model = AutoModelForCausalLM.from_config(
-                            AutoConfig.from_pretrained(self.model_path)
-                        )
-                    device_map = infer_auto_device_map(
-                        empty_model,
-                        max_memory=max_memory,
-                        no_split_module_classes=["GPTNeoXLayer"]
-                    )
-                else:
-                    device_map = "auto"
+            results.append({"topic": topic, "response": summary})

-                self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
-                self.model = AutoModelForCausalLM.from_pretrained(
-                    self.model_path,
-                    quantization_config=quantization_config,
-                    device_map=device_map,
-                    torch_dtype="auto"
-                )
-
-                pipe = pipeline(
-                    "text-generation",
-                    model=self.model,
-                    tokenizer=self.tokenizer,
-                    max_new_tokens=self.max_tokens,
-                    temperature=self.temperature,
-                    do_sample=True,
-                    pad_token_id=self.tokenizer.eos_token_id
-                )
-
-                self.llm = HuggingFacePipeline(pipeline=pipe)
-                print("모델 로드 완료")
-
-            except Exception as e2:
-                print(f"모델 재로드 실패: {e2}")
-                raise Exception("모델 로드에 실패했습니다")
+        return results

    def scrape_web(self, url):
        """