feat: AI 웹 스크래퍼의 핵심 컴포넌트 및 실행 가이드 구현

2025-08-28 11:01:04 +09:00
parent 50a6c3d407
commit 0206e42780
4 changed files with 187 additions and 150 deletions
--- a/AI_Web_Scraper/ai_agent.py
+++ b/AI_Web_Scraper/ai_agent.py
@@ -1,6 +1,7 @@
 import json
 import os
-from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+from typing import List, Dict
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, pipeline
 from langchain_community.llms import HuggingFacePipeline
 from langchain.agents import initialize_agent, AgentType
 from langchain.tools import Tool
@@ -10,6 +11,7 @@ from google_drive_uploader import GoogleDriveUploader, SimpleDriveSaver

 class AIAgent:
    def __init__(self, config_path='./config.json'):
+        self.config_path = config_path
        with open(config_path, 'r') as f:
            self.config = json.load(f)

@@ -61,140 +63,142 @@ class AIAgent:

    def load_model(self):
        """
-        Hugging Face 모델을 로드합니다. 없으면 다운로드 후 로드.
-        GPU와 CPU 메모리를 함께 활용.
+        Hugging Face 모델을 로드합니다.
+        - model_downloader가 가져온 로컬 스냅샷을 우선 사용
+        - 양자화/디바이스 맵은 가능한 한 보수적으로 설정하고, 실패 시 단계적 폴백
        """
-        import os
        # GPU 메모리 최적화 설정
        os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

-        try:
-            print(f"모델 로드 시도: {self.model_path}")
+        model_settings = self.config.get('model_settings', {})
+        use_quantization = bool(model_settings.get('use_quantization', False))
+        torch_dtype_cfg = str(model_settings.get('torch_dtype', 'auto')).lower()

-            # 모델 로드 시도
-            from transformers import BitsAndBytesConfig
-            from accelerate import infer_auto_device_map, init_empty_weights
+        # dtype 파싱
+        import torch
+        dtype = None
+        if torch_dtype_cfg in ("float16", "fp16", "half"):
+            dtype = torch.float16
+        elif torch_dtype_cfg in ("bfloat16", "bf16"):
+            dtype = torch.bfloat16
+        elif torch_dtype_cfg in ("float32", "fp32"):
+            dtype = torch.float32
+        else:
+            dtype = None  # auto

-            model_settings = self.config.get('model_settings', {})
-            use_quantization = model_settings.get('use_quantization', False)
-            max_memory_config = model_settings.get('max_memory', {})
+        # 로컬 스냅샷이 있으면 우선 사용, 없으면 모델 이름 사용
+        model_source = self.model_path if os.path.isdir(self.model_path) else self.config.get('model_name')
+        if not model_source:
+            raise RuntimeError("모델 경로/이름이 설정되지 않았습니다.")

-            # 메모리 제한 설정
-            max_memory = {}
-            if 'gpu' in max_memory_config:
-                max_memory[0] = max_memory_config['gpu']
-            if 'cpu' in max_memory_config:
-                max_memory['cpu'] = max_memory_config['cpu']
-
-            if use_quantization:
-                print("8bit 양자화 적용")
-                quantization_config = BitsAndBytesConfig(
+        # quantization 설정 (가능한 경우에만)
+        quant_args = {}
+        if use_quantization:
+            try:
+                from transformers import BitsAndBytesConfig
+                quant_args["quantization_config"] = BitsAndBytesConfig(
                    load_in_8bit=True,
                    llm_int8_enable_fp32_cpu_offload=True
                )
-            else:
-                quantization_config = None
+                print("8bit 양자화 적용")
+            except Exception as _:
+                # transformers/bitsandbytes 호환 문제 시 양자화 비활성화
+                print("bitsandbytes/transformers 호환 문제로 양자화를 비활성화합니다.")
+                quant_args = {}

-            # 최적의 device_map 계산
-            if max_memory:
-                print(f"GPU/CPU 메모리 분배 적용: {max_memory}")
-                with init_empty_weights():
-                    empty_model = AutoModelForCausalLM.from_config(
-                        AutoConfig.from_pretrained(self.model_path)
-                    )
-                device_map = infer_auto_device_map(
-                    empty_model,
-                    max_memory=max_memory,
-                    no_split_module_classes=["GPTNeoXLayer"]
-                )
-                print(f"계산된 device_map: {device_map}")
-            else:
-                device_map = "auto"
-
-            self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
+        # 1차 시도: device_map="auto" 로 로드
+        try:
+            self.tokenizer = AutoTokenizer.from_pretrained(model_source)
            self.model = AutoModelForCausalLM.from_pretrained(
-                self.model_path,
-                quantization_config=quantization_config,
-                device_map=device_map,
-                torch_dtype="auto"
+                model_source,
+                device_map="auto",
+                torch_dtype=dtype if dtype is not None else None,
+                low_cpu_mem_usage=True,
+                **quant_args
+            )
+        except Exception as e1:
+            print(f"device_map=auto 로드 실패: {e1}\nCPU로 폴백합니다.")
+            # 2차 시도: CPU 강제 로드
+            self.tokenizer = AutoTokenizer.from_pretrained(model_source)
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_source,
+                device_map={"": "cpu"},
+                torch_dtype=torch.float32,
+                low_cpu_mem_usage=False
            )

-            # 파이프라인 생성
-            pipe = pipeline(
-                "text-generation",
-                model=self.model,
-                tokenizer=self.tokenizer,
-                max_new_tokens=self.max_tokens,
-                temperature=self.temperature,
-                do_sample=True,
-                pad_token_id=self.tokenizer.eos_token_id
-            )
-
-            self.llm = HuggingFacePipeline(pipeline=pipe)
-            print("모델 로드 완료")
+        # 파이프라인 생성
+        pad_id = self.tokenizer.eos_token_id if getattr(self.tokenizer, 'eos_token_id', None) is not None else None
+        pipe = pipeline(
+            "text-generation",
+            model=self.model,
+            tokenizer=self.tokenizer,
+            max_new_tokens=self.max_tokens,
+            temperature=self.temperature,
+            do_sample=True,
+            pad_token_id=pad_id
+        )
+        self.llm = HuggingFacePipeline(pipeline=pipe)
+        print("모델 로드 완료")

+    # 간단 검색: DuckDuckGo HTML 결과 파싱 (외부 API 불필요)
+    def _search_urls(self, query: str, k: int = 5) -> List[str]:
+        import requests
+        from bs4 import BeautifulSoup
+        q = query.strip().replace(' ', '+')
+        url = f"https://duckduckgo.com/html/?q={q}"
+        headers = {"User-Agent": self.config['web_scraping']['user_agent']}
+        try:
+            r = requests.get(url, headers=headers, timeout=20)
+            r.raise_for_status()
+            soup = BeautifulSoup(r.text, 'html.parser')
+            links = []
+            for a in soup.select('a.result__a'):
+                href = a.get('href')
+                if href and href.startswith('http'):
+                    links.append(href)
+                if len(links) >= k:
+                    break
+            return links
        except Exception as e:
-            print(f"모델 로드 실패: {e}")
-            print("모델을 다운로드합니다...")
+            print(f"검색 실패({query}): {e}")
+            return []

-            # 모델 다운로드
-            from model_downloader import download_model as dl_model
-            success = dl_model(self.config_path.replace('config.json', ''))
+    def collect_information(self, topics: List[str]) -> List[Dict[str, str]]:
+        """
+        주제별로 웹 검색 → 스크래핑 → 요약 → 저장까지 수행
+        반환: [{ topic, response }]
+        """
+        results = []
+        for topic in topics:
+            urls = self._search_urls(topic, k=5)
+            collected = []
+            for u in urls[:5]:
+                data = self.web_scraper.scrape_website(u)
+                if data:
+                    collected.append(data)

-            if success[0] is None:
-                raise Exception("모델 다운로드 실패")
+            # 저장 (간단 저장 도구)
+            filename = f"{topic[:50].replace(' ', '_')}.json"
+            self.simple_saver.save_data_as_json(collected, filename)

-            # 다운로드 후 다시 로드 시도
+            # 간단 요약 생성
            try:
-                print("다운로드 완료, 모델 재로드 시도...")
-                from transformers import BitsAndBytesConfig
-                from accelerate import infer_auto_device_map, init_empty_weights
+                snippet = "\n\n".join([d.get('title', '') + ": " + d.get('description', '') for d in collected[:3]])
+                prompt = f"""
+                다음 자료를 간결히 요약하고 핵심 포인트 3가지를 bullet로 정리하세요.
+                주제: {topic}

-                if use_quantization:
-                    quantization_config = BitsAndBytesConfig(
-                        load_in_8bit=True,
-                        llm_int8_enable_fp32_cpu_offload=True
-                    )
-                else:
-                    quantization_config = None
+                자료:
+                {snippet}
+                """
+                summary = self.llm(prompt)
+            except Exception as e:
+                summary = f"요약 실패: {e}"

-                if max_memory:
-                    with init_empty_weights():
-                        empty_model = AutoModelForCausalLM.from_config(
-                            AutoConfig.from_pretrained(self.model_path)
-                        )
-                    device_map = infer_auto_device_map(
-                        empty_model,
-                        max_memory=max_memory,
-                        no_split_module_classes=["GPTNeoXLayer"]
-                    )
-                else:
-                    device_map = "auto"
+            results.append({"topic": topic, "response": summary})

-                self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
-                self.model = AutoModelForCausalLM.from_pretrained(
-                    self.model_path,
-                    quantization_config=quantization_config,
-                    device_map=device_map,
-                    torch_dtype="auto"
-                )
-
-                pipe = pipeline(
-                    "text-generation",
-                    model=self.model,
-                    tokenizer=self.tokenizer,
-                    max_new_tokens=self.max_tokens,
-                    temperature=self.temperature,
-                    do_sample=True,
-                    pad_token_id=self.tokenizer.eos_token_id
-                )
-
-                self.llm = HuggingFacePipeline(pipeline=pipe)
-                print("모델 로드 완료")
-
-            except Exception as e2:
-                print(f"모델 재로드 실패: {e2}")
-                raise Exception("모델 로드에 실패했습니다")
+        return results

    def scrape_web(self, url):
        """
--- a/AI_Web_Scraper/main.py
+++ b/AI_Web_Scraper/main.py
@@ -15,18 +15,27 @@ def main():

    print("AI 웹 정보 수집 시스템 시작")

-    # 1. 모델 다운로드 (필요한 경우)
+    # 1. 설정 파일 로드
+    with open(args.config, 'r') as f:
+        config = json.load(f)
+
+    # 2. 모델 다운로드 (필요한 경우)
    print("모델 확인 중...")
    model, tokenizer = download_model(args.config)

-    if model is None:
+    # 모델 로딩은 AIAgent에서 수행하므로, 다운로드만 성공해도 계속 진행
+    # 로컬 경로에 스냅샷 파일 유무로 성공 여부 확인
+    local_model_path = config.get('model_local_path', './models/model')
+    has_files = False
+    try:
+        has_files = any(True for _ in __import__('os').scandir(local_model_path))
+    except Exception:
+        has_files = False
+
+    if (model is None and not has_files):
        print("모델 다운로드 실패. 프로그램을 종료합니다.")
        sys.exit(1)

-    # 2. 설정 파일 로드 및 수정
-    with open(args.config, 'r') as f:
-        config = json.load(f)
-
    if args.save_path:
        config['data_storage']['drive_mount_path'] = args.save_path
        # 수정된 config 저장
--- a/AI_Web_Scraper/model_downloader.py
+++ b/AI_Web_Scraper/model_downloader.py
@@ -1,54 +1,51 @@
 import os
 import json
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
-from huggingface_hub import HfApi
+from typing import Tuple, Optional
+from transformers import AutoTokenizer
+from huggingface_hub import snapshot_download

-def download_model(config_path='./config.json'):
+def download_model(config_path: str = './config.json') -> Tuple[Optional[object], Optional[AutoTokenizer]]:
    """
-    Hugging Face에서 모델을 다운로드합니다.
+    Hugging Face에서 모델 아티팩트만 안전하게 다운로드합니다.
+    - 모델 로딩(메모리에 적재) 없이 파일만 받도록 snapshot_download 사용
+    - Colab에서 대형 모델의 초기 로딩 문제(양자화/가속기 버전 차이 등)를 회피
+
+    반환: (model, tokenizer)
+      - 이 함수는 모델을 메모리에 로드하지 않으므로 model은 항상 None을 반환합니다.
+      - tokenizer는 로컬 경로에서 로드에 성공하면 반환, 실패 시 None
    """
    with open(config_path, 'r') as f:
        config = json.load(f)

    model_name = config['model_name']
    local_path = config['model_local_path']
-    model_settings = config.get('model_settings', {})
-    use_quantization = model_settings.get('use_quantization', False)
-
-    if not os.path.exists(local_path):
-        os.makedirs(local_path)

+    os.makedirs(local_path, exist_ok=True)
    print(f"모델 {model_name}을 {local_path}에 다운로드 중...")

    try:
-        # 양자화 설정 적용
-        if use_quantization:
-            print("8bit 양자화 적용")
-            quantization_config = BitsAndBytesConfig(
-                load_in_8bit=True,
-                llm_int8_enable_fp32_cpu_offload=True
-            )
-        else:
-            quantization_config = None
+        # 인증 토큰(필요 시) 지원: 환경변수 HF_TOKEN 사용
+        hf_token = os.environ.get('HF_TOKEN', None)

-        model = AutoModelForCausalLM.from_pretrained(
-            model_name,
-            cache_dir=local_path,
-            quantization_config=quantization_config,
-            device_map="auto" if quantization_config else "cpu",  # 양자화 시 auto, 아니면 cpu
-            torch_dtype="auto"
-        )
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_name,
-            cache_dir=local_path
+        # 전체 리포 스냅샷을 로컬 디렉토리에 동기화
+        snapshot_download(
+            repo_id=model_name,
+            local_dir=local_path,
+            local_dir_use_symlinks=False,
+            resume_download=True,
+            token=hf_token
        )

-        # 모델과 토크나이저 저장
-        model.save_pretrained(local_path)
-        tokenizer.save_pretrained(local_path)
+        # 토크나이저 로드 가능 여부만 확인 (모델은 나중에 로드)
+        tokenizer = None
+        try:
+            tokenizer = AutoTokenizer.from_pretrained(local_path)
+        except Exception:
+            # 토크나이저 파일이 없을 수도 있으므로 경고만 출력
+            print("토크나이저 확인 실패(계속 진행): 로컬 경로에 tokenizer 파일이 없을 수 있습니다.")

        print(f"모델 다운로드 완료: {local_path}")
-        return model, tokenizer
+        return None, tokenizer
    except Exception as e:
        print(f"모델 다운로드 실패: {e}")
        return None, None
--- a/AI_Web_Scraper/run_guide.md
+++ b/AI_Web_Scraper/run_guide.md
@@ -50,6 +50,11 @@ drive.mount('/content/drive')
 !pip install -r requirements.txt
 ```

+최신 버전으로 업데이트(권장):
+```bash
+!pip install -U transformers accelerate bitsandbytes huggingface-hub
+```
+
 ### 2.2 설정 파일 수정 (선택사항)
 기본적으로 데이터는 `/content/drive/MyDrive/AI_Data`에 저장됩니다. 다른 경로를 원하시면 `config.json`의 `drive_mount_path`를 수정하세요:

@@ -92,6 +97,17 @@ python main.py --save-path "/content/drive/MyDrive/MyCustomFolder"
 ```
 이렇게 하면 데이터를 지정한 폴더에 저장합니다.

+### 3.6 Hugging Face 토큰(필요 시)
+일부 모델은 접근 토큰이 필요할 수 있습니다. 필요하다면 다음과 같이 환경 변수로 설정하세요.
+```bash
+export HF_TOKEN=hf_********************************
+```
+Colab 셀에서 설정 예:
+```python
+import os
+os.environ["HF_TOKEN"] = "hf_********************************"
+```
+
 ## 4. 실행 과정 설명

 1. **모델 다운로드**: Hugging Face에서 `jxm/gpt-oss-20b-base` 모델을 다운로드
@@ -119,6 +135,8 @@ python main.py --save-path "/content/drive/MyDrive/MyCustomFolder"
 ### 6.1 모델 다운로드 실패
 - Colab의 디스크 공간 확인
 - 모델 크기가 크므로 충분한 공간 확보
+- 모델 접근 권한(토큰) 필요 여부 확인: 필요 시 `HF_TOKEN` 설정
+- 네트워크 일시 오류일 수 있으므로 런타임 재시작 후 재시도

 ### 6.2 메모리 부족 오류 해결
 모델이 클 경우 GPU 메모리가 부족할 수 있습니다. 다음 방법으로 해결하세요:
@@ -157,6 +175,7 @@ python main.py --save-path "/content/drive/MyDrive/MyCustomFolder"
  "model_name": "microsoft/DialoGPT-medium"
 }
 ```
+또는 양자화된/경량화된 공개 모델을 사용하면 메모리 요구량이 크게 줄어듭니다.

 ## 7. 확장 및 커스터마이징

@@ -175,6 +194,14 @@ python main.py --save-path "/content/drive/MyDrive/MyCustomFolder"
 - Google Drive API 사용량 제한에 유의하세요.
 - 대량의 데이터를 수집할 경우 Colab 세션 시간 제한을 고려하세요.
 - 개인정보 보호 및 저작권을 준수하세요.
+- 일부 웹사이트는 자동화된 크롤링을 차단할 수 있습니다. 과도한 요청을 피하고 robots.txt를 준수하세요.
+- Selenium 실행에 문제가 있으면 Chrome/ChromeDriver 설치 상태를 확인하세요.
+
+## 10. 참고 사항 (동작 방식)
+
+- 모델 다운로드는 메모리에 올리지 않고 Hugging Face의 `snapshot_download`로 파일만 동기화합니다.
+- 실제 모델 로딩은 실행 시점에 `AIAgent`가 수행하며, 가능한 경우 GPU/CPU 자동 분산(`device_map="auto"`), 실패 시 CPU로 폴백합니다.
+- 간단한 웹 검색은 DuckDuckGo HTML 결과를 파싱하여 링크를 추출합니다(외부 유료 API 불필요).

 ## 9. 지원