From 59d213ab4a4b5c9ae1b8b87f42a6fbb227d793e6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=EB=B0=95=EC=83=81=ED=98=B8=20Sangho=20Park?=
 <ps040211@gmail.com>
Date: Thu, 28 Aug 2025 11:40:12 +0900
Subject: [PATCH] =?UTF-8?q?feat:=20=EC=85=80=EB=A0=88=EB=8B=88=EC=9B=80=20?=
 =?UTF-8?q?=EC=84=A0=ED=83=9D=EC=A0=81=20=EC=82=AC=EC=9A=A9=20=EB=B0=8F=20?=
 =?UTF-8?q?=ED=8F=B4=EB=B0=B1=20=EB=A9=94=EC=BB=A4=EB=8B=88=EC=A6=98=20?=
 =?UTF-8?q?=EC=B6=94=EA=B0=80?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 AI_Web_Scraper/ai_agent.py    |  75 ++++++++++++++---------
 AI_Web_Scraper/config.json    |   6 +-
 AI_Web_Scraper/run_guide.md   |  19 +++++-
 AI_Web_Scraper/web_scraper.py | 109 +++++++++++++++++++++++-----------
 4 files changed, 144 insertions(+), 65 deletions(-)

diff --git a/AI_Web_Scraper/ai_agent.py b/AI_Web_Scraper/ai_agent.py
index 5e9e598..986ed02 100644
--- a/AI_Web_Scraper/ai_agent.py
+++ b/AI_Web_Scraper/ai_agent.py
@@ -95,6 +95,12 @@ class AIAgent:
 
         model_settings = self.config.get('model_settings', {})
         use_quantization = bool(model_settings.get('use_quantization', False))
+        # 양자화 비트/오프로딩 옵션
+        try:
+            quant_bits = int(model_settings.get('quantization_bits', 8))
+        except Exception:
+            quant_bits = 8
+        cpu_offload = bool(model_settings.get('cpu_offload', False))
         torch_dtype_cfg = str(model_settings.get('torch_dtype', 'auto')).lower()
 
         # dtype 파싱
@@ -114,20 +120,7 @@ class AIAgent:
         if not model_source:
             raise RuntimeError("모델 경로/이름이 설정되지 않았습니다.")
 
-        # quantization 설정 (가능한 경우에만)
-        quant_args = {}
-        if use_quantization:
-            try:
-                from transformers import BitsAndBytesConfig
-                quant_args["quantization_config"] = BitsAndBytesConfig(
-                    load_in_8bit=True,
-                    llm_int8_enable_fp32_cpu_offload=True
-                )
-                print("8bit 양자화 적용")
-            except Exception as _:
-                # transformers/bitsandbytes 호환 문제 시 양자화 비활성화
-                print("bitsandbytes/transformers 호환 문제로 양자화를 비활성화합니다.")
-                quant_args = {}
+        # (이전) quant_args 경로 제거: load_kwargs에서 직접 처리
 
         # 메모리 제한/오프로딩 설정
         mm_cfg = model_settings.get('max_memory', {}) if isinstance(model_settings.get('max_memory', {}), dict) else {}
@@ -167,11 +160,28 @@ class AIAgent:
             if max_memory:
                 load_kwargs["max_memory"] = max_memory
 
-            # use_quantization=True면 8bit 우선 시도 (항상 레거시 플래그 사용)
+            # use_quantization=True면 4bit 우선, 아니면 8bit 레거시 플래그 사용
             if use_quantization:
-                load_kwargs["load_in_8bit"] = True
-                load_kwargs["llm_int8_enable_fp32_cpu_offload"] = True
-                print("8bit 양자화 적용 (레거시 플래그)")
+                if quant_bits == 4:
+                    try:
+                        from transformers import BitsAndBytesConfig
+                        load_kwargs["quantization_config"] = BitsAndBytesConfig(
+                            load_in_4bit=True,
+                            bnb_4bit_quant_type="nf4",
+                            bnb_4bit_use_double_quant=True,
+                            bnb_4bit_compute_dtype=__import__('torch').bfloat16
+                        )
+                        print("4bit 양자화 적용 (bnb nf4)")
+                    except Exception as _:
+                        load_kwargs["load_in_8bit"] = True
+                        if cpu_offload:
+                            load_kwargs["llm_int8_enable_fp32_cpu_offload"] = True
+                        print("4bit 미지원 → 8bit(레거시)로 폴백")
+                else:
+                    load_kwargs["load_in_8bit"] = True
+                    if cpu_offload:
+                        load_kwargs["llm_int8_enable_fp32_cpu_offload"] = True
+                    print("8bit 양자화 적용 (레거시 플래그)")
 
             self.model = AutoModelForCausalLM.from_pretrained(
                 model_source,
@@ -206,11 +216,11 @@ class AIAgent:
             except Exception as e_noq:
                 print(f"비양자화 재시도 실패: {e_noq}")
 
-                # 2b. 8-bit 양자화로 재시도 (가능 시)
+                # 2b. 양자화로 재시도 (4bit 우선, 아니면 8bit)
+                loaded = False
                 try:
-                    print("8bit 양자화로 재시도합니다...")
+                    print("양자화로 재시도합니다...")
                     self.tokenizer = AutoTokenizer.from_pretrained(model_source, trust_remote_code=True)
-                    # config 재생성 및 quantization_config 제거
                     cfg = AutoConfig.from_pretrained(model_source, trust_remote_code=True)
                     if hasattr(cfg, 'quantization_config'):
                         try:
@@ -224,20 +234,31 @@ class AIAgent:
                         offload_state_dict=True,
                         trust_remote_code=True,
                         config=cfg,
-                        load_in_8bit=True,
-                        llm_int8_enable_fp32_cpu_offload=True,
                     )
                     if dtype is not None:
                         retry_kwargs["torch_dtype"] = dtype
                     if max_memory:
                         retry_kwargs["max_memory"] = max_memory
+                    if quant_bits == 4:
+                        from transformers import BitsAndBytesConfig
+                        retry_kwargs["quantization_config"] = BitsAndBytesConfig(
+                            load_in_4bit=True,
+                            bnb_4bit_quant_type="nf4",
+                            bnb_4bit_use_double_quant=True,
+                            bnb_4bit_compute_dtype=__import__('torch').bfloat16
+                        )
+                    else:
+                        retry_kwargs["load_in_8bit"] = True
+                        if cpu_offload:
+                            retry_kwargs["llm_int8_enable_fp32_cpu_offload"] = True
 
                     self.model = AutoModelForCausalLM.from_pretrained(model_source, **retry_kwargs)
-                except Exception as e_int8:
-                    print(f"8bit 재시도 실패: {e_int8}")
+                    loaded = True
+                except Exception as e_q:
+                    print(f"양자화 재시도 실패: {e_q}")
 
-            if not tried_int8:
-                print("CPU로 폴백합니다.")
+                if not loaded:
+                    print("CPU로 폴백합니다.")
                 try:
                     import torch, gc
                     torch.cuda.empty_cache()
diff --git a/AI_Web_Scraper/config.json b/AI_Web_Scraper/config.json
index e794cce..11510cb 100644
--- a/AI_Web_Scraper/config.json
+++ b/AI_Web_Scraper/config.json
@@ -8,7 +8,8 @@
   "web_scraping": {
     "max_pages": 100,
     "delay_between_requests": 2,
-    "user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
+    "user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
+    "use_selenium": false
   },
   "data_storage": {
     "local_storage_path": "./collected_data",
@@ -17,8 +18,9 @@
   },
   "model_settings": {
     "use_quantization": true,
-    "quantization_bits": 8,
+    "quantization_bits": 4,
     "torch_dtype": "auto",
+    "cpu_offload": false,
     "max_memory": {
       "gpu": "20GB",
       "cpu": "60GB"
diff --git a/AI_Web_Scraper/run_guide.md b/AI_Web_Scraper/run_guide.md
index 1b1a669..877a4f5 100644
--- a/AI_Web_Scraper/run_guide.md
+++ b/AI_Web_Scraper/run_guide.md
@@ -68,6 +68,15 @@ drive.mount('/content/drive')
 
 또는 실행 시 `--save-path` 옵션으로 지정할 수 있습니다.
 
+웹 스크래핑은 기본으로 Requests+BeautifulSoup 모드로 동작합니다(`use_selenium=false`).
+Selenium을 사용하려면 `web_scraping.use_selenium`을 `true`로 바꾸고, Colab에 Chrome/ChromeDriver를 설치해야 합니다:
+
+```bash
+sudo apt-get update && sudo apt-get install -y google-chrome-stable || true
+pip install selenium webdriver-manager
+```
+설치가 어려우면 기본 Requests 모드를 유지하세요.
+
 ## 3. 시스템 실행
 
 ### 3.1 기본 실행 (AI가 스스로 주제 선정)
@@ -110,7 +119,7 @@ os.environ["HF_TOKEN"] = "hf_********************************"
 
 ## 4. 실행 과정 설명
 
-1. **모델 다운로드**: Hugging Face에서 `jxm/gpt-oss-20b-base` 모델을 다운로드
+1. **모델 다운로드**: Hugging Face에서 `jxm/gpt-oss-20b-base` 모델 파일을 동기화(snapshot)
 2. **AI 에이전트 초기화**: 모델을 로드하고 도구들을 설정
 3. **정보 수집**: 각 주제에 대해 AI가 스스로 웹을 탐색하며 정보 수집
 4. **데이터 저장**: 수집된 데이터를 마운트된 Google Drive의 지정된 폴더에 자동 저장
@@ -138,6 +147,14 @@ os.environ["HF_TOKEN"] = "hf_********************************"
 - 모델 접근 권한(토큰) 필요 여부 확인: 필요 시 `HF_TOKEN` 설정
 - 네트워크 일시 오류일 수 있으므로 런타임 재시작 후 재시도
 
+### 6.1.1 모델 로딩 시 GPU 사용이 0%로 보이는 경우
+- 기본 설정은 4bit 양자화 + GPU/CPU 오프로딩을 사용합니다. 로딩 초기에는 RAM이 먼저 오르고 GPU 사용이 0%일 수 있습니다.
+- 실행 중에도 GPU가 계속 0%라면 bitsandbytes가 GPU 커널을 잡지 못한 것입니다. 아래를 확인하세요:
+  - `pip install -U transformers accelerate bitsandbytes`
+  - `import torch, bitsandbytes as bnb; print(torch.cuda.is_available())`
+  - `from bitsandbytes.cuda_setup import main_check; print(main_check())`
+- 여전히 문제가 있으면 `model_settings.max_memory.gpu`를 소폭 올리거나(예: 24GB), `cpu_offload`를 false로 유지하세요.
+
 ### 6.2 메모리 부족 오류 해결
 모델이 클 경우 GPU 메모리가 부족할 수 있습니다. 다음 방법으로 해결하세요:
 
diff --git a/AI_Web_Scraper/web_scraper.py b/AI_Web_Scraper/web_scraper.py
index 57daa28..0f1c152 100644
--- a/AI_Web_Scraper/web_scraper.py
+++ b/AI_Web_Scraper/web_scraper.py
@@ -3,63 +3,92 @@ from bs4 import BeautifulSoup
 import json
 import time
 import os
-from selenium import webdriver
-from selenium.webdriver.chrome.options import Options
-from selenium.webdriver.common.by import By
-from webdriver_manager.chrome import ChromeDriverManager
-from selenium.webdriver.chrome.service import Service
+
+try:
+    from selenium import webdriver
+    from selenium.webdriver.chrome.options import Options
+    from selenium.webdriver.common.by import By
+    from webdriver_manager.chrome import ChromeDriverManager
+    from selenium.webdriver.chrome.service import Service
+    _SELENIUM_AVAILABLE = True
+except Exception:
+    _SELENIUM_AVAILABLE = False
 
 class WebScraper:
     def __init__(self, config_path='./config.json'):
         with open(config_path, 'r') as f:
             self.config = json.load(f)
 
-        self.max_pages = self.config['web_scraping']['max_pages']
-        self.delay = self.config['web_scraping']['delay_between_requests']
-        self.user_agent = self.config['web_scraping']['user_agent']
+        ws_conf = self.config.get('web_scraping', {})
+        self.max_pages = ws_conf.get('max_pages', 100)
+        self.delay = ws_conf.get('delay_between_requests', 2)
+        self.user_agent = ws_conf.get('user_agent', 'Mozilla/5.0')
+        self.use_selenium = bool(ws_conf.get('use_selenium', False))
 
-        # Selenium 설정
-        chrome_options = Options()
-        chrome_options.add_argument("--headless")  # Colab에서는 headless 모드
-        chrome_options.add_argument("--no-sandbox")
-        chrome_options.add_argument("--disable-dev-shm-usage")
-        chrome_options.add_argument(f"user-agent={self.user_agent}")
+        self.driver = None
+        if self.use_selenium and _SELENIUM_AVAILABLE:
+            try:
+                chrome_options = Options()
+                chrome_options.add_argument("--headless=new")
+                chrome_options.add_argument("--no-sandbox")
+                chrome_options.add_argument("--disable-dev-shm-usage")
+                chrome_options.add_argument(f"user-agent={self.user_agent}")
 
-        self.driver = webdriver.Chrome(
-            service=Service(ChromeDriverManager().install()),
-            options=chrome_options
-        )
+                # Chrome 바이너리 탐색 (Colab/리눅스 일반 경로)
+                chrome_bin_candidates = [
+                    os.environ.get('GOOGLE_CHROME_BIN'),
+                    os.environ.get('CHROME_BIN'),
+                    '/usr/bin/google-chrome',
+                    '/usr/bin/chromium-browser',
+                    '/usr/bin/chromium'
+                ]
+                chrome_bin = next((p for p in chrome_bin_candidates if p and os.path.exists(p)), None)
+                if chrome_bin:
+                    chrome_options.binary_location = chrome_bin
+
+                self.driver = webdriver.Chrome(
+                    service=Service(ChromeDriverManager().install()),
+                    options=chrome_options
+                )
+                print("Selenium 모드 활성화")
+            except Exception as e:
+                print(f"Selenium 초기화 실패, Requests 모드로 폴백: {e}")
+                self.driver = None
+                self.use_selenium = False
+        else:
+            if self.use_selenium and not _SELENIUM_AVAILABLE:
+                print("Selenium 패키지 미설치, Requests 모드로 폴백합니다.")
+            self.use_selenium = False
 
     def scrape_website(self, url, keywords=None):
         """
         웹사이트에서 정보를 수집합니다.
         """
         try:
-            self.driver.get(url)
-            time.sleep(self.delay)
+            if self.use_selenium and self.driver is not None:
+                self.driver.get(url)
+                time.sleep(self.delay)
+                page_source = self.driver.page_source
+            else:
+                headers = {"User-Agent": self.user_agent}
+                resp = requests.get(url, headers=headers, timeout=20)
+                resp.raise_for_status()
+                page_source = resp.text
 
-            # 페이지 내용 추출
-            page_source = self.driver.page_source
             soup = BeautifulSoup(page_source, 'html.parser')
-
-            # 텍스트 내용 추출
             text_content = soup.get_text(separator=' ', strip=True)
-
-            # 메타데이터 추출
             title = soup.title.string if soup.title else "No Title"
             meta_description = soup.find('meta', attrs={'name': 'description'})
-            description = meta_description['content'] if meta_description else "No Description"
+            description = meta_description['content'] if (meta_description and meta_description.has_attr('content')) else "No Description"
 
             data = {
                 'url': url,
                 'title': title,
                 'description': description,
-                'content': text_content[:5000],  # 내용 제한
+                'content': text_content[:5000],
                 'timestamp': time.time()
             }
-
             return data
-
         except Exception as e:
             print(f"스크래핑 실패: {url} - {e}")
             return None
@@ -83,16 +112,25 @@ class WebScraper:
 
                 # 추가 링크 찾기 (단순히 현재 페이지의 링크들)
                 try:
-                    links = self.driver.find_elements(By.TAG_NAME, "a")
-                    for link in links[:10]:  # 최대 10개 링크만
-                        href = link.get_attribute("href")
+                    if self.use_selenium and self.driver is not None:
+                        links = self.driver.find_elements(By.TAG_NAME, "a")
+                        hrefs = [link.get_attribute("href") for link in links[:20]]
+                    else:
+                        # Requests 모드일 때는 현재 페이지를 다시 받아서 링크 파싱
+                        headers = {"User-Agent": self.user_agent}
+                        resp = requests.get(url, headers=headers, timeout=20)
+                        resp.raise_for_status()
+                        soup = BeautifulSoup(resp.text, 'html.parser')
+                        hrefs = [a.get('href') for a in soup.find_all('a', href=True)][:20]
+
+                    for href in hrefs:
                         if href and href.startswith("http") and href not in visited_urls:
                             if len(collected_data) < self.max_pages:
                                 data = self.scrape_website(href, keywords)
                                 if data:
                                     collected_data.append(data)
                                     visited_urls.add(href)
-                except:
+                except Exception:
                     pass
 
         return collected_data
@@ -112,7 +150,8 @@ class WebScraper:
         print(f"데이터 저장 완료: {filepath}")
 
     def close(self):
-        self.driver.quit()
+        if self.driver is not None:
+            self.driver.quit()
 
 if __name__ == "__main__":
     scraper = WebScraper()