feat: 모델 설정에 양자화 및 메모리 제한 구성 추가

2025-08-28 10:43:17 +09:00
parent 8b4e5bb29c
commit 1eb2347886
4 changed files with 156 additions and 11 deletions
--- a/AI_Web_Scraper/ai_agent.py
+++ b/AI_Web_Scraper/ai_agent.py
@@ -1,7 +1,7 @@
 import json
 import os
 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
-from langchain.llms import HuggingFacePipeline
+from langchain_community.llms import HuggingFacePipeline
 from langchain.agents import initialize_agent, AgentType
 from langchain.tools import Tool
 from langchain.memory import ConversationBufferMemory
@@ -61,15 +61,61 @@ class AIAgent:
    def load_model(self):
        """
-        Hugging Face 모델을 로드합니다.
+        Hugging Face 모델을 로드합니다. 없으면 다운로드 후 로드.
        GPU와 CPU 메모리를 함께 활용.
        """
        import os
        # GPU 메모리 최적화 설정
        os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
        try:
-            print(f"모델 로드 중: {self.model_path}")
+            print(f"모델 로드 시도: {self.model_path}")
            # 모델 로드 시도
            from transformers import BitsAndBytesConfig
            from accelerate import infer_auto_device_map, init_empty_weights
            model_settings = self.config.get('model_settings', {})
            use_quantization = model_settings.get('use_quantization', False)
            max_memory_config = model_settings.get('max_memory', {})
            # 메모리 제한 설정
            max_memory = {}
            if 'gpu' in max_memory_config:
                max_memory[0] = max_memory_config['gpu']
            if 'cpu' in max_memory_config:
                max_memory['cpu'] = max_memory_config['cpu']
            if use_quantization:
                print("8bit 양자화 적용")
                quantization_config = BitsAndBytesConfig(
                    load_in_8bit=True,
                    llm_int8_enable_fp32_cpu_offload=True
                )
            else:
                quantization_config = None
            # 최적의 device_map 계산
            if max_memory:
                print(f"GPU/CPU 메모리 분배 적용: {max_memory}")
                with init_empty_weights():
                    empty_model = AutoModelForCausalLM.from_config(
                        AutoConfig.from_pretrained(self.model_path)
                    )
                device_map = infer_auto_device_map(
                    empty_model,
                    max_memory=max_memory,
                    no_split_module_classes=["GPTNeoXLayer"]
                )
                print(f"계산된 device_map: {device_map}")
            else:
                device_map = "auto"
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
            self.model = AutoModelForCausalLM.from_pretrained(
                self.model_path,
-                device_map="auto",
+                quantization_config=quantization_config,
                device_map=device_map,
                torch_dtype="auto"
            )
@@ -89,7 +135,66 @@ class AIAgent:
        except Exception as e:
            print(f"모델 로드 실패: {e}")
-            raise
+            print("모델을 다운로드합니다...")
            # 모델 다운로드
            from model_downloader import download_model as dl_model
            success = dl_model(self.config_path.replace('config.json', ''))
            if success[0] is None:
                raise Exception("모델 다운로드 실패")
            # 다운로드 후 다시 로드 시도
            try:
                print("다운로드 완료, 모델 재로드 시도...")
                from transformers import BitsAndBytesConfig
                from accelerate import infer_auto_device_map, init_empty_weights
                if use_quantization:
                    quantization_config = BitsAndBytesConfig(
                        load_in_8bit=True,
                        llm_int8_enable_fp32_cpu_offload=True
                    )
                else:
                    quantization_config = None
                if max_memory:
                    with init_empty_weights():
                        empty_model = AutoModelForCausalLM.from_config(
                            AutoConfig.from_pretrained(self.model_path)
                        )
                    device_map = infer_auto_device_map(
                        empty_model,
                        max_memory=max_memory,
                        no_split_module_classes=["GPTNeoXLayer"]
                    )
                else:
                    device_map = "auto"
                self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
                self.model = AutoModelForCausalLM.from_pretrained(
                    self.model_path,
                    quantization_config=quantization_config,
                    device_map=device_map,
                    torch_dtype="auto"
                )
                pipe = pipeline(
                    "text-generation",
                    model=self.model,
                    tokenizer=self.tokenizer,
                    max_new_tokens=self.max_tokens,
                    temperature=self.temperature,
                    do_sample=True,
                    pad_token_id=self.tokenizer.eos_token_id
                )
                self.llm = HuggingFacePipeline(pipeline=pipe)
                print("모델 로드 완료")
            except Exception as e2:
                print(f"모델 재로드 실패: {e2}")
                raise Exception("모델 로드에 실패했습니다")
    def scrape_web(self, url):
        """
--- a/AI_Web_Scraper/config.json
+++ b/AI_Web_Scraper/config.json
@@ -14,5 +14,14 @@
    "local_storage_path": "./collected_data",
    "file_format": "json",
    "drive_mount_path": "/content/drive/MyDrive/model_Dev/data"
  },
  "model_settings": {
    "use_quantization": true,
    "quantization_bits": 8,
    "torch_dtype": "auto",
    "max_memory": {
      "gpu": "20GB",
      "cpu": "60GB"
    }
  }
 }
--- a/AI_Web_Scraper/requirements.txt
+++ b/AI_Web_Scraper/requirements.txt
@@ -13,3 +13,4 @@ langchain-community>=0.0.20
 huggingface-hub>=0.15.0
 pandas>=1.3.0
 openpyxl>=3.0.0
 bitsandbytes>=0.41.0
--- a/AI_Web_Scraper/run_guide.md
+++ b/AI_Web_Scraper/run_guide.md
@@ -120,13 +120,43 @@ python main.py --save-path "/content/drive/MyDrive/MyCustomFolder"
 - Colab의 디스크 공간 확인
 - 모델 크기가 크므로 충분한 공간 확보
-### 6.2 Google Drive 마운트 실패
+### 6.2 메모리 부족 오류 해결
- 브라우저 팝업에서 권한 허용을 확인
+모델이 클 경우 GPU 메모리가 부족할 수 있습니다. 다음 방법으로 해결하세요:
 - 마운트 코드 재실행: `drive.mount('/content/drive', force_remount=True)`
 - `/content/drive/MyDrive` 경로가 존재하는지 확인
-### 6.3 메모리 부족 오류
+#### 6.2.1 GPU/CPU 메모리 공동 활용
- 배치 크기 조정 또는 더 작은 모델 사용 고려
+시스템이 자동으로 GPU(20GB)와 CPU(60GB)를 함께 사용하여 모델을 분산 적재합니다.
 #### 6.2.2 메모리 설정 커스터마이징
 `config.json`에서 메모리 할당을 조정할 수 있습니다:
 ```json
 {
  "model_settings": {
    "max_memory": {
      "gpu": "25GB",
      "cpu": "50GB"
    }
  }
 }
 ```
 #### 6.2.2 수동 설정
 `config.json`에서 양자화 설정을 조정할 수 있습니다:
 ```json
 {
  "model_settings": {
    "use_quantization": true,
    "quantization_bits": 8
  }
 }
 ```
 #### 6.2.3 더 작은 모델 사용
 메모리가 여전히 부족하다면 `config.json`에서 모델을 더 작은 것으로 변경:
 ```json
 {
  "model_name": "microsoft/DialoGPT-medium"
 }
 ```
 ## 7. 확장 및 커스터마이징