feat: 모델 설정에 양자화 및 메모리 제한 구성 추가

2025-08-28 10:43:17 +09:00
parent 8b4e5bb29c
commit 1eb2347886
4 changed files with 156 additions and 11 deletions
--- a/AI_Web_Scraper/ai_agent.py
+++ b/AI_Web_Scraper/ai_agent.py
@@ -1,7 +1,7 @@
 import json
 import os
 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
-from langchain.llms import HuggingFacePipeline
+from langchain_community.llms import HuggingFacePipeline
 from langchain.agents import initialize_agent, AgentType
 from langchain.tools import Tool
 from langchain.memory import ConversationBufferMemory
@@ -61,15 +61,61 @@ class AIAgent:

    def load_model(self):
        """
-        Hugging Face 모델을 로드합니다.
+        Hugging Face 모델을 로드합니다. 없으면 다운로드 후 로드.
+        GPU와 CPU 메모리를 함께 활용.
        """
+        import os
+        # GPU 메모리 최적화 설정
+        os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+
        try:
-            print(f"모델 로드 중: {self.model_path}")
+            print(f"모델 로드 시도: {self.model_path}")
+
+            # 모델 로드 시도
+            from transformers import BitsAndBytesConfig
+            from accelerate import infer_auto_device_map, init_empty_weights
+
+            model_settings = self.config.get('model_settings', {})
+            use_quantization = model_settings.get('use_quantization', False)
+            max_memory_config = model_settings.get('max_memory', {})
+
+            # 메모리 제한 설정
+            max_memory = {}
+            if 'gpu' in max_memory_config:
+                max_memory[0] = max_memory_config['gpu']
+            if 'cpu' in max_memory_config:
+                max_memory['cpu'] = max_memory_config['cpu']
+
+            if use_quantization:
+                print("8bit 양자화 적용")
+                quantization_config = BitsAndBytesConfig(
+                    load_in_8bit=True,
+                    llm_int8_enable_fp32_cpu_offload=True
+                )
+            else:
+                quantization_config = None
+
+            # 최적의 device_map 계산
+            if max_memory:
+                print(f"GPU/CPU 메모리 분배 적용: {max_memory}")
+                with init_empty_weights():
+                    empty_model = AutoModelForCausalLM.from_config(
+                        AutoConfig.from_pretrained(self.model_path)
+                    )
+                device_map = infer_auto_device_map(
+                    empty_model,
+                    max_memory=max_memory,
+                    no_split_module_classes=["GPTNeoXLayer"]
+                )
+                print(f"계산된 device_map: {device_map}")
+            else:
+                device_map = "auto"

            self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
            self.model = AutoModelForCausalLM.from_pretrained(
                self.model_path,
-                device_map="auto",
+                quantization_config=quantization_config,
+                device_map=device_map,
                torch_dtype="auto"
            )

@@ -89,7 +135,66 @@ class AIAgent:

        except Exception as e:
            print(f"모델 로드 실패: {e}")
-            raise
+            print("모델을 다운로드합니다...")
+
+            # 모델 다운로드
+            from model_downloader import download_model as dl_model
+            success = dl_model(self.config_path.replace('config.json', ''))
+
+            if success[0] is None:
+                raise Exception("모델 다운로드 실패")
+
+            # 다운로드 후 다시 로드 시도
+            try:
+                print("다운로드 완료, 모델 재로드 시도...")
+                from transformers import BitsAndBytesConfig
+                from accelerate import infer_auto_device_map, init_empty_weights
+
+                if use_quantization:
+                    quantization_config = BitsAndBytesConfig(
+                        load_in_8bit=True,
+                        llm_int8_enable_fp32_cpu_offload=True
+                    )
+                else:
+                    quantization_config = None
+
+                if max_memory:
+                    with init_empty_weights():
+                        empty_model = AutoModelForCausalLM.from_config(
+                            AutoConfig.from_pretrained(self.model_path)
+                        )
+                    device_map = infer_auto_device_map(
+                        empty_model,
+                        max_memory=max_memory,
+                        no_split_module_classes=["GPTNeoXLayer"]
+                    )
+                else:
+                    device_map = "auto"
+
+                self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
+                self.model = AutoModelForCausalLM.from_pretrained(
+                    self.model_path,
+                    quantization_config=quantization_config,
+                    device_map=device_map,
+                    torch_dtype="auto"
+                )
+
+                pipe = pipeline(
+                    "text-generation",
+                    model=self.model,
+                    tokenizer=self.tokenizer,
+                    max_new_tokens=self.max_tokens,
+                    temperature=self.temperature,
+                    do_sample=True,
+                    pad_token_id=self.tokenizer.eos_token_id
+                )
+
+                self.llm = HuggingFacePipeline(pipeline=pipe)
+                print("모델 로드 완료")
+
+            except Exception as e2:
+                print(f"모델 재로드 실패: {e2}")
+                raise Exception("모델 로드에 실패했습니다")

    def scrape_web(self, url):
        """