feat: 모델 성능 향상을 위한 양자화 설정 활성화

2025-08-28 11:09:55 +09:00
parent 0206e42780
commit f67f9a18aa
2 changed files with 95 additions and 16 deletions
--- a/AI_Web_Scraper/ai_agent.py
+++ b/AI_Web_Scraper/ai_agent.py
@@ -106,25 +106,104 @@ class AIAgent:
                print("bitsandbytes/transformers 호환 문제로 양자화를 비활성화합니다.")
                quant_args = {}
-        # 1차 시도: device_map="auto" 로 로드
+        # 메모리 제한/오프로딩 설정
        mm_cfg = model_settings.get('max_memory', {}) if isinstance(model_settings.get('max_memory', {}), dict) else {}
        # normalize memory strings to GiB (accelerate accepts both, but unify)
        def _norm_mem(v):
            if not isinstance(v, str):
                return v
            return v.replace('GB', 'GiB').replace('gb', 'GiB')
        max_memory = {}
        if 0 in mm_cfg or 'gpu' in mm_cfg:
            max_memory[0] = _norm_mem(mm_cfg.get(0, mm_cfg.get('gpu', '30GiB')))
        if 'cpu' in mm_cfg:
            max_memory['cpu'] = _norm_mem(mm_cfg.get('cpu', '60GiB'))
        offload_folder = os.path.join(os.path.dirname(self.config_path), 'offload')
        os.makedirs(offload_folder, exist_ok=True)
        # 1차 시도: device_map="auto" + max_memory 로 로드
        try:
-            self.tokenizer = AutoTokenizer.from_pretrained(model_source)
+            self.tokenizer = AutoTokenizer.from_pretrained(model_source, trust_remote_code=True)
            load_kwargs = dict(
                device_map="auto",
                low_cpu_mem_usage=True,
                offload_folder=offload_folder,
                offload_state_dict=True,
                trust_remote_code=True,
            )
            if dtype is not None:
                load_kwargs["torch_dtype"] = dtype
            if max_memory:
                load_kwargs["max_memory"] = max_memory
            # use_quantization=True면 8bit 우선 시도 (repo의 다른 양자화 경로 우회)
            if use_quantization:
                try:
                    from transformers import BitsAndBytesConfig
                    load_kwargs["quantization_config"] = BitsAndBytesConfig(
                        load_in_8bit=True,
                        llm_int8_enable_fp32_cpu_offload=True
                    )
                    print("8bit 양자화 적용 (1차 시도)")
                except Exception as _:
                    # bitsandbytes 사용 불가 시 양자화 미적용으로 진행
                    print("bitsandbytes 사용 불가: 비양자화로 1차 시도 진행")
            self.model = AutoModelForCausalLM.from_pretrained(
                model_source,
-                device_map="auto",
+                **load_kwargs
                torch_dtype=dtype if dtype is not None else None,
                low_cpu_mem_usage=True,
                **quant_args
            )
        except Exception as e1:
-            print(f"device_map=auto 로드 실패: {e1}\nCPU로 폴백합니다.")
+            print(f"device_map=auto 로드 실패: {e1}")
-            # 2차 시도: CPU 강제 로드
+            # 2차 시도: 8-bit 양자화로 재시도 (가능 시, 1차에서 적용 안된 경우)
            tried_int8 = False
            if not use_quantization:
                try:
                    from transformers import BitsAndBytesConfig
                    print("8bit 양자화로 재시도합니다...")
                    self.tokenizer = AutoTokenizer.from_pretrained(model_source, trust_remote_code=True)
                    retry_kwargs = dict(
                        device_map="auto",
                        low_cpu_mem_usage=True,
                        offload_folder=offload_folder,
                        offload_state_dict=True,
                        trust_remote_code=True,
                        quantization_config=BitsAndBytesConfig(
                            load_in_8bit=True,
                            llm_int8_enable_fp32_cpu_offload=True
                        )
                    )
                    if dtype is not None:
                        retry_kwargs["torch_dtype"] = dtype
                    if max_memory:
                        retry_kwargs["max_memory"] = max_memory
                    self.model = AutoModelForCausalLM.from_pretrained(
                        model_source,
                        **retry_kwargs
                    )
                    tried_int8 = True
                except Exception as e_int8:
                    print(f"8bit 재시도 실패: {e_int8}")
            if not tried_int8:
                print("CPU로 폴백합니다.")
                try:
                    import torch, gc
                    torch.cuda.empty_cache()
                    gc.collect()
                except Exception:
                    pass
                # CPU 강제 로드
                self.tokenizer = AutoTokenizer.from_pretrained(model_source)
                self.model = AutoModelForCausalLM.from_pretrained(
                    model_source,
                    device_map={"": "cpu"},
                    torch_dtype=torch.float32,
-                low_cpu_mem_usage=False
+                    low_cpu_mem_usage=False,
                    quantization_config=None,
                    trust_remote_code=True
                )
        # 파이프라인 생성
--- a/AI_Web_Scraper/config.json
+++ b/AI_Web_Scraper/config.json
@@ -16,7 +16,7 @@
    "drive_mount_path": "/content/drive/MyDrive/model_Dev/data"
  },
  "model_settings": {
-    "use_quantization": false,
+    "use_quantization": true,
    "quantization_bits": 8,
    "torch_dtype": "auto",
    "max_memory": {