diff --git a/AI_Web_Scraper/ai_agent.py b/AI_Web_Scraper/ai_agent.py index efa6952..a952882 100644 --- a/AI_Web_Scraper/ai_agent.py +++ b/AI_Web_Scraper/ai_agent.py @@ -106,26 +106,105 @@ class AIAgent: print("bitsandbytes/transformers 호환 문제로 양자화를 비활성화합니다.") quant_args = {} - # 1차 시도: device_map="auto" 로 로드 + # 메모리 제한/오프로딩 설정 + mm_cfg = model_settings.get('max_memory', {}) if isinstance(model_settings.get('max_memory', {}), dict) else {} + # normalize memory strings to GiB (accelerate accepts both, but unify) + def _norm_mem(v): + if not isinstance(v, str): + return v + return v.replace('GB', 'GiB').replace('gb', 'GiB') + max_memory = {} + if 0 in mm_cfg or 'gpu' in mm_cfg: + max_memory[0] = _norm_mem(mm_cfg.get(0, mm_cfg.get('gpu', '30GiB'))) + if 'cpu' in mm_cfg: + max_memory['cpu'] = _norm_mem(mm_cfg.get('cpu', '60GiB')) + offload_folder = os.path.join(os.path.dirname(self.config_path), 'offload') + os.makedirs(offload_folder, exist_ok=True) + + # 1차 시도: device_map="auto" + max_memory 로 로드 try: - self.tokenizer = AutoTokenizer.from_pretrained(model_source) + self.tokenizer = AutoTokenizer.from_pretrained(model_source, trust_remote_code=True) + load_kwargs = dict( + device_map="auto", + low_cpu_mem_usage=True, + offload_folder=offload_folder, + offload_state_dict=True, + trust_remote_code=True, + ) + if dtype is not None: + load_kwargs["torch_dtype"] = dtype + if max_memory: + load_kwargs["max_memory"] = max_memory + + # use_quantization=True면 8bit 우선 시도 (repo의 다른 양자화 경로 우회) + if use_quantization: + try: + from transformers import BitsAndBytesConfig + load_kwargs["quantization_config"] = BitsAndBytesConfig( + load_in_8bit=True, + llm_int8_enable_fp32_cpu_offload=True + ) + print("8bit 양자화 적용 (1차 시도)") + except Exception as _: + # bitsandbytes 사용 불가 시 양자화 미적용으로 진행 + print("bitsandbytes 사용 불가: 비양자화로 1차 시도 진행") + self.model = AutoModelForCausalLM.from_pretrained( model_source, - device_map="auto", - torch_dtype=dtype if dtype is not None else None, - low_cpu_mem_usage=True, - **quant_args + **load_kwargs ) except Exception as e1: - print(f"device_map=auto 로드 실패: {e1}\nCPU로 폴백합니다.") - # 2차 시도: CPU 강제 로드 - self.tokenizer = AutoTokenizer.from_pretrained(model_source) - self.model = AutoModelForCausalLM.from_pretrained( - model_source, - device_map={"": "cpu"}, - torch_dtype=torch.float32, - low_cpu_mem_usage=False - ) + print(f"device_map=auto 로드 실패: {e1}") + # 2차 시도: 8-bit 양자화로 재시도 (가능 시, 1차에서 적용 안된 경우) + tried_int8 = False + if not use_quantization: + try: + from transformers import BitsAndBytesConfig + print("8bit 양자화로 재시도합니다...") + self.tokenizer = AutoTokenizer.from_pretrained(model_source, trust_remote_code=True) + retry_kwargs = dict( + device_map="auto", + low_cpu_mem_usage=True, + offload_folder=offload_folder, + offload_state_dict=True, + trust_remote_code=True, + quantization_config=BitsAndBytesConfig( + load_in_8bit=True, + llm_int8_enable_fp32_cpu_offload=True + ) + ) + if dtype is not None: + retry_kwargs["torch_dtype"] = dtype + if max_memory: + retry_kwargs["max_memory"] = max_memory + + self.model = AutoModelForCausalLM.from_pretrained( + model_source, + **retry_kwargs + ) + tried_int8 = True + except Exception as e_int8: + print(f"8bit 재시도 실패: {e_int8}") + + if not tried_int8: + print("CPU로 폴백합니다.") + try: + import torch, gc + torch.cuda.empty_cache() + gc.collect() + except Exception: + pass + + # CPU 강제 로드 + self.tokenizer = AutoTokenizer.from_pretrained(model_source) + self.model = AutoModelForCausalLM.from_pretrained( + model_source, + device_map={"": "cpu"}, + torch_dtype=torch.float32, + low_cpu_mem_usage=False, + quantization_config=None, + trust_remote_code=True + ) # 파이프라인 생성 pad_id = self.tokenizer.eos_token_id if getattr(self.tokenizer, 'eos_token_id', None) is not None else None diff --git a/AI_Web_Scraper/config.json b/AI_Web_Scraper/config.json index 3dc8347..4f597bf 100644 --- a/AI_Web_Scraper/config.json +++ b/AI_Web_Scraper/config.json @@ -16,7 +16,7 @@ "drive_mount_path": "/content/drive/MyDrive/model_Dev/data" }, "model_settings": { - "use_quantization": false, + "use_quantization": true, "quantization_bits": 8, "torch_dtype": "auto", "max_memory": {