From f67f9a18aa530f1a156f3470cb2e757de2beea2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EB=B0=95=EC=83=81=ED=98=B8=20Sangho=20Park?= Date: Thu, 28 Aug 2025 11:09:55 +0900 Subject: [PATCH] =?UTF-8?q?feat:=20=EB=AA=A8=EB=8D=B8=20=EC=84=B1=EB=8A=A5?= =?UTF-8?q?=20=ED=96=A5=EC=83=81=EC=9D=84=20=EC=9C=84=ED=95=9C=20=EC=96=91?= =?UTF-8?q?=EC=9E=90=ED=99=94=20=EC=84=A4=EC=A0=95=20=ED=99=9C=EC=84=B1?= =?UTF-8?q?=ED=99=94?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- AI_Web_Scraper/ai_agent.py | 109 ++++++++++++++++++++++++++++++++----- AI_Web_Scraper/config.json | 2 +- 2 files changed, 95 insertions(+), 16 deletions(-) diff --git a/AI_Web_Scraper/ai_agent.py b/AI_Web_Scraper/ai_agent.py index efa6952..a952882 100644 --- a/AI_Web_Scraper/ai_agent.py +++ b/AI_Web_Scraper/ai_agent.py @@ -106,26 +106,105 @@ class AIAgent: print("bitsandbytes/transformers 호환 문제로 양자화를 비활성화합니다.") quant_args = {} - # 1차 시도: device_map="auto" 로 로드 + # 메모리 제한/오프로딩 설정 + mm_cfg = model_settings.get('max_memory', {}) if isinstance(model_settings.get('max_memory', {}), dict) else {} + # normalize memory strings to GiB (accelerate accepts both, but unify) + def _norm_mem(v): + if not isinstance(v, str): + return v + return v.replace('GB', 'GiB').replace('gb', 'GiB') + max_memory = {} + if 0 in mm_cfg or 'gpu' in mm_cfg: + max_memory[0] = _norm_mem(mm_cfg.get(0, mm_cfg.get('gpu', '30GiB'))) + if 'cpu' in mm_cfg: + max_memory['cpu'] = _norm_mem(mm_cfg.get('cpu', '60GiB')) + offload_folder = os.path.join(os.path.dirname(self.config_path), 'offload') + os.makedirs(offload_folder, exist_ok=True) + + # 1차 시도: device_map="auto" + max_memory 로 로드 try: - self.tokenizer = AutoTokenizer.from_pretrained(model_source) + self.tokenizer = AutoTokenizer.from_pretrained(model_source, trust_remote_code=True) + load_kwargs = dict( + device_map="auto", + low_cpu_mem_usage=True, + offload_folder=offload_folder, + offload_state_dict=True, + trust_remote_code=True, + ) + if dtype is not None: + load_kwargs["torch_dtype"] = dtype + if max_memory: + load_kwargs["max_memory"] = max_memory + + # use_quantization=True면 8bit 우선 시도 (repo의 다른 양자화 경로 우회) + if use_quantization: + try: + from transformers import BitsAndBytesConfig + load_kwargs["quantization_config"] = BitsAndBytesConfig( + load_in_8bit=True, + llm_int8_enable_fp32_cpu_offload=True + ) + print("8bit 양자화 적용 (1차 시도)") + except Exception as _: + # bitsandbytes 사용 불가 시 양자화 미적용으로 진행 + print("bitsandbytes 사용 불가: 비양자화로 1차 시도 진행") + self.model = AutoModelForCausalLM.from_pretrained( model_source, - device_map="auto", - torch_dtype=dtype if dtype is not None else None, - low_cpu_mem_usage=True, - **quant_args + **load_kwargs ) except Exception as e1: - print(f"device_map=auto 로드 실패: {e1}\nCPU로 폴백합니다.") - # 2차 시도: CPU 강제 로드 - self.tokenizer = AutoTokenizer.from_pretrained(model_source) - self.model = AutoModelForCausalLM.from_pretrained( - model_source, - device_map={"": "cpu"}, - torch_dtype=torch.float32, - low_cpu_mem_usage=False - ) + print(f"device_map=auto 로드 실패: {e1}") + # 2차 시도: 8-bit 양자화로 재시도 (가능 시, 1차에서 적용 안된 경우) + tried_int8 = False + if not use_quantization: + try: + from transformers import BitsAndBytesConfig + print("8bit 양자화로 재시도합니다...") + self.tokenizer = AutoTokenizer.from_pretrained(model_source, trust_remote_code=True) + retry_kwargs = dict( + device_map="auto", + low_cpu_mem_usage=True, + offload_folder=offload_folder, + offload_state_dict=True, + trust_remote_code=True, + quantization_config=BitsAndBytesConfig( + load_in_8bit=True, + llm_int8_enable_fp32_cpu_offload=True + ) + ) + if dtype is not None: + retry_kwargs["torch_dtype"] = dtype + if max_memory: + retry_kwargs["max_memory"] = max_memory + + self.model = AutoModelForCausalLM.from_pretrained( + model_source, + **retry_kwargs + ) + tried_int8 = True + except Exception as e_int8: + print(f"8bit 재시도 실패: {e_int8}") + + if not tried_int8: + print("CPU로 폴백합니다.") + try: + import torch, gc + torch.cuda.empty_cache() + gc.collect() + except Exception: + pass + + # CPU 강제 로드 + self.tokenizer = AutoTokenizer.from_pretrained(model_source) + self.model = AutoModelForCausalLM.from_pretrained( + model_source, + device_map={"": "cpu"}, + torch_dtype=torch.float32, + low_cpu_mem_usage=False, + quantization_config=None, + trust_remote_code=True + ) # 파이프라인 생성 pad_id = self.tokenizer.eos_token_id if getattr(self.tokenizer, 'eos_token_id', None) is not None else None diff --git a/AI_Web_Scraper/config.json b/AI_Web_Scraper/config.json index 3dc8347..4f597bf 100644 --- a/AI_Web_Scraper/config.json +++ b/AI_Web_Scraper/config.json @@ -16,7 +16,7 @@ "drive_mount_path": "/content/drive/MyDrive/model_Dev/data" }, "model_settings": { - "use_quantization": false, + "use_quantization": true, "quantization_bits": 8, "torch_dtype": "auto", "max_memory": {