feat: 모델 성능 향상을 위한 양자화 설정 활성화

This commit is contained in:
2025-08-28 11:09:55 +09:00
parent 0206e42780
commit f67f9a18aa
2 changed files with 95 additions and 16 deletions

View File

@@ -106,25 +106,104 @@ class AIAgent:
print("bitsandbytes/transformers 호환 문제로 양자화를 비활성화합니다.") print("bitsandbytes/transformers 호환 문제로 양자화를 비활성화합니다.")
quant_args = {} quant_args = {}
# 1차 시도: device_map="auto" 로 로드 # 메모리 제한/오프로딩 설정
mm_cfg = model_settings.get('max_memory', {}) if isinstance(model_settings.get('max_memory', {}), dict) else {}
# normalize memory strings to GiB (accelerate accepts both, but unify)
def _norm_mem(v):
if not isinstance(v, str):
return v
return v.replace('GB', 'GiB').replace('gb', 'GiB')
max_memory = {}
if 0 in mm_cfg or 'gpu' in mm_cfg:
max_memory[0] = _norm_mem(mm_cfg.get(0, mm_cfg.get('gpu', '30GiB')))
if 'cpu' in mm_cfg:
max_memory['cpu'] = _norm_mem(mm_cfg.get('cpu', '60GiB'))
offload_folder = os.path.join(os.path.dirname(self.config_path), 'offload')
os.makedirs(offload_folder, exist_ok=True)
# 1차 시도: device_map="auto" + max_memory 로 로드
try: try:
self.tokenizer = AutoTokenizer.from_pretrained(model_source) self.tokenizer = AutoTokenizer.from_pretrained(model_source, trust_remote_code=True)
load_kwargs = dict(
device_map="auto",
low_cpu_mem_usage=True,
offload_folder=offload_folder,
offload_state_dict=True,
trust_remote_code=True,
)
if dtype is not None:
load_kwargs["torch_dtype"] = dtype
if max_memory:
load_kwargs["max_memory"] = max_memory
# use_quantization=True면 8bit 우선 시도 (repo의 다른 양자화 경로 우회)
if use_quantization:
try:
from transformers import BitsAndBytesConfig
load_kwargs["quantization_config"] = BitsAndBytesConfig(
load_in_8bit=True,
llm_int8_enable_fp32_cpu_offload=True
)
print("8bit 양자화 적용 (1차 시도)")
except Exception as _:
# bitsandbytes 사용 불가 시 양자화 미적용으로 진행
print("bitsandbytes 사용 불가: 비양자화로 1차 시도 진행")
self.model = AutoModelForCausalLM.from_pretrained( self.model = AutoModelForCausalLM.from_pretrained(
model_source, model_source,
device_map="auto", **load_kwargs
torch_dtype=dtype if dtype is not None else None,
low_cpu_mem_usage=True,
**quant_args
) )
except Exception as e1: except Exception as e1:
print(f"device_map=auto 로드 실패: {e1}\nCPU로 폴백합니다.") print(f"device_map=auto 로드 실패: {e1}")
# 2차 시도: CPU 강제 로드 # 2차 시도: 8-bit 양자화로 재시도 (가능 시, 1차에서 적용 안된 경우)
tried_int8 = False
if not use_quantization:
try:
from transformers import BitsAndBytesConfig
print("8bit 양자화로 재시도합니다...")
self.tokenizer = AutoTokenizer.from_pretrained(model_source, trust_remote_code=True)
retry_kwargs = dict(
device_map="auto",
low_cpu_mem_usage=True,
offload_folder=offload_folder,
offload_state_dict=True,
trust_remote_code=True,
quantization_config=BitsAndBytesConfig(
load_in_8bit=True,
llm_int8_enable_fp32_cpu_offload=True
)
)
if dtype is not None:
retry_kwargs["torch_dtype"] = dtype
if max_memory:
retry_kwargs["max_memory"] = max_memory
self.model = AutoModelForCausalLM.from_pretrained(
model_source,
**retry_kwargs
)
tried_int8 = True
except Exception as e_int8:
print(f"8bit 재시도 실패: {e_int8}")
if not tried_int8:
print("CPU로 폴백합니다.")
try:
import torch, gc
torch.cuda.empty_cache()
gc.collect()
except Exception:
pass
# CPU 강제 로드
self.tokenizer = AutoTokenizer.from_pretrained(model_source) self.tokenizer = AutoTokenizer.from_pretrained(model_source)
self.model = AutoModelForCausalLM.from_pretrained( self.model = AutoModelForCausalLM.from_pretrained(
model_source, model_source,
device_map={"": "cpu"}, device_map={"": "cpu"},
torch_dtype=torch.float32, torch_dtype=torch.float32,
low_cpu_mem_usage=False low_cpu_mem_usage=False,
quantization_config=None,
trust_remote_code=True
) )
# 파이프라인 생성 # 파이프라인 생성

View File

@@ -16,7 +16,7 @@
"drive_mount_path": "/content/drive/MyDrive/model_Dev/data" "drive_mount_path": "/content/drive/MyDrive/model_Dev/data"
}, },
"model_settings": { "model_settings": {
"use_quantization": false, "use_quantization": true,
"quantization_bits": 8, "quantization_bits": 8,
"torch_dtype": "auto", "torch_dtype": "auto",
"max_memory": { "max_memory": {