feat: max_tokens 값을 131072로 증가하여 토큰 제한 확장

This commit is contained in:
2025-08-28 11:15:09 +09:00
parent f67f9a18aa
commit 57f9bba80e
3 changed files with 84 additions and 28 deletions

View File

@@ -2,6 +2,7 @@ import json
import os import os
from typing import List, Dict from typing import List, Dict
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, pipeline from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, pipeline
from transformers.utils import logging as hf_logging
from langchain_community.llms import HuggingFacePipeline from langchain_community.llms import HuggingFacePipeline
from langchain.agents import initialize_agent, AgentType from langchain.agents import initialize_agent, AgentType
from langchain.tools import Tool from langchain.tools import Tool
@@ -69,6 +70,11 @@ class AIAgent:
""" """
# GPU 메모리 최적화 설정 # GPU 메모리 최적화 설정
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
# Transformers 로깅 레벨을 낮춰 config __repr__ 경로로 인한 예외를 피함
try:
hf_logging.set_verbosity_error()
except Exception:
pass
model_settings = self.config.get('model_settings', {}) model_settings = self.config.get('model_settings', {})
use_quantization = bool(model_settings.get('use_quantization', False)) use_quantization = bool(model_settings.get('use_quantization', False))
@@ -124,12 +130,20 @@ class AIAgent:
# 1차 시도: device_map="auto" + max_memory 로 로드 # 1차 시도: device_map="auto" + max_memory 로 로드
try: try:
self.tokenizer = AutoTokenizer.from_pretrained(model_source, trust_remote_code=True) self.tokenizer = AutoTokenizer.from_pretrained(model_source, trust_remote_code=True)
# config 사전 로드 후 리포의 quantization_config 키 제거 (MXFP4 등 회피)
cfg = AutoConfig.from_pretrained(model_source, trust_remote_code=True)
if hasattr(cfg, 'quantization_config'):
try:
delattr(cfg, 'quantization_config')
except Exception:
setattr(cfg, 'quantization_config', None)
load_kwargs = dict( load_kwargs = dict(
device_map="auto", device_map="auto",
low_cpu_mem_usage=True, low_cpu_mem_usage=True,
offload_folder=offload_folder, offload_folder=offload_folder,
offload_state_dict=True, offload_state_dict=True,
trust_remote_code=True, trust_remote_code=True,
config=cfg,
) )
if dtype is not None: if dtype is not None:
load_kwargs["torch_dtype"] = dtype load_kwargs["torch_dtype"] = dtype
@@ -140,14 +154,17 @@ class AIAgent:
if use_quantization: if use_quantization:
try: try:
from transformers import BitsAndBytesConfig from transformers import BitsAndBytesConfig
load_kwargs["quantization_config"] = BitsAndBytesConfig( tmp = BitsAndBytesConfig(load_in_8bit=True, llm_int8_enable_fp32_cpu_offload=True)
load_in_8bit=True, if hasattr(tmp, 'get_loading_attributes'):
llm_int8_enable_fp32_cpu_offload=True load_kwargs["quantization_config"] = tmp
) print("8bit 양자화 적용 (1차 시도, bnb 신 API)")
print("8bit 양자화 적용 (1차 시도)") else:
# 레거시 API 시도
load_kwargs["load_in_8bit"] = True
load_kwargs["llm_int8_enable_fp32_cpu_offload"] = True
print("8bit 양자화 적용 (1차 시도, 레거시 API)")
except Exception as _: except Exception as _:
# bitsandbytes 사용 불가 시 양자화 미적용으로 진행 print("bitsandbytes 감지 실패: 비양자화로 1차 시도 진행")
print("bitsandbytes 사용 불가: 비양자화로 1차 시도 진행")
self.model = AutoModelForCausalLM.from_pretrained( self.model = AutoModelForCausalLM.from_pretrained(
model_source, model_source,
@@ -155,33 +172,66 @@ class AIAgent:
) )
except Exception as e1: except Exception as e1:
print(f"device_map=auto 로드 실패: {e1}") print(f"device_map=auto 로드 실패: {e1}")
# 2차 시도: 8-bit 양자화로 재시도 (가능 시, 1차에서 적용 안된 경우)
tried_int8 = False # 2a. 비양자화로 다시 auto+offload 시도 (오류가 bnb/버전이면 이 경로로 성공 가능)
if not use_quantization: try:
self.tokenizer = AutoTokenizer.from_pretrained(model_source, trust_remote_code=True)
cfg = AutoConfig.from_pretrained(model_source, trust_remote_code=True)
if hasattr(cfg, 'quantization_config'):
try:
delattr(cfg, 'quantization_config')
except Exception:
setattr(cfg, 'quantization_config', None)
retry_no_quant = dict(
device_map="auto",
low_cpu_mem_usage=True,
offload_folder=offload_folder,
offload_state_dict=True,
trust_remote_code=True,
config=cfg,
)
if dtype is not None:
retry_no_quant["torch_dtype"] = dtype
if max_memory:
retry_no_quant["max_memory"] = max_memory
self.model = AutoModelForCausalLM.from_pretrained(model_source, **retry_no_quant)
print("비양자화 재시도 성공")
except Exception as e_noq:
print(f"비양자화 재시도 실패: {e_noq}")
# 2b. 8-bit 양자화로 재시도 (가능 시)
tried_int8 = False
try: try:
from transformers import BitsAndBytesConfig from transformers import BitsAndBytesConfig
print("8bit 양자화로 재시도합니다...") print("8bit 양자화로 재시도합니다...")
self.tokenizer = AutoTokenizer.from_pretrained(model_source, trust_remote_code=True) self.tokenizer = AutoTokenizer.from_pretrained(model_source, trust_remote_code=True)
# config 재생성 및 quantization_config 제거
cfg = AutoConfig.from_pretrained(model_source, trust_remote_code=True)
if hasattr(cfg, 'quantization_config'):
try:
delattr(cfg, 'quantization_config')
except Exception:
setattr(cfg, 'quantization_config', None)
retry_kwargs = dict( retry_kwargs = dict(
device_map="auto", device_map="auto",
low_cpu_mem_usage=True, low_cpu_mem_usage=True,
offload_folder=offload_folder, offload_folder=offload_folder,
offload_state_dict=True, offload_state_dict=True,
trust_remote_code=True, trust_remote_code=True,
quantization_config=BitsAndBytesConfig( config=cfg,
load_in_8bit=True,
llm_int8_enable_fp32_cpu_offload=True
)
) )
if dtype is not None: if dtype is not None:
retry_kwargs["torch_dtype"] = dtype retry_kwargs["torch_dtype"] = dtype
if max_memory: if max_memory:
retry_kwargs["max_memory"] = max_memory retry_kwargs["max_memory"] = max_memory
tmp = BitsAndBytesConfig(load_in_8bit=True, llm_int8_enable_fp32_cpu_offload=True)
if hasattr(tmp, 'get_loading_attributes'):
retry_kwargs["quantization_config"] = tmp
else:
retry_kwargs["load_in_8bit"] = True
retry_kwargs["llm_int8_enable_fp32_cpu_offload"] = True
self.model = AutoModelForCausalLM.from_pretrained( self.model = AutoModelForCausalLM.from_pretrained(model_source, **retry_kwargs)
model_source,
**retry_kwargs
)
tried_int8 = True tried_int8 = True
except Exception as e_int8: except Exception as e_int8:
print(f"8bit 재시도 실패: {e_int8}") print(f"8bit 재시도 실패: {e_int8}")
@@ -195,15 +245,21 @@ class AIAgent:
except Exception: except Exception:
pass pass
# CPU 강제 로드 # CPU 강제 로드 (config의 quantization_config 제거)
self.tokenizer = AutoTokenizer.from_pretrained(model_source) self.tokenizer = AutoTokenizer.from_pretrained(model_source, trust_remote_code=True)
cfg = AutoConfig.from_pretrained(model_source, trust_remote_code=True)
if hasattr(cfg, 'quantization_config'):
try:
delattr(cfg, 'quantization_config')
except Exception:
setattr(cfg, 'quantization_config', None)
self.model = AutoModelForCausalLM.from_pretrained( self.model = AutoModelForCausalLM.from_pretrained(
model_source, model_source,
device_map={"": "cpu"}, device_map={"": "cpu"},
torch_dtype=torch.float32, torch_dtype=torch.float32,
low_cpu_mem_usage=False, low_cpu_mem_usage=False,
quantization_config=None, trust_remote_code=True,
trust_remote_code=True config=cfg
) )
# 파이프라인 생성 # 파이프라인 생성

View File

@@ -3,7 +3,7 @@
"model_local_path": "./models/gpt-oss-20b-base", "model_local_path": "./models/gpt-oss-20b-base",
"google_drive_folder_id": "YOUR_GOOGLE_DRIVE_FOLDER_ID", "google_drive_folder_id": "YOUR_GOOGLE_DRIVE_FOLDER_ID",
"google_credentials_path": "./credentials.json", "google_credentials_path": "./credentials.json",
"max_tokens": 2048, "max_tokens": 131072,
"temperature": 0.7, "temperature": 0.7,
"web_scraping": { "web_scraping": {
"max_pages": 100, "max_pages": 100,

View File

@@ -1,6 +1,6 @@
transformers>=4.20.0 transformers>=4.44.0
torch>=1.12.0 torch>=2.1.0
accelerate>=0.20.0 accelerate>=0.33.0
requests==2.32.4 requests==2.32.4
beautifulsoup4>=4.10.0 beautifulsoup4>=4.10.0
selenium>=4.0.0 selenium>=4.0.0
@@ -10,7 +10,7 @@ google-auth-oauthlib>=1.0.0
google-auth-httplib2>=0.1.0 google-auth-httplib2>=0.1.0
langchain>=0.0.200 langchain>=0.0.200
langchain-community>=0.0.20 langchain-community>=0.0.20
huggingface-hub>=0.15.0 huggingface-hub>=0.23.0
pandas>=1.3.0 pandas>=1.3.0
openpyxl>=3.0.0 openpyxl>=3.0.0
bitsandbytes>=0.41.0 bitsandbytes>=0.43.1