feat: AI 웹 스크래퍼 프로젝트의 초기 구조와 핵심 기능 구현

2025-08-28 10:23:24 +09:00
commit fdf330143f
10 changed files with 797 additions and 0 deletions
--- a/.windsurf/rules/mcp.md
+++ b/.windsurf/rules/mcp.md
@@ -0,0 +1,4 @@
 ---
 trigger: manual
 ---
--- a/AI_Web_Scraper/README.md
+++ b/AI_Web_Scraper/README.md
@@ -0,0 +1,52 @@
 # AI 웹 정보 수집 시스템
 이 프로젝트는 AI 모델을 사용하여 인터넷에서 정보를 자동으로 수집하고 Google Drive에 저장하는 시스템입니다.
 ## 주요 기능
 - Hugging Face 모델 (jxm/gpt-oss-20b-base)을 사용하여 AI 에이전트 실행
 - 웹 크롤링을 통한 정보 수집
 - 수집된 데이터의 Google Drive 저장
 - Colab Pro 환경에서 A100 GPU 활용
 ## 요구사항
 - Python 3.8 이상
 - Google Drive API 인증 파일 (credentials.json)
 - Colab Pro 계정 (A100 GPU 지원)
 ## 설치 방법
 ```bash
 pip install -r requirements.txt
 ```
 ## 설정
 1. `config.json` 파일에서 다음 항목들을 설정하세요:
   - `google_drive_folder_id`: 데이터를 저장할 Google Drive 폴더 ID
   - `google_credentials_path`: Google API 인증 파일 경로
 2. Google Drive API 설정:
   - Google Cloud Console에서 Drive API 활성화
   - OAuth 2.0 클라이언트 ID 생성
   - credentials.json 파일 다운로드
 ## 실행 방법
 자세한 실행 방법은 `run_guide.md` 파일을 참고하세요.
 ## 파일 구조
 ```
 AI_Web_Scraper/
 ├── main.py                 # 메인 실행 파일
 ├── model_downloader.py     # 모델 다운로드
 ├── web_scraper.py          # 웹 크롤링 도구
 ├── google_drive_uploader.py # Google Drive 업로드
 ├── ai_agent.py            # AI 에이전트
 ├── config.json            # 설정 파일
 ├── requirements.txt       # 의존성 파일
 ├── README.md              # 프로젝트 설명
 └── run_guide.md           # 실행 가이드
 ```
--- a/AI_Web_Scraper/ai_agent.py
+++ b/AI_Web_Scraper/ai_agent.py
@@ -0,0 +1,192 @@
 import json
 import os
 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 from langchain.llms import HuggingFacePipeline
 from langchain.agents import initialize_agent, AgentType
 from langchain.tools import Tool
 from langchain.memory import ConversationBufferMemory
 from web_scraper import WebScraper
 from google_drive_uploader import GoogleDriveUploader, SimpleDriveSaver
 class AIAgent:
    def __init__(self, config_path='./config.json'):
        with open(config_path, 'r') as f:
            self.config = json.load(f)
        self.model_path = self.config['model_local_path']
        self.max_tokens = self.config['max_tokens']
        self.temperature = self.config['temperature']
        # 모델 로드
        self.model = None
        self.tokenizer = None
        self.llm = None
        self.load_model()
        # 도구들 초기화
        self.web_scraper = WebScraper(config_path)
        self.drive_uploader = GoogleDriveUploader(config_path)
        self.simple_saver = SimpleDriveSaver(self.config['data_storage']['drive_mount_path'])
        # LangChain 도구 정의
        self.tools = [
            Tool(
                name="WebScraper",
                func=self.scrape_web,
                description="웹사이트에서 정보를 수집합니다. URL을 입력하세요."
            ),
            Tool(
                name="GoogleDriveUploader",
                func=self.upload_to_drive_api,
                description="Google Drive API를 사용하여 데이터를 업로드합니다. 데이터와 파일명을 입력하세요."
            ),
            Tool(
                name="SimpleDriveSaver",
                func=self.save_to_drive_simple,
                description="마운트된 Google Drive에 데이터를 저장합니다. 데이터와 파일명을 입력하세요."
            )
        ]
        # 메모리
        self.memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
        # 에이전트 초기화
        self.agent = initialize_agent(
            tools=self.tools,
            llm=self.llm,
            agent=AgentType.CONVERSATIONAL_REACT_DESCRIPTION,
            memory=self.memory,
            verbose=True
        )
    def load_model(self):
        """
        Hugging Face 모델을 로드합니다.
        """
        try:
            print(f"모델 로드 중: {self.model_path}")
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
            self.model = AutoModelForCausalLM.from_pretrained(
                self.model_path,
                device_map="auto",
                torch_dtype="auto"
            )
            # 파이프라인 생성
            pipe = pipeline(
                "text-generation",
                model=self.model,
                tokenizer=self.tokenizer,
                max_new_tokens=self.max_tokens,
                temperature=self.temperature,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id
            )
            self.llm = HuggingFacePipeline(pipeline=pipe)
            print("모델 로드 완료")
        except Exception as e:
            print(f"모델 로드 실패: {e}")
            raise
    def scrape_web(self, url):
        """
        웹 스크래핑 도구 함수
        """
        data = self.web_scraper.scrape_website(url)
        if data:
            return f"수집 완료: {data['title']} - {data['description'][:200]}..."
        else:
            return "수집 실패"
    def upload_to_drive_api(self, data_and_filename):
        """
        Google Drive API 업로드 도구 함수
        """
        try:
            # 간단한 파싱 (실제로는 더 정교하게)
            parts = data_and_filename.split('|')
            if len(parts) == 2:
                data = json.loads(parts[0])
                filename = parts[1]
            else:
                data = {"error": "잘못된 형식"}
                filename = "error.json"
            file_id = self.drive_uploader.upload_data_as_json(data, filename)
            return f"업로드 완료: {file_id}"
        except Exception as e:
            return f"업로드 실패: {e}"
    def save_to_drive_simple(self, data_and_filename):
        """
        마운트된 Drive에 저장하는 도구 함수
        """
        try:
            parts = data_and_filename.split('|')
            if len(parts) == 2:
                data = json.loads(parts[0])
                filename = parts[1]
            else:
                data = {"error": "잘못된 형식"}
                filename = "error.json"
            filepath = self.simple_saver.save_data_as_json(data, filename)
            return f"저장 완료: {filepath}"
        except Exception as e:
            return f"저장 실패: {e}"
    def run_agent(self, task_description):
        """
        AI 에이전트를 실행합니다.
        """
        try:
            response = self.agent.run(task_description)
            return response
        except Exception as e:
            print(f"에이전트 실행 실패: {e}")
            return None
    def generate_topics(self, num_topics=3):
        """
        AI가 스스로 흥미로운 주제를 생성합니다.
        """
        prompt = f"""
        당신은 AI 연구원입니다. 현재 세계에서 가장 흥미롭고 조사할 가치가 있는 기술 및 과학 분야의 주제 {num_topics}개를 선정해주세요.
        다음 기준을 고려하세요:
        1. 최근 트렌드나 미래 지향적인 주제
        2. 사회적 영향이 큰 주제
        3. 기술 발전이 빠른 분야
        4. AI와 관련된 주제 우선
        각 주제는 구체적이고 조사하기 쉬운 형태로 제시해주세요.
        예시: "양자 컴퓨팅의 최근 발전", "생성형 AI의 윤리적 문제"
        주제 목록만 출력하고, 다른 설명은 하지 마세요.
        형식: 각 줄에 하나의 주제
        """
        try:
            response = self.llm(prompt)
            # 응답에서 주제들을 추출 (줄 단위로 분리)
            topics = [line.strip() for line in response.split('\n') if line.strip() and not line.startswith(('1.', '2.', '3.', '-'))]
            # 최대 num_topics개 반환
            return topics[:num_topics]
        except Exception as e:
            print(f"주제 생성 실패: {e}")
            # 기본 주제 반환
            return ["AI 기술 동향", "머신러닝 응용", "딥러닝 최신 연구"]
    def close(self):
        self.web_scraper.close()
 if __name__ == "__main__":
    agent = AIAgent()
    # 테스트용
    topics = ["인공지능 최신 트렌드", "머신러닝 기초"]
    results = agent.collect_information(topics)
    print("수집 결과:", results)
    agent.close()
--- a/AI_Web_Scraper/config.json
+++ b/AI_Web_Scraper/config.json
@@ -0,0 +1,18 @@
 {
  "model_name": "jxm/gpt-oss-20b-base",
  "model_local_path": "./models/gpt-oss-20b-base",
  "google_drive_folder_id": "YOUR_GOOGLE_DRIVE_FOLDER_ID",
  "google_credentials_path": "./credentials.json",
  "max_tokens": 2048,
  "temperature": 0.7,
  "web_scraping": {
    "max_pages": 100,
    "delay_between_requests": 2,
    "user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
  },
  "data_storage": {
    "local_storage_path": "./collected_data",
    "file_format": "json",
    "drive_mount_path": "/content/drive/MyDrive/AI_Data"
  }
 }
--- a/AI_Web_Scraper/google_drive_uploader.py
+++ b/AI_Web_Scraper/google_drive_uploader.py
@@ -0,0 +1,149 @@
 import os
 import json
 from googleapiclient.discovery import build
 from googleapiclient.http import MediaFileUpload
 from google.oauth2.credentials import Credentials
 from google_auth_oauthlib.flow import InstalledAppFlow
 from google.auth.transport.requests import Request
 class GoogleDriveUploader:
    def __init__(self, config_path='./config.json'):
        with open(config_path, 'r') as f:
            self.config = json.load(f)
        self.folder_id = self.config['google_drive_folder_id']
        self.creds_path = self.config['google_credentials_path']
        self.scopes = ['https://www.googleapis.com/auth/drive.file']
        self.service = None
        self.authenticate()
    def authenticate(self):
        """
        Google Drive API 인증
        """
        creds = None
        if os.path.exists('token.json'):
            creds = Credentials.from_authorized_user_file('token.json', self.scopes)
        if not creds or not creds.valid:
            if creds and creds.expired and creds.refresh_token:
                creds.refresh(Request())
            else:
                flow = InstalledAppFlow.from_client_secrets_file(
                    self.creds_path, self.scopes)
                creds = flow.run_local_server(port=0)
            with open('token.json', 'w') as token:
                token.write(creds.to_json())
        self.service = build('drive', 'v3', credentials=creds)
    def upload_file(self, file_path, file_name=None):
        """
        파일을 Google Drive에 업로드
        """
        if file_name is None:
            file_name = os.path.basename(file_path)
        file_metadata = {
            'name': file_name,
            'parents': [self.folder_id] if self.folder_id else []
        }
        media = MediaFileUpload(file_path, resumable=True)
        try:
            file = self.service.files().create(
                body=file_metadata,
                media_body=media,
                fields='id'
            ).execute()
            print(f'파일 업로드 완료: {file_name} (ID: {file.get("id")})')
            return file.get('id')
        except Exception as e:
            print(f'업로드 실패: {e}')
            return None
    def upload_data_as_json(self, data, filename='collected_data.json'):
        """
        데이터를 JSON 파일로 변환하여 업로드
        """
        import tempfile
        with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
            temp_path = f.name
        try:
            file_id = self.upload_file(temp_path, filename)
            return file_id
        finally:
            os.unlink(temp_path)
    def list_files(self):
        """
        폴더 내 파일 목록 조회
        """
        try:
            results = self.service.files().list(
                q=f"'{self.folder_id}' in parents",
                pageSize=10,
                fields="nextPageToken, files(id, name)"
            ).execute()
            items = results.get('files', [])
            return items
        except Exception as e:
            print(f'파일 목록 조회 실패: {e}')
            return []
 class SimpleDriveSaver:
    """
    Colab의 drive.mount()를 사용한 간단한 저장 방식
    """
    def __init__(self, mount_path='/content/drive/MyDrive/AI_Data'):
        self.mount_path = mount_path
        if not os.path.exists(mount_path):
            os.makedirs(mount_path, exist_ok=True)
    def save_data_as_json(self, data, filename='collected_data.json'):
        """
        데이터를 마운트된 Drive에 JSON 파일로 저장
        """
        filepath = os.path.join(self.mount_path, filename)
        try:
            with open(filepath, 'w', encoding='utf-8') as f:
                json.dump(data, f, ensure_ascii=False, indent=2)
            print(f'데이터 저장 완료: {filepath}')
            return filepath
        except Exception as e:
            print(f'저장 실패: {e}')
            return None
    def save_text_data(self, data, filename='collected_data.txt'):
        """
        데이터를 텍스트 파일로 저장
        """
        filepath = os.path.join(self.mount_path, filename)
        try:
            with open(filepath, 'w', encoding='utf-8') as f:
                if isinstance(data, list):
                    for item in data:
                        f.write(json.dumps(item, ensure_ascii=False) + '\n')
                else:
                    f.write(str(data))
            print(f'텍스트 데이터 저장 완료: {filepath}')
            return filepath
        except Exception as e:
            print(f'저장 실패: {e}')
            return None
 def save_to_drive_simple(data, filename='collected_data.json', mount_path='/content/drive/MyDrive/AI_Data'):
    """
    간단한 함수로 마운트된 Drive에 데이터 저장
    """
    saver = SimpleDriveSaver(mount_path)
    return saver.save_data_as_json(data, filename)
--- a/AI_Web_Scraper/main.py
+++ b/AI_Web_Scraper/main.py
@@ -0,0 +1,53 @@
 import sys
 import json
 from model_downloader import download_model
 from ai_agent import AIAgent
 import argparse
 def main():
    parser = argparse.ArgumentParser(description='AI 웹 정보 수집 시스템')
    parser.add_argument('--topics', nargs='+', help='수집할 주제 목록', default=None)
    parser.add_argument('--config', default='./config.json', help='설정 파일 경로')
    parser.add_argument('--auto-topics', action='store_true', help='AI가 스스로 주제를 선정하여 조사')
    args = parser.parse_args()
    print("AI 웹 정보 수집 시스템 시작")
    # 1. 모델 다운로드 (필요한 경우)
    print("모델 확인 중...")
    model, tokenizer = download_model(args.config)
    if model is None:
        print("모델 다운로드 실패. 프로그램을 종료합니다.")
        sys.exit(1)
    # 2. AI 에이전트 초기화
    print("AI 에이전트 초기화 중...")
    agent = AIAgent(args.config)
    # 3. 주제 결정
    if args.auto_topics or args.topics is None:
        print("AI가 스스로 주제를 선정합니다...")
        topics = agent.generate_topics(num_topics=3)
        print(f"선정된 주제: {topics}")
    else:
        topics = args.topics
    # 4. 정보 수집 실행
    print(f"다음 주제들에 대해 정보를 수집합니다: {topics}")
    results = agent.collect_information(topics)
    # 5. 결과 출력
    print("\n=== 수집 결과 ===")
    for result in results:
        print(f"주제: {result['topic']}")
        print(f"응답: {result['response']}")
        print("-" * 50)
    # 6. 정리
    agent.close()
    print("프로그램 완료")
 if __name__ == "__main__":
    main()
--- a/AI_Web_Scraper/model_downloader.py
+++ b/AI_Web_Scraper/model_downloader.py
@@ -0,0 +1,44 @@
 import os
 import json
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from huggingface_hub import HfApi
 def download_model(config_path='./config.json'):
    """
    Hugging Face에서 모델을 다운로드합니다.
    """
    with open(config_path, 'r') as f:
        config = json.load(f)
    model_name = config['model_name']
    local_path = config['model_local_path']
    if not os.path.exists(local_path):
        os.makedirs(local_path)
    print(f"모델 {model_name}을 {local_path}에 다운로드 중...")
    try:
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            cache_dir=local_path,
            device_map="auto",  # GPU 자동 할당
            torch_dtype="auto"
        )
        tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            cache_dir=local_path
        )
        # 모델과 토크나이저 저장
        model.save_pretrained(local_path)
        tokenizer.save_pretrained(local_path)
        print(f"모델 다운로드 완료: {local_path}")
        return model, tokenizer
    except Exception as e:
        print(f"모델 다운로드 실패: {e}")
        return None, None
 if __name__ == "__main__":
    download_model()
--- a/AI_Web_Scraper/requirements.txt
+++ b/AI_Web_Scraper/requirements.txt
@@ -0,0 +1,15 @@
 transformers>=4.20.0
 torch>=1.12.0
 accelerate>=0.20.0
 requests>=2.25.1
 beautifulsoup4>=4.10.0
 selenium>=4.0.0
 webdriver-manager>=4.0.0
 google-api-python-client>=2.0.0
 google-auth-oauthlib>=1.0.0
 google-auth-httplib2>=0.1.0
 langchain>=0.0.200
 langchain-community>=0.0.20
 huggingface-hub>=0.15.0
 pandas>=1.3.0
 openpyxl>=3.0.0
--- a/AI_Web_Scraper/run_guide.md
+++ b/AI_Web_Scraper/run_guide.md
@@ -0,0 +1,148 @@
 # AI 웹 정보 수집 시스템 실행 가이드
 이 가이드는 Google Colab Pro 환경에서 시스템을 실행하는 방법을 설명합니다.
 ## 1. 사전 준비
 ### 1.1 Google Colab Pro 설정
 - Google Colab Pro 계정으로 로그인
 - 런타임 유형을 "GPU"로 설정 (A100 권장)
 - Python 3.8 이상 사용
 ### 1.2 Google Drive 설정 (간단한 방법 - 권장)
 Google Colab에서는 다음 코드로 쉽게 Google Drive를 마운트할 수 있습니다:
 ```python
 from google.colab import drive
 drive.mount('/content/drive')
 ```
 이 방법은 별도의 API 설정 없이 데이터를 저장할 수 있어 간단합니다. 데이터를 `/content/drive/MyDrive/AI_Data` 폴더에 자동으로 저장합니다.
 ### 1.3 Google Drive API 설정 (고급 방법)
 더 정교한 제어가 필요한 경우 Google Drive API를 사용할 수 있습니다:
 1. Google Cloud Console (https://console.cloud.google.com/) 접속
 2. 새 프로젝트 생성 또는 기존 프로젝트 선택
 3. "Google Drive API" 활성화
 4. "OAuth 2.0 클라이언트 ID" 생성
 5. 클라이언트 ID를 다운로드하여 `credentials.json`으로 이름 변경
 6. 데이터를 저장할 Google Drive 폴더 생성 및 폴더 ID 확인
   - 폴더 URL에서 ID 추출: `https://drive.google.com/drive/folders/[FOLDER_ID]`
 ### 1.4 프로젝트 파일 업로드
 Colab에 프로젝트 파일들을 업로드하거나 GitHub에서 클론:
 ```bash
 !git clone https://github.com/your-repo/AI_Web_Scraper.git
 %cd AI_Web_Scraper
 ```
 ### 1.5 Google Drive 마운트
 시스템 실행 전에 다음 코드를 실행하여 Google Drive를 마운트하세요:
 ```python
 from google.colab import drive
 drive.mount('/content/drive')
 ```
 ## 2. 환경 설정
 ### 2.1 필요한 패키지 설치
 ```bash
 !pip install -r requirements.txt
 ```
 ### 2.2 설정 파일 수정
 `config.json` 파일을 열어서 다음 항목들을 수정:
 ```json
 {
  "google_drive_folder_id": "YOUR_ACTUAL_FOLDER_ID",
  "google_credentials_path": "./credentials.json"
 }
 ```
 ### 2.3 인증 파일 업로드
 - `credentials.json` 파일을 Colab에 업로드
 - Google Drive 인증 시 브라우저 팝업이 나타나면 허용
 ## 3. 시스템 실행
 ### 3.1 기본 실행 (AI가 스스로 주제 선정)
 ```bash
 python main.py
 ```
 이 경우 AI가 현재 흥미로운 기술 트렌드 3개를 스스로 선정하여 조사합니다.
 ### 3.2 AI가 스스로 주제 선정하도록 명시적 실행
 ```bash
 python main.py --auto-topics
 ```
 ### 3.3 특정 주제로 실행
 ```bash
 python main.py --topics "인공지능" "머신러닝" "딥러닝"
 ```
 ### 3.4 설정 파일 지정
 ```bash
 python main.py --config ./custom_config.json
 ```
 ## 4. 실행 과정 설명
 1. **모델 다운로드**: Hugging Face에서 `jxm/gpt-oss-20b-base` 모델을 다운로드
 2. **AI 에이전트 초기화**: 모델을 로드하고 도구들을 설정
 3. **정보 수집**: 각 주제에 대해 AI가 스스로 웹을 탐색하며 정보 수집
 4. **데이터 저장**: 수집된 데이터를 마운트된 Google Drive의 `/content/drive/MyDrive/AI_Data` 폴더에 자동 저장
 ## 5. 모니터링 및 디버깅
 ### 5.1 로그 확인
 실행 중 출력되는 로그를 통해 진행 상황을 확인할 수 있습니다.
 ### 5.2 Colab GPU 모니터링
 ```bash
 !nvidia-smi
 ```
 ### 5.3 메모리 사용량 확인
 ```bash
 !free -h
 ```
 ## 6. 문제 해결
 ### 6.1 모델 다운로드 실패
 - Colab의 디스크 공간 확인
 - 모델 크기가 크므로 충분한 공간 확보
 ### 6.2 Google Drive 마운트 실패
 - 브라우저 팝업에서 권한 허용을 확인
 - 마운트 코드 재실행: `drive.mount('/content/drive', force_remount=True)`
 - `/content/drive/MyDrive` 경로가 존재하는지 확인
 ### 6.3 메모리 부족 오류
 - 배치 크기 조정 또는 더 작은 모델 사용 고려
 ## 7. 확장 및 커스터마이징
 ### 7.1 새로운 도구 추가
 `ai_agent.py`의 `tools` 리스트에 새로운 도구를 추가할 수 있습니다.
 ### 7.2 모델 변경
 `config.json`에서 `model_name`을 다른 모델로 변경 가능합니다.
 ### 7.3 크롤링 전략 수정
 `web_scraper.py`에서 크롤링 로직을 커스터마이징할 수 있습니다.
 ## 8. 주의사항
 - 모델 다운로드에 시간이 오래 걸릴 수 있습니다.
 - Google Drive API 사용량 제한에 유의하세요.
 - 대량의 데이터를 수집할 경우 Colab 세션 시간 제한을 고려하세요.
 - 개인정보 보호 및 저작권을 준수하세요.
 ## 9. 지원
 문제가 발생하거나 추가 기능이 필요한 경우 다음 정보를 포함하여 문의하세요:
 - 오류 메시지
 - 실행 환경 (Colab Pro, GPU 유형)
 - 재현 단계
--- a/AI_Web_Scraper/web_scraper.py
+++ b/AI_Web_Scraper/web_scraper.py
@@ -0,0 +1,122 @@
 import requests
 from bs4 import BeautifulSoup
 import json
 import time
 import os
 from selenium import webdriver
 from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.common.by import By
 from webdriver_manager.chrome import ChromeDriverManager
 from selenium.webdriver.chrome.service import Service
 class WebScraper:
    def __init__(self, config_path='./config.json'):
        with open(config_path, 'r') as f:
            self.config = json.load(f)
        self.max_pages = self.config['web_scraping']['max_pages']
        self.delay = self.config['web_scraping']['delay_between_requests']
        self.user_agent = self.config['web_scraping']['user_agent']
        # Selenium 설정
        chrome_options = Options()
        chrome_options.add_argument("--headless")  # Colab에서는 headless 모드
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument(f"user-agent={self.user_agent}")
        self.driver = webdriver.Chrome(
            service=Service(ChromeDriverManager().install()),
            options=chrome_options
        )
    def scrape_website(self, url, keywords=None):
        """
        웹사이트에서 정보를 수집합니다.
        """
        try:
            self.driver.get(url)
            time.sleep(self.delay)
            # 페이지 내용 추출
            page_source = self.driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            # 텍스트 내용 추출
            text_content = soup.get_text(separator=' ', strip=True)
            # 메타데이터 추출
            title = soup.title.string if soup.title else "No Title"
            meta_description = soup.find('meta', attrs={'name': 'description'})
            description = meta_description['content'] if meta_description else "No Description"
            data = {
                'url': url,
                'title': title,
                'description': description,
                'content': text_content[:5000],  # 내용 제한
                'timestamp': time.time()
            }
            return data
        except Exception as e:
            print(f"스크래핑 실패: {url} - {e}")
            return None
    def crawl_multiple_pages(self, start_urls, keywords=None):
        """
        여러 페이지를 크롤링합니다.
        """
        collected_data = []
        visited_urls = set()
        for url in start_urls:
            if len(collected_data) >= self.max_pages:
                break
            if url not in visited_urls:
                data = self.scrape_website(url, keywords)
                if data:
                    collected_data.append(data)
                    visited_urls.add(url)
                # 추가 링크 찾기 (단순히 현재 페이지의 링크들)
                try:
                    links = self.driver.find_elements(By.TAG_NAME, "a")
                    for link in links[:10]:  # 최대 10개 링크만
                        href = link.get_attribute("href")
                        if href and href.startswith("http") and href not in visited_urls:
                            if len(collected_data) < self.max_pages:
                                data = self.scrape_website(href, keywords)
                                if data:
                                    collected_data.append(data)
                                    visited_urls.add(href)
                except:
                    pass
        return collected_data
    def save_data(self, data, filename='scraped_data.json'):
        """
        수집된 데이터를 파일로 저장합니다.
        """
        storage_path = self.config['data_storage']['local_storage_path']
        if not os.path.exists(storage_path):
            os.makedirs(storage_path)
        filepath = os.path.join(storage_path, filename)
        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
        print(f"데이터 저장 완료: {filepath}")
    def close(self):
        self.driver.quit()
 if __name__ == "__main__":
    scraper = WebScraper()
    # 테스트용
    data = scraper.crawl_multiple_pages(["https://www.google.com"])
    scraper.save_data(data)
    scraper.close()