feat: AI 웹 스크래퍼 프로젝트의 초기 구조와 핵심 기능 구현

2025-08-28 10:23:24 +09:00
commit fdf330143f
10 changed files with 797 additions and 0 deletions
--- a/.windsurf/rules/mcp.md
+++ b/.windsurf/rules/mcp.md
@@ -0,0 +1,4 @@
+---
+trigger: manual
+---
+
--- a/AI_Web_Scraper/README.md
+++ b/AI_Web_Scraper/README.md
@@ -0,0 +1,52 @@
+# AI 웹 정보 수집 시스템
+
+이 프로젝트는 AI 모델을 사용하여 인터넷에서 정보를 자동으로 수집하고 Google Drive에 저장하는 시스템입니다.
+
+## 주요 기능
+
+- Hugging Face 모델 (jxm/gpt-oss-20b-base)을 사용하여 AI 에이전트 실행
+- 웹 크롤링을 통한 정보 수집
+- 수집된 데이터의 Google Drive 저장
+- Colab Pro 환경에서 A100 GPU 활용
+
+## 요구사항
+
+- Python 3.8 이상
+- Google Drive API 인증 파일 (credentials.json)
+- Colab Pro 계정 (A100 GPU 지원)
+
+## 설치 방법
+
+```bash
+pip install -r requirements.txt
+```
+
+## 설정
+
+1. `config.json` 파일에서 다음 항목들을 설정하세요:
+   - `google_drive_folder_id`: 데이터를 저장할 Google Drive 폴더 ID
+   - `google_credentials_path`: Google API 인증 파일 경로
+
+2. Google Drive API 설정:
+   - Google Cloud Console에서 Drive API 활성화
+   - OAuth 2.0 클라이언트 ID 생성
+   - credentials.json 파일 다운로드
+
+## 실행 방법
+
+자세한 실행 방법은 `run_guide.md` 파일을 참고하세요.
+
+## 파일 구조
+
+```
+AI_Web_Scraper/
+├── main.py                 # 메인 실행 파일
+├── model_downloader.py     # 모델 다운로드
+├── web_scraper.py          # 웹 크롤링 도구
+├── google_drive_uploader.py # Google Drive 업로드
+├── ai_agent.py            # AI 에이전트
+├── config.json            # 설정 파일
+├── requirements.txt       # 의존성 파일
+├── README.md              # 프로젝트 설명
+└── run_guide.md           # 실행 가이드
+```
--- a/AI_Web_Scraper/ai_agent.py
+++ b/AI_Web_Scraper/ai_agent.py
@@ -0,0 +1,192 @@
+import json
+import os
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+from langchain.llms import HuggingFacePipeline
+from langchain.agents import initialize_agent, AgentType
+from langchain.tools import Tool
+from langchain.memory import ConversationBufferMemory
+from web_scraper import WebScraper
+from google_drive_uploader import GoogleDriveUploader, SimpleDriveSaver
+
+class AIAgent:
+    def __init__(self, config_path='./config.json'):
+        with open(config_path, 'r') as f:
+            self.config = json.load(f)
+
+        self.model_path = self.config['model_local_path']
+        self.max_tokens = self.config['max_tokens']
+        self.temperature = self.config['temperature']
+
+        # 모델 로드
+        self.model = None
+        self.tokenizer = None
+        self.llm = None
+        self.load_model()
+
+        # 도구들 초기화
+        self.web_scraper = WebScraper(config_path)
+        self.drive_uploader = GoogleDriveUploader(config_path)
+        self.simple_saver = SimpleDriveSaver(self.config['data_storage']['drive_mount_path'])
+
+        # LangChain 도구 정의
+        self.tools = [
+            Tool(
+                name="WebScraper",
+                func=self.scrape_web,
+                description="웹사이트에서 정보를 수집합니다. URL을 입력하세요."
+            ),
+            Tool(
+                name="GoogleDriveUploader",
+                func=self.upload_to_drive_api,
+                description="Google Drive API를 사용하여 데이터를 업로드합니다. 데이터와 파일명을 입력하세요."
+            ),
+            Tool(
+                name="SimpleDriveSaver",
+                func=self.save_to_drive_simple,
+                description="마운트된 Google Drive에 데이터를 저장합니다. 데이터와 파일명을 입력하세요."
+            )
+        ]
+
+        # 메모리
+        self.memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
+
+        # 에이전트 초기화
+        self.agent = initialize_agent(
+            tools=self.tools,
+            llm=self.llm,
+            agent=AgentType.CONVERSATIONAL_REACT_DESCRIPTION,
+            memory=self.memory,
+            verbose=True
+        )
+
+    def load_model(self):
+        """
+        Hugging Face 모델을 로드합니다.
+        """
+        try:
+            print(f"모델 로드 중: {self.model_path}")
+
+            self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
+            self.model = AutoModelForCausalLM.from_pretrained(
+                self.model_path,
+                device_map="auto",
+                torch_dtype="auto"
+            )
+
+            # 파이프라인 생성
+            pipe = pipeline(
+                "text-generation",
+                model=self.model,
+                tokenizer=self.tokenizer,
+                max_new_tokens=self.max_tokens,
+                temperature=self.temperature,
+                do_sample=True,
+                pad_token_id=self.tokenizer.eos_token_id
+            )
+
+            self.llm = HuggingFacePipeline(pipeline=pipe)
+            print("모델 로드 완료")
+
+        except Exception as e:
+            print(f"모델 로드 실패: {e}")
+            raise
+
+    def scrape_web(self, url):
+        """
+        웹 스크래핑 도구 함수
+        """
+        data = self.web_scraper.scrape_website(url)
+        if data:
+            return f"수집 완료: {data['title']} - {data['description'][:200]}..."
+        else:
+            return "수집 실패"
+
+    def upload_to_drive_api(self, data_and_filename):
+        """
+        Google Drive API 업로드 도구 함수
+        """
+        try:
+            # 간단한 파싱 (실제로는 더 정교하게)
+            parts = data_and_filename.split('|')
+            if len(parts) == 2:
+                data = json.loads(parts[0])
+                filename = parts[1]
+            else:
+                data = {"error": "잘못된 형식"}
+                filename = "error.json"
+
+            file_id = self.drive_uploader.upload_data_as_json(data, filename)
+            return f"업로드 완료: {file_id}"
+        except Exception as e:
+            return f"업로드 실패: {e}"
+
+    def save_to_drive_simple(self, data_and_filename):
+        """
+        마운트된 Drive에 저장하는 도구 함수
+        """
+        try:
+            parts = data_and_filename.split('|')
+            if len(parts) == 2:
+                data = json.loads(parts[0])
+                filename = parts[1]
+            else:
+                data = {"error": "잘못된 형식"}
+                filename = "error.json"
+
+            filepath = self.simple_saver.save_data_as_json(data, filename)
+            return f"저장 완료: {filepath}"
+        except Exception as e:
+            return f"저장 실패: {e}"
+
+    def run_agent(self, task_description):
+        """
+        AI 에이전트를 실행합니다.
+        """
+        try:
+            response = self.agent.run(task_description)
+            return response
+        except Exception as e:
+            print(f"에이전트 실행 실패: {e}")
+            return None
+
+    def generate_topics(self, num_topics=3):
+        """
+        AI가 스스로 흥미로운 주제를 생성합니다.
+        """
+        prompt = f"""
+        당신은 AI 연구원입니다. 현재 세계에서 가장 흥미롭고 조사할 가치가 있는 기술 및 과학 분야의 주제 {num_topics}개를 선정해주세요.
+
+        다음 기준을 고려하세요:
+        1. 최근 트렌드나 미래 지향적인 주제
+        2. 사회적 영향이 큰 주제
+        3. 기술 발전이 빠른 분야
+        4. AI와 관련된 주제 우선
+
+        각 주제는 구체적이고 조사하기 쉬운 형태로 제시해주세요.
+        예시: "양자 컴퓨팅의 최근 발전", "생성형 AI의 윤리적 문제"
+
+        주제 목록만 출력하고, 다른 설명은 하지 마세요.
+        형식: 각 줄에 하나의 주제
+        """
+
+        try:
+            response = self.llm(prompt)
+            # 응답에서 주제들을 추출 (줄 단위로 분리)
+            topics = [line.strip() for line in response.split('\n') if line.strip() and not line.startswith(('1.', '2.', '3.', '-'))]
+            # 최대 num_topics개 반환
+            return topics[:num_topics]
+        except Exception as e:
+            print(f"주제 생성 실패: {e}")
+            # 기본 주제 반환
+            return ["AI 기술 동향", "머신러닝 응용", "딥러닝 최신 연구"]
+
+    def close(self):
+        self.web_scraper.close()
+
+if __name__ == "__main__":
+    agent = AIAgent()
+    # 테스트용
+    topics = ["인공지능 최신 트렌드", "머신러닝 기초"]
+    results = agent.collect_information(topics)
+    print("수집 결과:", results)
+    agent.close()
--- a/AI_Web_Scraper/config.json
+++ b/AI_Web_Scraper/config.json
@@ -0,0 +1,18 @@
+{
+  "model_name": "jxm/gpt-oss-20b-base",
+  "model_local_path": "./models/gpt-oss-20b-base",
+  "google_drive_folder_id": "YOUR_GOOGLE_DRIVE_FOLDER_ID",
+  "google_credentials_path": "./credentials.json",
+  "max_tokens": 2048,
+  "temperature": 0.7,
+  "web_scraping": {
+    "max_pages": 100,
+    "delay_between_requests": 2,
+    "user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
+  },
+  "data_storage": {
+    "local_storage_path": "./collected_data",
+    "file_format": "json",
+    "drive_mount_path": "/content/drive/MyDrive/AI_Data"
+  }
+}
--- a/AI_Web_Scraper/google_drive_uploader.py
+++ b/AI_Web_Scraper/google_drive_uploader.py
@@ -0,0 +1,149 @@
+import os
+import json
+from googleapiclient.discovery import build
+from googleapiclient.http import MediaFileUpload
+from google.oauth2.credentials import Credentials
+from google_auth_oauthlib.flow import InstalledAppFlow
+from google.auth.transport.requests import Request
+
+class GoogleDriveUploader:
+    def __init__(self, config_path='./config.json'):
+        with open(config_path, 'r') as f:
+            self.config = json.load(f)
+
+        self.folder_id = self.config['google_drive_folder_id']
+        self.creds_path = self.config['google_credentials_path']
+        self.scopes = ['https://www.googleapis.com/auth/drive.file']
+
+        self.service = None
+        self.authenticate()
+
+    def authenticate(self):
+        """
+        Google Drive API 인증
+        """
+        creds = None
+
+        if os.path.exists('token.json'):
+            creds = Credentials.from_authorized_user_file('token.json', self.scopes)
+
+        if not creds or not creds.valid:
+            if creds and creds.expired and creds.refresh_token:
+                creds.refresh(Request())
+            else:
+                flow = InstalledAppFlow.from_client_secrets_file(
+                    self.creds_path, self.scopes)
+                creds = flow.run_local_server(port=0)
+
+            with open('token.json', 'w') as token:
+                token.write(creds.to_json())
+
+        self.service = build('drive', 'v3', credentials=creds)
+
+    def upload_file(self, file_path, file_name=None):
+        """
+        파일을 Google Drive에 업로드
+        """
+        if file_name is None:
+            file_name = os.path.basename(file_path)
+
+        file_metadata = {
+            'name': file_name,
+            'parents': [self.folder_id] if self.folder_id else []
+        }
+
+        media = MediaFileUpload(file_path, resumable=True)
+
+        try:
+            file = self.service.files().create(
+                body=file_metadata,
+                media_body=media,
+                fields='id'
+            ).execute()
+
+            print(f'파일 업로드 완료: {file_name} (ID: {file.get("id")})')
+            return file.get('id')
+        except Exception as e:
+            print(f'업로드 실패: {e}')
+            return None
+
+    def upload_data_as_json(self, data, filename='collected_data.json'):
+        """
+        데이터를 JSON 파일로 변환하여 업로드
+        """
+        import tempfile
+
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
+            json.dump(data, f, ensure_ascii=False, indent=2)
+            temp_path = f.name
+
+        try:
+            file_id = self.upload_file(temp_path, filename)
+            return file_id
+        finally:
+            os.unlink(temp_path)
+
+    def list_files(self):
+        """
+        폴더 내 파일 목록 조회
+        """
+        try:
+            results = self.service.files().list(
+                q=f"'{self.folder_id}' in parents",
+                pageSize=10,
+                fields="nextPageToken, files(id, name)"
+            ).execute()
+
+            items = results.get('files', [])
+            return items
+        except Exception as e:
+            print(f'파일 목록 조회 실패: {e}')
+            return []
+
+class SimpleDriveSaver:
+    """
+    Colab의 drive.mount()를 사용한 간단한 저장 방식
+    """
+    def __init__(self, mount_path='/content/drive/MyDrive/AI_Data'):
+        self.mount_path = mount_path
+        if not os.path.exists(mount_path):
+            os.makedirs(mount_path, exist_ok=True)
+
+    def save_data_as_json(self, data, filename='collected_data.json'):
+        """
+        데이터를 마운트된 Drive에 JSON 파일로 저장
+        """
+        filepath = os.path.join(self.mount_path, filename)
+        try:
+            with open(filepath, 'w', encoding='utf-8') as f:
+                json.dump(data, f, ensure_ascii=False, indent=2)
+            print(f'데이터 저장 완료: {filepath}')
+            return filepath
+        except Exception as e:
+            print(f'저장 실패: {e}')
+            return None
+
+    def save_text_data(self, data, filename='collected_data.txt'):
+        """
+        데이터를 텍스트 파일로 저장
+        """
+        filepath = os.path.join(self.mount_path, filename)
+        try:
+            with open(filepath, 'w', encoding='utf-8') as f:
+                if isinstance(data, list):
+                    for item in data:
+                        f.write(json.dumps(item, ensure_ascii=False) + '\n')
+                else:
+                    f.write(str(data))
+            print(f'텍스트 데이터 저장 완료: {filepath}')
+            return filepath
+        except Exception as e:
+            print(f'저장 실패: {e}')
+            return None
+
+def save_to_drive_simple(data, filename='collected_data.json', mount_path='/content/drive/MyDrive/AI_Data'):
+    """
+    간단한 함수로 마운트된 Drive에 데이터 저장
+    """
+    saver = SimpleDriveSaver(mount_path)
+    return saver.save_data_as_json(data, filename)
--- a/AI_Web_Scraper/main.py
+++ b/AI_Web_Scraper/main.py
@@ -0,0 +1,53 @@
+import sys
+import json
+from model_downloader import download_model
+from ai_agent import AIAgent
+import argparse
+
+def main():
+    parser = argparse.ArgumentParser(description='AI 웹 정보 수집 시스템')
+    parser.add_argument('--topics', nargs='+', help='수집할 주제 목록', default=None)
+    parser.add_argument('--config', default='./config.json', help='설정 파일 경로')
+    parser.add_argument('--auto-topics', action='store_true', help='AI가 스스로 주제를 선정하여 조사')
+
+    args = parser.parse_args()
+
+    print("AI 웹 정보 수집 시스템 시작")
+
+    # 1. 모델 다운로드 (필요한 경우)
+    print("모델 확인 중...")
+    model, tokenizer = download_model(args.config)
+
+    if model is None:
+        print("모델 다운로드 실패. 프로그램을 종료합니다.")
+        sys.exit(1)
+
+    # 2. AI 에이전트 초기화
+    print("AI 에이전트 초기화 중...")
+    agent = AIAgent(args.config)
+
+    # 3. 주제 결정
+    if args.auto_topics or args.topics is None:
+        print("AI가 스스로 주제를 선정합니다...")
+        topics = agent.generate_topics(num_topics=3)
+        print(f"선정된 주제: {topics}")
+    else:
+        topics = args.topics
+
+    # 4. 정보 수집 실행
+    print(f"다음 주제들에 대해 정보를 수집합니다: {topics}")
+    results = agent.collect_information(topics)
+
+    # 5. 결과 출력
+    print("\n=== 수집 결과 ===")
+    for result in results:
+        print(f"주제: {result['topic']}")
+        print(f"응답: {result['response']}")
+        print("-" * 50)
+
+    # 6. 정리
+    agent.close()
+    print("프로그램 완료")
+
+if __name__ == "__main__":
+    main()
--- a/AI_Web_Scraper/model_downloader.py
+++ b/AI_Web_Scraper/model_downloader.py
@@ -0,0 +1,44 @@
+import os
+import json
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from huggingface_hub import HfApi
+
+def download_model(config_path='./config.json'):
+    """
+    Hugging Face에서 모델을 다운로드합니다.
+    """
+    with open(config_path, 'r') as f:
+        config = json.load(f)
+
+    model_name = config['model_name']
+    local_path = config['model_local_path']
+
+    if not os.path.exists(local_path):
+        os.makedirs(local_path)
+
+    print(f"모델 {model_name}을 {local_path}에 다운로드 중...")
+
+    try:
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            cache_dir=local_path,
+            device_map="auto",  # GPU 자동 할당
+            torch_dtype="auto"
+        )
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_name,
+            cache_dir=local_path
+        )
+
+        # 모델과 토크나이저 저장
+        model.save_pretrained(local_path)
+        tokenizer.save_pretrained(local_path)
+
+        print(f"모델 다운로드 완료: {local_path}")
+        return model, tokenizer
+    except Exception as e:
+        print(f"모델 다운로드 실패: {e}")
+        return None, None
+
+if __name__ == "__main__":
+    download_model()
--- a/AI_Web_Scraper/requirements.txt
+++ b/AI_Web_Scraper/requirements.txt
@@ -0,0 +1,15 @@
+transformers>=4.20.0
+torch>=1.12.0
+accelerate>=0.20.0
+requests>=2.25.1
+beautifulsoup4>=4.10.0
+selenium>=4.0.0
+webdriver-manager>=4.0.0
+google-api-python-client>=2.0.0
+google-auth-oauthlib>=1.0.0
+google-auth-httplib2>=0.1.0
+langchain>=0.0.200
+langchain-community>=0.0.20
+huggingface-hub>=0.15.0
+pandas>=1.3.0
+openpyxl>=3.0.0
--- a/AI_Web_Scraper/run_guide.md
+++ b/AI_Web_Scraper/run_guide.md
@@ -0,0 +1,148 @@
+# AI 웹 정보 수집 시스템 실행 가이드
+
+이 가이드는 Google Colab Pro 환경에서 시스템을 실행하는 방법을 설명합니다.
+
+## 1. 사전 준비
+
+### 1.1 Google Colab Pro 설정
+- Google Colab Pro 계정으로 로그인
+- 런타임 유형을 "GPU"로 설정 (A100 권장)
+- Python 3.8 이상 사용
+
+### 1.2 Google Drive 설정 (간단한 방법 - 권장)
+Google Colab에서는 다음 코드로 쉽게 Google Drive를 마운트할 수 있습니다:
+
+```python
+from google.colab import drive
+drive.mount('/content/drive')
+```
+
+이 방법은 별도의 API 설정 없이 데이터를 저장할 수 있어 간단합니다. 데이터를 `/content/drive/MyDrive/AI_Data` 폴더에 자동으로 저장합니다.
+
+### 1.3 Google Drive API 설정 (고급 방법)
+더 정교한 제어가 필요한 경우 Google Drive API를 사용할 수 있습니다:
+1. Google Cloud Console (https://console.cloud.google.com/) 접속
+2. 새 프로젝트 생성 또는 기존 프로젝트 선택
+3. "Google Drive API" 활성화
+4. "OAuth 2.0 클라이언트 ID" 생성
+5. 클라이언트 ID를 다운로드하여 `credentials.json`으로 이름 변경
+6. 데이터를 저장할 Google Drive 폴더 생성 및 폴더 ID 확인
+   - 폴더 URL에서 ID 추출: `https://drive.google.com/drive/folders/[FOLDER_ID]`
+
+### 1.4 프로젝트 파일 업로드
+Colab에 프로젝트 파일들을 업로드하거나 GitHub에서 클론:
+```bash
+!git clone https://github.com/your-repo/AI_Web_Scraper.git
+%cd AI_Web_Scraper
+```
+
+### 1.5 Google Drive 마운트
+시스템 실행 전에 다음 코드를 실행하여 Google Drive를 마운트하세요:
+```python
+from google.colab import drive
+drive.mount('/content/drive')
+```
+
+## 2. 환경 설정
+
+### 2.1 필요한 패키지 설치
+```bash
+!pip install -r requirements.txt
+```
+
+### 2.2 설정 파일 수정
+`config.json` 파일을 열어서 다음 항목들을 수정:
+```json
+{
+  "google_drive_folder_id": "YOUR_ACTUAL_FOLDER_ID",
+  "google_credentials_path": "./credentials.json"
+}
+```
+
+### 2.3 인증 파일 업로드
+- `credentials.json` 파일을 Colab에 업로드
+- Google Drive 인증 시 브라우저 팝업이 나타나면 허용
+
+## 3. 시스템 실행
+
+### 3.1 기본 실행 (AI가 스스로 주제 선정)
+```bash
+python main.py
+```
+이 경우 AI가 현재 흥미로운 기술 트렌드 3개를 스스로 선정하여 조사합니다.
+
+### 3.2 AI가 스스로 주제 선정하도록 명시적 실행
+```bash
+python main.py --auto-topics
+```
+
+### 3.3 특정 주제로 실행
+```bash
+python main.py --topics "인공지능" "머신러닝" "딥러닝"
+```
+
+### 3.4 설정 파일 지정
+```bash
+python main.py --config ./custom_config.json
+```
+
+## 4. 실행 과정 설명
+
+1. **모델 다운로드**: Hugging Face에서 `jxm/gpt-oss-20b-base` 모델을 다운로드
+2. **AI 에이전트 초기화**: 모델을 로드하고 도구들을 설정
+3. **정보 수집**: 각 주제에 대해 AI가 스스로 웹을 탐색하며 정보 수집
+4. **데이터 저장**: 수집된 데이터를 마운트된 Google Drive의 `/content/drive/MyDrive/AI_Data` 폴더에 자동 저장
+
+## 5. 모니터링 및 디버깅
+
+### 5.1 로그 확인
+실행 중 출력되는 로그를 통해 진행 상황을 확인할 수 있습니다.
+
+### 5.2 Colab GPU 모니터링
+```bash
+!nvidia-smi
+```
+
+### 5.3 메모리 사용량 확인
+```bash
+!free -h
+```
+
+## 6. 문제 해결
+
+### 6.1 모델 다운로드 실패
+- Colab의 디스크 공간 확인
+- 모델 크기가 크므로 충분한 공간 확보
+
+### 6.2 Google Drive 마운트 실패
+- 브라우저 팝업에서 권한 허용을 확인
+- 마운트 코드 재실행: `drive.mount('/content/drive', force_remount=True)`
+- `/content/drive/MyDrive` 경로가 존재하는지 확인
+
+### 6.3 메모리 부족 오류
+- 배치 크기 조정 또는 더 작은 모델 사용 고려
+
+## 7. 확장 및 커스터마이징
+
+### 7.1 새로운 도구 추가
+`ai_agent.py`의 `tools` 리스트에 새로운 도구를 추가할 수 있습니다.
+
+### 7.2 모델 변경
+`config.json`에서 `model_name`을 다른 모델로 변경 가능합니다.
+
+### 7.3 크롤링 전략 수정
+`web_scraper.py`에서 크롤링 로직을 커스터마이징할 수 있습니다.
+
+## 8. 주의사항
+
+- 모델 다운로드에 시간이 오래 걸릴 수 있습니다.
+- Google Drive API 사용량 제한에 유의하세요.
+- 대량의 데이터를 수집할 경우 Colab 세션 시간 제한을 고려하세요.
+- 개인정보 보호 및 저작권을 준수하세요.
+
+## 9. 지원
+
+문제가 발생하거나 추가 기능이 필요한 경우 다음 정보를 포함하여 문의하세요:
+- 오류 메시지
+- 실행 환경 (Colab Pro, GPU 유형)
+- 재현 단계
--- a/AI_Web_Scraper/web_scraper.py
+++ b/AI_Web_Scraper/web_scraper.py
@@ -0,0 +1,122 @@
+import requests
+from bs4 import BeautifulSoup
+import json
+import time
+import os
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.by import By
+from webdriver_manager.chrome import ChromeDriverManager
+from selenium.webdriver.chrome.service import Service
+
+class WebScraper:
+    def __init__(self, config_path='./config.json'):
+        with open(config_path, 'r') as f:
+            self.config = json.load(f)
+
+        self.max_pages = self.config['web_scraping']['max_pages']
+        self.delay = self.config['web_scraping']['delay_between_requests']
+        self.user_agent = self.config['web_scraping']['user_agent']
+
+        # Selenium 설정
+        chrome_options = Options()
+        chrome_options.add_argument("--headless")  # Colab에서는 headless 모드
+        chrome_options.add_argument("--no-sandbox")
+        chrome_options.add_argument("--disable-dev-shm-usage")
+        chrome_options.add_argument(f"user-agent={self.user_agent}")
+
+        self.driver = webdriver.Chrome(
+            service=Service(ChromeDriverManager().install()),
+            options=chrome_options
+        )
+
+    def scrape_website(self, url, keywords=None):
+        """
+        웹사이트에서 정보를 수집합니다.
+        """
+        try:
+            self.driver.get(url)
+            time.sleep(self.delay)
+
+            # 페이지 내용 추출
+            page_source = self.driver.page_source
+            soup = BeautifulSoup(page_source, 'html.parser')
+
+            # 텍스트 내용 추출
+            text_content = soup.get_text(separator=' ', strip=True)
+
+            # 메타데이터 추출
+            title = soup.title.string if soup.title else "No Title"
+            meta_description = soup.find('meta', attrs={'name': 'description'})
+            description = meta_description['content'] if meta_description else "No Description"
+
+            data = {
+                'url': url,
+                'title': title,
+                'description': description,
+                'content': text_content[:5000],  # 내용 제한
+                'timestamp': time.time()
+            }
+
+            return data
+
+        except Exception as e:
+            print(f"스크래핑 실패: {url} - {e}")
+            return None
+
+    def crawl_multiple_pages(self, start_urls, keywords=None):
+        """
+        여러 페이지를 크롤링합니다.
+        """
+        collected_data = []
+        visited_urls = set()
+
+        for url in start_urls:
+            if len(collected_data) >= self.max_pages:
+                break
+
+            if url not in visited_urls:
+                data = self.scrape_website(url, keywords)
+                if data:
+                    collected_data.append(data)
+                    visited_urls.add(url)
+
+                # 추가 링크 찾기 (단순히 현재 페이지의 링크들)
+                try:
+                    links = self.driver.find_elements(By.TAG_NAME, "a")
+                    for link in links[:10]:  # 최대 10개 링크만
+                        href = link.get_attribute("href")
+                        if href and href.startswith("http") and href not in visited_urls:
+                            if len(collected_data) < self.max_pages:
+                                data = self.scrape_website(href, keywords)
+                                if data:
+                                    collected_data.append(data)
+                                    visited_urls.add(href)
+                except:
+                    pass
+
+        return collected_data
+
+    def save_data(self, data, filename='scraped_data.json'):
+        """
+        수집된 데이터를 파일로 저장합니다.
+        """
+        storage_path = self.config['data_storage']['local_storage_path']
+        if not os.path.exists(storage_path):
+            os.makedirs(storage_path)
+
+        filepath = os.path.join(storage_path, filename)
+        with open(filepath, 'w', encoding='utf-8') as f:
+            json.dump(data, f, ensure_ascii=False, indent=2)
+
+        print(f"데이터 저장 완료: {filepath}")
+
+    def close(self):
+        self.driver.quit()
+
+if __name__ == "__main__":
+    scraper = WebScraper()
+    # 테스트용
+    data = scraper.crawl_multiple_pages(["https://www.google.com"])
+    scraper.save_data(data)
+    scraper.close()