feat: AI 웹 스크래퍼 프로젝트의 초기 구조와 핵심 기능 구현

2025-08-28 10:23:24 +09:00
commit fdf330143f
10 changed files with 797 additions and 0 deletions
--- a/AI_Web_Scraper/ai_agent.py
+++ b/AI_Web_Scraper/ai_agent.py
@@ -0,0 +1,192 @@
+import json
+import os
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+from langchain.llms import HuggingFacePipeline
+from langchain.agents import initialize_agent, AgentType
+from langchain.tools import Tool
+from langchain.memory import ConversationBufferMemory
+from web_scraper import WebScraper
+from google_drive_uploader import GoogleDriveUploader, SimpleDriveSaver
+
+class AIAgent:
+    def __init__(self, config_path='./config.json'):
+        with open(config_path, 'r') as f:
+            self.config = json.load(f)
+
+        self.model_path = self.config['model_local_path']
+        self.max_tokens = self.config['max_tokens']
+        self.temperature = self.config['temperature']
+
+        # 모델 로드
+        self.model = None
+        self.tokenizer = None
+        self.llm = None
+        self.load_model()
+
+        # 도구들 초기화
+        self.web_scraper = WebScraper(config_path)
+        self.drive_uploader = GoogleDriveUploader(config_path)
+        self.simple_saver = SimpleDriveSaver(self.config['data_storage']['drive_mount_path'])
+
+        # LangChain 도구 정의
+        self.tools = [
+            Tool(
+                name="WebScraper",
+                func=self.scrape_web,
+                description="웹사이트에서 정보를 수집합니다. URL을 입력하세요."
+            ),
+            Tool(
+                name="GoogleDriveUploader",
+                func=self.upload_to_drive_api,
+                description="Google Drive API를 사용하여 데이터를 업로드합니다. 데이터와 파일명을 입력하세요."
+            ),
+            Tool(
+                name="SimpleDriveSaver",
+                func=self.save_to_drive_simple,
+                description="마운트된 Google Drive에 데이터를 저장합니다. 데이터와 파일명을 입력하세요."
+            )
+        ]
+
+        # 메모리
+        self.memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
+
+        # 에이전트 초기화
+        self.agent = initialize_agent(
+            tools=self.tools,
+            llm=self.llm,
+            agent=AgentType.CONVERSATIONAL_REACT_DESCRIPTION,
+            memory=self.memory,
+            verbose=True
+        )
+
+    def load_model(self):
+        """
+        Hugging Face 모델을 로드합니다.
+        """
+        try:
+            print(f"모델 로드 중: {self.model_path}")
+
+            self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
+            self.model = AutoModelForCausalLM.from_pretrained(
+                self.model_path,
+                device_map="auto",
+                torch_dtype="auto"
+            )
+
+            # 파이프라인 생성
+            pipe = pipeline(
+                "text-generation",
+                model=self.model,
+                tokenizer=self.tokenizer,
+                max_new_tokens=self.max_tokens,
+                temperature=self.temperature,
+                do_sample=True,
+                pad_token_id=self.tokenizer.eos_token_id
+            )
+
+            self.llm = HuggingFacePipeline(pipeline=pipe)
+            print("모델 로드 완료")
+
+        except Exception as e:
+            print(f"모델 로드 실패: {e}")
+            raise
+
+    def scrape_web(self, url):
+        """
+        웹 스크래핑 도구 함수
+        """
+        data = self.web_scraper.scrape_website(url)
+        if data:
+            return f"수집 완료: {data['title']} - {data['description'][:200]}..."
+        else:
+            return "수집 실패"
+
+    def upload_to_drive_api(self, data_and_filename):
+        """
+        Google Drive API 업로드 도구 함수
+        """
+        try:
+            # 간단한 파싱 (실제로는 더 정교하게)
+            parts = data_and_filename.split('|')
+            if len(parts) == 2:
+                data = json.loads(parts[0])
+                filename = parts[1]
+            else:
+                data = {"error": "잘못된 형식"}
+                filename = "error.json"
+
+            file_id = self.drive_uploader.upload_data_as_json(data, filename)
+            return f"업로드 완료: {file_id}"
+        except Exception as e:
+            return f"업로드 실패: {e}"
+
+    def save_to_drive_simple(self, data_and_filename):
+        """
+        마운트된 Drive에 저장하는 도구 함수
+        """
+        try:
+            parts = data_and_filename.split('|')
+            if len(parts) == 2:
+                data = json.loads(parts[0])
+                filename = parts[1]
+            else:
+                data = {"error": "잘못된 형식"}
+                filename = "error.json"
+
+            filepath = self.simple_saver.save_data_as_json(data, filename)
+            return f"저장 완료: {filepath}"
+        except Exception as e:
+            return f"저장 실패: {e}"
+
+    def run_agent(self, task_description):
+        """
+        AI 에이전트를 실행합니다.
+        """
+        try:
+            response = self.agent.run(task_description)
+            return response
+        except Exception as e:
+            print(f"에이전트 실행 실패: {e}")
+            return None
+
+    def generate_topics(self, num_topics=3):
+        """
+        AI가 스스로 흥미로운 주제를 생성합니다.
+        """
+        prompt = f"""
+        당신은 AI 연구원입니다. 현재 세계에서 가장 흥미롭고 조사할 가치가 있는 기술 및 과학 분야의 주제 {num_topics}개를 선정해주세요.
+
+        다음 기준을 고려하세요:
+        1. 최근 트렌드나 미래 지향적인 주제
+        2. 사회적 영향이 큰 주제
+        3. 기술 발전이 빠른 분야
+        4. AI와 관련된 주제 우선
+
+        각 주제는 구체적이고 조사하기 쉬운 형태로 제시해주세요.
+        예시: "양자 컴퓨팅의 최근 발전", "생성형 AI의 윤리적 문제"
+
+        주제 목록만 출력하고, 다른 설명은 하지 마세요.
+        형식: 각 줄에 하나의 주제
+        """
+
+        try:
+            response = self.llm(prompt)
+            # 응답에서 주제들을 추출 (줄 단위로 분리)
+            topics = [line.strip() for line in response.split('\n') if line.strip() and not line.startswith(('1.', '2.', '3.', '-'))]
+            # 최대 num_topics개 반환
+            return topics[:num_topics]
+        except Exception as e:
+            print(f"주제 생성 실패: {e}")
+            # 기본 주제 반환
+            return ["AI 기술 동향", "머신러닝 응용", "딥러닝 최신 연구"]
+
+    def close(self):
+        self.web_scraper.close()
+
+if __name__ == "__main__":
+    agent = AIAgent()
+    # 테스트용
+    topics = ["인공지능 최신 트렌드", "머신러닝 기초"]
+    results = agent.collect_information(topics)
+    print("수집 결과:", results)
+    agent.close()