feat: AI 웹 스크래퍼 프로젝트의 초기 구조와 핵심 기능 구현

2025-08-28 10:23:24 +09:00
commit fdf330143f
10 changed files with 797 additions and 0 deletions
--- a/AI_Web_Scraper/web_scraper.py
+++ b/AI_Web_Scraper/web_scraper.py
@@ -0,0 +1,122 @@
+import requests
+from bs4 import BeautifulSoup
+import json
+import time
+import os
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.by import By
+from webdriver_manager.chrome import ChromeDriverManager
+from selenium.webdriver.chrome.service import Service
+
+class WebScraper:
+    def __init__(self, config_path='./config.json'):
+        with open(config_path, 'r') as f:
+            self.config = json.load(f)
+
+        self.max_pages = self.config['web_scraping']['max_pages']
+        self.delay = self.config['web_scraping']['delay_between_requests']
+        self.user_agent = self.config['web_scraping']['user_agent']
+
+        # Selenium 설정
+        chrome_options = Options()
+        chrome_options.add_argument("--headless")  # Colab에서는 headless 모드
+        chrome_options.add_argument("--no-sandbox")
+        chrome_options.add_argument("--disable-dev-shm-usage")
+        chrome_options.add_argument(f"user-agent={self.user_agent}")
+
+        self.driver = webdriver.Chrome(
+            service=Service(ChromeDriverManager().install()),
+            options=chrome_options
+        )
+
+    def scrape_website(self, url, keywords=None):
+        """
+        웹사이트에서 정보를 수집합니다.
+        """
+        try:
+            self.driver.get(url)
+            time.sleep(self.delay)
+
+            # 페이지 내용 추출
+            page_source = self.driver.page_source
+            soup = BeautifulSoup(page_source, 'html.parser')
+
+            # 텍스트 내용 추출
+            text_content = soup.get_text(separator=' ', strip=True)
+
+            # 메타데이터 추출
+            title = soup.title.string if soup.title else "No Title"
+            meta_description = soup.find('meta', attrs={'name': 'description'})
+            description = meta_description['content'] if meta_description else "No Description"
+
+            data = {
+                'url': url,
+                'title': title,
+                'description': description,
+                'content': text_content[:5000],  # 내용 제한
+                'timestamp': time.time()
+            }
+
+            return data
+
+        except Exception as e:
+            print(f"스크래핑 실패: {url} - {e}")
+            return None
+
+    def crawl_multiple_pages(self, start_urls, keywords=None):
+        """
+        여러 페이지를 크롤링합니다.
+        """
+        collected_data = []
+        visited_urls = set()
+
+        for url in start_urls:
+            if len(collected_data) >= self.max_pages:
+                break
+
+            if url not in visited_urls:
+                data = self.scrape_website(url, keywords)
+                if data:
+                    collected_data.append(data)
+                    visited_urls.add(url)
+
+                # 추가 링크 찾기 (단순히 현재 페이지의 링크들)
+                try:
+                    links = self.driver.find_elements(By.TAG_NAME, "a")
+                    for link in links[:10]:  # 최대 10개 링크만
+                        href = link.get_attribute("href")
+                        if href and href.startswith("http") and href not in visited_urls:
+                            if len(collected_data) < self.max_pages:
+                                data = self.scrape_website(href, keywords)
+                                if data:
+                                    collected_data.append(data)
+                                    visited_urls.add(href)
+                except:
+                    pass
+
+        return collected_data
+
+    def save_data(self, data, filename='scraped_data.json'):
+        """
+        수집된 데이터를 파일로 저장합니다.
+        """
+        storage_path = self.config['data_storage']['local_storage_path']
+        if not os.path.exists(storage_path):
+            os.makedirs(storage_path)
+
+        filepath = os.path.join(storage_path, filename)
+        with open(filepath, 'w', encoding='utf-8') as f:
+            json.dump(data, f, ensure_ascii=False, indent=2)
+
+        print(f"데이터 저장 완료: {filepath}")
+
+    def close(self):
+        self.driver.quit()
+
+if __name__ == "__main__":
+    scraper = WebScraper()
+    # 테스트용
+    data = scraper.crawl_multiple_pages(["https://www.google.com"])
+    scraper.save_data(data)
+    scraper.close()