import requests from bs4 import BeautifulSoup import json import time import os from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from webdriver_manager.chrome import ChromeDriverManager from selenium.webdriver.chrome.service import Service class WebScraper: def __init__(self, config_path='./config.json'): with open(config_path, 'r') as f: self.config = json.load(f) self.max_pages = self.config['web_scraping']['max_pages'] self.delay = self.config['web_scraping']['delay_between_requests'] self.user_agent = self.config['web_scraping']['user_agent'] # Selenium 설정 chrome_options = Options() chrome_options.add_argument("--headless") # Colab에서는 headless 모드 chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument(f"user-agent={self.user_agent}") self.driver = webdriver.Chrome( service=Service(ChromeDriverManager().install()), options=chrome_options ) def scrape_website(self, url, keywords=None): """ 웹사이트에서 정보를 수집합니다. """ try: self.driver.get(url) time.sleep(self.delay) # 페이지 내용 추출 page_source = self.driver.page_source soup = BeautifulSoup(page_source, 'html.parser') # 텍스트 내용 추출 text_content = soup.get_text(separator=' ', strip=True) # 메타데이터 추출 title = soup.title.string if soup.title else "No Title" meta_description = soup.find('meta', attrs={'name': 'description'}) description = meta_description['content'] if meta_description else "No Description" data = { 'url': url, 'title': title, 'description': description, 'content': text_content[:5000], # 내용 제한 'timestamp': time.time() } return data except Exception as e: print(f"스크래핑 실패: {url} - {e}") return None def crawl_multiple_pages(self, start_urls, keywords=None): """ 여러 페이지를 크롤링합니다. """ collected_data = [] visited_urls = set() for url in start_urls: if len(collected_data) >= self.max_pages: break if url not in visited_urls: data = self.scrape_website(url, keywords) if data: collected_data.append(data) visited_urls.add(url) # 추가 링크 찾기 (단순히 현재 페이지의 링크들) try: links = self.driver.find_elements(By.TAG_NAME, "a") for link in links[:10]: # 최대 10개 링크만 href = link.get_attribute("href") if href and href.startswith("http") and href not in visited_urls: if len(collected_data) < self.max_pages: data = self.scrape_website(href, keywords) if data: collected_data.append(data) visited_urls.add(href) except: pass return collected_data def save_data(self, data, filename='scraped_data.json'): """ 수집된 데이터를 파일로 저장합니다. """ storage_path = self.config['data_storage']['local_storage_path'] if not os.path.exists(storage_path): os.makedirs(storage_path) filepath = os.path.join(storage_path, filename) with open(filepath, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2) print(f"데이터 저장 완료: {filepath}") def close(self): self.driver.quit() if __name__ == "__main__": scraper = WebScraper() # 테스트용 data = scraper.crawl_multiple_pages(["https://www.google.com"]) scraper.save_data(data) scraper.close()