import requests from bs4 import BeautifulSoup import json import time import os try: from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from webdriver_manager.chrome import ChromeDriverManager from selenium.webdriver.chrome.service import Service _SELENIUM_AVAILABLE = True except Exception: _SELENIUM_AVAILABLE = False class WebScraper: def __init__(self, config_path='./config.json'): with open(config_path, 'r') as f: self.config = json.load(f) ws_conf = self.config.get('web_scraping', {}) self.max_pages = ws_conf.get('max_pages', 100) self.delay = ws_conf.get('delay_between_requests', 2) self.user_agent = ws_conf.get('user_agent', 'Mozilla/5.0') self.use_selenium = bool(ws_conf.get('use_selenium', False)) self.driver = None if self.use_selenium and _SELENIUM_AVAILABLE: try: chrome_options = Options() chrome_options.add_argument("--headless=new") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument(f"user-agent={self.user_agent}") # Chrome 바이너리 탐색 (Colab/리눅스 일반 경로) chrome_bin_candidates = [ os.environ.get('GOOGLE_CHROME_BIN'), os.environ.get('CHROME_BIN'), '/usr/bin/google-chrome', '/usr/bin/chromium-browser', '/usr/bin/chromium' ] chrome_bin = next((p for p in chrome_bin_candidates if p and os.path.exists(p)), None) if chrome_bin: chrome_options.binary_location = chrome_bin self.driver = webdriver.Chrome( service=Service(ChromeDriverManager().install()), options=chrome_options ) print("Selenium 모드 활성화") except Exception as e: print(f"Selenium 초기화 실패, Requests 모드로 폴백: {e}") self.driver = None self.use_selenium = False else: if self.use_selenium and not _SELENIUM_AVAILABLE: print("Selenium 패키지 미설치, Requests 모드로 폴백합니다.") self.use_selenium = False def scrape_website(self, url, keywords=None): """ 웹사이트에서 정보를 수집합니다. """ try: if self.use_selenium and self.driver is not None: self.driver.get(url) time.sleep(self.delay) page_source = self.driver.page_source else: headers = {"User-Agent": self.user_agent} resp = requests.get(url, headers=headers, timeout=20) resp.raise_for_status() page_source = resp.text soup = BeautifulSoup(page_source, 'html.parser') text_content = soup.get_text(separator=' ', strip=True) title = soup.title.string if soup.title else "No Title" meta_description = soup.find('meta', attrs={'name': 'description'}) description = meta_description['content'] if (meta_description and meta_description.has_attr('content')) else "No Description" data = { 'url': url, 'title': title, 'description': description, 'content': text_content[:5000], 'timestamp': time.time() } return data except Exception as e: print(f"스크래핑 실패: {url} - {e}") return None def crawl_multiple_pages(self, start_urls, keywords=None): """ 여러 페이지를 크롤링합니다. """ collected_data = [] visited_urls = set() for url in start_urls: if len(collected_data) >= self.max_pages: break if url not in visited_urls: data = self.scrape_website(url, keywords) if data: collected_data.append(data) visited_urls.add(url) # 추가 링크 찾기 (단순히 현재 페이지의 링크들) try: if self.use_selenium and self.driver is not None: links = self.driver.find_elements(By.TAG_NAME, "a") hrefs = [link.get_attribute("href") for link in links[:20]] else: # Requests 모드일 때는 현재 페이지를 다시 받아서 링크 파싱 headers = {"User-Agent": self.user_agent} resp = requests.get(url, headers=headers, timeout=20) resp.raise_for_status() soup = BeautifulSoup(resp.text, 'html.parser') hrefs = [a.get('href') for a in soup.find_all('a', href=True)][:20] for href in hrefs: if href and href.startswith("http") and href not in visited_urls: if len(collected_data) < self.max_pages: data = self.scrape_website(href, keywords) if data: collected_data.append(data) visited_urls.add(href) except Exception: pass return collected_data def save_data(self, data, filename='scraped_data.json'): """ 수집된 데이터를 파일로 저장합니다. """ storage_path = self.config['data_storage']['local_storage_path'] if not os.path.exists(storage_path): os.makedirs(storage_path) filepath = os.path.join(storage_path, filename) with open(filepath, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2) print(f"데이터 저장 완료: {filepath}") def close(self): if self.driver is not None: self.driver.quit() if __name__ == "__main__": scraper = WebScraper() # 테스트용 data = scraper.crawl_multiple_pages(["https://www.google.com"]) scraper.save_data(data) scraper.close()