162 lines
6.4 KiB
Python
162 lines
6.4 KiB
Python
import requests
|
|
from bs4 import BeautifulSoup
|
|
import json
|
|
import time
|
|
import os
|
|
|
|
try:
|
|
from selenium import webdriver
|
|
from selenium.webdriver.chrome.options import Options
|
|
from selenium.webdriver.common.by import By
|
|
from webdriver_manager.chrome import ChromeDriverManager
|
|
from selenium.webdriver.chrome.service import Service
|
|
_SELENIUM_AVAILABLE = True
|
|
except Exception:
|
|
_SELENIUM_AVAILABLE = False
|
|
|
|
class WebScraper:
|
|
def __init__(self, config_path='./config.json'):
|
|
with open(config_path, 'r') as f:
|
|
self.config = json.load(f)
|
|
|
|
ws_conf = self.config.get('web_scraping', {})
|
|
self.max_pages = ws_conf.get('max_pages', 100)
|
|
self.delay = ws_conf.get('delay_between_requests', 2)
|
|
self.user_agent = ws_conf.get('user_agent', 'Mozilla/5.0')
|
|
self.use_selenium = bool(ws_conf.get('use_selenium', False))
|
|
|
|
self.driver = None
|
|
if self.use_selenium and _SELENIUM_AVAILABLE:
|
|
try:
|
|
chrome_options = Options()
|
|
chrome_options.add_argument("--headless=new")
|
|
chrome_options.add_argument("--no-sandbox")
|
|
chrome_options.add_argument("--disable-dev-shm-usage")
|
|
chrome_options.add_argument(f"user-agent={self.user_agent}")
|
|
|
|
# Chrome 바이너리 탐색 (Colab/리눅스 일반 경로)
|
|
chrome_bin_candidates = [
|
|
os.environ.get('GOOGLE_CHROME_BIN'),
|
|
os.environ.get('CHROME_BIN'),
|
|
'/usr/bin/google-chrome',
|
|
'/usr/bin/chromium-browser',
|
|
'/usr/bin/chromium'
|
|
]
|
|
chrome_bin = next((p for p in chrome_bin_candidates if p and os.path.exists(p)), None)
|
|
if chrome_bin:
|
|
chrome_options.binary_location = chrome_bin
|
|
|
|
self.driver = webdriver.Chrome(
|
|
service=Service(ChromeDriverManager().install()),
|
|
options=chrome_options
|
|
)
|
|
print("Selenium 모드 활성화")
|
|
except Exception as e:
|
|
print(f"Selenium 초기화 실패, Requests 모드로 폴백: {e}")
|
|
self.driver = None
|
|
self.use_selenium = False
|
|
else:
|
|
if self.use_selenium and not _SELENIUM_AVAILABLE:
|
|
print("Selenium 패키지 미설치, Requests 모드로 폴백합니다.")
|
|
self.use_selenium = False
|
|
|
|
def scrape_website(self, url, keywords=None):
|
|
"""
|
|
웹사이트에서 정보를 수집합니다.
|
|
"""
|
|
try:
|
|
if self.use_selenium and self.driver is not None:
|
|
self.driver.get(url)
|
|
time.sleep(self.delay)
|
|
page_source = self.driver.page_source
|
|
else:
|
|
headers = {"User-Agent": self.user_agent}
|
|
resp = requests.get(url, headers=headers, timeout=20)
|
|
resp.raise_for_status()
|
|
page_source = resp.text
|
|
|
|
soup = BeautifulSoup(page_source, 'html.parser')
|
|
text_content = soup.get_text(separator=' ', strip=True)
|
|
title = soup.title.string if soup.title else "No Title"
|
|
meta_description = soup.find('meta', attrs={'name': 'description'})
|
|
description = meta_description['content'] if (meta_description and meta_description.has_attr('content')) else "No Description"
|
|
|
|
data = {
|
|
'url': url,
|
|
'title': title,
|
|
'description': description,
|
|
'content': text_content[:5000],
|
|
'timestamp': time.time()
|
|
}
|
|
return data
|
|
except Exception as e:
|
|
print(f"스크래핑 실패: {url} - {e}")
|
|
return None
|
|
|
|
def crawl_multiple_pages(self, start_urls, keywords=None):
|
|
"""
|
|
여러 페이지를 크롤링합니다.
|
|
"""
|
|
collected_data = []
|
|
visited_urls = set()
|
|
|
|
for url in start_urls:
|
|
if len(collected_data) >= self.max_pages:
|
|
break
|
|
|
|
if url not in visited_urls:
|
|
data = self.scrape_website(url, keywords)
|
|
if data:
|
|
collected_data.append(data)
|
|
visited_urls.add(url)
|
|
|
|
# 추가 링크 찾기 (단순히 현재 페이지의 링크들)
|
|
try:
|
|
if self.use_selenium and self.driver is not None:
|
|
links = self.driver.find_elements(By.TAG_NAME, "a")
|
|
hrefs = [link.get_attribute("href") for link in links[:20]]
|
|
else:
|
|
# Requests 모드일 때는 현재 페이지를 다시 받아서 링크 파싱
|
|
headers = {"User-Agent": self.user_agent}
|
|
resp = requests.get(url, headers=headers, timeout=20)
|
|
resp.raise_for_status()
|
|
soup = BeautifulSoup(resp.text, 'html.parser')
|
|
hrefs = [a.get('href') for a in soup.find_all('a', href=True)][:20]
|
|
|
|
for href in hrefs:
|
|
if href and href.startswith("http") and href not in visited_urls:
|
|
if len(collected_data) < self.max_pages:
|
|
data = self.scrape_website(href, keywords)
|
|
if data:
|
|
collected_data.append(data)
|
|
visited_urls.add(href)
|
|
except Exception:
|
|
pass
|
|
|
|
return collected_data
|
|
|
|
def save_data(self, data, filename='scraped_data.json'):
|
|
"""
|
|
수집된 데이터를 파일로 저장합니다.
|
|
"""
|
|
storage_path = self.config['data_storage']['local_storage_path']
|
|
if not os.path.exists(storage_path):
|
|
os.makedirs(storage_path)
|
|
|
|
filepath = os.path.join(storage_path, filename)
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, ensure_ascii=False, indent=2)
|
|
|
|
print(f"데이터 저장 완료: {filepath}")
|
|
|
|
def close(self):
|
|
if self.driver is not None:
|
|
self.driver.quit()
|
|
|
|
if __name__ == "__main__":
|
|
scraper = WebScraper()
|
|
# 테스트용
|
|
data = scraper.crawl_multiple_pages(["https://www.google.com"])
|
|
scraper.save_data(data)
|
|
scraper.close()
|