Files
AI_Devlop/AI_Web_Scraper/web_scraper.py

123 lines
4.3 KiB
Python

import requests
from bs4 import BeautifulSoup
import json
import time
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
class WebScraper:
def __init__(self, config_path='./config.json'):
with open(config_path, 'r') as f:
self.config = json.load(f)
self.max_pages = self.config['web_scraping']['max_pages']
self.delay = self.config['web_scraping']['delay_between_requests']
self.user_agent = self.config['web_scraping']['user_agent']
# Selenium 설정
chrome_options = Options()
chrome_options.add_argument("--headless") # Colab에서는 headless 모드
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument(f"user-agent={self.user_agent}")
self.driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()),
options=chrome_options
)
def scrape_website(self, url, keywords=None):
"""
웹사이트에서 정보를 수집합니다.
"""
try:
self.driver.get(url)
time.sleep(self.delay)
# 페이지 내용 추출
page_source = self.driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
# 텍스트 내용 추출
text_content = soup.get_text(separator=' ', strip=True)
# 메타데이터 추출
title = soup.title.string if soup.title else "No Title"
meta_description = soup.find('meta', attrs={'name': 'description'})
description = meta_description['content'] if meta_description else "No Description"
data = {
'url': url,
'title': title,
'description': description,
'content': text_content[:5000], # 내용 제한
'timestamp': time.time()
}
return data
except Exception as e:
print(f"스크래핑 실패: {url} - {e}")
return None
def crawl_multiple_pages(self, start_urls, keywords=None):
"""
여러 페이지를 크롤링합니다.
"""
collected_data = []
visited_urls = set()
for url in start_urls:
if len(collected_data) >= self.max_pages:
break
if url not in visited_urls:
data = self.scrape_website(url, keywords)
if data:
collected_data.append(data)
visited_urls.add(url)
# 추가 링크 찾기 (단순히 현재 페이지의 링크들)
try:
links = self.driver.find_elements(By.TAG_NAME, "a")
for link in links[:10]: # 최대 10개 링크만
href = link.get_attribute("href")
if href and href.startswith("http") and href not in visited_urls:
if len(collected_data) < self.max_pages:
data = self.scrape_website(href, keywords)
if data:
collected_data.append(data)
visited_urls.add(href)
except:
pass
return collected_data
def save_data(self, data, filename='scraped_data.json'):
"""
수집된 데이터를 파일로 저장합니다.
"""
storage_path = self.config['data_storage']['local_storage_path']
if not os.path.exists(storage_path):
os.makedirs(storage_path)
filepath = os.path.join(storage_path, filename)
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
print(f"데이터 저장 완료: {filepath}")
def close(self):
self.driver.quit()
if __name__ == "__main__":
scraper = WebScraper()
# 테스트용
data = scraper.crawl_multiple_pages(["https://www.google.com"])
scraper.save_data(data)
scraper.close()