Zum Hauptinhalt springen

์›น ์Šคํฌ๋ž˜ํ•‘

BeautifulSoup์„ ์‚ฌ์šฉํ•˜์—ฌ ์›น ํŽ˜์ด์ง€์—์„œ ์›ํ•˜๋Š” ๋ฐ์ดํ„ฐ๋ฅผ ์ถ”์ถœํ•˜๋Š” ๋ฐฉ๋ฒ•์„ ๋ฐฐ์›Œ๋ด…์‹œ๋‹ค.

์›น ์Šคํฌ๋ž˜ํ•‘์ด๋ž€?โ€‹

์›น ์Šคํฌ๋ž˜ํ•‘์€ ์›น ํŽ˜์ด์ง€์—์„œ ์ž๋™์œผ๋กœ ๋ฐ์ดํ„ฐ๋ฅผ ์ˆ˜์ง‘ํ•˜๋Š” ๊ธฐ์ˆ ์ž…๋‹ˆ๋‹ค. ๋‰ด์Šค ๊ธฐ์‚ฌ, ์ƒํ’ˆ ๊ฐ€๊ฒฉ, ๋‚ ์”จ ์ •๋ณด ๋“ฑ ์›น์— ์žˆ๋Š” ๋‹ค์–‘ํ•œ ๋ฐ์ดํ„ฐ๋ฅผ ํ”„๋กœ๊ทธ๋ž˜๋ฐ์ ์œผ๋กœ ์ถ”์ถœํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.

์ฃผ์š” ์šฉ๋„โ€‹

  • ๊ฐ€๊ฒฉ ๋น„๊ต ๋ฐ ๋ชจ๋‹ˆํ„ฐ๋ง
  • ๋‰ด์Šค ๋ฐ ์ฝ˜ํ…์ธ  ์ˆ˜์ง‘
  • ๋ฐ์ดํ„ฐ ๋ถ„์„ ๋ฐ ์—ฐ๊ตฌ
  • ๊ฒ€์ƒ‰ ์—”์ง„ ์ธ๋ฑ์‹ฑ
  • ์‹œ์žฅ ์กฐ์‚ฌ ๋ฐ ๊ฒฝ์Ÿ์‚ฌ ๋ถ„์„

์ฃผ์˜์‚ฌํ•ญโ€‹

  • robots.txt ํ™•์ธ: ์›น์‚ฌ์ดํŠธ์˜ ํฌ๋กค๋ง ์ •์ฑ… ์ค€์ˆ˜
  • ์ด์šฉ ์•ฝ๊ด€ ๊ฒ€ํ† : ๋ฒ•์  ๋ฌธ์ œ ๋ฐฉ์ง€
  • ์„œ๋ฒ„ ๋ถ€ํ•˜ ๊ณ ๋ ค: ์š”์ฒญ ๊ฐ„๊ฒฉ ์กฐ์ ˆ
  • ๊ฐœ์ธ์ •๋ณด ๋ณดํ˜ธ: ๋ฏผ๊ฐํ•œ ์ •๋ณด ์ˆ˜์ง‘ ๊ธˆ์ง€

์„ค์น˜ํ•˜๊ธฐโ€‹

# BeautifulSoup 4
pip install beautifulsoup4

# HTML ํŒŒ์„œ (lxml์ด ๋” ๋น ๋ฆ„)
pip install lxml

# ๋˜๋Š” html5lib (๋” ๊ด€๋Œ€ํ•œ ํŒŒ์‹ฑ)
pip install html5lib

# requests (HTTP ์š”์ฒญ์šฉ)
pip install requests

๊ธฐ๋ณธ ์‚ฌ์šฉ๋ฒ•โ€‹

HTML ๊ฐ€์ ธ์˜ค๊ธฐ์™€ ํŒŒ์‹ฑโ€‹

import requests
from bs4 import BeautifulSoup

# ์›น ํŽ˜์ด์ง€ ๊ฐ€์ ธ์˜ค๊ธฐ
url = 'https://example.com'
response = requests.get(url)
html = response.text

# BeautifulSoup ๊ฐ์ฒด ์ƒ์„ฑ
soup = BeautifulSoup(html, 'lxml')

# ์ „์ฒด HTML ์ถœ๋ ฅ (๋ณด๊ธฐ ์ข‹๊ฒŒ)
print(soup.prettify())

# ์ œ๋ชฉ ๊ฐ€์ ธ์˜ค๊ธฐ
title = soup.title
print(title.string) # ์ œ๋ชฉ ํ…์ŠคํŠธ

# ๋ชจ๋“  ํ…์ŠคํŠธ ๊ฐ€์ ธ์˜ค๊ธฐ
text = soup.get_text()
print(text)

ํŒŒ์„œ ์ข…๋ฅ˜โ€‹

# lxml (๋น ๋ฅด๊ณ  ์œ ์—ฐํ•จ, ๊ถŒ์žฅ)
soup = BeautifulSoup(html, 'lxml')

# html.parser (๋‚ด์žฅ, ์ถ”๊ฐ€ ์„ค์น˜ ๋ถˆํ•„์š”)
soup = BeautifulSoup(html, 'html.parser')

# html5lib (๊ฐ€์žฅ ๊ด€๋Œ€ํ•จ, ๋А๋ฆผ)
soup = BeautifulSoup(html, 'html5lib')

# XML ํŒŒ์‹ฑ
soup = BeautifulSoup(xml, 'xml')

ํƒœ๊ทธ ์„ ํƒํ•˜๊ธฐโ€‹

๊ธฐ๋ณธ ์„ ํƒ ๋ฐฉ๋ฒ•โ€‹

from bs4 import BeautifulSoup

html = '''
<html>
<head><title>My Page</title></head>
<body>
<h1 id="main-title">Welcome</h1>
<div class="content">
<p class="intro">First paragraph</p>
<p class="text">Second paragraph</p>
<a href="/page1">Link 1</a>
<a href="/page2">Link 2</a>
</div>
</body>
</html>
'''

soup = BeautifulSoup(html, 'lxml')

# ์ฒซ ๋ฒˆ์งธ ํƒœ๊ทธ ์ฐพ๊ธฐ
h1 = soup.find('h1')
print(h1.string) # Welcome

# ๋ชจ๋“  ํƒœ๊ทธ ์ฐพ๊ธฐ
paragraphs = soup.find_all('p')
for p in paragraphs:
print(p.string)

# ID๋กœ ์ฐพ๊ธฐ
main_title = soup.find(id='main-title')
print(main_title.string)

# ํด๋ž˜์Šค๋กœ ์ฐพ๊ธฐ
intro = soup.find(class_='intro')
print(intro.string)

# ์—ฌ๋Ÿฌ ์กฐ๊ฑด
link = soup.find('a', href='/page1')
print(link.string)

CSS ์„ ํƒ์ž ์‚ฌ์šฉโ€‹

# CSS ์„ ํƒ์ž๋กœ ์ฐพ๊ธฐ (๋” ์ง๊ด€์ )
h1 = soup.select_one('h1#main-title')
print(h1.string)

# ํด๋ž˜์Šค ์„ ํƒ
intro = soup.select_one('.intro')
paragraphs = soup.select('.content p')

# ์ž์‹ ์„ ํƒ์ž
content_links = soup.select('div.content > a')

# ์†์„ฑ ์„ ํƒ์ž
external_links = soup.select('a[href^="http"]')

# ์—ฌ๋Ÿฌ ๊ฐœ ์„ ํƒ
items = soup.select('.item')
for item in items:
print(item.text)

๋ฐ์ดํ„ฐ ์ถ”์ถœํ•˜๊ธฐโ€‹

ํ…์ŠคํŠธ ์ถ”์ถœโ€‹

# ํƒœ๊ทธ์˜ ํ…์ŠคํŠธ
text = tag.string # ์ง์ ‘ ์ž์‹ ํ…์ŠคํŠธ๋งŒ
text = tag.get_text() # ๋ชจ๋“  ํ•˜์œ„ ํ…์ŠคํŠธ
text = tag.text # get_text()์™€ ๋™์ผ

# ๊ณต๋ฐฑ ์ฒ˜๋ฆฌ
text = tag.get_text(strip=True) # ์•ž๋’ค ๊ณต๋ฐฑ ์ œ๊ฑฐ

# ๊ตฌ๋ถ„์ž ์ง€์ •
text = tag.get_text(separator=' | ') # ํƒœ๊ทธ ์‚ฌ์ด์— ๊ตฌ๋ถ„์ž

# ์˜ˆ์ œ
html = '<div> <p>Hello</p> <p>World</p> </div>'
soup = BeautifulSoup(html, 'lxml')
div = soup.find('div')

print(div.get_text()) # " Hello World "
print(div.get_text(strip=True)) # "HelloWorld"
print(div.get_text(separator=' | ', strip=True)) # "Hello | World"

์†์„ฑ ์ถ”์ถœโ€‹

# ์†์„ฑ ๊ฐ€์ ธ์˜ค๊ธฐ
link = soup.find('a')

# ๋”•์…”๋„ˆ๋ฆฌ ๋ฐฉ์‹
href = link['href']
title = link.get('title', 'No title') # ๊ธฐ๋ณธ๊ฐ’ ์„ค์ •

# ๋ชจ๋“  ์†์„ฑ
attrs = link.attrs
print(attrs) # {'href': '/page1', 'class': ['link'], 'id': 'first'}

# ์—ฌ๋Ÿฌ ๊ฐ’์„ ๊ฐ€์ง„ ์†์„ฑ (class, rel ๋“ฑ)
classes = link.get('class') # ๋ฆฌ์ŠคํŠธ ๋ฐ˜ํ™˜
print(classes) # ['link', 'external']

# ์˜ˆ์ œ: ๋ชจ๋“  ์ด๋ฏธ์ง€ URL ์ถ”์ถœ
images = soup.find_all('img')
for img in images:
src = img.get('src')
alt = img.get('alt', 'No description')
print(f"Image: {src} - {alt}")

ํƒ์ƒ‰ ๋ฐ ์ˆœํšŒโ€‹

# ๋ถ€๋ชจ ํƒ์ƒ‰
tag = soup.find('p')
parent = tag.parent
print(parent.name)

# ๋ชจ๋“  ๋ถ€๋ชจ
for parent in tag.parents:
print(parent.name)

# ํ˜•์ œ ํƒ์ƒ‰
next_sibling = tag.next_sibling # ๋‹ค์Œ ํ˜•์ œ (ํ…์ŠคํŠธ ํฌํ•จ)
next_tag = tag.find_next_sibling() # ๋‹ค์Œ ํƒœ๊ทธ
prev_tag = tag.find_previous_sibling() # ์ด์ „ ํƒœ๊ทธ

# ๋ชจ๋“  ํ˜•์ œ
for sibling in tag.next_siblings:
print(sibling)

# ์ž์‹ ํƒ์ƒ‰
children = tag.children # ์ง์ ‘ ์ž์‹ (์ดํ„ฐ๋ ˆ์ดํ„ฐ)
descendants = tag.descendants # ๋ชจ๋“  ํ•˜์œ„ ์š”์†Œ

# ์˜ˆ์ œ: ํ…Œ์ด๋ธ” ํƒ์ƒ‰
table = soup.find('table')
for row in table.find_all('tr'):
cells = row.find_all(['td', 'th'])
data = [cell.get_text(strip=True) for cell in cells]
print(data)

์‹ค์ „ ์˜ˆ์ œโ€‹

์˜ˆ์ œ 1: ๋‰ด์Šค ํ—ค๋“œ๋ผ์ธ ์ˆ˜์ง‘โ€‹

import requests
from bs4 import BeautifulSoup
from datetime import datetime

def scrape_news_headlines(url):
"""๋‰ด์Šค ์‚ฌ์ดํŠธ์—์„œ ํ—ค๋“œ๋ผ์ธ์„ ์ˆ˜์ง‘ํ•ฉ๋‹ˆ๋‹ค."""
try:
# ํ—ค๋” ์„ค์ • (๋ด‡ ์ฐจ๋‹จ ๋ฐฉ์ง€)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}

response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()

soup = BeautifulSoup(response.text, 'lxml')

# ๋‰ด์Šค ๊ธฐ์‚ฌ ์ฐพ๊ธฐ (์‚ฌ์ดํŠธ๋งˆ๋‹ค ๋‹ค๋ฆ„)
articles = soup.select('.news-item') # CSS ์„ ํƒ์ž๋Š” ์‹ค์ œ ์‚ฌ์ดํŠธ์— ๋งž๊ฒŒ ์ˆ˜์ •

news_list = []
for article in articles:
# ์ œ๋ชฉ
title_tag = article.select_one('.title')
title = title_tag.get_text(strip=True) if title_tag else 'No title'

# ๋งํฌ
link_tag = article.select_one('a')
link = link_tag['href'] if link_tag else ''

# ์ ˆ๋Œ€ URL๋กœ ๋ณ€ํ™˜
if link and not link.startswith('http'):
from urllib.parse import urljoin
link = urljoin(url, link)

# ์š”์•ฝ
summary_tag = article.select_one('.summary')
summary = summary_tag.get_text(strip=True) if summary_tag else ''

# ์‹œ๊ฐ„
time_tag = article.select_one('.time')
timestamp = time_tag.get_text(strip=True) if time_tag else ''

news_list.append({
'title': title,
'link': link,
'summary': summary,
'timestamp': timestamp
})

return news_list

except Exception as e:
print(f"์Šคํฌ๋ž˜ํ•‘ ์˜ค๋ฅ˜: {e}")
return []

# ์‚ฌ์šฉ ์˜ˆ์‹œ
url = 'https://news.example.com'
news = scrape_news_headlines(url)

for i, article in enumerate(news[:10], 1):
print(f"\n{i}. {article['title']}")
print(f" {article['link']}")
print(f" {article['summary'][:100]}...")
print(f" ์‹œ๊ฐ„: {article['timestamp']}")

์˜ˆ์ œ 2: ์‡ผํ•‘๋ชฐ ๊ฐ€๊ฒฉ ๋น„๊ตโ€‹

import requests
from bs4 import BeautifulSoup
import re
from typing import List, Dict

class PriceScraper:
"""์—ฌ๋Ÿฌ ์‡ผํ•‘๋ชฐ์˜ ๊ฐ€๊ฒฉ์„ ๋น„๊ตํ•ฉ๋‹ˆ๋‹ค."""

def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}

def extract_price(self, price_text: str) -> int:
"""ํ…์ŠคํŠธ์—์„œ ๊ฐ€๊ฒฉ(์ˆซ์ž)์„ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค."""
# ์ˆซ์ž๋งŒ ์ถ”์ถœ
numbers = re.findall(r'\d+', price_text.replace(',', ''))
return int(''.join(numbers)) if numbers else 0

def scrape_site_a(self, product_name: str) -> List[Dict]:
"""์‚ฌ์ดํŠธ A์—์„œ ์ƒํ’ˆ ๊ฒ€์ƒ‰"""
url = f"https://site-a.com/search?q={product_name}"

try:
response = requests.get(url, headers=self.headers, timeout=10)
soup = BeautifulSoup(response.text, 'lxml')

products = []
items = soup.select('.product-item')

for item in items[:5]: # ์ƒ์œ„ 5๊ฐœ๋งŒ
title = item.select_one('.product-title').get_text(strip=True)
price_text = item.select_one('.price').get_text(strip=True)
price = self.extract_price(price_text)
link = item.select_one('a')['href']

products.append({
'site': 'Site A',
'title': title,
'price': price,
'link': link
})

return products

except Exception as e:
print(f"Site A ์Šคํฌ๋ž˜ํ•‘ ์˜ค๋ฅ˜: {e}")
return []

def scrape_site_b(self, product_name: str) -> List[Dict]:
"""์‚ฌ์ดํŠธ B์—์„œ ์ƒํ’ˆ ๊ฒ€์ƒ‰"""
url = f"https://site-b.com/products?keyword={product_name}"

try:
response = requests.get(url, headers=self.headers, timeout=10)
soup = BeautifulSoup(response.text, 'lxml')

products = []
items = soup.select('.item')

for item in items[:5]:
title = item.select_one('h3').get_text(strip=True)
price_text = item.select_one('.price-now').get_text(strip=True)
price = self.extract_price(price_text)
link = item.select_one('a')['href']

products.append({
'site': 'Site B',
'title': title,
'price': price,
'link': link
})

return products

except Exception as e:
print(f"Site B ์Šคํฌ๋ž˜ํ•‘ ์˜ค๋ฅ˜: {e}")
return []

def compare_prices(self, product_name: str) -> List[Dict]:
"""์—ฌ๋Ÿฌ ์‚ฌ์ดํŠธ์—์„œ ๊ฐ€๊ฒฉ์„ ๋น„๊ตํ•ฉ๋‹ˆ๋‹ค."""
all_products = []

# ์—ฌ๋Ÿฌ ์‚ฌ์ดํŠธ์—์„œ ๊ฒ€์ƒ‰
all_products.extend(self.scrape_site_a(product_name))
all_products.extend(self.scrape_site_b(product_name))

# ๊ฐ€๊ฒฉ์ˆœ ์ •๋ ฌ
all_products.sort(key=lambda x: x['price'])

return all_products

# ์‚ฌ์šฉ ์˜ˆ์‹œ
scraper = PriceScraper()
products = scraper.compare_prices('๋ฌด์„  ์ด์–ดํฐ')

print(f"'{product_name}' ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ:\n")
for i, product in enumerate(products, 1):
print(f"{i}. [{product['site']}] {product['title']}")
print(f" ๊ฐ€๊ฒฉ: {product['price']:,}์›")
print(f" ๋งํฌ: {product['link']}\n")

if products:
cheapest = products[0]
print(f"์ตœ์ €๊ฐ€: {cheapest['site']} - {cheapest['price']:,}์›")

์˜ˆ์ œ 3: ํ…Œ์ด๋ธ” ๋ฐ์ดํ„ฐ ์ถ”์ถœโ€‹

import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_table_to_dataframe(url: str, table_index: int = 0) -> pd.DataFrame:
"""์›น ํŽ˜์ด์ง€์˜ ํ…Œ์ด๋ธ”์„ DataFrame์œผ๋กœ ๋ณ€ํ™˜ํ•ฉ๋‹ˆ๋‹ค."""
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')

# ์›ํ•˜๋Š” ํ…Œ์ด๋ธ” ์„ ํƒ
tables = soup.find_all('table')
if not tables or table_index >= len(tables):
raise ValueError("ํ…Œ์ด๋ธ”์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.")

table = tables[table_index]

# ํ—ค๋” ์ถ”์ถœ
headers = []
header_row = table.find('thead')
if header_row:
headers = [th.get_text(strip=True) for th in header_row.find_all('th')]
else:
# thead๊ฐ€ ์—†์œผ๋ฉด ์ฒซ ๋ฒˆ์งธ tr์„ ํ—ค๋”๋กœ
first_row = table.find('tr')
headers = [th.get_text(strip=True) for th in first_row.find_all(['th', 'td'])]

# ๋ฐ์ดํ„ฐ ์ถ”์ถœ
rows = []
for tr in table.find_all('tr')[1:]: # ์ฒซ ํ–‰(ํ—ค๋”) ์ œ์™ธ
cells = tr.find_all(['td', 'th'])
row = [cell.get_text(strip=True) for cell in cells]
if row: # ๋นˆ ํ–‰ ์ œ์™ธ
rows.append(row)

# DataFrame ์ƒ์„ฑ
df = pd.DataFrame(rows, columns=headers)
return df

# ์‚ฌ์šฉ ์˜ˆ์‹œ
url = 'https://example.com/data-table'
df = scrape_table_to_dataframe(url)

print(df.head())
print(f"\n์ด {len(df)}๊ฐœ์˜ ํ–‰")

# CSV๋กœ ์ €์žฅ
df.to_csv('scraped_data.csv', index=False, encoding='utf-8-sig')

์˜ˆ์ œ 4: ์ด๋ฏธ์ง€ ๋‹ค์šด๋กœ๋“œโ€‹

import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urljoin, urlparse

def download_images(url: str, save_dir: str = 'images', limit: int = 10):
"""์›น ํŽ˜์ด์ง€์˜ ์ด๋ฏธ์ง€๋ฅผ ๋‹ค์šด๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค."""
# ์ €์žฅ ๋””๋ ‰ํ† ๋ฆฌ ์ƒ์„ฑ
os.makedirs(save_dir, exist_ok=True)

# ํŽ˜์ด์ง€ ๊ฐ€์ ธ์˜ค๊ธฐ
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')

# ๋ชจ๋“  ์ด๋ฏธ์ง€ ํƒœ๊ทธ ์ฐพ๊ธฐ
images = soup.find_all('img')

downloaded = 0
for img in images:
if downloaded >= limit:
break

# ์ด๋ฏธ์ง€ URL
img_url = img.get('src') or img.get('data-src')
if not img_url:
continue

# ์ ˆ๋Œ€ URL๋กœ ๋ณ€ํ™˜
img_url = urljoin(url, img_url)

# ์™ธ๋ถ€ URL์€ ์ œ์™ธ (์„ ํƒ์‚ฌํ•ญ)
if urlparse(img_url).netloc != urlparse(url).netloc:
continue

try:
# ์ด๋ฏธ์ง€ ๋‹ค์šด๋กœ๋“œ
img_response = requests.get(img_url, timeout=10)
img_response.raise_for_status()

# ํŒŒ์ผ๋ช… ์ƒ์„ฑ
filename = os.path.basename(urlparse(img_url).path)
if not filename:
filename = f"image_{downloaded + 1}.jpg"

# ์ €์žฅ
filepath = os.path.join(save_dir, filename)
with open(filepath, 'wb') as f:
f.write(img_response.content)

print(f"๋‹ค์šด๋กœ๋“œ: {filename}")
downloaded += 1

except Exception as e:
print(f"๋‹ค์šด๋กœ๋“œ ์‹คํŒจ ({img_url}): {e}")

print(f"\n์ด {downloaded}๊ฐœ์˜ ์ด๋ฏธ์ง€ ๋‹ค์šด๋กœ๋“œ ์™„๋ฃŒ")

# ์‚ฌ์šฉ ์˜ˆ์‹œ
url = 'https://example.com/gallery'
download_images(url, save_dir='downloaded_images', limit=20)

์˜ˆ์ œ 5: ํŽ˜์ด์ง€ ๋„ค๋น„๊ฒŒ์ด์…˜ (์—ฌ๋Ÿฌ ํŽ˜์ด์ง€ ํฌ๋กค๋ง)โ€‹

import requests
from bs4 import BeautifulSoup
import time
from typing import List, Dict

def scrape_multiple_pages(base_url: str, max_pages: int = 5) -> List[Dict]:
"""์—ฌ๋Ÿฌ ํŽ˜์ด์ง€๋ฅผ ์ˆœํšŒํ•˜๋ฉฐ ๋ฐ์ดํ„ฐ๋ฅผ ์ˆ˜์ง‘ํ•ฉ๋‹ˆ๋‹ค."""
all_items = []

for page in range(1, max_pages + 1):
print(f"ํŽ˜์ด์ง€ {page} ์Šคํฌ๋ž˜ํ•‘ ์ค‘...")

# ํŽ˜์ด์ง€ URL (์‚ฌ์ดํŠธ๋งˆ๋‹ค ๋‹ค๋ฆ„)
url = f"{base_url}?page={page}"

try:
response = requests.get(url, timeout=10)
response.raise_for_status()

soup = BeautifulSoup(response.text, 'lxml')

# ์•„์ดํ…œ ์ถ”์ถœ
items = soup.select('.item')

if not items:
print("๋” ์ด์ƒ ๋ฐ์ดํ„ฐ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.")
break

for item in items:
title = item.select_one('.title').get_text(strip=True)
link = item.select_one('a')['href']

all_items.append({
'page': page,
'title': title,
'link': link
})

print(f" - {len(items)}๊ฐœ ์•„์ดํ…œ ์ˆ˜์ง‘")

# ๋‹ค์Œ ํŽ˜์ด์ง€๋กœ ์ด๋™ (๋ฐฉ๋ฒ• 1: ๋ฒˆํ˜ธ)
# ์ด๋ฏธ ์œ„์—์„œ ๊ตฌํ˜„๋จ

# ๋‹ค์Œ ํŽ˜์ด์ง€๋กœ ์ด๋™ (๋ฐฉ๋ฒ• 2: "๋‹ค์Œ" ๋ฒ„ํŠผ)
next_button = soup.select_one('.pagination .next')
if not next_button:
print("๋งˆ์ง€๋ง‰ ํŽ˜์ด์ง€์ž…๋‹ˆ๋‹ค.")
break

# ์„œ๋ฒ„ ๋ถ€ํ•˜ ๋ฐฉ์ง€๋ฅผ ์œ„ํ•œ ๋Œ€๊ธฐ
time.sleep(1)

except Exception as e:
print(f"ํŽ˜์ด์ง€ {page} ์Šคํฌ๋ž˜ํ•‘ ์˜ค๋ฅ˜: {e}")
break

print(f"\n์ด {len(all_items)}๊ฐœ ์•„์ดํ…œ ์ˆ˜์ง‘ ์™„๋ฃŒ")
return all_items

# ์‚ฌ์šฉ ์˜ˆ์‹œ
base_url = 'https://example.com/items'
items = scrape_multiple_pages(base_url, max_pages=10)

# ๊ฒฐ๊ณผ ์ €์žฅ
import json
with open('scraped_items.json', 'w', encoding='utf-8') as f:
json.dump(items, f, ensure_ascii=False, indent=2)

์˜ˆ์ œ 6: ๋™์  ์ปจํ…์ธ  ์Šคํฌ๋ž˜ํ•‘ (Selenium ์‚ฌ์šฉ)โ€‹

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time

def scrape_dynamic_content(url: str):
"""JavaScript๋กœ ๋กœ๋”ฉ๋˜๋Š” ๋™์  ์ฝ˜ํ…์ธ ๋ฅผ ์Šคํฌ๋ž˜ํ•‘ํ•ฉ๋‹ˆ๋‹ค."""

# Selenium ์„ค์น˜: pip install selenium
# ChromeDriver ๋‹ค์šด๋กœ๋“œ ํ•„์š”

# ๋ธŒ๋ผ์šฐ์ € ์˜ต์…˜
options = webdriver.ChromeOptions()
options.add_argument('--headless') # ๋ฐฑ๊ทธ๋ผ์šด๋“œ ์‹คํ–‰
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

# ๋“œ๋ผ์ด๋ฒ„ ์ดˆ๊ธฐํ™”
driver = webdriver.Chrome(options=options)

try:
# ํŽ˜์ด์ง€ ์—ด๊ธฐ
driver.get(url)

# ํŠน์ • ์š”์†Œ๊ฐ€ ๋กœ๋“œ๋  ๋•Œ๊นŒ์ง€ ๋Œ€๊ธฐ
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'content')))

# ์Šคํฌ๋กค ๋‹ค์šด (๋ฌดํ•œ ์Šคํฌ๋กค ํŽ˜์ด์ง€)
SCROLL_PAUSE_TIME = 2
last_height = driver.execute_script("return document.body.scrollHeight")

while True:
# ์•„๋ž˜๋กœ ์Šคํฌ๋กค
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(SCROLL_PAUSE_TIME)

# ์ƒˆ๋กœ์šด ๋†’์ด ๊ณ„์‚ฐ
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height

# ํŽ˜์ด์ง€ ์†Œ์Šค ๊ฐ€์ ธ์˜ค๊ธฐ
html = driver.page_source

# BeautifulSoup์œผ๋กœ ํŒŒ์‹ฑ
soup = BeautifulSoup(html, 'lxml')

# ๋ฐ์ดํ„ฐ ์ถ”์ถœ
items = soup.select('.item')
results = []

for item in items:
title = item.select_one('.title').get_text(strip=True)
results.append({'title': title})

return results

finally:
# ๋ธŒ๋ผ์šฐ์ € ์ข…๋ฃŒ
driver.quit()

# ์‚ฌ์šฉ ์˜ˆ์‹œ
url = 'https://example.com/dynamic-content'
results = scrape_dynamic_content(url)
print(f"{len(results)}๊ฐœ ์•„์ดํ…œ ์ˆ˜์ง‘")

๊ณ ๊ธ‰ ๊ธฐ๋ฒ•โ€‹

User-Agent ๋กœํ…Œ์ด์…˜โ€‹

import random
import requests
from bs4 import BeautifulSoup

USER_AGENTS = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36',
]

def get_random_user_agent():
return random.choice(USER_AGENTS)

def scrape_with_rotation(url):
headers = {'User-Agent': get_random_user_agent()}
response = requests.get(url, headers=headers)
return BeautifulSoup(response.text, 'lxml')

์—๋Ÿฌ ์ฒ˜๋ฆฌ์™€ ์žฌ์‹œ๋„โ€‹

import time
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

def create_session_with_retry():
"""์žฌ์‹œ๋„ ๋กœ์ง์ด ์žˆ๋Š” ์„ธ์…˜ ์ƒ์„ฑ"""
session = requests.Session()

retry = Retry(
total=3,
backoff_factor=1,
status_forcelist=[429, 500, 502, 503, 504]
)

adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

return session

# ์‚ฌ์šฉ
session = create_session_with_retry()
response = session.get('https://example.com')

์†๋„ ์ œํ•œโ€‹

import time
from functools import wraps

def rate_limit(max_per_minute):
"""๋ถ„๋‹น ์ตœ๋Œ€ ํ˜ธ์ถœ ํšŸ์ˆ˜ ์ œํ•œ ๋ฐ์ฝ”๋ ˆ์ดํ„ฐ"""
min_interval = 60.0 / max_per_minute

def decorator(func):
last_called = [0.0]

@wraps(func)
def wrapper(*args, **kwargs):
elapsed = time.time() - last_called[0]
left_to_wait = min_interval - elapsed

if left_to_wait > 0:
time.sleep(left_to_wait)

result = func(*args, **kwargs)
last_called[0] = time.time()
return result

return wrapper
return decorator

@rate_limit(max_per_minute=30)
def scrape_page(url):
response = requests.get(url)
return BeautifulSoup(response.text, 'lxml')

์ž์ฃผ ๋ฌป๋Š” ์งˆ๋ฌธโ€‹

Q1. ์›น ์Šคํฌ๋ž˜ํ•‘์€ ํ•ฉ๋ฒ•์ธ๊ฐ€์š”?โ€‹

A: ์›น ์Šคํฌ๋ž˜ํ•‘ ์ž์ฒด๋Š” ํ•ฉ๋ฒ•์ด์ง€๋งŒ, ๋‹ค์Œ์„ ์ค€์ˆ˜ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค:

  • robots.txt ํ™•์ธ ๋ฐ ์ค€์ˆ˜
  • ์‚ฌ์ดํŠธ์˜ ์ด์šฉ ์•ฝ๊ด€ ๊ฒ€ํ† 
  • ๊ฐœ์ธ์ •๋ณด ๋ฐ ์ €์ž‘๊ถŒ ์กด์ค‘
  • ์„œ๋ฒ„์— ๊ณผ๋„ํ•œ ๋ถ€ํ•˜ ์ฃผ์ง€ ์•Š๊ธฐ
# robots.txt ํ™•์ธ
import urllib.robotparser

rp = urllib.robotparser.RobotFileParser()
rp.set_url("https://example.com/robots.txt")
rp.read()

url = "https://example.com/page"
user_agent = "MyBot"

if rp.can_fetch(user_agent, url):
print("์Šคํฌ๋ž˜ํ•‘ ํ—ˆ์šฉ")
else:
print("์Šคํฌ๋ž˜ํ•‘ ๊ธˆ์ง€")

Q2. JavaScript๋กœ ๋ Œ๋”๋ง๋˜๋Š” ํŽ˜์ด์ง€๋Š” ์–ด๋–ป๊ฒŒ ์Šคํฌ๋ž˜ํ•‘ํ•˜๋‚˜์š”?โ€‹

A: Selenium์ด๋‚˜ Playwright๋ฅผ ์‚ฌ์šฉํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.

# Selenium
pip install selenium

# Playwright (๋” ํ˜„๋Œ€์ )
pip install playwright
playwright install

Q3. ๋ด‡์œผ๋กœ ๊ฐ์ง€๋˜์–ด ์ฐจ๋‹จ๋ฉ๋‹ˆ๋‹ค.โ€‹

A: ๋‹ค์Œ ๋ฐฉ๋ฒ•์„ ์‹œ๋„ํ•ด๋ณด์„ธ์š”:

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'ko-KR,ko;q=0.9,en;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
}

# ์š”์ฒญ ๊ฐ„๊ฒฉ ์ถ”๊ฐ€
time.sleep(random.uniform(1, 3))

Q4. ์ธ์ฝ”๋”ฉ ๋ฌธ์ œ๊ฐ€ ๋ฐœ์ƒํ•ฉ๋‹ˆ๋‹ค.โ€‹

A: ์‘๋‹ต์˜ ์ธ์ฝ”๋”ฉ์„ ๋ช…์‹œ์ ์œผ๋กœ ์„ค์ •ํ•˜์„ธ์š”.

response = requests.get(url)
response.encoding = 'utf-8' # ๋˜๋Š” 'euc-kr', 'cp949' ๋“ฑ

# ์ž๋™ ๊ฐ์ง€
import chardet
encoding = chardet.detect(response.content)['encoding']
response.encoding = encoding

Q5. ์†๋„๋ฅผ ๋†’์ด๋ ค๋ฉด?โ€‹

A: ๋น„๋™๊ธฐ ์š”์ฒญ์„ ์‚ฌ์šฉํ•˜์„ธ์š”.

import asyncio
import aiohttp
from bs4 import BeautifulSoup

async def fetch(session, url):
async with session.get(url) as response:
return await response.text()

async def scrape_multiple(urls):
async with aiohttp.ClientSession() as session:
tasks = [fetch(session, url) for url in urls]
htmls = await asyncio.gather(*tasks)

results = []
for html in htmls:
soup = BeautifulSoup(html, 'lxml')
# ๋ฐ์ดํ„ฐ ์ถ”์ถœ
results.append(soup.title.string)

return results

# ์‹คํ–‰
urls = ['https://example.com/page1', 'https://example.com/page2']
results = asyncio.run(scrape_multiple(urls))

๋‹ค์Œ ๋‹จ๊ณ„โ€‹

์›น ์Šคํฌ๋ž˜ํ•‘์„ ๋งˆ์Šคํ„ฐํ–ˆ๋‹ค๋ฉด, ๋‹ค์Œ ์ฃผ์ œ๋ฅผ ํ•™์Šตํ•ด๋ณด์„ธ์š”:

  1. Selenium/Playwright - ๋™์  ์›น ํŽ˜์ด์ง€ ์Šคํฌ๋ž˜ํ•‘
  2. Scrapy - ๋Œ€๊ทœ๋ชจ ์›น ํฌ๋กค๋ง ํ”„๋ ˆ์ž„์›Œํฌ
  3. ๋ฐ์ดํ„ฐ ๋ถ„์„ - pandas๋กœ ์ˆ˜์ง‘ํ•œ ๋ฐ์ดํ„ฐ ๋ถ„์„ํ•˜๊ธฐ
  4. API ๊ฐœ๋ฐœ - ์ˆ˜์ง‘ํ•œ ๋ฐ์ดํ„ฐ๋ฅผ API๋กœ ์ œ๊ณตํ•˜๊ธฐ

์ฐธ๊ณ  ์ž๋ฃŒโ€‹