๐ธ๏ธ Web Scraping with BeautifulSoup
BeautifulSoup helps you parse HTML and extract the data you need from websites.
Mastering this concept will significantly boost your Python data science skills!
๐ป Code Example:
from bs4 import BeautifulSoup import requests def scrape_pynfinity_page(url: str) -> dict: """Scrape a web page and extract structured data.""" headers = {"User-Agent": "Mozilla/5.0 (pynfinity-bot/1.0)"} try: resp = requests.get(url, headers=headers, timeout=10) resp.raise_for_status() except requests.RequestException as e: return {"error": str(e)} soup = BeautifulSoup(resp.text, "html.parser") # 1. Title title = soup.title.get_text(strip=True) if soup.title else "N/A" # 2. All headings headings = [h.get_text(strip=True) for h in soup.find_all(["h1", "h2", "h3"])] # 3. All links with text links = [ {"text": a.get_text(strip=True), "href": a.get("href", "")} for a in soup.find_all("a", href=True) if a.get_text(strip=True) ][:10] # 4. Meta description meta = soup.find("meta", attrs={"name": "description"}) description = meta["content"] if meta else "No description" # 5. All paragraph text paragraphs = [p.get_text(strip=True) for p in soup.find_all("p") if len(p.get_text(strip=True)) > 30] return { "title" : title, "description": description, "headings" : headings[:5], "links" : links, "paragraphs" : paragraphs[:3], } # Demo โ using httpbin (returns request info as JSON) data = scrape_pynfinity_page("https://httpbin.org/html") print("Title:", data["title"]) print("Paragraphs found:", len(data["paragraphs"])) print("Links found:", len(data["links"]))
Keep exploring and happy coding! ๐ป