-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextractcontentfromhtml.py
More file actions
77 lines (67 loc) · 2.98 KB
/
extractcontentfromhtml.py
File metadata and controls
77 lines (67 loc) · 2.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import os
def extract_article(url, output_filename="extracted_article.html"):
"""
Extracts the main article content and images from a webpage and saves it to a simplified HTML file.
Args:
url: The URL of the article or blog post.
output_filename: The name of the HTML file to save the extracted content to.
"""
try:
response = requests.get(url)
response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
soup = BeautifulSoup(response.content, 'html.parser')
# --- Identify Main Content Area ---
# This is the trickiest part. You'll likely need to adjust this based on the specific website structure.
# Common strategies:
# 1. Look for specific IDs or classes on container elements.
# 2. Look for the element containing the most text.
# 3. Manually inspect the HTML source to find the right element.
# Example: Let's assume the article content is within a <div class="article-content">
main_content = soup.find('div', class_='article-content')
if not main_content:
# Alternative strategy: find the tag containing the most text (crude but sometimes works)
main_content = max(soup.find_all('div'), key=lambda tag: len(tag.text)) #This will likely require manual adjustment for each site!
if not main_content:
print(f"Warning: Could not automatically identify the main content area. You may need to adjust the selection logic in the script.")
main_content = soup.body #Use the whole body as last resort
# --- Extract Images and Update Source URLs ---
images = main_content.find_all('img')
for img in images:
src = img.get('src')
if src:
# Make URLs absolute if they are relative
absolute_url = urljoin(url, src)
img['src'] = absolute_url # Update the image source
# --- Create Stripped-Down HTML ---
html_content = f"""<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Extracted Article</title>
<style>
body {{
font-family: sans-serif;
margin: 20px;
line-height: 1.6;
}}
img {{
max-width: 100%;
height: auto;
}}
</style>
</head>
<body>
{main_content.prettify()}
</body>
</html>"""
# --- Save to File ---
with open(output_filename, 'w', encoding='utf-8') as f:
f.write(html_content)
print(f"Article extracted and saved to {output_filename}")
except requests.exceptions.RequestException as e:
print(f"Error fetching URL: {e}")
except Exception as e:
print(f"An error occurred: {e}")