randomscripts/extractcontentfromhtml.py at master · grothedev/randomscripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import os

def extract_article(url, output_filename="extracted_article.html"):
    """
    Extracts the main article content and images from a webpage and saves it to a simplified HTML file.

    Args:
        url: The URL of the article or blog post.
        output_filename: The name of the HTML file to save the extracted content to.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
        soup = BeautifulSoup(response.content, 'html.parser')

        # --- Identify Main Content Area ---
        # This is the trickiest part. You'll likely need to adjust this based on the specific website structure.
        # Common strategies:
        # 1. Look for specific IDs or classes on container elements.
        # 2. Look for the element containing the most text.
        # 3. Manually inspect the HTML source to find the right element.

        # Example: Let's assume the article content is within a <div class="article-content">
        main_content = soup.find('div', class_='article-content')
        if not main_content:
            # Alternative strategy: find the tag containing the most text (crude but sometimes works)
            main_content = max(soup.find_all('div'), key=lambda tag: len(tag.text)) #This will likely require manual adjustment for each site!
            if not main_content:
                print(f"Warning: Could not automatically identify the main content area.  You may need to adjust the selection logic in the script.")
                main_content = soup.body #Use the whole body as last resort

        # --- Extract Images and Update Source URLs ---
        images = main_content.find_all('img')
        for img in images:
            src = img.get('src')
            if src:
                # Make URLs absolute if they are relative
                absolute_url = urljoin(url, src)
                img['src'] = absolute_url  # Update the image source

        # --- Create Stripped-Down HTML ---
        html_content = f"""<!DOCTYPE html>
        <html>
        <head>
            <meta charset="UTF-8">
            <title>Extracted Article</title>
            <style>
                body {{
                    font-family: sans-serif;
                    margin: 20px;
                    line-height: 1.6;
                }}
                img {{
                    max-width: 100%;
                    height: auto;
                }}
            </style>
        </head>
        <body>
            {main_content.prettify()}
        </body>
        </html>"""


        # --- Save to File ---
        with open(output_filename, 'w', encoding='utf-8') as f:
            f.write(html_content)

        print(f"Article extracted and saved to {output_filename}")

    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")