diff --git a/main.py b/main.py index 7863459..b7059be 100644 --- a/main.py +++ b/main.py @@ -15,6 +15,17 @@ from bs4 import BeautifulSoup, NavigableString from utils import Parser import re +import subprocess + + +HEADERS = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', + 'Accept-Language': 'en-US,en;q=0.9', + 'Referer': 'https://blog.csdn.net/', + 'Connection': 'keep-alive', + 'Upgrade-Insecure-Requests': '1', +} parser = argparse.ArgumentParser('CSDN Blog Exporter: To Markdown or To PDF') @@ -47,7 +58,21 @@ args = parser.parse_args() def html2md(url, md_file, with_title=False, is_win=False): - response = httpx.get(url) + try: + # Force HTTP/1.1 to avoid [Errno 0] Error in some environments + with httpx.Client(http2=False, verify=False, timeout=30.0, follow_redirects=True) as client: + # Warm up session + try: + client.get("https://blog.csdn.net/", headers=HEADERS) + except Exception: + pass # Ignore warmup failure + + response = client.get(url, headers=HEADERS) + response.raise_for_status() + except Exception as e: + print(f"Error fetching {url}: {e}") + return + soup = BeautifulSoup(response.content, 'html.parser', from_encoding="utf-8") html = "" for child in soup.find_all('svg'): @@ -72,19 +97,22 @@ def generate_pdf(input_md_file, pdf_dir, is_win=False): cmd = ["pandoc", "--toc", "--pdf-engine=xelatex", - "-V mainfont='Source Code Pro'", - "-V monofont='Source Code Pro'", - "-V documentclass='{}'".format(documentclass), - "-V geometry:'top=2cm, bottom=2cm, left=1.6cm, right=1.6cm'", - "-V pagestyle=plain", - "-V fontsize=11pt", - "-V colorlinks=blue", - "-s {}".format(input_md_file), - "-o {}".format(pdf_file), + "-V", "mainfont=Source Code Pro", + "-V", "monofont=Source Code Pro", + "-V", "documentclass={}".format(documentclass), + "-V", "geometry:top=2cm, bottom=2cm, left=1.6cm, right=1.6cm", + "-V", "pagestyle=plain", + "-V", "fontsize=11pt", + "-V", "colorlinks=blue", + "-s", "{}".format(input_md_file), + "-o", "{}".format(pdf_file), ] - cmd = ' '.join(cmd) + # cmd = ' '.join(cmd) print('Generate PDF File: {}'.format(pdf_file)) - os.system(cmd) + try: + subprocess.run(cmd, check=True) + except subprocess.CalledProcessError as e: + print(f"Error generating PDF: {e}") def get_category_article_info(soup): url = soup.find_all('a')[0].attrs['href'] @@ -108,9 +136,26 @@ def download_csdn_category_url(category_url, md_dir, start_page=1, page_num=100, suffix = '.html' if page == 1 else '_{}.html'.format(page) category_url_new = category_url.rstrip('.html') + suffix print('Getting Response From {}'.format(category_url_new)) - response = httpx.get(category_url_new) + try: + with httpx.Client(http2=False, verify=False, timeout=30.0, follow_redirects=True) as client: + # Warm up session + try: + client.get("https://blog.csdn.net/", headers=HEADERS) + except Exception: + pass + + response = client.get(category_url_new, headers=HEADERS) + response.raise_for_status() + except Exception as e: + print(f"Error fetching category {category_url_new}: {e}") + break + soup = BeautifulSoup(response.content, 'html.parser', from_encoding="utf-8") - article_list = soup.find_all('ul', {'class': 'column_article_list'})[0] + article_list_tags = soup.find_all('ul', {'class': 'column_article_list'}) + if not article_list_tags: + print("No article list found") + break + article_list = article_list_tags[0] p = article_list.find_all('p') if p and p[0].string == '空空如也': print('No Content in {}, I Will Skip It!'.format(category_url_new)) @@ -135,9 +180,27 @@ def download_csdn_category_url(category_url, md_dir, start_page=1, page_num=100, def download_csdn_single_page(details_url, md_dir, with_title=True, pdf_dir='pdf', to_pdf=False, is_win=False): if not exists(md_dir): os.makedirs(md_dir) - response = httpx.get(details_url) + + try: + with httpx.Client(http2=False, verify=False, timeout=30.0, follow_redirects=True) as client: + # Warm up session + try: + client.get("https://blog.csdn.net/", headers=HEADERS) + except Exception: + pass + + response = client.get(details_url, headers=HEADERS) + response.raise_for_status() + except Exception as e: + print(f"Error fetching article {details_url}: {e}") + return + soup = BeautifulSoup(response.content, 'html.parser', from_encoding="utf-8") - title = soup.find_all('h1', {'class': 'title-article'})[0].string ## 使用 html 的 title 作为 md 文件名 + title_tags = soup.find_all('h1', {'class': 'title-article'}) + if not title_tags: + print("Could not find article title. The page structure might have changed.") + return + title = title_tags[0].string ## 使用 html 的 title 作为 md 文件名 title = '_'.join(title.replace('*', '').replace('/','-').replace('\\','-').replace('//','-').strip().split()) md_file = join(md_dir, title + '.md') print('Export Markdown File To {}'.format(md_file)) diff --git a/utils.py b/utils.py index 6bd4594..052972a 100644 --- a/utils.py +++ b/utils.py @@ -7,6 +7,7 @@ import os from os.path import join, exists import re +import httpx special_characters = { "<": "<", ">": ">", " ": " ", @@ -36,9 +37,10 @@ def remove_comment(self, soup): def recursive(self, soup): if isinstance(soup, Comment): return elif isinstance(soup, NavigableString): + text = soup.string for key, val in special_characters.items(): - soup.string = soup.string.replace(key, val) - self.outputs.append(soup.string) + text = text.replace(key, val) + self.outputs.append(text) elif isinstance(soup, Tag): tag = soup.name if tag in ['h1', 'h2', 'h3', 'h4', 'h5']: @@ -107,11 +109,23 @@ def recursive(self, soup): img_filename = result_tuple[1].split('/')[-1].rstrip('?') # img_file = re.findall(pattern, src)[0][0].split('/')[-1].rstrip('?') ## e.g. https://img-blog.csdnimg.cn/20200228210146931.png? - download_img_cmd = 'aria2c --file-allocation=none -c -x 10 -s 10 -d {} -o {} {}'.format(self.fig_dir, img_filename, src) - img_file = join(self.fig_dir, img_filename) if not exists(img_file): - os.system(download_img_cmd) + try: + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36', + 'Referer': 'https://blog.csdn.net/' + } + with httpx.Client(http2=False, verify=False, timeout=30.0) as client: + resp = client.get(src, headers=headers) + if resp.status_code == 200: + with open(img_file, 'wb') as f: + f.write(resp.content) + else: + print(f"Download failed: {src}, status: {resp.status_code}") + except Exception as e: + print(f"Error downloading {src}: {e}") + # soup.attrs['src'] = img_file # self.outputs.append('\n' + str(soup.parent) + '\n') code = '![{}]({})'.format(img_file, img_file)