axzml · google-labs-jules · Jan 4, 2026 · Jan 4, 2026 · Jan 9, 2026 · Jan 9, 2026
diff --git a/main.py b/main.py
@@ -15,6 +15,17 @@
 from bs4 import BeautifulSoup, NavigableString
 from utils import Parser
 import re
+import subprocess
+
+
+HEADERS = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
+    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+    'Accept-Language': 'en-US,en;q=0.9',
+    'Referer': 'https://blog.csdn.net/',
+    'Connection': 'keep-alive',
+    'Upgrade-Insecure-Requests': '1',
+}
 
 
 parser = argparse.ArgumentParser('CSDN Blog Exporter: To Markdown or To PDF')
@@ -47,7 +58,21 @@
 args = parser.parse_args()
 
 def html2md(url, md_file, with_title=False, is_win=False):
-    response = httpx.get(url)
+    try:
+        # Force HTTP/1.1 to avoid [Errno 0] Error in some environments
+        with httpx.Client(http2=False, verify=False, timeout=30.0, follow_redirects=True) as client:
+            # Warm up session
+            try:
+                client.get("https://blog.csdn.net/", headers=HEADERS)
+            except Exception:
+                pass # Ignore warmup failure
+
+            response = client.get(url, headers=HEADERS)
+            response.raise_for_status()
+    except Exception as e:
+        print(f"Error fetching {url}: {e}")
+        return
+
     soup = BeautifulSoup(response.content, 'html.parser', from_encoding="utf-8")
     html = ""
     for child in soup.find_all('svg'):
@@ -72,19 +97,22 @@ def generate_pdf(input_md_file, pdf_dir, is_win=False):
     cmd = ["pandoc",
         "--toc",
         "--pdf-engine=xelatex",
-        "-V mainfont='Source Code Pro'",
-        "-V monofont='Source Code Pro'",
-        "-V documentclass='{}'".format(documentclass),
-        "-V geometry:'top=2cm, bottom=2cm, left=1.6cm, right=1.6cm'",
-        "-V pagestyle=plain",
-        "-V fontsize=11pt",
-        "-V colorlinks=blue",
-        "-s {}".format(input_md_file),
-        "-o {}".format(pdf_file),
+        "-V", "mainfont=Source Code Pro",
+        "-V", "monofont=Source Code Pro",
+        "-V", "documentclass={}".format(documentclass),
+        "-V", "geometry:top=2cm, bottom=2cm, left=1.6cm, right=1.6cm",
+        "-V", "pagestyle=plain",
+        "-V", "fontsize=11pt",
+        "-V", "colorlinks=blue",
+        "-s", "{}".format(input_md_file),
+        "-o", "{}".format(pdf_file),
     ]
-    cmd = ' '.join(cmd)
+    # cmd = ' '.join(cmd)
     print('Generate PDF File: {}'.format(pdf_file))
-    os.system(cmd)
+    try:
+        subprocess.run(cmd, check=True)
+    except subprocess.CalledProcessError as e:
+        print(f"Error generating PDF: {e}")
 
 def get_category_article_info(soup):
     url = soup.find_all('a')[0].attrs['href']
@@ -108,9 +136,26 @@ def download_csdn_category_url(category_url, md_dir, start_page=1, page_num=100,
         suffix = '.html' if page == 1 else '_{}.html'.format(page)
         category_url_new = category_url.rstrip('.html') + suffix
         print('Getting Response From {}'.format(category_url_new))
-        response = httpx.get(category_url_new)
+        try:
+            with httpx.Client(http2=False, verify=False, timeout=30.0, follow_redirects=True) as client:
+                # Warm up session
+                try:
+                    client.get("https://blog.csdn.net/", headers=HEADERS)
+                except Exception:
+                    pass
+
+                response = client.get(category_url_new, headers=HEADERS)
+                response.raise_for_status()
+        except Exception as e:
+            print(f"Error fetching category {category_url_new}: {e}")
+            break
+
         soup = BeautifulSoup(response.content, 'html.parser', from_encoding="utf-8")
-        article_list = soup.find_all('ul', {'class': 'column_article_list'})[0]
+        article_list_tags = soup.find_all('ul', {'class': 'column_article_list'})
+        if not article_list_tags:
+            print("No article list found")
+            break
+        article_list = article_list_tags[0]
         p = article_list.find_all('p')
         if p and p[0].string == '空空如也':
             print('No Content in {}, I Will Skip It!'.format(category_url_new))
@@ -135,9 +180,27 @@ def download_csdn_category_url(category_url, md_dir, start_page=1, page_num=100,
 def download_csdn_single_page(details_url, md_dir, with_title=True, pdf_dir='pdf', to_pdf=False, is_win=False):
     if not exists(md_dir):
         os.makedirs(md_dir)
-    response = httpx.get(details_url)
+
+    try:
+        with httpx.Client(http2=False, verify=False, timeout=30.0, follow_redirects=True) as client:
+            # Warm up session
+            try:
+                client.get("https://blog.csdn.net/", headers=HEADERS)
+            except Exception:
+                pass
+
+            response = client.get(details_url, headers=HEADERS)
+            response.raise_for_status()
+    except Exception as e:
+        print(f"Error fetching article {details_url}: {e}")
+        return
+
     soup = BeautifulSoup(response.content, 'html.parser', from_encoding="utf-8")
-    title = soup.find_all('h1', {'class': 'title-article'})[0].string  ## 使用 html 的 title 作为 md 文件名
+    title_tags = soup.find_all('h1', {'class': 'title-article'})
+    if not title_tags:
+        print("Could not find article title. The page structure might have changed.")
+        return
+    title = title_tags[0].string  ## 使用 html 的 title 作为 md 文件名
     title = '_'.join(title.replace('*', '').replace('/','-').replace('\\','-').replace('//','-').strip().split())
     md_file = join(md_dir, title + '.md')
     print('Export Markdown File To {}'.format(md_file))

diff --git a/utils.py b/utils.py
@@ -7,6 +7,7 @@
 import os
 from os.path import join, exists
 import re
+import httpx
 
 special_characters = {
     "&lt;": "<", "&gt;": ">", "&nbsp": " ",
@@ -36,9 +37,10 @@ def remove_comment(self, soup):
     def recursive(self, soup):
         if isinstance(soup, Comment): return
         elif isinstance(soup, NavigableString):
+            text = soup.string
             for key, val in special_characters.items():
-                soup.string = soup.string.replace(key, val)
-            self.outputs.append(soup.string)
+                text = text.replace(key, val)
+            self.outputs.append(text)
         elif isinstance(soup, Tag):
             tag = soup.name
             if tag in ['h1', 'h2', 'h3', 'h4', 'h5']:
@@ -107,11 +109,23 @@ def recursive(self, soup):
                     img_filename = result_tuple[1].split('/')[-1].rstrip('?')
                 # img_file = re.findall(pattern, src)[0][0].split('/')[-1].rstrip('?') ## e.g. https://img-blog.csdnimg.cn/20200228210146931.png?
 
-                download_img_cmd = 'aria2c --file-allocation=none -c -x 10 -s 10 -d {} -o {} {}'.format(self.fig_dir, img_filename, src)
-
                 img_file = join(self.fig_dir, img_filename)
                 if not exists(img_file):
-                    os.system(download_img_cmd)
+                    try:
+                        headers = {
+                            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
+                            'Referer': 'https://blog.csdn.net/'
+                        }
+                        with httpx.Client(http2=False, verify=False, timeout=30.0) as client:
+                            resp = client.get(src, headers=headers)
+                            if resp.status_code == 200:
+                                with open(img_file, 'wb') as f:
+                                    f.write(resp.content)
+                            else:
+                                print(f"Download failed: {src}, status: {resp.status_code}")
+                    except Exception as e:
+                        print(f"Error downloading {src}: {e}")
+
                 # soup.attrs['src'] = img_file
                 # self.outputs.append('\n' + str(soup.parent) + '\n')
                 code = '![{}]({})'.format(img_file, img_file)