From f2807bed34edcca98c5850dfa8373a659571cffc Mon Sep 17 00:00:00 2001
From: "google-labs-jules[bot]"
 <161369871+google-labs-jules[bot]@users.noreply.github.com>
Date: Sun, 4 Jan 2026 03:59:32 +0000
Subject: [PATCH 1/4] Refactor CSDN exporter to remove aria2 dependency and
 improve robustness

- Removed `aria2c` dependency; images are now downloaded using `httpx`.
- Added User-Agent headers to all requests to mitigate anti-scraping blocks.
- Replaced `os.system` with `subprocess.run` for safer PDF generation.
- Added error handling for network requests and HTML parsing.
---
 main.py  | 68 +++++++++++++++++++++++++++++++++++++++++++-------------
 utils.py | 16 ++++++++++---
 2 files changed, 65 insertions(+), 19 deletions(-)

diff --git a/main.py b/main.py
index 7863459..324ca09 100644
--- a/main.py
+++ b/main.py
@@ -15,6 +15,12 @@
 from bs4 import BeautifulSoup, NavigableString
 from utils import Parser
 import re
+import subprocess
+
+
+HEADERS = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+}
 
 
 parser = argparse.ArgumentParser('CSDN Blog Exporter: To Markdown or To PDF')
@@ -47,7 +53,13 @@
 args = parser.parse_args()
 
 def html2md(url, md_file, with_title=False, is_win=False):
-    response = httpx.get(url)
+    try:
+        response = httpx.get(url, headers=HEADERS, timeout=20.0, follow_redirects=True)
+        response.raise_for_status()
+    except Exception as e:
+        print(f"Error fetching {url}: {e}")
+        return
+
     soup = BeautifulSoup(response.content, 'html.parser', from_encoding="utf-8")
     html = ""
     for child in soup.find_all('svg'):
@@ -72,19 +84,22 @@ def generate_pdf(input_md_file, pdf_dir, is_win=False):
     cmd = ["pandoc",
         "--toc",
         "--pdf-engine=xelatex",
-        "-V mainfont='Source Code Pro'",
-        "-V monofont='Source Code Pro'",
-        "-V documentclass='{}'".format(documentclass),
-        "-V geometry:'top=2cm, bottom=2cm, left=1.6cm, right=1.6cm'",
-        "-V pagestyle=plain",
-        "-V fontsize=11pt",
-        "-V colorlinks=blue",
-        "-s {}".format(input_md_file),
-        "-o {}".format(pdf_file),
+        "-V", "mainfont=Source Code Pro",
+        "-V", "monofont=Source Code Pro",
+        "-V", "documentclass={}".format(documentclass),
+        "-V", "geometry:top=2cm, bottom=2cm, left=1.6cm, right=1.6cm",
+        "-V", "pagestyle=plain",
+        "-V", "fontsize=11pt",
+        "-V", "colorlinks=blue",
+        "-s", "{}".format(input_md_file),
+        "-o", "{}".format(pdf_file),
     ]
-    cmd = ' '.join(cmd)
+    # cmd = ' '.join(cmd)
     print('Generate PDF File: {}'.format(pdf_file))
-    os.system(cmd)
+    try:
+        subprocess.run(cmd, check=True)
+    except subprocess.CalledProcessError as e:
+        print(f"Error generating PDF: {e}")
 
 def get_category_article_info(soup):
     url = soup.find_all('a')[0].attrs['href']
@@ -108,9 +123,19 @@ def download_csdn_category_url(category_url, md_dir, start_page=1, page_num=100,
         suffix = '.html' if page == 1 else '_{}.html'.format(page)
         category_url_new = category_url.rstrip('.html') + suffix
         print('Getting Response From {}'.format(category_url_new))
-        response = httpx.get(category_url_new)
+        try:
+            response = httpx.get(category_url_new, headers=HEADERS, timeout=20.0, follow_redirects=True)
+            response.raise_for_status()
+        except Exception as e:
+            print(f"Error fetching category {category_url_new}: {e}")
+            break
+
         soup = BeautifulSoup(response.content, 'html.parser', from_encoding="utf-8")
-        article_list = soup.find_all('ul', {'class': 'column_article_list'})[0]
+        article_list_tags = soup.find_all('ul', {'class': 'column_article_list'})
+        if not article_list_tags:
+            print("No article list found")
+            break
+        article_list = article_list_tags[0]
         p = article_list.find_all('p')
         if p and p[0].string == '空空如也':
             print('No Content in {}, I Will Skip It!'.format(category_url_new))
@@ -135,9 +160,20 @@ def download_csdn_category_url(category_url, md_dir, start_page=1, page_num=100,
 def download_csdn_single_page(details_url, md_dir, with_title=True, pdf_dir='pdf', to_pdf=False, is_win=False):
     if not exists(md_dir):
         os.makedirs(md_dir)
-    response = httpx.get(details_url)
+
+    try:
+        response = httpx.get(details_url, headers=HEADERS, timeout=20.0, follow_redirects=True)
+        response.raise_for_status()
+    except Exception as e:
+        print(f"Error fetching article {details_url}: {e}")
+        return
+
     soup = BeautifulSoup(response.content, 'html.parser', from_encoding="utf-8")
-    title = soup.find_all('h1', {'class': 'title-article'})[0].string  ## 使用 html 的 title 作为 md 文件名
+    title_tags = soup.find_all('h1', {'class': 'title-article'})
+    if not title_tags:
+        print("Could not find article title. The page structure might have changed.")
+        return
+    title = title_tags[0].string  ## 使用 html 的 title 作为 md 文件名
     title = '_'.join(title.replace('*', '').replace('/','-').replace('\\','-').replace('//','-').strip().split())
     md_file = join(md_dir, title + '.md')
     print('Export Markdown File To {}'.format(md_file))
diff --git a/utils.py b/utils.py
index 6bd4594..eb0aab6 100644
--- a/utils.py
+++ b/utils.py
@@ -7,6 +7,7 @@
 import os
 from os.path import join, exists
 import re
+import httpx
 
 special_characters = {
     "&lt;": "<", "&gt;": ">", "&nbsp": " ",
@@ -107,11 +108,20 @@ def recursive(self, soup):
                     img_filename = result_tuple[1].split('/')[-1].rstrip('?')
                 # img_file = re.findall(pattern, src)[0][0].split('/')[-1].rstrip('?') ## e.g. https://img-blog.csdnimg.cn/20200228210146931.png?
                 
-                download_img_cmd = 'aria2c --file-allocation=none -c -x 10 -s 10 -d {} -o {} {}'.format(self.fig_dir, img_filename, src)
-
                 img_file = join(self.fig_dir, img_filename)
                 if not exists(img_file):
-                    os.system(download_img_cmd)
+                    try:
+                        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'}
+                        with httpx.Client(verify=False) as client:
+                            resp = client.get(src, headers=headers, timeout=20.0)
+                            if resp.status_code == 200:
+                                with open(img_file, 'wb') as f:
+                                    f.write(resp.content)
+                            else:
+                                print(f"Download failed: {src}, status: {resp.status_code}")
+                    except Exception as e:
+                        print(f"Error downloading {src}: {e}")
+
                 # soup.attrs['src'] = img_file
                 # self.outputs.append('\n' + str(soup.parent) + '\n')
                 code = '![{}]({})'.format(img_file, img_file)

From 21fd84e3ec52638d4845cfda3948be116d765374 Mon Sep 17 00:00:00 2001
From: "google-labs-jules[bot]"
 <161369871+google-labs-jules[bot]@users.noreply.github.com>
Date: Sun, 4 Jan 2026 05:09:44 +0000
Subject: [PATCH 2/4] Refactor CSDN exporter to fix network and parsing errors

- Fixed `[Errno 0] Error` by adding `verify=False` to `httpx` requests, bypassing SSL handshake issues.
- Fixed `AttributeError` in `utils.py` caused by attempting to set the read-only `.string` property of `NavigableString` in newer BeautifulSoup versions.
- Removed `aria2c` dependency; images are now downloaded using `httpx`.
- Added User-Agent headers to all requests to mitigate anti-scraping blocks.
- Replaced `os.system` with `subprocess.run` for safer PDF generation.
- Added robust error handling for network requests and HTML parsing.
---
 main.py  | 6 +++---
 utils.py | 5 +++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/main.py b/main.py
index 324ca09..79b3c03 100644
--- a/main.py
+++ b/main.py
@@ -54,7 +54,7 @@
 
 def html2md(url, md_file, with_title=False, is_win=False):
     try:
-        response = httpx.get(url, headers=HEADERS, timeout=20.0, follow_redirects=True)
+        response = httpx.get(url, headers=HEADERS, timeout=20.0, follow_redirects=True, verify=False)
         response.raise_for_status()
     except Exception as e:
         print(f"Error fetching {url}: {e}")
@@ -124,7 +124,7 @@ def download_csdn_category_url(category_url, md_dir, start_page=1, page_num=100,
         category_url_new = category_url.rstrip('.html') + suffix
         print('Getting Response From {}'.format(category_url_new))
         try:
-            response = httpx.get(category_url_new, headers=HEADERS, timeout=20.0, follow_redirects=True)
+            response = httpx.get(category_url_new, headers=HEADERS, timeout=20.0, follow_redirects=True, verify=False)
             response.raise_for_status()
         except Exception as e:
             print(f"Error fetching category {category_url_new}: {e}")
@@ -162,7 +162,7 @@ def download_csdn_single_page(details_url, md_dir, with_title=True, pdf_dir='pdf
         os.makedirs(md_dir)
 
     try:
-        response = httpx.get(details_url, headers=HEADERS, timeout=20.0, follow_redirects=True)
+        response = httpx.get(details_url, headers=HEADERS, timeout=20.0, follow_redirects=True, verify=False)
         response.raise_for_status()
     except Exception as e:
         print(f"Error fetching article {details_url}: {e}")
diff --git a/utils.py b/utils.py
index eb0aab6..6d61a39 100644
--- a/utils.py
+++ b/utils.py
@@ -37,9 +37,10 @@ def remove_comment(self, soup):
     def recursive(self, soup):
         if isinstance(soup, Comment): return
         elif isinstance(soup, NavigableString):
+            text = soup.string
             for key, val in special_characters.items():
-                soup.string = soup.string.replace(key, val)
-            self.outputs.append(soup.string)
+                text = text.replace(key, val)
+            self.outputs.append(text)
         elif isinstance(soup, Tag):
             tag = soup.name
             if tag in ['h1', 'h2', 'h3', 'h4', 'h5']:

From 1a5615414ea8822f41bbd281bf2594a533a3b4c1 Mon Sep 17 00:00:00 2001
From: "google-labs-jules[bot]"
 <161369871+google-labs-jules[bot]@users.noreply.github.com>
Date: Fri, 9 Jan 2026 03:50:13 +0000
Subject: [PATCH 3/4] Refactor CSDN exporter to fix network and parsing errors

- Fixed `[Errno 0] Error` by forcing HTTP/1.1 (`http2=False`) and bypassing SSL verification (`verify=False`) in `httpx` requests.
- Fixed `AttributeError` in `utils.py` caused by attempting to set the read-only `.string` property of `NavigableString`.
- Removed `aria2c` dependency; images are now downloaded natively using `httpx`.
- Added User-Agent headers to all requests to mitigate anti-scraping blocks.
- Replaced `os.system` with `subprocess.run` for safer PDF generation.
- Added robust error handling for network requests and HTML parsing.
---
 main.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/main.py b/main.py
index 79b3c03..05d50aa 100644
--- a/main.py
+++ b/main.py
@@ -54,8 +54,10 @@
 
 def html2md(url, md_file, with_title=False, is_win=False):
     try:
-        response = httpx.get(url, headers=HEADERS, timeout=20.0, follow_redirects=True, verify=False)
-        response.raise_for_status()
+        # Force HTTP/1.1 to avoid [Errno 0] Error in some environments
+        with httpx.Client(http2=False, verify=False, timeout=20.0, follow_redirects=True) as client:
+            response = client.get(url, headers=HEADERS)
+            response.raise_for_status()
     except Exception as e:
         print(f"Error fetching {url}: {e}")
         return
@@ -124,8 +126,9 @@ def download_csdn_category_url(category_url, md_dir, start_page=1, page_num=100,
         category_url_new = category_url.rstrip('.html') + suffix
         print('Getting Response From {}'.format(category_url_new))
         try:
-            response = httpx.get(category_url_new, headers=HEADERS, timeout=20.0, follow_redirects=True, verify=False)
-            response.raise_for_status()
+            with httpx.Client(http2=False, verify=False, timeout=20.0, follow_redirects=True) as client:
+                response = client.get(category_url_new, headers=HEADERS)
+                response.raise_for_status()
         except Exception as e:
             print(f"Error fetching category {category_url_new}: {e}")
             break
@@ -162,8 +165,9 @@ def download_csdn_single_page(details_url, md_dir, with_title=True, pdf_dir='pdf
         os.makedirs(md_dir)
 
     try:
-        response = httpx.get(details_url, headers=HEADERS, timeout=20.0, follow_redirects=True, verify=False)
-        response.raise_for_status()
+        with httpx.Client(http2=False, verify=False, timeout=20.0, follow_redirects=True) as client:
+            response = client.get(details_url, headers=HEADERS)
+            response.raise_for_status()
     except Exception as e:
         print(f"Error fetching article {details_url}: {e}")
         return

From 02d80d088bb671ed5c6533cd1e831c36fa1fe446 Mon Sep 17 00:00:00 2001
From: "google-labs-jules[bot]"
 <161369871+google-labs-jules[bot]@users.noreply.github.com>
Date: Fri, 9 Jan 2026 04:02:23 +0000
Subject: [PATCH 4/4] Refactor CSDN exporter to fix network and parsing errors

- Fixed `[Errno 0] Error` by forcing HTTP/1.1 (`http2=False`), bypassing SSL verification (`verify=False`), and adding session warm-up logic.
- Fixed `AttributeError` in `utils.py` caused by attempting to set the read-only `.string` property of `NavigableString`.
- Removed `aria2c` dependency; images are now downloaded natively using `httpx`.
- Added comprehensive browser headers (User-Agent, Referer, Accept) to bypass anti-scraping blocks.
- Replaced `os.system` with `subprocess.run` for safer PDF generation.
- Added robust error handling for network requests and HTML parsing.
---
 main.py  | 31 +++++++++++++++++++++++++++----
 utils.py |  9 ++++++---
 2 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/main.py b/main.py
index 05d50aa..b7059be 100644
--- a/main.py
+++ b/main.py
@@ -19,7 +19,12 @@
 
 
 HEADERS = {
-    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
+    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+    'Accept-Language': 'en-US,en;q=0.9',
+    'Referer': 'https://blog.csdn.net/',
+    'Connection': 'keep-alive',
+    'Upgrade-Insecure-Requests': '1',
 }
 
 
@@ -55,7 +60,13 @@
 def html2md(url, md_file, with_title=False, is_win=False):
     try:
         # Force HTTP/1.1 to avoid [Errno 0] Error in some environments
-        with httpx.Client(http2=False, verify=False, timeout=20.0, follow_redirects=True) as client:
+        with httpx.Client(http2=False, verify=False, timeout=30.0, follow_redirects=True) as client:
+            # Warm up session
+            try:
+                client.get("https://blog.csdn.net/", headers=HEADERS)
+            except Exception:
+                pass # Ignore warmup failure
+
             response = client.get(url, headers=HEADERS)
             response.raise_for_status()
     except Exception as e:
@@ -126,7 +137,13 @@ def download_csdn_category_url(category_url, md_dir, start_page=1, page_num=100,
         category_url_new = category_url.rstrip('.html') + suffix
         print('Getting Response From {}'.format(category_url_new))
         try:
-            with httpx.Client(http2=False, verify=False, timeout=20.0, follow_redirects=True) as client:
+            with httpx.Client(http2=False, verify=False, timeout=30.0, follow_redirects=True) as client:
+                # Warm up session
+                try:
+                    client.get("https://blog.csdn.net/", headers=HEADERS)
+                except Exception:
+                    pass
+
                 response = client.get(category_url_new, headers=HEADERS)
                 response.raise_for_status()
         except Exception as e:
@@ -165,7 +182,13 @@ def download_csdn_single_page(details_url, md_dir, with_title=True, pdf_dir='pdf
         os.makedirs(md_dir)
 
     try:
-        with httpx.Client(http2=False, verify=False, timeout=20.0, follow_redirects=True) as client:
+        with httpx.Client(http2=False, verify=False, timeout=30.0, follow_redirects=True) as client:
+            # Warm up session
+            try:
+                client.get("https://blog.csdn.net/", headers=HEADERS)
+            except Exception:
+                pass
+
             response = client.get(details_url, headers=HEADERS)
             response.raise_for_status()
     except Exception as e:
diff --git a/utils.py b/utils.py
index 6d61a39..052972a 100644
--- a/utils.py
+++ b/utils.py
@@ -112,9 +112,12 @@ def recursive(self, soup):
                 img_file = join(self.fig_dir, img_filename)
                 if not exists(img_file):
                     try:
-                        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'}
-                        with httpx.Client(verify=False) as client:
-                            resp = client.get(src, headers=headers, timeout=20.0)
+                        headers = {
+                            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
+                            'Referer': 'https://blog.csdn.net/'
+                        }
+                        with httpx.Client(http2=False, verify=False, timeout=30.0) as client:
+                            resp = client.get(src, headers=headers)
                             if resp.status_code == 200:
                                 with open(img_file, 'wb') as f:
                                     f.write(resp.content)