Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 79 additions & 16 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,17 @@
from bs4 import BeautifulSoup, NavigableString
from utils import Parser
import re
import subprocess


HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'en-US,en;q=0.9',
'Referer': 'https://blog.csdn.net/',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
}


parser = argparse.ArgumentParser('CSDN Blog Exporter: To Markdown or To PDF')
Expand Down Expand Up @@ -47,7 +58,21 @@
args = parser.parse_args()

def html2md(url, md_file, with_title=False, is_win=False):
response = httpx.get(url)
try:
# Force HTTP/1.1 to avoid [Errno 0] Error in some environments
with httpx.Client(http2=False, verify=False, timeout=30.0, follow_redirects=True) as client:
# Warm up session
try:
client.get("https://blog.csdn.net/", headers=HEADERS)
except Exception:
pass # Ignore warmup failure

response = client.get(url, headers=HEADERS)
response.raise_for_status()
except Exception as e:
print(f"Error fetching {url}: {e}")
return

soup = BeautifulSoup(response.content, 'html.parser', from_encoding="utf-8")
html = ""
for child in soup.find_all('svg'):
Expand All @@ -72,19 +97,22 @@ def generate_pdf(input_md_file, pdf_dir, is_win=False):
cmd = ["pandoc",
"--toc",
"--pdf-engine=xelatex",
"-V mainfont='Source Code Pro'",
"-V monofont='Source Code Pro'",
"-V documentclass='{}'".format(documentclass),
"-V geometry:'top=2cm, bottom=2cm, left=1.6cm, right=1.6cm'",
"-V pagestyle=plain",
"-V fontsize=11pt",
"-V colorlinks=blue",
"-s {}".format(input_md_file),
"-o {}".format(pdf_file),
"-V", "mainfont=Source Code Pro",
"-V", "monofont=Source Code Pro",
"-V", "documentclass={}".format(documentclass),
"-V", "geometry:top=2cm, bottom=2cm, left=1.6cm, right=1.6cm",
"-V", "pagestyle=plain",
"-V", "fontsize=11pt",
"-V", "colorlinks=blue",
"-s", "{}".format(input_md_file),
"-o", "{}".format(pdf_file),
]
cmd = ' '.join(cmd)
# cmd = ' '.join(cmd)
print('Generate PDF File: {}'.format(pdf_file))
os.system(cmd)
try:
subprocess.run(cmd, check=True)
except subprocess.CalledProcessError as e:
print(f"Error generating PDF: {e}")

def get_category_article_info(soup):
url = soup.find_all('a')[0].attrs['href']
Expand All @@ -108,9 +136,26 @@ def download_csdn_category_url(category_url, md_dir, start_page=1, page_num=100,
suffix = '.html' if page == 1 else '_{}.html'.format(page)
category_url_new = category_url.rstrip('.html') + suffix
print('Getting Response From {}'.format(category_url_new))
response = httpx.get(category_url_new)
try:
with httpx.Client(http2=False, verify=False, timeout=30.0, follow_redirects=True) as client:
# Warm up session
try:
client.get("https://blog.csdn.net/", headers=HEADERS)
except Exception:
pass

response = client.get(category_url_new, headers=HEADERS)
response.raise_for_status()
except Exception as e:
print(f"Error fetching category {category_url_new}: {e}")
break

soup = BeautifulSoup(response.content, 'html.parser', from_encoding="utf-8")
article_list = soup.find_all('ul', {'class': 'column_article_list'})[0]
article_list_tags = soup.find_all('ul', {'class': 'column_article_list'})
if not article_list_tags:
print("No article list found")
break
article_list = article_list_tags[0]
p = article_list.find_all('p')
if p and p[0].string == '空空如也':
print('No Content in {}, I Will Skip It!'.format(category_url_new))
Expand All @@ -135,9 +180,27 @@ def download_csdn_category_url(category_url, md_dir, start_page=1, page_num=100,
def download_csdn_single_page(details_url, md_dir, with_title=True, pdf_dir='pdf', to_pdf=False, is_win=False):
if not exists(md_dir):
os.makedirs(md_dir)
response = httpx.get(details_url)

try:
with httpx.Client(http2=False, verify=False, timeout=30.0, follow_redirects=True) as client:
# Warm up session
try:
client.get("https://blog.csdn.net/", headers=HEADERS)
except Exception:
pass

response = client.get(details_url, headers=HEADERS)
response.raise_for_status()
except Exception as e:
print(f"Error fetching article {details_url}: {e}")
return

soup = BeautifulSoup(response.content, 'html.parser', from_encoding="utf-8")
title = soup.find_all('h1', {'class': 'title-article'})[0].string ## 使用 html 的 title 作为 md 文件名
title_tags = soup.find_all('h1', {'class': 'title-article'})
if not title_tags:
print("Could not find article title. The page structure might have changed.")
return
title = title_tags[0].string ## 使用 html 的 title 作为 md 文件名
title = '_'.join(title.replace('*', '').replace('/','-').replace('\\','-').replace('//','-').strip().split())
md_file = join(md_dir, title + '.md')
print('Export Markdown File To {}'.format(md_file))
Expand Down
24 changes: 19 additions & 5 deletions utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import os
from os.path import join, exists
import re
import httpx

special_characters = {
"&lt;": "<", "&gt;": ">", "&nbsp": " ",
Expand Down Expand Up @@ -36,9 +37,10 @@ def remove_comment(self, soup):
def recursive(self, soup):
if isinstance(soup, Comment): return
elif isinstance(soup, NavigableString):
text = soup.string
for key, val in special_characters.items():
soup.string = soup.string.replace(key, val)
self.outputs.append(soup.string)
text = text.replace(key, val)
self.outputs.append(text)
elif isinstance(soup, Tag):
tag = soup.name
if tag in ['h1', 'h2', 'h3', 'h4', 'h5']:
Expand Down Expand Up @@ -107,11 +109,23 @@ def recursive(self, soup):
img_filename = result_tuple[1].split('/')[-1].rstrip('?')
# img_file = re.findall(pattern, src)[0][0].split('/')[-1].rstrip('?') ## e.g. https://img-blog.csdnimg.cn/20200228210146931.png?

download_img_cmd = 'aria2c --file-allocation=none -c -x 10 -s 10 -d {} -o {} {}'.format(self.fig_dir, img_filename, src)

img_file = join(self.fig_dir, img_filename)
if not exists(img_file):
os.system(download_img_cmd)
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
'Referer': 'https://blog.csdn.net/'
}
with httpx.Client(http2=False, verify=False, timeout=30.0) as client:
resp = client.get(src, headers=headers)
if resp.status_code == 200:
with open(img_file, 'wb') as f:
f.write(resp.content)
else:
print(f"Download failed: {src}, status: {resp.status_code}")
except Exception as e:
print(f"Error downloading {src}: {e}")

# soup.attrs['src'] = img_file
# self.outputs.append('\n' + str(soup.parent) + '\n')
code = '![{}]({})'.format(img_file, img_file)
Expand Down