Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 22 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,28 @@ CSDN 博客导出工具, 用于将 CSDN 博客导出为 Markdown / PDF 格式.

0. 安装必要的 Python 库, 如 `httpx`, `requests`, `BeautifulSoup`;
1. 为了解析图片链接, 需要安装 [aria2](https://aria2.github.io/), 并保证能在命令行启动;
2. 为了转换为 PDF, 需要安装 [Pandoc](https://pandoc.org/)
2. 为了转换为 PDF, 需要安装 [Pandoc](https://pandoc.org/), 同时需要安装[MiKTeX](https://miktex.org/download);
3. 安装字体[Source Code Pro](https://github.com/adobe-fonts/source-code-pro), windows用户建议安装ttf格式; 或修改main.py->generate_pdf方法中'-V mainfont="Source Code Pro"'等字体参数.

99. 下载过快容易被CSDN屏蔽!
100. TODO: 标题没有处理完善!
101. 如MD文件无法正常转换为PDF,可使用VSCODE + Markdown Preview Enhanced插件
(https://marketplace.visualstudio.com/items?itemName=shd101wyy.markdown-preview-enhanced)实现。
使用VSCODE打开MD文件-右键菜单-打开侧边预览-在预览页面打开右键菜单-Open in browser-打印为PDF即可。


此外, 正如博客标题 [导出 CSDN 博客至 Markdown 或 PDF 格式 (近乎完美)](https://blog.csdn.net/Eric_1993/article/details/104772437) 中说的, "近乎完美",
是因为该博客导出工具还有一些细节没有考虑, 没有需求就没有动力去改代码, 精力有限, 目前该工具让我满意 😂😂😂.

#修复记录
20231109
1.在README中增加Pando和MikTex引擎安装信息。
2.修复Windows下`run.bat`中--to_pdf参数未生效的问题。
3.修复文章中存在gif,bmp格式图片时下载失败的问题。
4.修复文章中存在无后缀格式图片下载失败的问题。
5.修复文章中存在匹配图片格式但实际不是图片url导致下载失败的问题。
6.修复generate_pdf方法的is_win参数未生效的问题。
7.修复MD文件中图片路径由于img_file = join(self.fig_dir, img_file)导致未正常转义引发PDF转换失败的问题。

20231110
1.pandoc增加--verbose参数,便于后续调试。
21 changes: 15 additions & 6 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,13 @@ def generate_pdf(input_md_file, pdf_dir, is_win=False):
md_name = os.path.basename(input_md_file)
pdf_name = md_name.replace('.md', '.pdf')
pdf_file = join(pdf_dir, pdf_name)

if exists(pdf_file):
return

if is_win:
cmd = ['pandoc',
"--verbose "
'--toc',
'--pdf-engine=xelatex',
'-V mainfont="Source Code Pro"',
Expand All @@ -84,6 +89,7 @@ def generate_pdf(input_md_file, pdf_dir, is_win=False):
]
else:
cmd = ["pandoc",
"--verbose "
"--toc",
"--pdf-engine=xelatex",
"-V mainfont='Source Code Pro'",
Expand All @@ -93,8 +99,8 @@ def generate_pdf(input_md_file, pdf_dir, is_win=False):
"-V pagestyle=plain",
"-V fontsize=11pt",
"-V colorlinks=blue",
"-s {}".format(input_md_file),
"-o {}".format(pdf_file),
"-s {''}".format(input_md_file),
"-o {''}".format(pdf_file),
]
cmd = ' '.join(cmd)
print('Generate PDF File: {}'.format(pdf_file))
Expand Down Expand Up @@ -142,8 +148,9 @@ def download_csdn_category_url(category_url, md_dir, start_page=1, page_num=100,
print('BlogNum: {}, Exporting Markdown File To {}'.format(idx, md_file))
if not exists(md_file):
html2md(url, md_file)
if to_pdf:
generate_pdf(md_file, pdf_dir, is_win)

if to_pdf:
generate_pdf(md_file, pdf_dir, is_win)


def download_csdn_single_page(details_url, md_dir, with_title=True, pdf_dir='pdf', to_pdf=False, is_win=False):
Expand Down Expand Up @@ -179,13 +186,15 @@ def download_csdn_single_page(details_url, md_dir, with_title=True, pdf_dir='pdf
start_page=args.start_page,
page_num=args.page_num,
pdf_dir=args.pdf_dir,
to_pdf=args.to_pdf)
to_pdf=args.to_pdf,
is_win=args.is_win == 1)
else:
download_csdn_single_page(args.article_url,
args.markdown_dir,
with_title=args.with_title,
pdf_dir=args.pdf_dir,
to_pdf=args.to_pdf)
to_pdf=args.to_pdf,
is_win=args.is_win == 1)
is_win = args.is_win == 1
if args.combine_together:
source_files = join(args.markdown_dir, '*.md')
Expand Down
10 changes: 4 additions & 6 deletions run.bat
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@
@echo ================================================================================

set download_category="true"
set category_url="https://blog.csdn.net/weixin_43792401/category_12292383.html"
set article_url="https://blog.csdn.net/weixin_43792401/article/details/130065744"
set category_url="https://blog.csdn.net/hiwangwenbing/category_10280587.html"
set article_url="https://blog.csdn.net/HiWangWenBing/article/details/112058664"
set start_page=1
set page_num=100
set markdown_dir=markdown
set pdf_dir=pdf\
set pdf_dir=pdf

if %download_category% == "true" (
echo "download a category"
Expand All @@ -24,7 +24,6 @@ if %download_category% == "true" (
--page_num %page_num% ^
--markdown_dir %markdown_dir% ^
--pdf_dir %pdf_dir% ^
--combine_together ^
--to_pdf ^
--is_win 1
@REM --with_title ^
Expand All @@ -34,11 +33,10 @@ if %download_category% == "true" (
python -u main.py ^
--article_url %article_url% ^
--markdown_dir %markdown_dir% ^
--pdf_dir %pdf_dir%
--pdf_dir %pdf_dir% ^
--to_pdf ^
--with_title ^
--rm_cache ^
--combine_together
--is_win 1
)
pause
21 changes: 16 additions & 5 deletions utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@
# Author: axzml
# Date: 2020-03-07
#############################
from operator import index
from bs4 import BeautifulSoup, Tag, NavigableString, Comment
import os
from os.path import join, exists
import re
import pathlib

special_characters = {
"&lt;": "<", "&gt;": ">", "&nbsp": " ",
Expand All @@ -18,7 +20,7 @@ def __init__(self, html, is_win=False):
self.html = html
self.soup = BeautifulSoup(html, 'html.parser')
self.outputs = []
self.fig_dir = './figures'
self.fig_dir = './figures/'
self.pre = False
self.equ_inline = False
self.is_win = is_win
Expand Down Expand Up @@ -99,13 +101,22 @@ def recursive(self, soup):
# soup.contents.insert(0, NavigableString('> '))
elif tag == 'img':
src = soup.attrs['src']
# pattern = r'.*\.png'
pattern = r'(.*\..*\?)|(.*\.(png|jpeg|jpg))'
result_tuple = re.findall(pattern, src)[0]
pattern = r'(.*\..*\?)|(.*\.(png|jpeg|jpg|gif|bmp))'
find_result = re.findall(pattern, src);
if len(find_result) == 0:
result_tuple = [src + '.jpg'] * 2
else:
result_tuple = re.findall(pattern, src)[0]

if result_tuple[0]:
img_file = result_tuple[0].split('/')[-1].rstrip('?')
else:
img_file = result_tuple[1].split('/')[-1].rstrip('?')
file_ext = pathlib.Path(img_file)
fileExtList = ['.png','.jpg','.jpeg','.gif','.bmp']
if not file_ext.suffix.lower() in fileExtList:
return

# img_file = re.findall(pattern, src)[0][0].split('/')[-1].rstrip('?') ## e.g. https://img-blog.csdnimg.cn/20200228210146931.png?
img_file = join(self.fig_dir, img_file)
if self.is_win:
Expand All @@ -116,7 +127,7 @@ def recursive(self, soup):
os.system(download_img_cmd)
# soup.attrs['src'] = img_file
# self.outputs.append('\n' + str(soup.parent) + '\n')
code = '![{}]({})'.format(img_file, img_file)
code = '![{}]({})'.format(img_file, src)
self.outputs.append('\n' + code + '\n')
return
if not hasattr(soup, 'children'): return
Expand Down