diff --git a/README.md b/README.md index 4ed48bc..fc154f4 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,28 @@ CSDN 博客导出工具, 用于将 CSDN 博客导出为 Markdown / PDF 格式. 0. 安装必要的 Python 库, 如 `httpx`, `requests`, `BeautifulSoup`; 1. 为了解析图片链接, 需要安装 [aria2](https://aria2.github.io/), 并保证能在命令行启动; -2. 为了转换为 PDF, 需要安装 [Pandoc](https://pandoc.org/) +2. 为了转换为 PDF, 需要安装 [Pandoc](https://pandoc.org/), 同时需要安装[MiKTeX](https://miktex.org/download); +3. 安装字体[Source Code Pro](https://github.com/adobe-fonts/source-code-pro), windows用户建议安装ttf格式; 或修改main.py->generate_pdf方法中'-V mainfont="Source Code Pro"'等字体参数. + +99. 下载过快容易被CSDN屏蔽! +100. TODO: 标题没有处理完善! +101. 如MD文件无法正常转换为PDF,可使用VSCODE + Markdown Preview Enhanced插件 +(https://marketplace.visualstudio.com/items?itemName=shd101wyy.markdown-preview-enhanced)实现。 +使用VSCODE打开MD文件-右键菜单-打开侧边预览-在预览页面打开右键菜单-Open in browser-打印为PDF即可。 + 此外, 正如博客标题 [导出 CSDN 博客至 Markdown 或 PDF 格式 (近乎完美)](https://blog.csdn.net/Eric_1993/article/details/104772437) 中说的, "近乎完美", 是因为该博客导出工具还有一些细节没有考虑, 没有需求就没有动力去改代码, 精力有限, 目前该工具让我满意 😂😂😂. + +#修复记录 +20231109 +1.在README中增加Pando和MikTex引擎安装信息。 +2.修复Windows下`run.bat`中--to_pdf参数未生效的问题。 +3.修复文章中存在gif,bmp格式图片时下载失败的问题。 +4.修复文章中存在无后缀格式图片下载失败的问题。 +5.修复文章中存在匹配图片格式但实际不是图片url导致下载失败的问题。 +6.修复generate_pdf方法的is_win参数未生效的问题。 +7.修复MD文件中图片路径由于img_file = join(self.fig_dir, img_file)导致未正常转义引发PDF转换失败的问题。 + +20231110 +1.pandoc增加--verbose参数,便于后续调试。 \ No newline at end of file diff --git a/main.py b/main.py index 08e7a1e..fe25a06 100644 --- a/main.py +++ b/main.py @@ -68,8 +68,13 @@ def generate_pdf(input_md_file, pdf_dir, is_win=False): md_name = os.path.basename(input_md_file) pdf_name = md_name.replace('.md', '.pdf') pdf_file = join(pdf_dir, pdf_name) + + if exists(pdf_file): + return + if is_win: cmd = ['pandoc', + "--verbose " '--toc', '--pdf-engine=xelatex', '-V mainfont="Source Code Pro"', @@ -84,6 +89,7 @@ def generate_pdf(input_md_file, pdf_dir, is_win=False): ] else: cmd = ["pandoc", + "--verbose " "--toc", "--pdf-engine=xelatex", "-V mainfont='Source Code Pro'", @@ -93,8 +99,8 @@ def generate_pdf(input_md_file, pdf_dir, is_win=False): "-V pagestyle=plain", "-V fontsize=11pt", "-V colorlinks=blue", - "-s {}".format(input_md_file), - "-o {}".format(pdf_file), + "-s {''}".format(input_md_file), + "-o {''}".format(pdf_file), ] cmd = ' '.join(cmd) print('Generate PDF File: {}'.format(pdf_file)) @@ -142,8 +148,9 @@ def download_csdn_category_url(category_url, md_dir, start_page=1, page_num=100, print('BlogNum: {}, Exporting Markdown File To {}'.format(idx, md_file)) if not exists(md_file): html2md(url, md_file) - if to_pdf: - generate_pdf(md_file, pdf_dir, is_win) + + if to_pdf: + generate_pdf(md_file, pdf_dir, is_win) def download_csdn_single_page(details_url, md_dir, with_title=True, pdf_dir='pdf', to_pdf=False, is_win=False): @@ -179,13 +186,15 @@ def download_csdn_single_page(details_url, md_dir, with_title=True, pdf_dir='pdf start_page=args.start_page, page_num=args.page_num, pdf_dir=args.pdf_dir, - to_pdf=args.to_pdf) + to_pdf=args.to_pdf, + is_win=args.is_win == 1) else: download_csdn_single_page(args.article_url, args.markdown_dir, with_title=args.with_title, pdf_dir=args.pdf_dir, - to_pdf=args.to_pdf) + to_pdf=args.to_pdf, + is_win=args.is_win == 1) is_win = args.is_win == 1 if args.combine_together: source_files = join(args.markdown_dir, '*.md') diff --git a/run.bat b/run.bat index 0e16814..ae626f6 100644 --- a/run.bat +++ b/run.bat @@ -9,12 +9,12 @@ @echo ================================================================================ set download_category="true" -set category_url="https://blog.csdn.net/weixin_43792401/category_12292383.html" -set article_url="https://blog.csdn.net/weixin_43792401/article/details/130065744" +set category_url="https://blog.csdn.net/hiwangwenbing/category_10280587.html" +set article_url="https://blog.csdn.net/HiWangWenBing/article/details/112058664" set start_page=1 set page_num=100 set markdown_dir=markdown -set pdf_dir=pdf\ +set pdf_dir=pdf if %download_category% == "true" ( echo "download a category" @@ -24,7 +24,6 @@ if %download_category% == "true" ( --page_num %page_num% ^ --markdown_dir %markdown_dir% ^ --pdf_dir %pdf_dir% ^ - --combine_together ^ --to_pdf ^ --is_win 1 @REM --with_title ^ @@ -34,11 +33,10 @@ if %download_category% == "true" ( python -u main.py ^ --article_url %article_url% ^ --markdown_dir %markdown_dir% ^ - --pdf_dir %pdf_dir% + --pdf_dir %pdf_dir% ^ --to_pdf ^ --with_title ^ --rm_cache ^ - --combine_together --is_win 1 ) pause \ No newline at end of file diff --git a/utils.py b/utils.py index 89f8d5f..deb7fca 100644 --- a/utils.py +++ b/utils.py @@ -3,10 +3,12 @@ # Author: axzml # Date: 2020-03-07 ############################# +from operator import index from bs4 import BeautifulSoup, Tag, NavigableString, Comment import os from os.path import join, exists import re +import pathlib special_characters = { "<": "<", ">": ">", " ": " ", @@ -18,7 +20,7 @@ def __init__(self, html, is_win=False): self.html = html self.soup = BeautifulSoup(html, 'html.parser') self.outputs = [] - self.fig_dir = './figures' + self.fig_dir = './figures/' self.pre = False self.equ_inline = False self.is_win = is_win @@ -99,13 +101,22 @@ def recursive(self, soup): # soup.contents.insert(0, NavigableString('> ')) elif tag == 'img': src = soup.attrs['src'] - # pattern = r'.*\.png' - pattern = r'(.*\..*\?)|(.*\.(png|jpeg|jpg))' - result_tuple = re.findall(pattern, src)[0] + pattern = r'(.*\..*\?)|(.*\.(png|jpeg|jpg|gif|bmp))' + find_result = re.findall(pattern, src); + if len(find_result) == 0: + result_tuple = [src + '.jpg'] * 2 + else: + result_tuple = re.findall(pattern, src)[0] + if result_tuple[0]: img_file = result_tuple[0].split('/')[-1].rstrip('?') else: img_file = result_tuple[1].split('/')[-1].rstrip('?') + file_ext = pathlib.Path(img_file) + fileExtList = ['.png','.jpg','.jpeg','.gif','.bmp'] + if not file_ext.suffix.lower() in fileExtList: + return + # img_file = re.findall(pattern, src)[0][0].split('/')[-1].rstrip('?') ## e.g. https://img-blog.csdnimg.cn/20200228210146931.png? img_file = join(self.fig_dir, img_file) if self.is_win: @@ -116,7 +127,7 @@ def recursive(self, soup): os.system(download_img_cmd) # soup.attrs['src'] = img_file # self.outputs.append('\n' + str(soup.parent) + '\n') - code = '![{}]({})'.format(img_file, img_file) + code = '![{}]({})'.format(img_file, src) self.outputs.append('\n' + code + '\n') return if not hasattr(soup, 'children'): return