axzml · w-r0rschach · Nov 9, 2023 · Nov 10, 2023
diff --git a/README.md b/README.md
@@ -16,7 +16,28 @@ CSDN 博客导出工具, 用于将 CSDN 博客导出为 Markdown / PDF 格式.
 
 0. 安装必要的 Python 库, 如 `httpx`, `requests`, `BeautifulSoup`;
 1. 为了解析图片链接, 需要安装 [aria2](https://aria2.github.io/), 并保证能在命令行启动;
-2. 为了转换为 PDF, 需要安装 [Pandoc](https://pandoc.org/)
+2. 为了转换为 PDF, 需要安装 [Pandoc](https://pandoc.org/), 同时需要安装[MiKTeX](https://miktex.org/download);
+3. 安装字体[Source Code Pro](https://github.com/adobe-fonts/source-code-pro), windows用户建议安装ttf格式; 或修改main.py->generate_pdf方法中'-V mainfont="Source Code Pro"'等字体参数.
+
+99. 下载过快容易被CSDN屏蔽！
+100. TODO: 标题没有处理完善！
+101. 如MD文件无法正常转换为PDF，可使用VSCODE + Markdown Preview Enhanced插件
+(https://marketplace.visualstudio.com/items?itemName=shd101wyy.markdown-preview-enhanced)实现。
+使用VSCODE打开MD文件-右键菜单-打开侧边预览-在预览页面打开右键菜单-Open in browser-打印为PDF即可。
+
 
 此外, 正如博客标题 [导出 CSDN 博客至 Markdown 或 PDF 格式 (近乎完美)](https://blog.csdn.net/Eric_1993/article/details/104772437) 中说的, "近乎完美",
 是因为该博客导出工具还有一些细节没有考虑, 没有需求就没有动力去改代码, 精力有限, 目前该工具让我满意 😂😂😂.
+
+#修复记录
+20231109
+1.在README中增加Pando和MikTex引擎安装信息。
+2.修复Windows下`run.bat`中--to_pdf参数未生效的问题。
+3.修复文章中存在gif,bmp格式图片时下载失败的问题。
+4.修复文章中存在无后缀格式图片下载失败的问题。
+5.修复文章中存在匹配图片格式但实际不是图片url导致下载失败的问题。
+6.修复generate_pdf方法的is_win参数未生效的问题。
+7.修复MD文件中图片路径由于img_file = join(self.fig_dir, img_file)导致未正常转义引发PDF转换失败的问题。
+
+20231110
+1.pandoc增加--verbose参数，便于后续调试。
diff --git a/main.py b/main.py
@@ -68,8 +68,13 @@ def generate_pdf(input_md_file, pdf_dir, is_win=False):
     md_name = os.path.basename(input_md_file)
     pdf_name = md_name.replace('.md', '.pdf')
     pdf_file = join(pdf_dir, pdf_name)
+
+    if exists(pdf_file):
+        return
+
     if is_win:
         cmd = ['pandoc',
+            "--verbose "
             '--toc',
             '--pdf-engine=xelatex',
             '-V mainfont="Source Code Pro"',
@@ -84,6 +89,7 @@ def generate_pdf(input_md_file, pdf_dir, is_win=False):
         ]
     else:
         cmd = ["pandoc",
+            "--verbose "
             "--toc",
             "--pdf-engine=xelatex",
             "-V mainfont='Source Code Pro'",
@@ -93,8 +99,8 @@ def generate_pdf(input_md_file, pdf_dir, is_win=False):
             "-V pagestyle=plain",
             "-V fontsize=11pt",
             "-V colorlinks=blue",
-            "-s {}".format(input_md_file),
-            "-o {}".format(pdf_file),
+            "-s {''}".format(input_md_file),
+            "-o {''}".format(pdf_file),
         ]
     cmd = ' '.join(cmd)
     print('Generate PDF File: {}'.format(pdf_file))
@@ -142,8 +148,9 @@ def download_csdn_category_url(category_url, md_dir, start_page=1, page_num=100,
         print('BlogNum: {}, Exporting Markdown File To {}'.format(idx, md_file))
         if not exists(md_file):
             html2md(url, md_file)
-            if to_pdf:
-                generate_pdf(md_file, pdf_dir, is_win)
+
+        if to_pdf:
+            generate_pdf(md_file, pdf_dir, is_win)
 
 
 def download_csdn_single_page(details_url, md_dir, with_title=True, pdf_dir='pdf', to_pdf=False, is_win=False):
@@ -179,13 +186,15 @@ def download_csdn_single_page(details_url, md_dir, with_title=True, pdf_dir='pdf
                                    start_page=args.start_page,
                                    page_num=args.page_num,
                                    pdf_dir=args.pdf_dir,
-                                   to_pdf=args.to_pdf)
+                                   to_pdf=args.to_pdf,
+                                   is_win=args.is_win == 1)
     else:
         download_csdn_single_page(args.article_url,
                                  args.markdown_dir,
                                  with_title=args.with_title,
                                  pdf_dir=args.pdf_dir,
-                                 to_pdf=args.to_pdf)
+                                 to_pdf=args.to_pdf,
+                                 is_win=args.is_win == 1)
     is_win = args.is_win == 1
     if args.combine_together:
         source_files = join(args.markdown_dir, '*.md')

diff --git a/run.bat b/run.bat
@@ -9,12 +9,12 @@
 @echo ================================================================================
 
 set download_category="true"
-set category_url="https://blog.csdn.net/weixin_43792401/category_12292383.html"
-set article_url="https://blog.csdn.net/weixin_43792401/article/details/130065744"
+set category_url="https://blog.csdn.net/hiwangwenbing/category_10280587.html"
+set article_url="https://blog.csdn.net/HiWangWenBing/article/details/112058664"
 set start_page=1
 set page_num=100
 set markdown_dir=markdown
-set pdf_dir=pdf\
+set pdf_dir=pdf
 
 if %download_category% == "true" (
     echo "download a category"
@@ -24,7 +24,6 @@ if %download_category% == "true" (
         --page_num %page_num% ^
         --markdown_dir %markdown_dir% ^
         --pdf_dir %pdf_dir% ^
-        --combine_together ^
         --to_pdf ^
         --is_win 1
         @REM --with_title ^
@@ -34,11 +33,10 @@ if %download_category% == "true" (
     python -u main.py ^
         --article_url %article_url% ^
         --markdown_dir %markdown_dir% ^
-        --pdf_dir %pdf_dir% 
+        --pdf_dir %pdf_dir% ^
         --to_pdf ^
         --with_title ^
         --rm_cache ^
-        --combine_together
         --is_win 1
 )
 pause
diff --git a/utils.py b/utils.py
@@ -3,10 +3,12 @@
 # Author: axzml
 # Date: 2020-03-07
 #############################
+from operator import index
 from bs4 import BeautifulSoup, Tag, NavigableString, Comment
 import os
 from os.path import join, exists
 import re
+import pathlib
 
 special_characters = {
     "&lt;": "<", "&gt;": ">", "&nbsp": " ",
@@ -18,7 +20,7 @@ def __init__(self, html, is_win=False):
         self.html = html
         self.soup = BeautifulSoup(html, 'html.parser')
         self.outputs = []
-        self.fig_dir = './figures'
+        self.fig_dir = './figures/'
         self.pre = False
         self.equ_inline = False
         self.is_win = is_win
@@ -99,13 +101,22 @@ def recursive(self, soup):
                 # soup.contents.insert(0, NavigableString('> '))
             elif tag == 'img':
                 src = soup.attrs['src']
-                # pattern = r'.*\.png'
-                pattern = r'(.*\..*\?)|(.*\.(png|jpeg|jpg))'
-                result_tuple = re.findall(pattern, src)[0]
+                pattern = r'(.*\..*\?)|(.*\.(png|jpeg|jpg|gif|bmp))'
+                find_result = re.findall(pattern, src);
+                if len(find_result) == 0:
+                    result_tuple = [src + '.jpg'] * 2
+                else:
+                    result_tuple = re.findall(pattern, src)[0]
+
                 if result_tuple[0]:
                     img_file = result_tuple[0].split('/')[-1].rstrip('?')
                 else:
                     img_file = result_tuple[1].split('/')[-1].rstrip('?')
+                file_ext = pathlib.Path(img_file)
+                fileExtList = ['.png','.jpg','.jpeg','.gif','.bmp']
+                if not file_ext.suffix.lower() in fileExtList:
+                    return
+
                 # img_file = re.findall(pattern, src)[0][0].split('/')[-1].rstrip('?') ## e.g. https://img-blog.csdnimg.cn/20200228210146931.png?
                 img_file = join(self.fig_dir, img_file)
                 if self.is_win:
@@ -116,7 +127,7 @@ def recursive(self, soup):
                     os.system(download_img_cmd)
                 # soup.attrs['src'] = img_file
                 # self.outputs.append('\n' + str(soup.parent) + '\n')
-                code = '![{}]({})'.format(img_file, img_file)
+                code = '![{}]({})'.format(img_file, src)
                 self.outputs.append('\n' + code + '\n')
                 return
         if not hasattr(soup, 'children'): return