-
Notifications
You must be signed in to change notification settings - Fork 23
Expand file tree
/
Copy pathutils.py
More file actions
148 lines (138 loc) · 6.84 KB
/
utils.py
File metadata and controls
148 lines (138 loc) · 6.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#############################
# CopyRight ~~~~~~
# Author: axzml
# Date: 2020-03-07
#############################
from bs4 import BeautifulSoup, Tag, NavigableString, Comment
import os
from os.path import join, exists
import re
special_characters = {
"<": "<", ">": ">", " ": " ",
"​": "",
}
class Parser(object):
def __init__(self, html):
self.html = html
self.soup = BeautifulSoup(html, 'html.parser')
self.outputs = []
self.fig_dir = './figures'
self.pre = False
self.equ_inline = False
if not exists(self.fig_dir):
os.makedirs(self.fig_dir)
self.recursive(self.soup)
def remove_comment(self, soup):
if not hasattr(soup, 'children'): return
for c in soup.children:
if isinstance(c, Comment):
c.extract()
self.remove_comment(c)
def recursive(self, soup):
if isinstance(soup, Comment): return
elif isinstance(soup, NavigableString):
for key, val in special_characters.items():
soup.string = soup.string.replace(key, val)
self.outputs.append(soup.string)
elif isinstance(soup, Tag):
tag = soup.name
if tag in ['h1', 'h2', 'h3', 'h4', 'h5']:
n = int(tag[1])
soup.contents.insert(0, NavigableString('\n' + '#'*n + ' '))
soup.contents.append(NavigableString('\n'))
elif tag == 'a' and 'href' in soup.attrs:
soup.contents.insert(0, NavigableString('['))
soup.contents.append(NavigableString("]({})".format(soup.attrs['href'])))
elif tag in ['b', 'strong']:
soup.contents.insert(0, NavigableString('**'))
soup.contents.append(NavigableString('**'))
elif tag in ['em']:
soup.contents.insert(0, NavigableString('*'))
soup.contents.append(NavigableString('*'))
elif tag == 'pre':
self.pre = True
elif tag in ['code', 'tt']:
if self.pre:
if not 'class' in soup.attrs:
language = 'bash' # default language
else:
language = ''
for name in ['cpp', 'bash', 'python', 'java']:
if name in ' '.join(list(soup.attrs['class'])): # <code class="prism language-cpp">
language = name
soup.contents.insert(0, NavigableString('\n```{}\n'.format(language)))
soup.contents.append(NavigableString('```\n'))
self.pre = False # assume the contents of <pre> contain only one <code>
else:
soup.contents.insert(0, NavigableString('`'))
soup.contents.append(NavigableString('`'))
elif tag == 'p':
if soup.parent.name != 'li':
# print(soup.parent)
soup.contents.insert(0, NavigableString('\n'))
elif tag == 'span':
if 'class' in soup.attrs:
if ('katex--inline' in soup.attrs['class'] or
'katex--display' in soup.attrs['class']): ## inline math
self.equ_inline = True if 'katex--inline' in soup.attrs['class'] else False
math_start_sign = '$' if self.equ_inline else '\n\n$$'
math_end_sign = '$' if self.equ_inline else '$$\n\n'
# equation = soup.find_all('annotation', {'encoding': 'application/x-tex'})[0].string
equation = soup.find_all('span', {'class': 'katex-mathml'})[0].string
equation = equation.strip().split('\n')[-1].strip()
equation = math_start_sign + str(equation) + math_end_sign
self.outputs.append(equation)
self.equ_inline = False
return
elif tag in ['ol', 'ul']:
soup.contents.insert(0, NavigableString('\n'))
soup.contents.append(NavigableString('\n'))
elif tag in ['li']:
soup.contents.insert(0, NavigableString('+ '))
# elif tag == 'blockquote':
# soup.contents.insert(0, NavigableString('> '))
elif tag == 'img':
src = soup.attrs['src']
# pattern = r'.*\.png'
pattern = r'(.*\..*\?)|(.*\.(png|jpeg|jpg))'
result_tuple = re.findall(pattern, src)[0]
if result_tuple[0]:
img_filename = result_tuple[0].split('/')[-1].rstrip('?')
else:
img_filename = result_tuple[1].split('/')[-1].rstrip('?')
# img_file = re.findall(pattern, src)[0][0].split('/')[-1].rstrip('?') ## e.g. https://img-blog.csdnimg.cn/20200228210146931.png?
download_img_cmd = 'aria2c --file-allocation=none -c -x 10 -s 10 -d {} -o {} {}'.format(self.fig_dir, img_filename, src)
img_file = join(self.fig_dir, img_filename)
if not exists(img_file):
os.system(download_img_cmd)
# soup.attrs['src'] = img_file
# self.outputs.append('\n' + str(soup.parent) + '\n')
code = ''.format(img_file, img_file)
self.outputs.append('\n' + code + '\n')
return
# parse table
elif tag == 'table':
soup.contents.insert(0, NavigableString('\n'))
soup.contents.append(NavigableString('\n'))
elif tag == 'thead':
self.head_num = 0
elif tag == 'tbody':
if self.head_num > 0:
soup.contents.insert(0, NavigableString('|' + ':---:|'*self.head_num + '\n'))
elif tag == 'tr':
soup.contents.insert(0, NavigableString('|'))
soup.contents.append(NavigableString('\n'))
elif tag == 'th':
self.head_num += 1
soup.contents.append(NavigableString('|'))
elif tag == 'td':
soup.contents.append(NavigableString('|'))
if not hasattr(soup, 'children'): return
for child in soup.children:
self.recursive(child)
if __name__ == '__main__':
# html = '<body><!-- cde --><h1>This is 1 <= 2<!-- abc --> <b>title</b></h1><p><a href="www.hello.com">hello</a></p><b>test</b>'
# html = '<body><!-- cde --><h1>hello</h1><h2>world</h2></body>'
html = '<body><!-- cde --><h1>hello</h1><h2>world</h2><table><thead><tr><th>H1</th><th>H2</th></tr></thead><tbody><tr><td>A</td><td>B</td></tr><tr><td>C</td><td>D</td></tr></tbody></table></body>'
parser = Parser(html)
print(''.join(parser.outputs))