-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathHTMLParser.py
More file actions
134 lines (106 loc) · 4.18 KB
/
HTMLParser.py
File metadata and controls
134 lines (106 loc) · 4.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
from HTMl_Tags import Element, Text
from utils.checkEntity import checkEntity
HEAD_TAGS = [
"base", "basefont", "bgsound", "noscript",
"link", "meta", "title", "style", "script",
]
class HTMLParser:
def __init__(self, body) -> None:
self.body = body
self.unfinished = []
self.selfClosing = [ "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", "track", "wbr", ] # tags that dont require a closing tag
self.in_style_tag = False # Flag to track if inside a <style> tag
# parse body of html
def parse(self, scheme):
text = "" # contains current text (not in a tag)
inTag = False # Flag to track if inside a tag
inScriptTag = False # Flag to track if inside a <script> tag
# special case for view-source
if scheme == "view-source":
return Text(self.body, None)
for char in self.body:
if char == "<":
inTag = True
if text: self.addText(checkEntity(text))
text = ""
elif char == ">":
inTag = False
self.addTag(text)
text = ""
else:
text += char
if not inTag and not inScriptTag and text:
self.addText(checkEntity(text))
return self.finish()
# gets attributes of text
def getAttributes(self, text):
textParts = text.split()
tag = textParts[0].casefold() if textParts else ""
attributes = {}
for part in textParts[1:]:
if "=" in part:
key, value = part.split("=",1)
if len(value) > 2 and value[0] in ["'", "\""]:
value = value[1:-1]
attributes[key.casefold()] = value
else:
attributes[part.casefold()] = ""
return tag, attributes
# checks for header tags if they are not present
def implicitTags(self, tag):
while True:
openTags = [node.tag for node in self.unfinished]
if openTags == [] and tag != "html":
self.addTag("html")
elif tag == ["html"] and tag not in ["head", "body", "/html"]:
if tag in HEAD_TAGS:
self.addTag("head")
else:
self.addTag("body")
elif openTags == ["html", "head"] and tag not in ["/head"] + HEAD_TAGS:
self.addTag("/head")
else:
break
# creates a text node
def addText(self, text):
if text.isspace() or self.in_style_tag: return
self.implicitTags(None)
parent = self.unfinished[-1]
node = Text(text, parent)
parent.children.append(node)
# creates a tag node
def addTag(self, tag):
tag, attributes = self.getAttributes(tag)
if tag.startswith("!doctype"): return
self.implicitTags(tag)
if tag.startswith("/"):
if len(self.unfinished) == 1: return
node = self.unfinished.pop()
parent = self.unfinished[-1]
parent.children.append(node)
if tag == "/style":
self.in_style_tag = False # Exiting a <style> tag
elif tag in self.selfClosing:
parent = self.unfinished[-1]
node = Element(tag, parent, attributes)
parent.children.append(node)
else:
parent = self.unfinished[-1] if self.unfinished else None
node = Element(tag, parent, attributes)
self.unfinished.append(node)
if tag == "style":
self.in_style_tag = True # Entering a <style> tag
# finishes tree
def finish(self):
if not self.unfinished:
self.implicitTags(None)
while len(self.unfinished) > 1:
node = self.unfinished.pop()
parent = self.unfinished[-1]
parent.children.append(node)
return self.unfinished.pop()
# debug print
def print_tree(self, node, indent=0):
print(" " * indent, node)
for child in node.children:
self.print_tree(child, indent + 2)