Web-Browser/HTMLParser.py at main · Macpickle/Web-Browser · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
from HTMl_Tags import Element, Text
from utils.checkEntity import checkEntity

HEAD_TAGS = [
    "base", "basefont", "bgsound", "noscript",
    "link", "meta", "title", "style", "script",
]

class HTMLParser:
    def __init__(self, body) -> None:
        self.body = body
        self.unfinished = []
        self.selfClosing = [ "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", "track", "wbr", ] # tags that dont require a closing tag
        self.in_style_tag = False  # Flag to track if inside a <style> tag

    # parse body of html
    def parse(self, scheme):
        text = "" # contains current text (not in a tag)
        inTag = False # Flag to track if inside a tag
        inScriptTag = False # Flag to track if inside a <script> tag

        # special case for view-source
        if scheme == "view-source":
            return Text(self.body, None)

        for char in self.body:
            if char == "<":
                inTag = True
                if text: self.addText(checkEntity(text))
                text = ""

            elif char == ">":
                inTag = False
                self.addTag(text)
                text = ""

            else:
                text += char

        if not inTag and not inScriptTag and text:
            self.addText(checkEntity(text))

        return self.finish()

        # gets attributes of text
    def getAttributes(self, text):
        textParts = text.split()
        tag = textParts[0].casefold() if textParts else ""
        attributes = {}
        for part in textParts[1:]:
            if "=" in part:
                key, value = part.split("=",1)
                if len(value) > 2 and value[0] in ["'", "\""]:
                    value = value[1:-1]
                attributes[key.casefold()] = value

            else:
                attributes[part.casefold()] = ""

        return tag, attributes

    # checks for header tags if they are not present
    def implicitTags(self, tag):
        while True:
            openTags = [node.tag for node in self.unfinished]

            if openTags == [] and tag != "html":
                self.addTag("html")

            elif tag == ["html"] and tag not in ["head", "body", "/html"]:
                if tag in HEAD_TAGS:
                    self.addTag("head")

                else:
                    self.addTag("body")

            elif openTags == ["html", "head"] and tag not in ["/head"] + HEAD_TAGS:
                self.addTag("/head")

            else:
                break

    # creates a text node
    def addText(self, text):
        if text.isspace() or self.in_style_tag: return
        self.implicitTags(None)

        parent = self.unfinished[-1]
        node = Text(text, parent)
        parent.children.append(node)

    # creates a tag node
    def addTag(self, tag):
        tag, attributes = self.getAttributes(tag)
        if tag.startswith("!doctype"): return
        self.implicitTags(tag)

        if tag.startswith("/"):
            if len(self.unfinished) == 1: return
            node = self.unfinished.pop()
            parent = self.unfinished[-1]
            parent.children.append(node)
            if tag == "/style":
                self.in_style_tag = False  # Exiting a <style> tag

        elif tag in self.selfClosing:
            parent = self.unfinished[-1]
            node = Element(tag, parent, attributes)
            parent.children.append(node)

        else:
            parent = self.unfinished[-1] if self.unfinished else None
            node = Element(tag, parent, attributes)
            self.unfinished.append(node)
            if tag == "style":
                self.in_style_tag = True  # Entering a <style> tag

    # finishes tree
    def finish(self):
        if not self.unfinished:
            self.implicitTags(None)

        while len(self.unfinished) > 1:
            node = self.unfinished.pop()
            parent = self.unfinished[-1]
            parent.children.append(node)

        return self.unfinished.pop()

    # debug print
    def print_tree(self, node, indent=0):
        print(" " * indent, node)
        for child in node.children:
            self.print_tree(child, indent + 2)