html-parser/htmlparser.py at master · b4ubles/html-parser · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import re


class HTML(object):

    tagRe = re.compile(r'''<(?:"[^"]*"['"]*|'[^']*'['"]*|[^'">])+>''')
    attrRe = re.compile(r'''([\w-]+)|['"]{1}([^'"]*)['"]{1}''')

    def __init__(self):
        super(HTML, self).__init__()

    @classmethod
    def parse(cls, htmlString):
        start = 0
        end = 0
        tree = {}
        for tag in cls.tagRe.finditer(htmlString):
            if end >= tag.end():
                continue
            start = tag.start()
            end = tag.end()
            string = tag.group(0)
            if string[1] == "/":
                if string == "</html>":
                    return tree
                return tree, start, end
            tagName = string[1:].split()[0].strip(">")
            if tagName.lower() == "!doctype":
                continue
            tagAttr = cls.parseTag(string)
            if "</%s>" % tagName in htmlString:
                if tagName == "script":
                    ret = cls.parseScript(htmlString[end:])
                else:
                    ret = cls.parse(htmlString[end:])
                if ret[0]:
                    tagAttr[tagName]["children"] = ret[0]
                else:
                    tagAttr[tagName]["content"] = htmlString[end:end+ret[1]]
                end += ret[2]
            tree[tagName] = tagAttr[tagName]
        return tree

    @classmethod
    def parseScript(cls, scriptString):
        return [], scriptString.index("</script>"), scriptString.index("</script>")

    @classmethod
    def parseTag(cls, tagString):
        tag = {}
        index = 0
        key = "tagname"
        for attr in cls.attrRe.finditer(tagString):
            if index % 2 == 0:
                tag[key] = attr.group(0)
            else:
                key = attr.group(0)
            index += 1
        tagname = tag["tagname"]
        del tag["tagname"]
        return {tagname: tag}


if __name__ == '__main__':
    from pprint import pprint
    with open("test\\sample0.html", "rb") as fh:
        pprint(HTML.parse(fh.read()))