HTML Lexerを書いてみた
厳密にHTMLの仕様に基づいて作っているわけではないですが、htmlっぽいタグとかをトークン化するものを書きました。open_tagの後にちゃんとclose_tagがあるかとか、そういった仕事はしません。それはParserなどがする仕事としてます。標準ライブラリのHTMLParserがありますが、トークン化だけしてくれる方がうれしいときもあると思います。
#!/usr/bin/env python2.6 import re IDENTIFIER_STRING = r"[a-zA-Z][a-zA-Z0-9_-]*" ATTRIBUTE_VALUE = "[^'\"]*" OPEN_TAG_STRING = r"([^<]*?)(<)(%s)" % IDENTIFIER_STRING ATTRIBUTE_STRING = r"(%s)\s*=\s*(?:'|\")(%s|%s)(?:'|\")" %\ (IDENTIFIER_STRING, ATTRIBUTE_VALUE, ATTRIBUTE_VALUE) COMMENT_STRING = r'(<!--)(.*?)(--\s*>)' class Lexer: def __init__(self): self.pos = 0 def tokenize(self, source): gen = self.tokeniter(source) for type, value in gen: if 'white_space' == type: continue yield type, value def tokeniter(self, source): c = lambda x: re.compile(x, re.M| re.S) rules = { 'root':[ #starttag (c(OPEN_TAG_STRING), ('data', 'opentag_begin', 'tag_name')), #closedtag (c('([^<]*?)(</)\s*(%s)\s*(>)' % IDENTIFIER_STRING), ('data', 'closedtag_begin', 'tag_name', 'tag_end')), #commentstart (c(COMMENT_STRING), ('comment_start', 'comment', 'comment_end')), #htmlend (None, 'htmlend') ], 'opentag_begin':[ # attribute (c('(\s+)'), ('white_space',)), (c(ATTRIBUTE_STRING), ('attribute_name', 'attribute_value')), # tag_end (c('(>)'), ('tag_end',)), (c('(/>)'), ('openclosedtag_end',)), ] } stackstate = rules['root'] stack = ['root'] pos = self.pos while 1: for regex, tokens in stackstate: if regex is None: stack.append(tokens) continue m = regex.match(source, pos) if m is None: continue for idx, token in enumerate(tokens): data = m.group(1 + idx) if data != '': yield token, data if token in ('opentag_begin', 'closedtag_begin'): stack.append(token) if token in ('tag_end', 'openclosedtag_end'): stack.pop() pos = m.end() stackstate = rules[stack[-1]] break else: if pos >= len(source) or 'htmlend' == stack.pop(): self.pos = pos break raise RuntimeError('unknown token %s' % source[pos]) import unittest class LexerTests(unittest.TestCase): def test_openclosedtag(self): source = "<img src='path/to/file' />" expections = [ ('opentag_begin', '<'), ('tag_name', 'img'), ('attribute_name', 'src'), ('attribute_value', 'path/to/file'), ('openclosedtag_end', '/>') ] g = Lexer().tokenize(source) for idx, value in enumerate(g): self.assertEqual(expections[idx], value) def test_nest(self): source = """\ <div id='content'> <h2 id='title'>TITLE</h2> <ul> <li><!-- <a href='path/to/file1'>article1</a> --> </li> <li><a href='path/to/file2'>article2</a></li> </ul> </div>\ """ expections = [ ('opentag_begin', '<'), ('tag_name', 'div'), ('attribute_name', 'id'), ('attribute_value', 'content'), ('tag_end', '>'), ('data', '\n '), ('opentag_begin', '<'), ('tag_name', 'h2'), ('attribute_name', 'id'), ('attribute_value', 'title'), ('tag_end', '>'), ('data', 'TITLE'), ('closedtag_begin', '</'), ('tag_name', 'h2'), ('tag_end', '>'), ('data', '\n '), ('opentag_begin', '<'), ('tag_name', 'ul'), ('tag_end', '>'), ('data', '\n '), ('opentag_begin', '<'), ('tag_name', 'li'), ('tag_end', '>'), ('comment_start', '<!--'), ('comment', "\n <a href='path/to/file1'>article1</a>\n "), ('comment_end', '-->'), ('data', '\n '), ('closedtag_begin', '</'), ('tag_name', 'li'), ('tag_end', '>'), ('data', ' \n '), ('opentag_begin', '<'), ('tag_name', 'li'), ('tag_end', '>'), ('opentag_begin', '<'), ('tag_name', 'a'), ('attribute_name', 'href'), ('attribute_value', 'path/to/file2'), ('tag_end', '>'), ('data', 'article2'), ('closedtag_begin', '</'), ('tag_name', 'a'), ('tag_end', '>'), ('closedtag_begin', '</'), ('tag_name', 'li'), ('tag_end', '>'), ('data', '\n '), ('closedtag_begin', '</'), ('tag_name', 'ul'), ('tag_end', '>'), ('data', '\n'), ('closedtag_begin', '</'), ('tag_name', 'div'), ('tag_end', '>'), ] g = Lexer().tokenize(source) for idx, value in enumerate(g): self.assertEqual(expections[idx], value) if __name__ == '__main__': unittest.main()