HTML Lexerを書いてみた

厳密にHTMLの仕様に基づいて作っているわけではないですが、htmlっぽいタグとかをトークン化するものを書きました。open_tagの後にちゃんとclose_tagがあるかとか、そういった仕事はしません。それはParserなどがする仕事としてます。標準ライブラリのHTMLParserがありますが、トークン化だけしてくれる方がうれしいときもあると思います。

#!/usr/bin/env python2.6
import re
IDENTIFIER_STRING = r"[a-zA-Z][a-zA-Z0-9_-]*"
ATTRIBUTE_VALUE = "[^'\"]*"
OPEN_TAG_STRING = r"([^<]*?)(<)(%s)" % IDENTIFIER_STRING
ATTRIBUTE_STRING = r"(%s)\s*=\s*(?:'|\")(%s|%s)(?:'|\")" %\
  (IDENTIFIER_STRING, ATTRIBUTE_VALUE, ATTRIBUTE_VALUE)
COMMENT_STRING = r'(<!--)(.*?)(--\s*>)'
class Lexer:
    def __init__(self):
        self.pos = 0
    def tokenize(self, source):
        gen = self.tokeniter(source)
        for type, value in gen:
            if 'white_space' == type:
                continue
            yield type, value
    def tokeniter(self, source):
        c = lambda x: re.compile(x, re.M| re.S)
        rules = {
            'root':[
                #starttag
                (c(OPEN_TAG_STRING), 
                    ('data', 'opentag_begin', 'tag_name')),
                #closedtag 
                (c('([^<]*?)(</)\s*(%s)\s*(>)' % IDENTIFIER_STRING), 
                  ('data', 'closedtag_begin', 'tag_name', 'tag_end')),
                #commentstart
                (c(COMMENT_STRING), 
                  ('comment_start', 'comment', 'comment_end')),
                #htmlend
                (None, 'htmlend')
                ],
            'opentag_begin':[
                # attribute
                (c('(\s+)'), ('white_space',)),
                (c(ATTRIBUTE_STRING),
                  ('attribute_name', 'attribute_value')),
                # tag_end
                (c('(>)'), ('tag_end',)),
                (c('(/>)'), ('openclosedtag_end',)),
                ]
        }
        stackstate = rules['root']
        stack = ['root']
        pos = self.pos
        while 1:
            for regex, tokens in stackstate:
                if regex is None:
                    stack.append(tokens)
                    continue
                m = regex.match(source, pos)
                if m is None:
                    continue
                for idx, token in enumerate(tokens):
                    data = m.group(1 + idx)
                    if data != '':
                        yield token, data
                    if token in ('opentag_begin', 'closedtag_begin'):
                        stack.append(token)
                    if token in ('tag_end', 'openclosedtag_end'):
                        stack.pop()
                pos = m.end()
                stackstate = rules[stack[-1]]
                break
            else:
                if pos >= len(source) or 'htmlend' == stack.pop():
                    self.pos = pos
                    break
                raise RuntimeError('unknown token %s' % source[pos])
import unittest
class LexerTests(unittest.TestCase):
    def test_openclosedtag(self):
        source = "<img src='path/to/file' />"
        expections = [
            ('opentag_begin', '<'),
            ('tag_name', 'img'),
            ('attribute_name', 'src'),
            ('attribute_value', 'path/to/file'),
            ('openclosedtag_end', '/>')
        ]
        g = Lexer().tokenize(source)
        for idx, value in enumerate(g):
            self.assertEqual(expections[idx], value) 
    def test_nest(self):
        source = """\
<div id='content'>
  <h2 id='title'>TITLE</h2>
    <ul>
      <li><!--
      <a href='path/to/file1'>article1</a>
      -->
      </li>     
      <li><a href='path/to/file2'>article2</a></li>
    </ul>
</div>\
"""
        expections = [
        ('opentag_begin', '<'),
        ('tag_name', 'div'),
        ('attribute_name', 'id'),
        ('attribute_value', 'content'),
        ('tag_end', '>'),
        ('data', '\n  '),
        ('opentag_begin', '<'),
        ('tag_name', 'h2'),
        ('attribute_name', 'id'),
        ('attribute_value', 'title'),
        ('tag_end', '>'),
        ('data', 'TITLE'),
        ('closedtag_begin', '</'),
        ('tag_name', 'h2'),
        ('tag_end', '>'),
        ('data', '\n    '),
        ('opentag_begin', '<'),
        ('tag_name', 'ul'),
        ('tag_end', '>'),
        ('data', '\n      '),
        ('opentag_begin', '<'),
        ('tag_name', 'li'),
        ('tag_end', '>'),
        ('comment_start', '<!--'),
        ('comment', "\n      <a href='path/to/file1'>article1</a>\n      "),
        ('comment_end', '-->'),
        ('data', '\n      '),
        ('closedtag_begin', '</'),
        ('tag_name', 'li'),
        ('tag_end', '>'),
        ('data', '     \n      '),
        ('opentag_begin', '<'),
        ('tag_name', 'li'),
        ('tag_end', '>'),
        ('opentag_begin', '<'),
        ('tag_name', 'a'),
        ('attribute_name', 'href'),
        ('attribute_value', 'path/to/file2'),
        ('tag_end', '>'),
        ('data', 'article2'),
        ('closedtag_begin', '</'),
        ('tag_name', 'a'),
        ('tag_end', '>'),
        ('closedtag_begin', '</'),
        ('tag_name', 'li'),
        ('tag_end', '>'),
        ('data', '\n    '),
        ('closedtag_begin', '</'),
        ('tag_name', 'ul'),
        ('tag_end', '>'),
        ('data', '\n'),
        ('closedtag_begin', '</'),
        ('tag_name', 'div'),
        ('tag_end', '>'),
        ]
        g = Lexer().tokenize(source)
        for idx, value in enumerate(g):
            self.assertEqual(expections[idx], value)
if __name__ == '__main__':
    unittest.main()