読者です 読者をやめる 読者になる 読者になる

DFAベース正規表現エンジンを追加(Python)

前回のものにDFAを構成しながらマッチングするクラスを追加しました。

class State(object):
    def transfer(self, c, nlist):
        pass
    def appendTo(self, nlist):
        nlist.append(self)
SuccessState = State()
class CharState(State):
    def __init__(self, c):
        self.c = c
        self.out = None
    def transfer(self, c, nlist):
        if self.c == c and self.out is not None:
            self.out.appendTo(nlist)
    def connect(self, s):
        self.out = s
class RegExpError(Exception):
    pass
class SplitState(State):
    def __init__(self):
        self.first = None
        self.second = None
    def transfer(self, c, nlist):
        raise RegExpError()
    def appendTo(self, nlist):
        self.first.appendTo(nlist)
        self.second.appendTo(nlist)
    def connect(self, s):
        assert(self.first is not None)# greedy
        self.second = s
class Fragment(object):
    def __init__(self, start, out):
        self.start = start
        self.out = out
    def patch(self, outer):
        self.connect(outer.start)
        return Fragment(self.start, outer.out)
    def connect(self, state):
        for s in self.out:
            s.connect(state)
def from_post(post, FA):
    stack = []
    for c in post:
        if c == '.':
            f2 = stack.pop()
            f1 = stack.pop()
            stack.append(f1.patch(f2))
        elif c == '|':
            f2 = stack.pop()
            f1 = stack.pop()
            s = SplitState()
            s.first = f1.start
            s.second = f2.start
            out = []
            out.extend(f1.out)
            out.extend(f2.out)
            stack.append(Fragment(s, out))
        elif c == '*':
            f = stack.pop()
            s = SplitState()
            f.connect(s)
            s.first = f.start
            stack.append(Fragment(s, [s]))
        elif c == '+':
            f = stack.pop()
            s = SplitState()
            f.connect(s)
            s.first = f.start
            stack.append(Fragment(f.start, [s]))
        elif c == '?':
            f = stack.pop()
            s = SplitState()
            s.first = f.start
            out = [s]
            out.extend(f.out)
            stack.append(Fragment(s, out))
        else:
            s = CharState(c)
            stack.append(Fragment(s, [s]))
    f = stack.pop()
    assert(0 == len(stack))
    f.connect(SuccessState)
    return FA(f.start)
class NFA(object):
    def __init__(self, start):
        self.start = start
    def match(self, string):
        clist = []
        nlist = []
        self.start.appendTo(clist)        
        for c in string:
            for s in clist:
                s.transfer(c, nlist)
            clist = nlist
            nlist = []
        return SuccessState in clist
    @staticmethod
    def from_post(post):
        return from_post(post, FA=NFA)
class DFA(object):
    def __init__(self, start):
        self.start = start
        self.states = []
    def match(self, string):
        nlist = []
        self.start.appendTo(nlist)
        s = self.find(nlist)
        if s is None:
            s = DState(nlist)
            self.states.append(s)
        nlist = []
        for c in string:
            s = s.getnext(c, nlist, self)
            nlist = []
        return SuccessState in s.nstates
    def find(self, nlist):
        nset = set(nlist)
        for s in self.states:
            if nset == s.nstates:
                return s
        return None
    @staticmethod
    def from_post(post):
        return from_post(post, FA=DFA)
class DState(object):
    def __init__(self, nstates):
        self.nstates = set(nstates)
        self.next = {}
    def transfer(self, c, nlist):
        for s in self.nstates:
            s.transfer(c, nlist)
    def getnext(self, c, nlist, dfa):
        ns = self.next.get(c)
        if ns is not None:
            return ns
        self.transfer(c, nlist)
        ns = dfa.find(nlist)
        if ns is None:
            ns = DState(nlist)
            dfa.states.append(ns)
        self.setnext(c, ns)
        return ns 
    def setnext(self, c, ns):
          self.next.update({c: ns})

続いてテスト。DFAとNFAのマッチングについてのテストは同じものを流用してます。

#!/usr/bin/env python2.6
import unittest
from test import test_support
from regex import *
class RegexTests(unittest.TestCase):
    def test_char_transfer(self):
        s1 = CharState('a')
        s1.out = CharState('b')
        nlist = []
        s1.transfer('b', nlist)
        self.assertEqual([], nlist)
        s1.transfer('a', nlist)
        self.assertEqual([s1.out], nlist)
        nlist = []
        s1.out.transfer('b', nlist)
        self.assertEqual([], nlist)
    def test_char_fragment(self):
        s1 = CharState('a')
        s2 = CharState('b')
        f1 = Fragment(s1, [s1])
        f2 = Fragment(s2, [s2])
        self.assertEqual(None, s1.out)
        f = f1.patch(f2)
        self.assertEqual(s2, s1.out)
        self.assertEqual(s1, f.start)
        self.assertEqual([s2], f.out)
    def test_cat_match(self):
        nfa = NFA.from_post('ab.')
        self.assertTrue(nfa.match('ab'))
        self.assertFalse(nfa.match('a'))
        self.assertFalse(nfa.match('b'))
        self.assertFalse(nfa.match('abc'))
    def test_split_appendTo(self):
        s = SplitState()
        s1 = CharState('a')
        s2 = CharState('b')
        s.first = s1
        s.second = s2
        nlist = []
        s.appendTo(nlist)
        self.assertEqual([s1, s2], nlist)
class MatchTests(unittest.TestCase):
    def test_alt_match(self):
        nfa = self.FA.from_post('ab|')
        self.assertFalse(nfa.match(''))
        self.assertTrue(nfa.match('a'))
        self.assertTrue(nfa.match('b'))
        self.assertFalse(nfa.match('ab'))
    def test_star_match(self):
        nfa = self.FA.from_post('a*')
        self.assertTrue(nfa.match(''))
        self.assertTrue(nfa.match('a'))
        self.assertTrue(nfa.match('aa'))
        self.assertTrue(nfa.match('aaaaa'))
        self.assertFalse(nfa.match('aab'))
    def testplus_match(self):
        nfa = self.FA.from_post('b+')
        self.assertFalse(nfa.match(''))
        self.assertTrue(nfa.match('b'))
        self.assertTrue(nfa.match('bbbbb'))
        self.assertFalse(nfa.match('bbba'))
    def test_quest_match(self):
        nfa = self.FA.from_post('a?')
        self.assertTrue(nfa.match(''))
        self.assertTrue(nfa.match('a'))
        self.assertFalse(nfa.match('aa'))
        self.assertFalse(nfa.match('abc'))
    def test_mixed_case(self):
        nfa = self.FA.from_post('a?b+.c*.')
        self.assertTrue(nfa.match('b'))
        self.assertTrue(nfa.match('abc'))
        self.assertTrue(nfa.match('abbc'))
        self.assertTrue(nfa.match('abbccc'))
        self.assertFalse(nfa.match('aabc'))
        self.assertFalse(nfa.match('ac'))
        self.assertFalse(nfa.match(''))
    def test_complex_case(self):
        buf = [] 
        buf.append('0r|') # 0|r
        buf.append('20*.1r|.') # 20*(1|r)
        buf.append('120*.2.|0*.')# (1|20*2)0*
        buf.append('2r|10*.1r|.|') #(2|r)|(10*(1|r))
        source = ''.join(buf) + '.||*'
        prog = self.FA.from_post(source)
        self.assertFalse(prog.match('11'))
        self.assertTrue(prog.match(''))
        self.assertTrue(prog.match('111'))
        self.assertTrue(prog.match('21'))
        self.assertTrue(prog.match('2100011r'))
        self.assertTrue(prog.match('r'))
        self.assertTrue(prog.match('r00'))
        self.assertTrue(prog.match('20002101'))
class NFAMatchTests(MatchTests):
    FA = NFA
class DFAMatchTests(MatchTests):
    FA = DFA
class DFATests(unittest.TestCase):
    def test_DState(self):
        nfa = NFA.from_post('ab*|')
        clist = []
        nfa.start.appendTo(clist)
        ds = DState(clist)
        s1 = nfa.start
        s2 = clist[0]
        s4 = clist[1]
        s3 = s4.out
        s5 = clist[2]
        nlist = []
        ds.transfer('a', nlist)
        self.assertEqual([s5], nlist)
        nlist = []
        ds.transfer('b', nlist)
        self.assertEqual([s4, s5], nlist)
    def test_find(self):
        dfa = DFA.from_post('ab*|')
        clist = []
        dfa.start.appendTo(clist)
        s = DState(clist)
        dfa.states.append(s)
        s1 = dfa.start
        s2 = clist[0]
        s4 = clist[1]
        s3 = s4.out
        s5 = clist[2] 
        self.assertEqual(s, dfa.find([s2, s4, s5]))
        self.assertEqual(s, dfa.find([s4, s2, s5]))
        self.assertEqual(None, dfa.find([s1, s2]))
    def test_match_simple(self):
        dfa = DFA.from_post('ab*|')
        self.assertTrue(dfa.match('a'))
        self.assertFalse(dfa.match('aa'))
        self.assertFalse(dfa.match('ab'))
        self.assertTrue(dfa.match('bb'))
        self.assertTrue(dfa.match(''))
        self.assertFalse(dfa.match('bba'))
        self.assertFalse(dfa.match('bbbba'))
        self.assertEqual(4, len(dfa.states))
if __name__ == '__main__':
    test_support.run_unittest(RegexTests, NFAMatchTests, DFAMatchTests, DFATests)