diff --git a/.github/workflows/emulated.yml b/.github/workflows/emulated.yml index 1c48b4f..e753a51 100644 --- a/.github/workflows/emulated.yml +++ b/.github/workflows/emulated.yml @@ -44,7 +44,7 @@ jobs: # interpret the forward slashes. You're welcome. sed -i "s/getDataDir/\(return \"$(pwd | sed 's/\//\\\//g')\\/data\"\)/g" src/Main.hs sed -i "s/version/undefined/g" src/Main.hs - ghc -XHaskell2010 -XPatternSynonyms -XFlexibleContexts -XMagicHash -XCPP -XNondecreasingIndentation -XScopedTypeVariables -XTupleSections \ + ghc -XHaskell2010 -XPatternSynonyms -XFlexibleContexts -XMagicHash -XCPP -XNondecreasingIndentation -XScopedTypeVariables -XTupleSections -XDeriveFunctor \ -package array -package containers -package directory \ -isrc src/Main.hs \ -o alex diff --git a/alex.cabal b/alex.cabal index 4e5f174..f49fa58 100644 --- a/alex.cabal +++ b/alex.cabal @@ -91,6 +91,8 @@ extra-source-files: tests/issue_141.x tests/issue_197.x tests/issue_262.x + tests/issue_269_part1.x + tests/issue_269_part2.x tests/strict_text_typeclass.x tests/posn_typeclass_strict_text.x tests/tokens_monadUserState_strict_text.x @@ -113,6 +115,7 @@ executable alex default-language: Haskell2010 default-extensions: + DeriveFunctor PatternSynonyms ScopedTypeVariables TupleSections diff --git a/src/AbsSyn.hs b/src/AbsSyn.hs index 8db9871..56b2be0 100644 --- a/src/AbsSyn.hs +++ b/src/AbsSyn.hs @@ -123,7 +123,7 @@ data RightContext r = NoRightContext | RightContextRExp r | RightContextCode Code - deriving (Eq,Ord) + deriving (Eq, Ord, Functor) instance Show RECtx where showsPrec _ (RECtx scs _ r rctx code) = diff --git a/src/DFAMin.hs b/src/DFAMin.hs index 5a0a781..deceeb0 100644 --- a/src/DFAMin.hs +++ b/src/DFAMin.hs @@ -1,218 +1,253 @@ -{-# OPTIONS_GHC -fno-warn-name-shadowing #-} - -{-# LANGUAGE CPP #-} - module DFAMin (minimizeDFA) where import AbsSyn -import Data.IntMap ( IntMap ) -import Data.IntSet ( IntSet ) -import Data.Map ( Map ) -#if !MIN_VERSION_containers(0,6,0) -import Data.Maybe ( mapMaybe ) -#endif +import Control.Monad (guard) +import Data.Bifunctor (second) +import Data.IntMap (IntMap) +import Data.IntSet (IntSet) -import qualified Data.Map as Map -import qualified Data.IntSet as IntSet import qualified Data.IntMap as IntMap +import qualified Data.IntSet as IntSet import qualified Data.List as List +import qualified Data.Map as Map + +{- Note [Hopcroft's Algorithm] + +DFA minimization is implemented using Hopcroft's algorithm. The following +definition is mostly copied from Wikipedia. + +We assume the following definitions: + - Q is the set of all states in our DFA + - F is the subset of Q that contains all final (or "accepting") states + - ∑ is the set of input symbols (for us, [0..255]) + +We use the phrase "X refines Y into Y1 and Y2" to mean the following: + - Y1 := Y ∩ X + - Y2 := Y \ X + - |Y1| > 0 + - |Y2| > 0 + +The algorithm itself is defined thusly: + + P := {F, Q \ F} + W := {F, Q \ F} + while (W is not empty) do + choose and remove a set A from W + for each c in Σ do + let X be the set of states for which a transition on c leads to a state in A + for each set Y in P that is refined by X into Y1 and Y2 do + replace Y in P by the two sets Y1 and Y2 + if Y is in W + replace Y in W by Y1 and Y2 + else + if |Y1| <= |Y2| + add Y1 to W + else + add Y2 to W + + +Our implementation differs slightly, as we perform several optimizations. + +In the Wikipedia implementation, P and W are initialized to two subsets of Q, +specifically F and Q \ F. The exact subsets do not matter; what matters is the +following: + - P and W should contain all Q states + - equivalent states should all be in the same subset + +As per the first requirement, it would be fine for P and W to be initialized as +a set that only contains Q. Using more fine-grained subsets reduces the amount +of work that needs to be done. The second requirement stems from the fact that +our partition "refining" can divide subsets, but we do not have a way to +re-merge subsets. + +Our first optimization is that we use a more granular division of states in the +initial set. Specifically, we group all states by their list of "accepts", since +we know that for two states to be equivalent their list of "accepts" must be the +same: the resulting subsets therefore meet our two stated criteria. + + +Our second optimization relies on the observation that given that all states are +in W, then all states will appear in A; as a result, instead of starting with a +set P that contains all subsets, that we refine in parallel to W, we can instead +start with an empty set R, and add each A to R before iterating over P. This +makes updating R and W easier, and removes the need for the expensive "is Y in +W" check. + + +With those two optimizations, our implementation is therefore: + + R := {} + W := {all "accept" subsets of Q} + while (W is not empty) do + choose and remove a set A from W + add A to R + for each c in Σ do + let X be the set of states for which a transition on c leads to a state in A + for each set Y in R that is refined by X into Y1 and Y2 do + replace Y in R by the two sets Y1 and Y2 + if |Y1| <= |Y2| + add Y1 to W + else + add Y2 to W + for each set Y in W that is refined by X into Y1 and Y2 do + replace Y in W by the two sets Y1 and Y2 + +-} + +type OldSNum = Int -- ^ Old state number +type NewSNum = Int -- ^ New state number + +-- | Reduce the number of states in the given DFA by grouping indistinguishable +-- states. +minimizeDFA :: forall a. Ord a => DFA OldSNum a -> DFA NewSNum a +minimizeDFA dfa@(DFA starts statemap) = DFA starts $ Map.fromList new_states + where + -- Group the states into classes according to the language they accept. + equiv_classes :: [EquivalenceClass] + equiv_classes = groupEquivalentStates dfa + + -- A map from new state numbers to a class of equivalent old states. + numbered_states :: [(NewSNum, EquivalenceClass)] + numbered_states = number (length starts) starts equiv_classes + + -- Assign each state in the minimized DFA a number, making + -- sure that we assign the numbers [0..] to the start states. + number :: NewSNum -> [NewSNum] -> [EquivalenceClass] -> [(NewSNum, EquivalenceClass)] + number _ _ [] = [] + number n unassigned_starts (ss:sss) + | null starts_ss = (n,ss) : continue (n+1) + | otherwise = map (,ss) starts_ss ++ continue n + -- if one of the states of the minimized DFA corresponds + -- to multiple starts states, we just have to duplicate + -- that state. + where + -- All the start states in ss (starts_ss) are assigned this equivalence class. + -- The remaining ones are passed to the recursive call. + (starts_ss, starts_other) = List.partition (`IntSet.member` ss) unassigned_starts + continue n' = number n' starts_other sss + + -- Mapping new state numbers to their state description. + new_states :: [(NewSNum, State NewSNum a)] + new_states = map (second class_to_new_state) numbered_states + + -- Translate an equivalence class of old states into a new state description. + class_to_new_state :: EquivalenceClass -> State NewSNum a + class_to_new_state = + -- A new state is constructed from any of the old states in the equivalence class. + -- It does not matter which old state we pick since by construction of the classes + -- they have the same behavior, both in their output (accepts) and their transitions. + -- Since IntSet does not have a method to give an arbitrary element + -- (ideally the one that is fastest to retrieve) + -- we use findMin (always succeeds because the IntSet is non-empty). + old_state_to_new_state . lookupOrPanic statemap . IntSet.findMin + where + lookupOrPanic = flip $ Map.findWithDefault panic + panic = error "alex::DFAMin.minimizeDFA: panic: state not found" + + -- Convert all state numbers in the State structure to new ones. + old_state_to_new_state :: State OldSNum a -> State NewSNum a + old_state_to_new_state (State old_accepts old_transitions) = + State (map fix_acc old_accepts) (fmap get_new old_transitions) + + fix_acc :: Accept a -> Accept a + fix_acc acc = acc { accRightCtx = fmap get_new $ accRightCtx acc } --- % Hopcroft's Algorithm for DFA minimization (cut/pasted from Wikipedia): --- % X refines Y into Y1 and Y2 means --- % Y1 := Y ∩ X --- % Y2 := Y \ X --- % where both Y1 and Y2 are nonempty + get_new :: OldSNum -> NewSNum + get_new k = IntMap.findWithDefault panic k old_to_new + where + panic = error "alex::DFAMin.minimizeDFA: panic: state not found" + + -- Memoized translation of old state numbers to new state numbers. + old_to_new :: IntMap NewSNum + old_to_new = IntMap.fromList $ do + (n,ss) <- numbered_states + s <- IntSet.toList ss + pure (s,n) + + +-- | An equivalence class is a /non-empty/ set of states. +type EquivalenceClass = IntSet + + +-- | Creates the subsets of Q that are used to initialize W. -- --- P := {{all accepting states}, {all nonaccepting states}}; --- Q := {{all accepting states}}; --- while (Q is not empty) do --- choose and remove a set A from Q --- for each c in ∑ do --- let X be the set of states for which a transition on c leads to a state in A --- for each set Y in P for which X refines Y into Y1 and Y2 do --- replace Y in P by the two sets Y1 and Y2 --- if Y is in Q --- replace Y in Q by the same two sets --- else --- add the smaller of the two sets to Q --- end; --- end; --- end; +-- As per the two conditions listed in Note [Hopcroft's Algorithm], we have two +-- requirements: the union of all resulting sets must be equivalent to Q (the set +-- of all states), and all equivalent states must be in the same subsets. -- --- % X is a preimage of A under transition function. +-- We group states by their list of 'Accept'. +initialSubsets :: forall a. Ord a => DFA OldSNum a -> [EquivalenceClass] +initialSubsets dfa = Map.elems $ Map.fromListWith IntSet.union $ do + (stateIndex, State accepts _transitions) <- Map.toList $ dfa_states dfa + pure (accepts, IntSet.singleton stateIndex) + --- % observation : Q is always subset of P --- % let R = P \ Q. then following algorithm is the equivalent of the Hopcroft's Algorithm +-- | Creates a cache of all reverse transitions for a given DFA. -- --- R := {{all nonaccepting states}}; --- Q := {{all accepting states}}; --- while (Q is not empty) do --- choose a set A from Q --- remove A from Q and add it to R --- for each c in ∑ do --- let X be the set of states for which a transition on c leads to a state in A --- for each set Y in R for which X refines Y into Y1 and Y2 do --- replace Y in R by the greater of the two sets Y1 and Y2 --- add the smaller of the two sets to Q --- end; --- for each set Y in Q for which X refines Y into Y1 and Y2 do --- replace Y in Q by the two sets Y1 and Y2 --- end; --- end; --- end; +-- To each token c in Σ, the resulting map associates a reverse map of +-- transitions. That is: for each c, we have a map that, to a state s, +-- associates the set of states that can reach s via c. -- --- % The second for loop that iterates over R mutates Q, --- % but it does not affect the third for loop that iterates over Q. --- % Because once X refines Y into Y1 and Y2, Y1 and Y2 can't be more refined by X. - -minimizeDFA :: forall a. Ord a => DFA Int a -> DFA Int a -minimizeDFA dfa@(DFA { dfa_start_states = starts, - dfa_states = statemap - }) - = DFA { dfa_start_states = starts, - dfa_states = Map.fromList states } +-- Given that the actual value of c is never actually required, we flatten the +-- result into a list. +generateReverseTransitionCache :: forall a. Ord a => DFA OldSNum a -> [IntMap EquivalenceClass] +generateReverseTransitionCache dfa = IntMap.elems $ + IntMap.fromListWith (IntMap.unionWith IntSet.union) $ do + (sourceState, State _accepts transitions) <- Map.toList $ dfa_states dfa + (token, targetState) <- IntMap.toList transitions + pure (token, IntMap.singleton targetState (IntSet.singleton sourceState)) + + +-- | Given two sets X and Y, compute their intersection and difference. +-- Only returns both if both are non-empty, otherwise return neither. +refine + :: EquivalenceClass + -> EquivalenceClass + -> Maybe (EquivalenceClass, EquivalenceClass) +refine x y = + if IntSet.null intersection || IntSet.null difference + then Nothing + else Just (intersection, difference) where - equiv_classes :: [EquivalenceClass] - equiv_classes = groupEquivStates dfa - - numbered_states :: [(Int, EquivalenceClass)] - numbered_states = number (length starts) equiv_classes - - -- assign each state in the minimized DFA a number, making - -- sure that we assign the numbers [0..] to the start states. - number :: Int -> [EquivalenceClass] -> [(Int, EquivalenceClass)] - number _ [] = [] - number n (ss:sss) = - case filter (`IntSet.member` ss) starts of - [] -> (n,ss) : number (n+1) sss - starts' -> map (,ss) starts' ++ number n sss - -- if one of the states of the minimized DFA corresponds - -- to multiple starts states, we just have to duplicate - -- that state. - - states :: [(Int, State Int a)] - states = [ - let old_states = map (lookup statemap) (IntSet.toList equiv) - accs = map fix_acc (state_acc (headWithDefault undefined old_states)) - -- accepts should all be the same - out = IntMap.fromList [ (b, get_new old) - | State _ out <- old_states, - (b,old) <- IntMap.toList out ] - in (n, State accs out) - | (n, equiv) <- numbered_states - ] - - fix_acc :: Accept a -> Accept a - fix_acc acc = acc { accRightCtx = fix_rctxt (accRightCtx acc) } - - fix_rctxt :: RightContext SNum -> RightContext SNum - fix_rctxt (RightContextRExp s) = RightContextRExp (get_new s) - fix_rctxt other = other - - lookup :: Ord k => Map k v -> k -> v - lookup m k = Map.findWithDefault (error "minimizeDFA") k m - - get_new :: Int -> Int - get_new = lookup old_to_new - - old_to_new :: Map Int Int - old_to_new = Map.fromList [ (s,n) | (n,ss) <- numbered_states, - s <- IntSet.toList ss ] + intersection = IntSet.intersection y x + difference = IntSet.difference y x -type EquivalenceClass = IntSet -groupEquivStates :: forall a. Ord a => DFA Int a -> [EquivalenceClass] -groupEquivStates DFA { dfa_states = statemap } - = go init_r init_q +-- | Given a DFA, compute all sets of equivalent states. +-- +-- See Note [Hopcroft's Algorithm] for details. +groupEquivalentStates :: forall a. Ord a => DFA OldSNum a -> [EquivalenceClass] +groupEquivalentStates dfa = outerLoop ([], initialSubsets dfa) where - accepting, nonaccepting :: Map Int (State Int a) - (accepting, nonaccepting) = Map.partition acc statemap - where acc (State as _) = not (List.null as) - - nonaccepting_states :: EquivalenceClass - nonaccepting_states = IntSet.fromList (Map.keys nonaccepting) - - -- group the accepting states into equivalence classes - accept_map :: Map [Accept a] [Int] - accept_map = {-# SCC "accept_map" #-} - List.foldl' (\m (n,s) -> Map.insertWith (++) (state_acc s) [n] m) - Map.empty - (Map.toList accepting) - - accept_groups :: [EquivalenceClass] - accept_groups = map IntSet.fromList (Map.elems accept_map) - - init_r, init_q :: [EquivalenceClass] - init_r -- Issue #71: each EquivalenceClass needs to be a non-empty set - | IntSet.null nonaccepting_states = [] - | otherwise = [nonaccepting_states] - init_q = accept_groups - - -- a map from token T to - -- a map from state S to the set of states that transition to - -- S on token T - -- bigmap is an inversed transition function classified by each input token. - -- the codomain of each inversed function is a set of states rather than single state - -- since a transition function might not be an injective. - -- This is a cache of the information needed to compute xs below - bigmap :: IntMap (IntMap EquivalenceClass) - bigmap = IntMap.fromListWith (IntMap.unionWith IntSet.union) - [ (i, IntMap.singleton to (IntSet.singleton from)) - | (from, state) <- Map.toList statemap, - (i,to) <- IntMap.toList (state_out state) ] - - -- The outer loop: recurse on each set in R and Q - go :: [EquivalenceClass] -> [EquivalenceClass] -> [EquivalenceClass] - go r [] = r - go r (a:q) = uncurry go $ List.foldl' go0 (a:r,q) xs - where - preimage :: IntMap EquivalenceClass -- inversed transition function - -> EquivalenceClass -- subset of codomain of original transition function - -> EquivalenceClass -- preimage of given subset -#if MIN_VERSION_containers(0,6,0) - preimage invMap = IntSet.unions . IntMap.restrictKeys invMap -#else - preimage invMap = IntSet.unions . mapMaybe (`IntMap.lookup` invMap) . IntSet.toList -#endif - - xs :: [EquivalenceClass] - xs = - [ x - | invMap <- IntMap.elems bigmap - , let x = preimage invMap a - , not (IntSet.null x) - ] - - refineWith - :: EquivalenceClass -- preimage set that bisects the input equivalence class - -> EquivalenceClass -- input equivalence class - -> Maybe (EquivalenceClass, EquivalenceClass) -- refined equivalence class - refineWith x y = - if IntSet.null y1 || IntSet.null y2 - then Nothing - else Just (y1, y2) - where - y1 = IntSet.intersection y x - y2 = IntSet.difference y x - - go0 (r,q) x = go1 r [] [] - where - -- iterates over R - go1 [] r' q' = (r', go2 q q') - go1 (y:r) r' q' = case refineWith x y of - Nothing -> go1 r (y:r') q' - Just (y1, y2) - | IntSet.size y1 <= IntSet.size y2 -> go1 r (y2:r') (y1:q') - | otherwise -> go1 r (y1:r') (y2:q') - - -- iterates over Q - go2 [] q' = q' - go2 (y:q) q' = case refineWith x y of - Nothing -> go2 q (y:q') - Just (y1, y2) -> go2 q (y1:y2:q') - --- To pacify GHC 9.8's warning about 'head' -headWithDefault :: a -> [a] -> a -headWithDefault a [] = a -headWithDefault _ (a:_) = a + reverseTransitionCache :: [IntMap EquivalenceClass] + reverseTransitionCache = generateReverseTransitionCache dfa + + -- While W isn't empty, pick an A from W, add it to R + -- and iterate on X for each c in ∑. + outerLoop :: ([EquivalenceClass], [EquivalenceClass]) -> [EquivalenceClass] + outerLoop (r, []) = r + outerLoop (r, a:w) = outerLoop $ List.foldl' refineWithX (a:r,w) $ do + allPreviousStates <- reverseTransitionCache + let x = IntSet.unions $ do + (target, sources) <- IntMap.toList allPreviousStates + guard $ target `IntSet.member` a + pure sources + guard $ not $ IntSet.null x + pure x + + -- Given X, refine values in R, then refine values in W, building + -- the new values of R and W along the way. + refineWithX (r, w) x = + let (r', w') = List.foldl' (processR x) ([], []) r + in (r', List.foldl' (processW x) w' w) + + processR x (r', w') y = case refine x y of + Nothing -> (y:r', w') + Just (y1, y2) + | IntSet.size y1 <= IntSet.size y2 -> (y2:r', y1:w') + | otherwise -> (y1:r', y2:w') + + processW x w' y = case refine x y of + Nothing -> y:w' + Just (y1, y2) -> y1:y2:w' diff --git a/tests/Makefile b/tests/Makefile index eede942..97ecd22 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -59,6 +59,8 @@ TESTS = \ issue_141.x \ issue_197.x \ issue_262.x \ + issue_269_part1.x \ + issue_269_part2.x \ monad_typeclass.x \ monad_typeclass_bytestring.x \ monadUserState_typeclass.x \ diff --git a/tests/issue_269_part1.x b/tests/issue_269_part1.x new file mode 100644 index 0000000..84ea491 --- /dev/null +++ b/tests/issue_269_part1.x @@ -0,0 +1,40 @@ +{ + +-- Issue #269 +-- reported 2025-04-02 by Antoine Leblanc (https://github.com/nicuveo) +-- fixed 2025-04-03 by Antoine Leblanc +-- +-- Problem was: +-- The minimizer was not initialized with the proper subsets of +-- states, which could result in different states being erroneously +-- considered equivalent, which in turn could result in the wrong +-- rule being selected at runtime. +-- +-- This version of the test fails with the minimizer as implemented +-- after the changes in 4f0b51b8c370d0dd0f9c65af98282789f1cb035f. + +import Control.Monad (when) +import System.Exit + +} + +%wrapper "basic" + +tokens :- + [abc] { Left } + "abc" { const $ Right "abc" } + +{ + +test :: String -> [Either String String] -> IO () +test input expected = + when (expected /= alexScanTokens input) + exitFailure + +main :: IO () +main = do + test "abc" [Right "abc"] + test "bbb" [Left "b", Left "b", Left "b"] + test "bbc" [Left "b", Left "b", Left "c"] + +} diff --git a/tests/issue_269_part2.x b/tests/issue_269_part2.x new file mode 100644 index 0000000..0d1994f --- /dev/null +++ b/tests/issue_269_part2.x @@ -0,0 +1,40 @@ +{ + +-- Issue #269 +-- reported 2025-04-02 by Antoine Leblanc (https://github.com/nicuveo) +-- fixed 2025-04-03 by Antoine Leblanc +-- +-- Problem was: +-- The minimizer was not initialized with the proper subsets of +-- states, which could result in different states being erroneously +-- considered equivalent, which in turn could result in the wrong +-- rule being selected at runtime. +-- +-- This version of the test fails with the minimizer as implemented +-- pre 4f0b51b8c370d0dd0f9c65af98282789f1cb035f. + +import Control.Monad (when) +import System.Exit + +} + +%wrapper "basic" + +tokens :- + "abc" { const $ Right "abc" } + [abc] { Left } + +{ + +test :: String -> [Either String String] -> IO () +test input expected = + when (expected /= alexScanTokens input) + exitFailure + +main :: IO () +main = do + test "abc" [Right "abc"] + test "bbb" [Left "b", Left "b", Left "b"] + test "bbc" [Left "b", Left "b", Left "c"] + +}