diff --git a/FEXCore/Source/CMakeLists.txt b/FEXCore/Source/CMakeLists.txt index de1de9bdec..da53311c96 100644 --- a/FEXCore/Source/CMakeLists.txt +++ b/FEXCore/Source/CMakeLists.txt @@ -7,6 +7,7 @@ set (FEXCORE_BASE_SRCS Utils/ForcedAssert.cpp Utils/LogManager.cpp Utils/SpinWaitLock.cpp + Utils/Regex.cpp ) if (NOT MINGW_BUILD) diff --git a/FEXCore/Source/Utils/Regex.cpp b/FEXCore/Source/Utils/Regex.cpp new file mode 100644 index 0000000000..1668efbe0d --- /dev/null +++ b/FEXCore/Source/Utils/Regex.cpp @@ -0,0 +1,282 @@ +#include "FEXCore/fextl/string.h" +#include + +#include +#include +#include +#include +#include + +// Implementation for FEX regex enginee, please see unittests/APITests/Regex.cpp +// for test cases + +// Inspiration taken from dragon book and +// https://sh4dy.com/2025/05/01/regex_engine/ +namespace FEXCore::Utils { +///////////// +// STATE IMPL +///////////// +void State::addEpsilonTransition(State *nextState) { + assert(nextState && "state needs to be non null for addEpsilonTransition"); + epsilonTransitions.push_back(nextState); +} + +void State::addTransition(char c, State *nextState) { + assert(nextState && "state needs to be non null for addTransition"); + transitions[c].push_back(nextState); +} + +///////////// +// NFA IMPL +///////////// +NFA::NFA() { + fextl::unique_ptr start = fextl::make_unique(); + fextl::unique_ptr accepting = fextl::make_unique(true); + startState = start.get(); + acceptingState = accepting.get(); + + // transfer the ownership to the states vector + states.push_back(std::move(start)); + states.push_back(std::move(accepting)); +} + +void NFA::acquireStatesFrom(NFA &other) { + for (auto &s : other.states) + this->states.push_back(std::move(s)); + + other.states.clear(); +} + +NFA NFA::createForEpsilon() { + NFA nfa; + nfa.startState->addEpsilonTransition(nfa.acceptingState); + return nfa; +} +NFA NFA::createForDot() { + NFA nfa; + + // INFO: For now i think let's keep it simple and spawn NFA for the whole + // alphabet + // + // See if performance is acceptable or not and then we can read more dragon + // book to find optimization + for (auto ch : Regex::Alphabet) + nfa.startState->addTransition(ch, nfa.acceptingState); + return nfa; +} + +NFA NFA::createForChar(char c) { + NFA nfa; + nfa.startState->addTransition(c, nfa.acceptingState); + return nfa; +} + +// Dragon book 2nd edition, figure 3.40: NFA for the union of two regular +// expressions +NFA NFA::createForUnion(NFA &nfa1, NFA &nfa2) { + NFA newNFA; + nfa1.acceptingState->isAccepting = false; + nfa2.acceptingState->isAccepting = false; + newNFA.startState->addEpsilonTransition(nfa1.startState); + newNFA.startState->addEpsilonTransition(nfa2.startState); + nfa1.acceptingState->addEpsilonTransition(newNFA.acceptingState); + nfa2.acceptingState->addEpsilonTransition(newNFA.acceptingState); + + // NOTE: we acquire state here because the use case was that we were gonna + // leave nfa1 and nfa2 out of scope. Probably need to measure the performance + // first but it'd be great to move away from making up unique ptr everytime + // inside a loop + newNFA.acquireStatesFrom(nfa1); + newNFA.acquireStatesFrom(nfa2); + return newNFA; +} + +// Dragon book 2nd edition, figure 3.41: NFA for the concat of two regular +// expressions +NFA NFA::createForConcatenation(NFA &nfa1, NFA &nfa2) { + NFA newNFA; + nfa1.acceptingState->addEpsilonTransition(nfa2.startState); + nfa1.acceptingState->isAccepting = false; + newNFA.startState = nfa1.startState; + newNFA.acceptingState = nfa2.acceptingState; + newNFA.acquireStatesFrom(nfa1); + newNFA.acquireStatesFrom(nfa2); + return newNFA; +} + +// loop back the nfa to itself to enable accepting 1 more time, do not disable accepting state +NFA NFA::createForPlus(NFA &originalNFA) { + NFA newNFA; + newNFA.startState = originalNFA.startState; + newNFA.acceptingState = originalNFA.acceptingState; + newNFA.acceptingState->addEpsilonTransition(newNFA.startState); + newNFA.acquireStatesFrom(originalNFA); + return newNFA; +} + +NFA NFA::createForQuestion(NFA &originalNFA) { + NFA newNFA; + newNFA.startState = originalNFA.startState; + newNFA.acceptingState = originalNFA.acceptingState; + newNFA.startState->addEpsilonTransition(newNFA.acceptingState); + newNFA.acquireStatesFrom(originalNFA); + return newNFA; +} + +// Dragon book 2nd edition, figure 3.42: NFA for the closure of a regular +// expression +NFA NFA::createForKleeneStar(NFA &originalNFA) { + NFA newNFA; + newNFA.startState->addEpsilonTransition(originalNFA.startState); + newNFA.startState->addEpsilonTransition(newNFA.acceptingState); + originalNFA.acceptingState->addEpsilonTransition(originalNFA.startState); + originalNFA.acceptingState->addEpsilonTransition(newNFA.acceptingState); + originalNFA.acceptingState->isAccepting = false; + newNFA.acquireStatesFrom(originalNFA); + return newNFA; +} + +// Find all the states that can be reached from the current set of states using +// only epsilon transitions +fextl::set NFA::epsilonClosure(const fextl::set &states) { + fextl::stack stateStack; + fextl::set result = states; + + for (State *state : states) + stateStack.push(state); + + while (!stateStack.empty()) { + State *currState = stateStack.top(); + stateStack.pop(); + for (State *next : currState->epsilonTransitions) { + if (result.find(next) == result.end()) { + stateStack.push(next); + result.insert(next); + } + } + } + return result; +} + +// Find all the states that can be reached from the current set of states using +// only character transition +fextl::set NFA::move(const fextl::set &states, const char c) { + fextl::set result; + for (auto *state : states) { + const decltype(state->transitions) &transitionMap = state->transitions; + if (auto itr = transitionMap.find(c); itr != transitionMap.end()) { + for (auto *transition : itr->second) { + result.insert(transition); + } + } + } + return result; +} + +///////////// +// REGEX IMPL +///////////// +Regex::Regex(const fextl::string &s) : Pattern(s), Pos(0) { + Nfa = parseExpression(); +} + +NFA Regex::parseExpression() { return parseUnion(); } + +NFA Regex::parseUnion() { + NFA result = parseConcatenation(); + while (Pos < Pattern.size() && Pattern[Pos] == '|') { + Pos++; + NFA nfaToMakeUnion = parseConcatenation(); + result = NFA::createForUnion(result, nfaToMakeUnion); + } + return result; +} + +NFA Regex::parseConcatenation() { + NFA result = parseStarPlusHuhhhh(); + while (Pos < Pattern.size() && Pattern[Pos] != '|' && Pattern[Pos] != ')') { + NFA nfaToConcat = parseStarPlusHuhhhh(); + result = NFA::createForConcatenation(result, nfaToConcat); + } + return result; +} + +NFA Regex::parseStarPlusHuhhhh() { + NFA result = parseAtom(); + while (Pos < Pattern.size()) { + if (Pattern[Pos] == '*') { + result = NFA::createForKleeneStar(result); + } else if (Pattern[Pos] == '+') { + result = NFA::createForPlus(result); + } else if (Pattern[Pos] == '?') { + result = NFA::createForQuestion(result); + } else { + break; + } + Pos++; + } + return result; +} +// Algo 3.23: Basis +NFA Regex::parseAtom() { + + if (Pos >= Pattern.size()) { + return NFA::createForEpsilon(); + } + char curChar = Pattern[Pos++]; + + if (Escaped) { + Escaped = false; + if (AcceptableEscapable.find(curChar) == fextl::string::npos) { + fprintf(stderr, "Expected an acceptable escapable character which " + "consists of \"%s\", but found '%c' " + "while parsing regex for NFA creation\n", + AcceptableEscapable.c_str(), curChar); + std::exit(1); + } + return NFA::createForChar(curChar); + } + // Move past the opening brace and get the NFA for the expression till a + // closing brace is found. + if (curChar == '(') { + NFA result = parseExpression(); + if (Pos < Pattern.size() && Pattern[Pos] == ')') { + Pos++; + } else if (Pos >= Pattern.size()) { + // TODO: Add error prop here somewhere + fprintf(stderr, "Expected ')', but has no more character to parse from " + "regex for NFA creation\n"); + std::exit(1); + } else { + fprintf(stderr, "Expected ')', but encountered character '%c' while parsing " + "regex for NFA creation\n", + Pattern[Pos]); + std::exit(1); + } + return result; + } + + if (curChar == '\\') { + assert(Escaped == false && "Cannot have escaped = true here"); + Escaped = true; + return parseAtom(); + } + + if (curChar == '.') + return NFA::createForDot(); + return NFA::createForChar(curChar); +} + +// Dragon book 2nd edition, algorithm 3.22: Simulating an NFA +bool Regex::matches(const fextl::string &target) { + fextl::set currentStates = NFA::epsilonClosure({Nfa.startState}); + + for (const auto c : target) { + currentStates = NFA::epsilonClosure(NFA::move(currentStates, c)); + if (currentStates.empty()) + return false; + } + return std::ranges::any_of(currentStates, &State::isAccepting); +} + +} // namespace FEXCore::Utils diff --git a/FEXCore/include/FEXCore/Utils/Regex.h b/FEXCore/include/FEXCore/Utils/Regex.h new file mode 100644 index 0000000000..b31941c987 --- /dev/null +++ b/FEXCore/include/FEXCore/Utils/Regex.h @@ -0,0 +1,75 @@ +#pragma once +#include "FEXCore/fextl/memory.h" +#include "FEXCore/fextl/string.h" +#include +#include +#include +#include + +namespace FEXCore::Utils { + +class State { +public: + fextl::vector epsilonTransitions; + fextl::map> transitions; + bool isAccepting; + State(bool accepting = false) : isAccepting(accepting) {} + void addEpsilonTransition(State *nextState); + void addTransition(char c, State *nextState); +}; +class NFA { +public: + State *startState; + State *acceptingState; + fextl::vector> states; + + NFA(); + // Transfers the ownership of the states (unique_ptr) of other NFA to the + // current NFA. + void acquireStatesFrom(NFA &other); + + // Functions for creating NFA using the McNaughton-Yamada-Thompson algorithm + static NFA createForEpsilon(); + static NFA createForChar(char c); + static NFA createForDot(); + static NFA createForUnion(NFA &nfa1, NFA &nfa2); + static NFA createForConcatenation(NFA &nfa1, NFA &nfa2); + static NFA createForKleeneStar(NFA &originalNFA); + static NFA createForPlus(NFA &originalNFA); + static NFA createForQuestion(NFA &originalNFA); + static fextl::set epsilonClosure(const fextl::set &states); + static fextl::set move(const fextl::set &states, char c); +}; + +// TODO: probably an NFA vector would be better instead of State vector inside +// each NFA + +// TODO: Better error reporting? +class Regex { + fextl::string Pattern; + int Pos; + NFA Nfa; + bool Escaped = false; + + // Top level parser, calls parseUnion + NFA parseExpression(); + + // INFO: "a|b" + NFA parseUnion(); + + // INFO: "ab" + NFA parseConcatenation(); + + // INFO: "a*", ".*" + NFA parseStarPlusHuhhhh(); + + // INFO: "(abc)" or a + NFA parseAtom(); + +public: + static inline fextl::string Alphabet = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ,./<>?;':\"[]\\{}|1234567890!@#$%^&*()-=_+"; + static inline fextl::string AcceptableEscapable = ".?[]\\|"; + Regex(const fextl::string &s); + bool matches(const fextl::string &s); +}; +} // namespace FEXCore::Utils diff --git a/Source/Common/Config.cpp b/Source/Common/Config.cpp index 4b7e80f4fc..5221cef4f8 100644 --- a/Source/Common/Config.cpp +++ b/Source/Common/Config.cpp @@ -10,7 +10,7 @@ #include #include #include - +#include #include #include #ifndef _WIN32 @@ -43,22 +43,59 @@ namespace JSON { return; } - for (const json_t* ConfigItem = json_getChild(ConfigList); ConfigItem != nullptr; ConfigItem = json_getSibling(ConfigItem)) { - const char* ConfigName = json_getName(ConfigItem); - const char* ConfigString = json_getValue(ConfigItem); + auto ListApplier = [&Config, &Func](const json_t* jsonList) { + for (const json_t* ConfigItem = json_getChild(jsonList); ConfigItem != nullptr; ConfigItem = json_getSibling(ConfigItem)) { + const char* ConfigName = json_getName(ConfigItem); + const char* ConfigString = json_getValue(ConfigItem); + + if (!ConfigName) { + LogMan::Msg::EFmt("JSON file '{}': Couldn't get config name for an item", Config); + return; + } - if (!ConfigName) { + if (!ConfigString) { + LogMan::Msg::EFmt("JSON file '{}': Couldn't get value for config item '{}'", Config, ConfigName); + return; + } + Func(ConfigName, ConfigString); + } + + }; + + ListApplier(ConfigList); + + const json_t* RegexList = json_getProperty(json, "RegexConfig"); + if (!RegexList) { + // This is a non-error if the configuration file exists but no RegexConfigList section + return; + } + + using FEXCore::Utils::Regex; + + for (const json_t* RegexItem = json_getChild(RegexList); RegexItem != nullptr; RegexItem = json_getSibling(RegexItem)) { + const char* RegexName = json_getName(RegexItem); + const json_t* RegexNamedList = json_getProperty(RegexList, RegexName); + + if (!RegexName) { LogMan::Msg::EFmt("JSON file '{}': Couldn't get config name for an item", Config); return; } - if (!ConfigString) { - LogMan::Msg::EFmt("JSON file '{}': Couldn't get value for config item '{}'", Config, ConfigName); + if (!RegexNamedList) { + LogMan::Msg::EFmt("JSON file '{}': Couldn't get value for config item '{}'", Config, RegexName); return; } - Func(ConfigName, ConfigString); + // Matches the first and then get out + // Needs PR review on this + if (Regex(RegexName).matches(Config)) { + // Safe to assume its just pairs of strings at this point? + ListApplier(RegexNamedList); + break; + } + } + } } // namespace JSON diff --git a/unittests/APITests/CMakeLists.txt b/unittests/APITests/CMakeLists.txt index 1ae70cbf72..7d94c8bd6b 100644 --- a/unittests/APITests/CMakeLists.txt +++ b/unittests/APITests/CMakeLists.txt @@ -7,6 +7,7 @@ set (TESTS Filesystem InterruptableConditionVariable StringUtils + Regex ) list(APPEND LIBS Common FEXCore JemallocLibs) diff --git a/unittests/APITests/Regex.cpp b/unittests/APITests/Regex.cpp new file mode 100644 index 0000000000..bdc1d6eff9 --- /dev/null +++ b/unittests/APITests/Regex.cpp @@ -0,0 +1,64 @@ +#include "FEXCore/fextl/string.h" +#include +#include + +using namespace FEXCore::Utils; + +TEST_CASE("Singular regex") { + CHECK(Regex("a").matches("a") == true); + CHECK(Regex("a*").matches("aaaaaaa") == true); + CHECK(Regex(".").matches("a") == true); +} + +TEST_CASE("Concat regex") { + CHECK(Regex("aaa").matches("aaa") == true); + CHECK(Regex("ab").matches("ab") == true); + CHECK(Regex("a").matches("ab") == false); + CHECK(Regex("ab").matches("a") == false); + CHECK(Regex("(aab)").matches("aab") == true); +} + +TEST_CASE("Union regex") { + CHECK(Regex("a|b").matches("a") == true); + CHECK(Regex("a|b").matches("b") == true); + CHECK(Regex("(ab)|b").matches("ab") == true); + CHECK(Regex("(ab)|b").matches("b") == true); + CHECK(Regex("(ab)|b").matches("abb") == false); +} + +TEST_CASE("Dot regex") { + CHECK(Regex(".*").matches("") == true); + CHECK(Regex(".*").matches("setup.json") == true); + CHECK(Regex("setup.*").matches("setup.json") == true); + CHECK(Regex("setup.*").matches("setup/setup.json") == true); + CHECK(Regex(".*setup.*").matches("setup/setup.json") == true); + + CHECK(Regex("setup\\.*").matches("setup/setup.json") == false); + CHECK(Regex("setup\\.*").matches("setup.....") == true); + CHECK(Regex("setup\\.*").matches("setup.aaaa") == false); + CHECK(Regex("setup\\\.*").matches("setup\.aaaa") == false); + CHECK(Regex("setup\\\.*").matches("setup\....") == true); + CHECK(Regex("setup\\\.*").matches("setup\a") == false); + CHECK(Regex("setup\\\.*").matches("setup\.\.\.\.") == true); +} + +TEST_CASE("Plus regex") { + CHECK(Regex("setup.+").matches("setup") == false); + CHECK(Regex("aa").matches("aa") == true); + CHECK(Regex("aa+").matches("aaa") == true); + CHECK(Regex("aa+").matches("aab") == false); +} + +TEST_CASE("Question regex") { + CHECK(Regex(".?").matches("") == true); + CHECK(Regex(".?").matches("aa") == false); + CHECK(Regex("setup.?").matches("setup") == true); + CHECK(Regex("setup.?").matches("setupa") == true); + CHECK(Regex("setup.?").matches("setupb") == true); + CHECK(Regex("aa?").matches("aa") == true); +} + +// Tests potential usage inside fex itself +TEST_CASE("FEX regex") { + CHECK(Regex(".*Config.*").matches("/home/ubuntu/.fex-emu/Config.json") == true); +}