From 449340eb20defbee6bc5bae9d58e95d06c11b8ab Mon Sep 17 00:00:00 2001 From: Benjamin <5719034+bnjjj@users.noreply.github.com> Date: Wed, 22 Jan 2025 15:34:22 +0100 Subject: [PATCH 1/2] graphql parser wip Signed-off-by: Benjamin <5719034+bnjjj@users.noreply.github.com> --- .gitignore | 1 + Cargo.toml | 1 + src/graphql/error.rs | 15 +++ src/graphql/mod.rs | 124 +++++++++++++++++++++ src/graphql/parsing.rs | 237 +++++++++++++++++++++++++++++++++++++++++ src/graphql/types.rs | 31 ++++++ src/lib.rs | 5 +- 7 files changed, 413 insertions(+), 1 deletion(-) create mode 100644 src/graphql/error.rs create mode 100644 src/graphql/mod.rs create mode 100644 src/graphql/parsing.rs create mode 100644 src/graphql/types.rs diff --git a/.gitignore b/.gitignore index 9dbb7b2a..62347741 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ benchmarks/results # Remove doc build folders .cache/ +.cargo_check build/ rust-coverage/ target/ diff --git a/Cargo.toml b/Cargo.toml index f49df1ae..1d716dcf 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,6 +20,7 @@ hf-hub = "=0.3.2" tokenizers = { version = "=0.20.3", features = ["http"] } rustc-hash = "2.1.0" regex-automata = "0.4.9" +apollo-compiler = "1.25.0" [features] python-bindings = ["pyo3"] diff --git a/src/graphql/error.rs b/src/graphql/error.rs new file mode 100644 index 00000000..5cce3070 --- /dev/null +++ b/src/graphql/error.rs @@ -0,0 +1,15 @@ +use apollo_compiler::validation::DiagnosticList; +use apollo_compiler::Name; +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum GraphQLParserError { + #[error("GraphQL apollo compiler error: {0}")] + ApolloCompiler(String), + #[error("Can't find any Query type in your schema, it must exists and is the entrypoint")] + UnknownQuery, + #[error("Can't find any type {0} in your schema")] + UnknownType(Name), + #[error("Input value definition is not supported")] + InputValueDefinitionNotSupported, +} diff --git a/src/graphql/mod.rs b/src/graphql/mod.rs new file mode 100644 index 00000000..9fd612dd --- /dev/null +++ b/src/graphql/mod.rs @@ -0,0 +1,124 @@ +mod error; +mod parsing; +mod types; + +use error::GraphQLParserError; +pub use types::*; + +type Result = std::result::Result; + +pub fn build_regex_from_schema( + graphql_schema: &str, + whitespace_pattern: Option<&str>, +) -> Result { + let mut parser = parsing::Parser::new(graphql_schema)?; + if let Some(pattern) = whitespace_pattern { + parser = parser.with_whitespace_pattern(pattern) + } + parser.to_regex() +} + +#[cfg(test)] +mod tests { + use regex::Regex; + + use super::*; + + fn should_match(re: &Regex, value: &str) { + // Asserts that value is fully matched. + match re.find(value) { + Some(matched) => { + assert_eq!( + matched.as_str(), + value, + "Value should match, but does not for: {value}, re:\n{re}" + ); + assert_eq!(matched.range(), 0..value.len()); + } + None => unreachable!( + "Value should match, but does not, in unreachable for: {value}, re:\n{re}" + ), + } + } + + fn should_not_match(re: &Regex, value: &str) { + // Asserts that regex does not find a match or not a full match. + if let Some(matched) = re.find(value) { + assert_ne!( + matched.as_str(), + value, + "Value should NOT match, but does for: {value}, re:\n{re}" + ); + assert_ne!(matched.range(), 0..value.len()); + } + } + + #[test] + fn test_schema_matches_regex() { + for (schema, regex, a_match, not_a_match) in [ + // ========================================================== + // Integer Type + // ========================================================== + // Required integer property + ( + r#"type Query { + count: Int! + }"#, + r#"\{[ ]?"count"[ ]?:[ ]?(-)?(0|[1-9][0-9]*)[ ]?\}"#, + vec![r#"{ "count": 100 }"#], + vec![r#"{ "count": "a" }"#, ""], + ), + ( + r#"type Query { + count: Int + }"#, + r#"\{([ ]?"count"[ ]?:[ ]?((-)?(0|[1-9][0-9]*))?)?[ ]?\}"#, + vec![r#"{ "count": 100 }"#], + vec![r#"{ "count": "a" }"#, ""], + ), + // ========================================================== + // Number Type + // ========================================================== + // Required number property + ( + r#"type Query { + count: Float! + }"#, + r#"\{[ ]?"count"[ ]?:[ ]?((-)?(0|[1-9][0-9]*))(\.[0-9]+)?([eE][+-][0-9]+)?[ ]?\}"#, + vec![r#"{ "count": 100 }"#, r#"{ "count": 100.5 }"#], + vec![""], + ), + ( + r#"type Query { + count: Float + }"#, + r#"\{([ ]?"count"[ ]?:[ ]?(((-)?(0|[1-9][0-9]*))(\.[0-9]+)?([eE][+-][0-9]+)?)?)?[ ]?\}"#, + vec![r#"{ "count": 100 }"#, r#"{ "count": 100.5 }"#], + vec![""], + ), + // ========================================================== + // Array Type + // ========================================================== + // Required number property + ( + r#"type Query { + count: [Float]! + }"#, + r#"\{[ ]?"count"[ ]?:[ ]?\[[ ]?(((((-)?(0|[1-9][0-9]*))(\.[0-9]+)?([eE][+-][0-9]+)?)?)(,[ ]?((((-)?(0|[1-9][0-9]*))(\.[0-9]+)?([eE][+-][0-9]+)?)?)){0,})[ ]?\][ ]?\}"#, + vec![r#"{ "count": [100.5] }"#, r#"{ "count": [100] }"#], + vec![""], + ), + ] { + let result = build_regex_from_schema(schema, None).expect("To regex failed"); + assert_eq!(result, regex, "JSON Schema {} didn't match", schema); + + let re = Regex::new(&result).expect("Regex failed"); + for m in a_match { + should_match(&re, m); + } + for not_m in not_a_match { + should_not_match(&re, not_m); + } + } + } +} diff --git a/src/graphql/parsing.rs b/src/graphql/parsing.rs new file mode 100644 index 00000000..680ac5a6 --- /dev/null +++ b/src/graphql/parsing.rs @@ -0,0 +1,237 @@ +use apollo_compiler::ast::Type; +use apollo_compiler::schema::EnumType; +use apollo_compiler::schema::ExtendedType; +use apollo_compiler::schema::InterfaceType; +use apollo_compiler::schema::ObjectType; +use apollo_compiler::schema::ScalarType; +use apollo_compiler::schema::UnionType; +use apollo_compiler::Schema; +use regex::escape; + +use super::error::GraphQLParserError; +use super::types; + +type Result = std::result::Result; + +pub(crate) struct Parser<'a> { + root: Schema, + whitespace_pattern: &'a str, + recursion_depth: usize, + max_recursion_depth: usize, +} + +impl<'a> Parser<'a> { + // Max recursion depth is defined at level 3. + // Defining recursion depth higher than that should be done cautiously, since + // each +1 step on the depth blows up regex's size exponentially. + // + // For example, for simple referential json schema level 5 will produce regex size over 700K, + // which seems counterproductive and likely to introduce performance issues. + // It also breaks even `regex` sensible defaults with `CompiledTooBig` error. + pub fn new(root: &'a str) -> Result { + let root = Schema::parse_and_validate(root, "sdl.graphql") + .map_err(|err| GraphQLParserError::ApolloCompiler(err.to_string()))? + .into_inner(); + + if root.schema_definition.query.is_none() { + return Err(GraphQLParserError::UnknownQuery); + } + + Ok(Self { + root, + whitespace_pattern: types::WHITESPACE, + recursion_depth: 0, + max_recursion_depth: 3, + }) + } + + pub fn with_whitespace_pattern(self, whitespace_pattern: &'a str) -> Self { + Self { + whitespace_pattern, + ..self + } + } + + #[allow(dead_code)] + pub fn with_max_recursion_depth(self, max_recursion_depth: usize) -> Self { + Self { + max_recursion_depth, + ..self + } + } + + #[allow(clippy::wrong_self_convention)] + pub fn to_regex(&mut self) -> Result { + let query_obj_name = self + .root + .schema_definition + .query + .as_ref() + .ok_or_else(|| GraphQLParserError::UnknownQuery)?; + + let query_type = self + .root + .types + .get(&query_obj_name.name) + .ok_or_else(|| GraphQLParserError::UnknownQuery)?; + + self.parse_type(query_type) + } + + fn parse_type(&self, node: &ExtendedType) -> Result { + match node { + ExtendedType::Scalar(node) => self.parse_scalar(node), + ExtendedType::Object(node) => self.parse_object(node), + ExtendedType::Interface(node) => self.parse_interface(node), + ExtendedType::Union(node) => self.parse_union(node), + ExtendedType::Enum(node) => self.parse_enum(node), + ExtendedType::InputObject(_node) => { + Err(GraphQLParserError::InputValueDefinitionNotSupported) + } + } + } + + fn parse_scalar(&self, node: &ScalarType) -> Result { + match node.name.as_str() { + "Int" => Ok(types::INTEGER.to_string()), + "Float" => Ok(types::NUMBER.to_string()), + "String" => Ok(types::STRING.to_string()), + "Boolean" => Ok(types::BOOLEAN.to_string()), + "ID" => Ok(types::STRING.to_string()), + other => { + todo!(); + } + } + } + + fn parse_object(&self, node: &ObjectType) -> Result { + let mut regex = String::from(r"\{"); + + let last_required_pos = node + .fields + .iter() + .enumerate() + .filter_map(|(i, (_field_name, field_def))| { + if matches!(field_def.ty, Type::NonNullList(_) | Type::NonNullNamed(_)) { + Some(i) + } else { + None + } + }) + .max(); + + match last_required_pos { + // We have required fields + Some(last_required_pos) => { + for (i, (field_name, field_def)) in node.fields.iter().enumerate() { + let mut subregex = format!( + r#"{0}"{1}"{0}:{0}"#, + self.whitespace_pattern, + escape(field_name.as_str()) + ); + + // TODO: add * for list + let (inner_ty_regex, is_required) = self.parse_inner_type(&field_def.ty)?; + subregex += &inner_ty_regex; + + if i < last_required_pos { + subregex = format!("{}{},", subregex, self.whitespace_pattern) + } else if i > last_required_pos { + subregex = format!("{},{}", self.whitespace_pattern, subregex) + } + + if is_required { + regex += &subregex; + } else { + regex += &format!("({})?", subregex); + }; + } + } + // We don't have any required fields + None => { + let mut property_subregexes = Vec::with_capacity(node.fields.len()); + + for (field_name, field_def) in &node.fields { + let mut subregex = format!( + r#"{0}"{1}"{0}:{0}"#, + self.whitespace_pattern, + escape(field_name.as_str()) + ); + + let (inner_ty_regex, _is_required) = self.parse_inner_type(&field_def.ty)?; + subregex += &inner_ty_regex; + + property_subregexes.push(subregex); + } + + let mut possible_patterns = Vec::new(); + for i in 0..property_subregexes.len() { + let mut pattern = String::new(); + for subregex in &property_subregexes[..i] { + pattern += &format!("({}{},)?", subregex, self.whitespace_pattern); + } + pattern += &property_subregexes[i]; + possible_patterns.push(pattern); + } + + regex += &format!("({})?", possible_patterns.join("|")); + } + } + regex += &format!("{}\\}}", self.whitespace_pattern); + + Ok(regex) + } + + fn parse_inner_type(&self, ty: &Type) -> Result<(String, bool)> { + let mut subregex = String::new(); + let mut is_required = false; + match ty { + Type::Named(name) => { + let ty = self + .root + .types + .get(name) + .ok_or_else(|| GraphQLParserError::UnknownType(name.clone()))?; + subregex += &format!("({})?", self.parse_type(ty)?); + } + Type::List(ty) => { + subregex += &format!( + r"\[{0}(({1})(,{0}({1})){{0,}}){0}\]?", + self.whitespace_pattern, + self.parse_inner_type(ty)?.0 + ); + } + Type::NonNullNamed(name) => { + is_required = true; + let ty = self + .root + .types + .get(name) + .ok_or_else(|| GraphQLParserError::UnknownType(name.clone()))?; + subregex += &self.parse_type(ty)?; + } + Type::NonNullList(ty) => { + is_required = true; + subregex += &format!( + r"\[{0}(({1})(,{0}({1})){{0,}}){0}\]", + self.whitespace_pattern, + self.parse_inner_type(ty)?.0 + ); + } + } + + Ok((subregex, is_required)) + } + + fn parse_interface(&self, node: &InterfaceType) -> Result { + todo!() + } + + fn parse_union(&self, node: &UnionType) -> Result { + todo!() + } + + fn parse_enum(&self, node: &EnumType) -> Result { + todo!() + } +} diff --git a/src/graphql/types.rs b/src/graphql/types.rs new file mode 100644 index 00000000..156861e3 --- /dev/null +++ b/src/graphql/types.rs @@ -0,0 +1,31 @@ +// allow `\"`, `\\`, or any character which isn't a control sequence +pub const STRING_INNER: &str = r#"([^"\\\x00-\x1F\x7F-\x9F]|\\["\\])"#; +pub const STRING: &str = r#""([^"\\\x00-\x1F\x7F-\x9F]|\\["\\])*""#; + +pub const INTEGER: &str = r#"(-)?(0|[1-9][0-9]*)"#; +pub const NUMBER: &str = r#"((-)?(0|[1-9][0-9]*))(\.[0-9]+)?([eE][+-][0-9]+)?"#; +pub const BOOLEAN: &str = r#"(true|false)"#; +pub const NULL: &str = r#"null"#; + +pub const WHITESPACE: &str = r#"[ ]?"#; + +#[derive(Debug, PartialEq)] +pub enum GraphQLType { + String, + Integer, + Number, + Boolean, + Null, +} + +impl GraphQLType { + pub fn to_regex(&self) -> &'static str { + match self { + GraphQLType::String => STRING, + GraphQLType::Integer => INTEGER, + GraphQLType::Number => NUMBER, + GraphQLType::Boolean => BOOLEAN, + GraphQLType::Null => NULL, + } + } +} diff --git a/src/lib.rs b/src/lib.rs index 538152f1..7afe57ad 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,11 +1,14 @@ pub mod error; +pub mod graphql; pub mod index; pub mod json_schema; pub mod prelude; pub mod primitives; pub mod vocabulary; -pub use error::{Error, JsonSchemaParserError, Result}; +pub use error::Error; +pub use error::JsonSchemaParserError; +pub use error::Result; #[cfg(feature = "python-bindings")] mod python_bindings; From 7bc9a8af4cfee30e311ba712b0bc889060d244b2 Mon Sep 17 00:00:00 2001 From: Benjamin <5719034+bnjjj@users.noreply.github.com> Date: Thu, 23 Jan 2025 18:30:55 +0100 Subject: [PATCH 2/2] add support for enums Signed-off-by: Benjamin <5719034+bnjjj@users.noreply.github.com> --- src/graphql/mod.rs | 32 +++++++++++++++++++++++++++++++- src/graphql/parsing.rs | 17 +++++++++++++---- src/graphql/types.rs | 7 +++++++ 3 files changed, 51 insertions(+), 5 deletions(-) diff --git a/src/graphql/mod.rs b/src/graphql/mod.rs index 9fd612dd..dd368a16 100644 --- a/src/graphql/mod.rs +++ b/src/graphql/mod.rs @@ -97,6 +97,32 @@ mod tests { vec![""], ), // ========================================================== + // Enum + // ========================================================== + // Required number property + ( + r#" + enum Episode { + NEWHOPE + EMPIRE + JEDI + } + type Query { + episode: Episode! + }"#, + r#"\{[ ]?"episode"[ ]?:[ ]?("NEWHOPE"|"EMPIRE"|"JEDI")[ ]?\}"#, + vec![r#"{ "episode": "NEWHOPE" }"#, r#"{ "episode": "JEDI" }"#], + vec![""], + ), + ( + r#"type Query { + count: Float + }"#, + r#"\{([ ]?"count"[ ]?:[ ]?(((-)?(0|[1-9][0-9]*))(\.[0-9]+)?([eE][+-][0-9]+)?)?)?[ ]?\}"#, + vec![r#"{ "count": 100 }"#, r#"{ "count": 100.5 }"#], + vec![""], + ), + // ========================================================== // Array Type // ========================================================== // Required number property @@ -105,7 +131,11 @@ mod tests { count: [Float]! }"#, r#"\{[ ]?"count"[ ]?:[ ]?\[[ ]?(((((-)?(0|[1-9][0-9]*))(\.[0-9]+)?([eE][+-][0-9]+)?)?)(,[ ]?((((-)?(0|[1-9][0-9]*))(\.[0-9]+)?([eE][+-][0-9]+)?)?)){0,})[ ]?\][ ]?\}"#, - vec![r#"{ "count": [100.5] }"#, r#"{ "count": [100] }"#], + vec![ + r#"{ "count": [100.5] }"#, + r#"{ "count": [100] }"#, + r#"{ "count": [100, 101] }"#, + ], vec![""], ), ] { diff --git a/src/graphql/parsing.rs b/src/graphql/parsing.rs index 680ac5a6..ced0ce8c 100644 --- a/src/graphql/parsing.rs +++ b/src/graphql/parsing.rs @@ -98,9 +98,11 @@ impl<'a> Parser<'a> { "String" => Ok(types::STRING.to_string()), "Boolean" => Ok(types::BOOLEAN.to_string()), "ID" => Ok(types::STRING.to_string()), - other => { - todo!(); - } + "Date" => Ok(types::DATE.to_string()), + "Uri" => Ok(types::URI.to_string()), + "Uuid" => Ok(types::UUID.to_string()), + "Email" => Ok(types::EMAIL.to_string()), + _ => Ok(types::STRING.to_string()), } } @@ -232,6 +234,13 @@ impl<'a> Parser<'a> { } fn parse_enum(&self, node: &EnumType) -> Result { - todo!() + let variants = node + .values + .iter() + .map(|(_name, def)| format!(r#""{}""#, def.value.as_str())) + .collect::>() + .join("|"); + + Ok(format!(r"({variants})")) } } diff --git a/src/graphql/types.rs b/src/graphql/types.rs index 156861e3..72ad7bf3 100644 --- a/src/graphql/types.rs +++ b/src/graphql/types.rs @@ -7,6 +7,13 @@ pub const NUMBER: &str = r#"((-)?(0|[1-9][0-9]*))(\.[0-9]+)?([eE][+-][0-9]+)?"#; pub const BOOLEAN: &str = r#"(true|false)"#; pub const NULL: &str = r#"null"#; +pub const DATE_TIME: &str = r#""(-?(?:[1-9][0-9]*)?[0-9]{4})-(1[0-2]|0[1-9])-(3[01]|0[1-9]|[12][0-9])T(2[0-3]|[01][0-9]):([0-5][0-9]):([0-5][0-9])(\.[0-9]{3})?(Z)?""#; +pub const DATE: &str = r#""(?:\d{4})-(?:0[1-9]|1[0-2])-(?:0[1-9]|[1-2][0-9]|3[0-1])""#; +pub const TIME: &str = r#""(2[0-3]|[01][0-9]):([0-5][0-9]):([0-5][0-9])(\\.[0-9]+)?(Z)?""#; +pub const UUID: &str = r#""[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}""#; +pub const URI: &str = r#"^(https?|ftp):\/\/([^\s:@]+(:[^\s:@]*)?@)?([a-zA-Z\d.-]+\.[a-zA-Z]{2,}|localhost)(:\d+)?(\/[^\s?#]*)?(\?[^\s#]*)?(#[^\s]*)?$|^urn:[a-zA-Z\d][a-zA-Z\d\-]{0,31}:[^\s]+$"#; +pub const EMAIL: &str = r#"^(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])$"#; + pub const WHITESPACE: &str = r#"[ ]?"#; #[derive(Debug, PartialEq)]