Skip to content
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
9d60dc7
feat(isthmus): udf support for substrait<->calcite
ZorinAnton Jul 8, 2025
556847b
chore(isthmus): handle nullability and EnumArgument in SimplExtension…
ZorinAnton Aug 13, 2025
857fe2c
feat(isthmus): udf support for substrait<->calcite
ZorinAnton Jul 8, 2025
eeb97cd
chore(isthmus): handle nullability and EnumArgument in SimplExtension…
ZorinAnton Aug 13, 2025
f30b556
Merge branch 'main' into zor-udf
ZorinAnton Oct 13, 2025
6d9cc2a
chore: resolve pmd
ZorinAnton Oct 13, 2025
1a1bf47
chore: sync submodules with main
ZorinAnton Oct 13, 2025
834ab5e
chore: sync substrait with main
ZorinAnton Oct 13, 2025
8340cc5
chore(isthmus): resolve conflicts after submodule update
ZorinAnton Oct 14, 2025
6ba115f
chore(isthmus): fix yaml for custom udf
ZorinAnton Oct 14, 2025
e739184
Merge branch 'main' into zor-udf
ZorinAnton Nov 26, 2025
838a0a7
chore(isthmus): fix regression
ZorinAnton Nov 26, 2025
32ae275
fix: add missing serialVersionUIDs (#621)
nielspardon Nov 26, 2025
e4f656d
Merge branch 'main' into zor-udf
ZorinAnton Nov 26, 2025
8d2b4ed
chore(isthmus): fix regression
ZorinAnton Nov 26, 2025
9b0c027
feat(isthmus): enable dynamic UDFs with FeatureBoard
ZorinAnton Dec 3, 2025
a9e7c60
chore(isthmus): refactor SimpleExtensionToSqlOperatorTest
ZorinAnton Dec 3, 2025
2f6cd28
chore(isthmus): address minor PR comments
ZorinAnton Dec 3, 2025
71fcdfc
feat(isthmus): make dynamic UDF configurable for substrait to sql con…
ZorinAnton Dec 3, 2025
ca3d967
Merge branch 'main' into zor-udf
ZorinAnton Dec 3, 2025
142d372
chore(isthmus): fix format violations
ZorinAnton Dec 3, 2025
74fd2e7
chore(isthmus): minor refactoring
ZorinAnton Dec 3, 2025
ec45173
chore(isthmus): check arg types in SimpleExtensionToSqlOperatorTest
ZorinAnton Dec 4, 2025
6eedacd
chore(isthmus): minor refactoring of SimpleExtensionToSqlOperator
ZorinAnton Dec 4, 2025
b002d9b
chore(isthmus): fix opt-in for dynamic operators
ZorinAnton Dec 5, 2025
fcf24bb
Merge branch 'main' into zor-udf
ZorinAnton Dec 5, 2025
fc2d576
chore(isthmus): fix regression in test
ZorinAnton Dec 5, 2025
26c15dc
Merge branch 'main' into zor-udf
ZorinAnton Dec 8, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions isthmus/src/main/java/io/substrait/isthmus/ExtensionUtils.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package io.substrait.isthmus;

import io.substrait.extension.DefaultExtensionCatalog;
import io.substrait.extension.SimpleExtension;
import io.substrait.isthmus.calcite.SubstraitOperatorTable;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.stream.Collectors;

public class ExtensionUtils {

public static SimpleExtension.ExtensionCollection getDynamicExtensions(
SimpleExtension.ExtensionCollection extensions) {
Set<String> knownFunctionNames =
SubstraitOperatorTable.INSTANCE.getOperatorList().stream()
.map(op -> op.getName().toLowerCase(Locale.ROOT))
.collect(Collectors.toSet());

List<SimpleExtension.ScalarFunctionVariant> customFunctions =
extensions.scalarFunctions().stream()
.filter(f -> !knownFunctionNames.contains(f.name().toLowerCase(Locale.ROOT)))
.collect(Collectors.toList());

return SimpleExtension.ExtensionCollection.builder()
.scalarFunctions(customFunctions)
// TODO: handle aggregates and other functions
.build();
}

public static SimpleExtension.ExtensionCollection loadExtensions(List<String> yamlFunctionFiles) {
SimpleExtension.ExtensionCollection allExtensions = DefaultExtensionCatalog.DEFAULT_COLLECTION;
if (yamlFunctionFiles != null && !yamlFunctionFiles.isEmpty()) {
allExtensions = allExtensions.merge(SimpleExtension.load(yamlFunctionFiles));
}
return allExtensions;
}
Copy link
Member

@vbarua vbarua Nov 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It doesn't appear that this is used. Do we need this or can we delete this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dropped

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,342 @@
package io.substrait.isthmus;

import io.substrait.extension.SimpleExtension;
import io.substrait.function.ParameterizedType;
import io.substrait.function.ParameterizedTypeVisitor;
import io.substrait.function.TypeExpression;
import io.substrait.type.Type;
import io.substrait.type.TypeExpressionEvaluator;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.calcite.jdbc.JavaTypeFactoryImpl;
import org.apache.calcite.rel.type.RelDataType;
import org.apache.calcite.rel.type.RelDataTypeFactory;
import org.apache.calcite.sql.SqlFunction;
import org.apache.calcite.sql.SqlFunctionCategory;
import org.apache.calcite.sql.SqlKind;
import org.apache.calcite.sql.SqlOperator;
import org.apache.calcite.sql.SqlOperatorBinding;
import org.apache.calcite.sql.type.OperandTypes;
import org.apache.calcite.sql.type.SqlReturnTypeInference;
import org.apache.calcite.sql.type.SqlTypeFamily;
import org.apache.calcite.sql.type.SqlTypeName;

public final class SimpleExtensionToSqlOperator {

private static final RelDataTypeFactory DEFAULT_TYPE_FACTORY =
new JavaTypeFactoryImpl(SubstraitTypeSystem.TYPE_SYSTEM);

private SimpleExtensionToSqlOperator() {}

public static List<SqlOperator> from(SimpleExtension.ExtensionCollection collection) {
return from(collection, DEFAULT_TYPE_FACTORY);
}

public static List<SqlOperator> from(
SimpleExtension.ExtensionCollection collection, RelDataTypeFactory typeFactory) {
TypeConverter typeConverter = TypeConverter.DEFAULT;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would suggest making the TypeConverter injectable as well to allow users with custom extension with custom types to use this capability more easily.

Effectively we can have these 2 constructors:

public static List<SqlOperator> from(SimpleExtension.ExtensionCollection collection) {}
public static List<SqlOperator> from(
      SimpleExtension.ExtensionCollection collection, RelDataTypeFactory typeFactory, TypeConverter typeConverter)

The first for default behaviour, and the second for customizable behaviour.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

return Stream.concat(
collection.scalarFunctions().stream(), collection.aggregateFunctions().stream())
.map(function -> toSqlFunction(function, typeFactory, typeConverter))
.collect(Collectors.toList());
}

private static SqlFunction toSqlFunction(
SimpleExtension.Function function,
RelDataTypeFactory typeFactory,
TypeConverter typeConverter) {

List<SqlTypeFamily> argFamilies = new ArrayList<>();

for (SimpleExtension.Argument arg : function.requiredArguments()) {
if (arg instanceof SimpleExtension.ValueArgument) {
SimpleExtension.ValueArgument valueArg = (SimpleExtension.ValueArgument) arg;
SqlTypeName typeName = valueArg.value().accept(new CalciteTypeVisitor());
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

minor: The CalciteTypeVisitor is stateless, so it might be worth creating one at the class level once rather than instantiating one of for every individual argument we process.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

argFamilies.add(typeName.getFamily());
} else if (arg instanceof SimpleExtension.EnumArgument) {
// Treat an EnumArgument as a required string literal.
argFamilies.add(SqlTypeFamily.STRING);
}
}

SqlReturnTypeInference returnTypeInference =
new SubstraitReturnTypeInference(function, typeFactory, typeConverter);

return new SqlFunction(
function.name(),
SqlKind.OTHER_FUNCTION,
returnTypeInference,
null,
OperandTypes.family(argFamilies),
SqlFunctionCategory.USER_DEFINED_FUNCTION);
}

private static class SubstraitReturnTypeInference implements SqlReturnTypeInference {

private final SimpleExtension.Function function;
private final RelDataTypeFactory typeFactory;
private final TypeConverter typeConverter;

private SubstraitReturnTypeInference(
SimpleExtension.Function function,
RelDataTypeFactory typeFactory,
TypeConverter typeConverter) {
this.function = function;
this.typeFactory = typeFactory;
this.typeConverter = typeConverter;
}

@Override
public RelDataType inferReturnType(SqlOperatorBinding opBinding) {
List<Type> substraitArgTypes =
opBinding.collectOperandTypes().stream()
.map(typeConverter::toSubstrait)
.collect(Collectors.toList());

TypeExpression returnExpression = function.returnType();
Type resolvedSubstraitType =
TypeExpressionEvaluator.evaluateExpression(
returnExpression, function.args(), substraitArgTypes);

boolean finalIsNullable;
switch (function.nullability()) {
case MIRROR:
// If any input is nullable, the output is nullable.
finalIsNullable =
opBinding.collectOperandTypes().stream().anyMatch(RelDataType::isNullable);
break;
case DISCRETE:
// The function can return null even if inputs are not null.
finalIsNullable = true;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DISCRETE should also use the nullability as declared in the Substrait type like DECLARED_OUTPUT does. It's a stricter version of DECLARED_OUTPUT, because the argument nullabilities must also match.

See https://substrait.io/expressions/scalar_functions/#nullability-handling

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

break;
case DECLARED_OUTPUT:
default:
// Use the nullability declared on the resolved Substrait type.
finalIsNullable = resolvedSubstraitType.nullable();
break;
}

RelDataType baseCalciteType = typeConverter.toCalcite(typeFactory, resolvedSubstraitType);

return typeFactory.createTypeWithNullability(baseCalciteType, finalIsNullable);
}
}

private static class CalciteTypeVisitor
extends ParameterizedTypeVisitor.ParameterizedTypeThrowsVisitor<
SqlTypeName, RuntimeException> {

private CalciteTypeVisitor() {
super("Type not supported for Calcite conversion.");
}

@Override
public SqlTypeName visit(Type.Bool expr) {
return SqlTypeName.BOOLEAN;
}

@Override
public SqlTypeName visit(Type.I8 expr) {
return SqlTypeName.TINYINT;
}

@Override
public SqlTypeName visit(Type.I16 expr) {
return SqlTypeName.SMALLINT;
}

@Override
public SqlTypeName visit(Type.I32 expr) {
return SqlTypeName.INTEGER;
}

@Override
public SqlTypeName visit(Type.I64 expr) {
return SqlTypeName.BIGINT;
}

@Override
public SqlTypeName visit(Type.FP32 expr) {
return SqlTypeName.FLOAT;
}

@Override
public SqlTypeName visit(Type.FP64 expr) {
return SqlTypeName.DOUBLE;
}

@Override
public SqlTypeName visit(Type.Str expr) {
return SqlTypeName.VARCHAR;
}

@Override
public SqlTypeName visit(Type.Binary expr) {
return SqlTypeName.VARBINARY;
}

@Override
public SqlTypeName visit(Type.Date expr) {
return SqlTypeName.DATE;
}

@Override
public SqlTypeName visit(Type.Time expr) {
return SqlTypeName.TIME;
}

@Override
public SqlTypeName visit(Type.TimestampTZ expr) {
return SqlTypeName.TIMESTAMP_WITH_LOCAL_TIME_ZONE;
}

@Override
public SqlTypeName visit(Type.Timestamp expr) {
return SqlTypeName.TIMESTAMP;
}

@Override
public SqlTypeName visit(Type.IntervalYear year) {
return SqlTypeName.INTERVAL_YEAR_MONTH;
}

@Override
public SqlTypeName visit(Type.IntervalDay day) {
return SqlTypeName.INTERVAL_DAY;
}

@Override
public SqlTypeName visit(Type.UUID expr) {
return SqlTypeName.VARCHAR;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we use SqlTypeName.UUID here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

}

@Override
public SqlTypeName visit(Type.Struct struct) {
return SqlTypeName.ROW;
}

@Override
public SqlTypeName visit(Type.ListType listType) {
return SqlTypeName.ARRAY;
}

@Override
public SqlTypeName visit(Type.Map map) {
return SqlTypeName.MAP;
}

@Override
public SqlTypeName visit(ParameterizedType.FixedChar expr) {
return SqlTypeName.CHAR;
}

@Override
public SqlTypeName visit(ParameterizedType.VarChar expr) {
return SqlTypeName.VARCHAR;
}

@Override
public SqlTypeName visit(ParameterizedType.FixedBinary expr) {
return SqlTypeName.BINARY;
}

@Override
public SqlTypeName visit(ParameterizedType.Decimal expr) {
return SqlTypeName.DECIMAL;
}

@Override
public SqlTypeName visit(ParameterizedType.Struct expr) {
return SqlTypeName.ROW;
}

@Override
public SqlTypeName visit(ParameterizedType.ListType expr) {
return SqlTypeName.ARRAY;
}

@Override
public SqlTypeName visit(ParameterizedType.Map expr) {
return SqlTypeName.MAP;
}

@Override
public SqlTypeName visit(ParameterizedType.PrecisionTimestamp expr) {
return SqlTypeName.TIMESTAMP;
}

@Override
public SqlTypeName visit(ParameterizedType.PrecisionTimestampTZ expr) {
return SqlTypeName.TIMESTAMP_WITH_LOCAL_TIME_ZONE;
}

@Override
public SqlTypeName visit(ParameterizedType.PrecisionTime expr) {
return SqlTypeName.TIME;
}

@Override
public SqlTypeName visit(ParameterizedType.IntervalDay expr) {
return SqlTypeName.INTERVAL_DAY;
}

@Override
public SqlTypeName visit(ParameterizedType.IntervalCompound expr) {
// TODO: double check
return SqlTypeName.INTERVAL_DAY_HOUR;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't thinks this is correct. IntervalCompound is defined as

A compound interval type that is composed of elements of the underlying elements and rules of both interval_month and interval_day to express arbitrary durations across multiple grains. Substrait gives no definition for the conversion of values between independent grains (e.g. months to days).

So it would be a combination of Calcite's INTERVAL_DAY and INTERVAL_YEAR_MONTH.

I would suggest avoiding dealing with this mapping for and and just not generating dynamic operators for Substrait functions that use it. For users trying to process SQL, this shouldn't cause any issues because Calcite can't generate that type. For users trying to process Substrait plans containing those functions into Calcite, those would fail anyways because the TypeConverter from Substrait to Calcite can't handle it currently anyways.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dropped the overridden 'visit' method

}

@Override
public SqlTypeName visit(ParameterizedType.StringLiteral expr) {
String type = expr.value().toUpperCase();

if (type.startsWith("ANY")) {
return SqlTypeName.ANY;
}

switch (type) {
case "BOOLEAN":
return SqlTypeName.BOOLEAN;
case "I8":
return SqlTypeName.TINYINT;
case "I16":
return SqlTypeName.SMALLINT;
case "I32":
return SqlTypeName.INTEGER;
case "I64":
return SqlTypeName.BIGINT;
case "FP32":
return SqlTypeName.FLOAT;
case "FP64":
return SqlTypeName.DOUBLE;
case "STRING":
return SqlTypeName.VARCHAR;
case "BINARY":
return SqlTypeName.VARBINARY;
case "TIMESTAMP":
return SqlTypeName.TIMESTAMP;
case "TIMESTAMP_TZ":
return SqlTypeName.TIMESTAMP_WITH_LOCAL_TIME_ZONE;
case "DATE":
return SqlTypeName.DATE;
case "TIME":
return SqlTypeName.TIME;
case "UUID":
return SqlTypeName.VARCHAR;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could this be SqlTypeName.UUID?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

default:
if (type.startsWith("DECIMAL")) {
return SqlTypeName.DECIMAL;
}
if (type.startsWith("STRUCT")) {
return SqlTypeName.ROW;
}
if (type.startsWith("LIST")) {
return SqlTypeName.ARRAY;
}
return super.visit(expr);
}
}
}
}
Loading