From f86a6ccea2da632fda2bf7f2563e42bb8c72b86a Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Wed, 11 Dec 2024 13:58:59 +0100 Subject: [PATCH 1/3] implicit conversion from Char to String in DataColumn.convertTo and DataFrame.convert() --- .../kotlinx/dataframe/impl/api/convert.kt | 8 ++++++- .../kotlinx/dataframe/api/convert.kt | 22 +++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt index f7cdacd630..8b97babaf2 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt @@ -367,7 +367,13 @@ internal fun createConverter(from: KType, to: KType, options: ParserOptions? = n Char::class -> when (toClass) { Int::class -> convert { it.code } - else -> null + + else -> // convert char to string and then to target type + getConverter(typeOf(), to, options)?.let { stringConverter -> + convert { + stringConverter(it.toString()) + } + } } Int::class -> when (toClass) { diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt index 47c49736db..55d4a802ea 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt @@ -3,6 +3,7 @@ package org.jetbrains.kotlinx.dataframe.api import io.kotest.assertions.throwables.shouldNotThrow import io.kotest.assertions.throwables.shouldThrow import io.kotest.matchers.shouldBe +import io.kotest.matchers.shouldNotBe import kotlinx.datetime.Clock import kotlinx.datetime.Instant import kotlinx.datetime.LocalTime @@ -69,6 +70,20 @@ class ConvertTests { @Test fun `convert string to enum`() { columnOf("A", "B").convertTo() shouldBe columnOf(EnumClass.A, EnumClass.B) + + dataFrameOf(columnOf("A", "B") named "colA") + .convert("colA").to() + .getColumn("colA") shouldBe columnOf(EnumClass.A, EnumClass.B).named("colA") + } + + @Test + fun `convert char to enum`() { + // Char -> String -> Enum + columnOf('A', 'B').convertTo() shouldBe columnOf(EnumClass.A, EnumClass.B) + + dataFrameOf(columnOf('A', 'B') named "colA") + .convert("colA").to() + .getColumn("colA") shouldBe columnOf(EnumClass.A, EnumClass.B).named("colA") } @JvmInline @@ -199,6 +214,13 @@ class ConvertTests { val col = columnOf(65, 66) col.convertTo() shouldBe columnOf('A', 'B') col.convertTo().convertTo() shouldBe col + + // this means + columnOf('1', '2').convertToInt() shouldNotBe columnOf(1, 2) + columnOf('1', '2').convertToInt() shouldBe columnOf(49, 50) + + // but + columnOf('1', '2').convertToString().convertToInt() shouldBe columnOf(1, 2) } @Test From 5c54f58224497ecb6200f3796152b21419a766e0 Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Wed, 11 Dec 2024 13:59:46 +0100 Subject: [PATCH 2/3] implicit conversion from Char to String in DataFrame.convertTo { parser { ... } } --- .../kotlinx/dataframe/api/convertTo.kt | 6 +++- .../kotlinx/dataframe/impl/api/convertTo.kt | 19 ++++++++++- .../kotlinx/dataframe/api/convertTo.kt | 32 +++++++++++++++++++ 3 files changed, 55 insertions(+), 2 deletions(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convertTo.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convertTo.kt index 70f2954940..4e7fd9aa08 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convertTo.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convertTo.kt @@ -52,7 +52,7 @@ public class ConverterScope(public val fromType: KType, public val toSchema: Col * df.convertTo { * // defines how to convert Int? -> String * convert().with { it?.toString() ?: "No input given" } - * // defines how to convert String -> SomeType + * // defines how to convert String -> SomeType (and Char.toString() -> SomeType) * parser { SomeType(it) } * // fill missing column `sum` with expression `a+b` * fill { sum }.with { a + b } @@ -102,6 +102,10 @@ public fun ConvertToFill.with(expr: RowExpression) { /** * Defines how to convert `String` values into given type [C]. + * + * This method is a shortcut for `convert().with { }`. + * + * If no converter is defined for `Char` values, this converter will be used for them as well. */ public inline fun ConvertSchemaDsl<*>.parser(noinline parser: (String) -> C): Unit = convert().with(parser) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convertTo.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convertTo.kt index 48c2864df8..452abe96e1 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convertTo.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convertTo.kt @@ -45,8 +45,10 @@ import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema import org.jetbrains.kotlinx.dataframe.schema.DataFrameSchema import org.jetbrains.kotlinx.dataframe.size import kotlin.reflect.KType +import kotlin.reflect.full.isSubtypeOf import kotlin.reflect.full.withNullability import kotlin.reflect.jvm.jvmErasure +import kotlin.reflect.typeOf private val logger = KotlinLogging.logger {} @@ -143,7 +145,22 @@ internal fun AnyFrame.convertToImpl( // try to perform any user-specified conversions first val from = originalColumn.type() val to = targetSchema.type - val converter = dsl.getConverter(from, targetSchema) + var converter = dsl.getConverter(from, targetSchema) + + // special case for Char columns; check if we have any converters for String -> target + // if so, we can convert Char -> String -> target + if (converter == null && from.isSubtypeOf(typeOf())) { + val stringConverter = dsl.getConverter( + fromType = typeOf().withNullability(from.isMarkedNullable), + toSchema = targetSchema, + ) + if (stringConverter != null) { + converter = Converter( + transform = { stringConverter.transform(this, (it as Char?)?.toString()) }, + skipNulls = stringConverter.skipNulls, + ) + } + } val convertedColumn = if (converter != null) { val nullsAllowed = to.isMarkedNullable diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/convertTo.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/convertTo.kt index 176ab06975..56e5ff36a2 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/convertTo.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/convertTo.kt @@ -51,6 +51,38 @@ class ConvertToTests { df.convertTo { parser { A(it.toInt()) } } .single() .a.value shouldBe 1 + + // shortcut for: + df.convertTo { convert().with { A(it.toInt()) } } + .single() + .a.value shouldBe 1 + } + + @Test + fun `convert from char with parser`() { + val df = dataFrameOf("a")('1') + + shouldThrow { + df.convertTo() + } + + // Char -> String -> Target + df.convertTo { parser { A(it.toInt()) } } + .single() + .a.value shouldBe 1 + + // shortcut for: + df.convertTo { convert().with { A(it.toInt()) } } + .single() + .a.value shouldBe 1 + + // Char -> Target + df.convertTo { + parser { error("should not be triggered if convert() is present") } + convert().with<_, A> { error("should not be triggered if convert() is present") } + + convert().with { A(it.digitToInt()) } + }.single().a.value shouldBe 1 } @Test From eee84c26aa2839a4a045959e4162580e0c83fe4e Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Wed, 11 Dec 2024 14:40:27 +0100 Subject: [PATCH 3/3] introducing parsing of Char? columns. It works the same as String parsing, but can never result in Char and can never fail (since it can parse to String) --- core/api/core.api | 4 ++ .../jetbrains/kotlinx/dataframe/api/parse.kt | 40 +++++++++++++++++++ .../kotlinx/dataframe/impl/api/parse.kt | 17 ++++---- .../kotlinx/dataframe/api/convert.kt | 2 + .../jetbrains/kotlinx/dataframe/api/parse.kt | 16 ++++++++ .../kotlinx/dataframe/io/ParserTests.kt | 6 +++ 6 files changed, 76 insertions(+), 9 deletions(-) diff --git a/core/api/core.api b/core/api/core.api index c5fbefa9b8..02e1a19a52 100644 --- a/core/api/core.api +++ b/core/api/core.api @@ -6501,8 +6501,12 @@ public final class org/jetbrains/kotlinx/dataframe/api/ParseKt { public static synthetic fun parse$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame;[Lorg/jetbrains/kotlinx/dataframe/columns/ColumnReference;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; public static final fun parseAnyFrameNullable (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;)Lorg/jetbrains/kotlinx/dataframe/DataColumn; public static synthetic fun parseAnyFrameNullable$default (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataColumn; + public static final fun parseChar (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;)Lorg/jetbrains/kotlinx/dataframe/DataColumn; + public static synthetic fun parseChar$default (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataColumn; public static final fun tryParse (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;)Lorg/jetbrains/kotlinx/dataframe/DataColumn; public static synthetic fun tryParse$default (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataColumn; + public static final fun tryParseChar (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;)Lorg/jetbrains/kotlinx/dataframe/DataColumn; + public static synthetic fun tryParseChar$default (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataColumn; } public final class org/jetbrains/kotlinx/dataframe/api/ParserOptions { diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt index 36c76537ec..40f5078f7a 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt @@ -18,6 +18,7 @@ import java.time.format.DateTimeFormatter import java.util.Locale import kotlin.reflect.KProperty import kotlin.reflect.KType +import kotlin.reflect.typeOf /** * ### Global Parser Options @@ -197,6 +198,28 @@ public class ParserOptions( /** @include [tryParseImpl] */ public fun DataColumn.tryParse(options: ParserOptions? = null): DataColumn<*> = tryParseImpl(options) +/** + * Tries to parse a column of chars into a column of a different type. + * Each parser in [Parsers] is run in order until a valid parser is found, + * a.k.a. that parser was able to parse all values in the column successfully. If a parser + * fails to parse any value, the next parser is tried. If all the others fail, the final parser + * returns strings. + * + * Parsers that are [covered by][StringParser.coveredBy] other parsers are skipped. + * + * @param options options for parsing, like providing a locale or a custom date-time formatter + * @throws IllegalStateException if no valid parser is found (unlikely, unless the `String` parser is disabled) + * @return a new column with parsed values + */ +@JvmName("tryParseChar") +public fun DataColumn.tryParse(options: ParserOptions? = null): DataColumn<*> { + // skip the Char parser, as we're trying to parse away from Char + val providedSkipTypes = options?.skipTypes ?: DataFrame.parser.skipTypes + val parserOptions = (options ?: ParserOptions()).copy(skipTypes = providedSkipTypes + typeOf()) + + return map { it?.toString() }.tryParse(parserOptions) +} + public fun DataFrame.parse(options: ParserOptions? = null): DataFrame = parse(options) { colsAtAnyDepth { !it.isColumnGroup() } @@ -220,6 +243,23 @@ public fun DataFrame.parse(options: ParserOptions? = null): DataFrame public fun DataColumn.parse(options: ParserOptions? = null): DataColumn<*> = tryParse(options).also { if (it.typeClass == String::class) error("Can't guess column type") } +/** + * Tries to parse a column of chars as strings into a column of a different type. + * Each parser in [Parsers] is run in order until a valid parser is found, + * a.k.a. that parser was able to parse all values in the column successfully. If a parser + * fails to parse any value, the next parser is tried. + * + * If all fail, the column is returned as `String`, this can never fail. + * + * Parsers that are [covered by][StringParser.coveredBy] other parsers are skipped. + * + * @param options options for parsing, like providing a locale or a custom date-time formatter + * @return a new column with parsed values + */ +@JvmName("parseChar") +public fun DataColumn.parse(options: ParserOptions? = null): DataColumn<*> = + tryParse(options) // no need to throw an exception, as Char can always be parsed as String + @JvmName("parseAnyFrameNullable") public fun DataColumn.parse(options: ParserOptions? = null): DataColumn = map { it?.parse(options) } diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt index 239c22d5c4..af66d0db2a 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt @@ -564,29 +564,28 @@ internal fun DataFrame.parseImpl(options: ParserOptions?, columns: Column when { // when a frame column is requested to be parsed, // parse each value/frame column at any depth inside each DataFrame in the frame column - col.isFrameColumn() -> { + col.isFrameColumn() -> col.map { it.parseImpl(options) { colsAtAnyDepth { !it.isColumnGroup() } } } - } // when a column group is requested to be parsed, // parse each column in the group - col.isColumnGroup() -> { + col.isColumnGroup() -> col.parseImpl(options) { all() } .asColumnGroup(col.name()) .asDataColumn() - } + + // Base case, parse the column as String if it's a `Char?` column + col.isSubtypeOf() -> + col.cast().map { it?.toString() }.tryParseImpl(options) // Base case, parse the column if it's a `String?` column - col.isSubtypeOf() -> { + col.isSubtypeOf() -> col.cast().tryParseImpl(options) - } - else -> { - col - } + else -> col } } diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt index 55d4a802ea..b8ca8cca1a 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt @@ -221,6 +221,8 @@ class ConvertTests { // but columnOf('1', '2').convertToString().convertToInt() shouldBe columnOf(1, 2) + // or + columnOf('1', '2').parse() shouldBe columnOf(1, 2) } @Test diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt index 081e28d078..9a1fd2e63d 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt @@ -32,6 +32,22 @@ import java.time.Duration as JavaDuration import java.time.Instant as JavaInstant class ParseTests { + + @Test + fun `parse chars to string`() { + val char = columnOf('a', 'b', 'c') + char.parse() shouldBe columnOf("a", "b", "c") + char.tryParse() shouldBe columnOf("a", "b", "c") + char.parse().cast().parse() shouldBe char + } + + @Test + fun `parse chars to int`() { + val char = columnOf('1', '2', '3') + char.parse() shouldBe columnOf(1, 2, 3) + char.tryParse() shouldBe columnOf(1, 2, 3) + } + @Test fun parseDate() { val currentLocale = Locale.getDefault() diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt index 64d2ced7b1..93b6a97ecc 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt @@ -46,6 +46,12 @@ class ParserTests { DataFrame.parser.resetToDefault() } + @Test + fun `parse to Char`() { + val col by columnOf("a", "b") + col.parse().type() shouldBe typeOf() + } + @Test(expected = IllegalStateException::class) fun `parse should throw`() { val col by columnOf("a", "bc")