Skip to content

Handling Chars as Strings implicitly #999

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions core/api/core.api
Original file line number Diff line number Diff line change
@@ -6501,8 +6501,12 @@ public final class org/jetbrains/kotlinx/dataframe/api/ParseKt {
public static synthetic fun parse$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame;[Lorg/jetbrains/kotlinx/dataframe/columns/ColumnReference;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static final fun parseAnyFrameNullable (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
public static synthetic fun parseAnyFrameNullable$default (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
public static final fun parseChar (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
public static synthetic fun parseChar$default (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
public static final fun tryParse (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
public static synthetic fun tryParse$default (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
public static final fun tryParseChar (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
public static synthetic fun tryParseChar$default (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
}

public final class org/jetbrains/kotlinx/dataframe/api/ParserOptions {
Original file line number Diff line number Diff line change
@@ -52,7 +52,7 @@ public class ConverterScope(public val fromType: KType, public val toSchema: Col
* df.convertTo<SomeSchema> {
* // defines how to convert Int? -> String
* convert<Int?>().with { it?.toString() ?: "No input given" }
* // defines how to convert String -> SomeType
* // defines how to convert String -> SomeType (and Char.toString() -> SomeType)
* parser { SomeType(it) }
* // fill missing column `sum` with expression `a+b`
* fill { sum }.with { a + b }
@@ -102,6 +102,10 @@ public fun <T, C> ConvertToFill<T, C>.with(expr: RowExpression<T, C>) {

/**
* Defines how to convert `String` values into given type [C].
*
* This method is a shortcut for `convert<String>().with { }`.
*
* If no converter is defined for `Char` values, this converter will be used for them as well.
*/
public inline fun <reified C> ConvertSchemaDsl<*>.parser(noinline parser: (String) -> C): Unit =
convert<String>().with(parser)
40 changes: 40 additions & 0 deletions core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt
Original file line number Diff line number Diff line change
@@ -18,6 +18,7 @@ import java.time.format.DateTimeFormatter
import java.util.Locale
import kotlin.reflect.KProperty
import kotlin.reflect.KType
import kotlin.reflect.typeOf

/**
* ### Global Parser Options
@@ -197,6 +198,28 @@ public class ParserOptions(
/** @include [tryParseImpl] */
public fun DataColumn<String?>.tryParse(options: ParserOptions? = null): DataColumn<*> = tryParseImpl(options)

/**
* Tries to parse a column of chars into a column of a different type.
* Each parser in [Parsers] is run in order until a valid parser is found,
* a.k.a. that parser was able to parse all values in the column successfully. If a parser
* fails to parse any value, the next parser is tried. If all the others fail, the final parser
* returns strings.
*
* Parsers that are [covered by][StringParser.coveredBy] other parsers are skipped.
*
* @param options options for parsing, like providing a locale or a custom date-time formatter
* @throws IllegalStateException if no valid parser is found (unlikely, unless the `String` parser is disabled)
* @return a new column with parsed values
*/
@JvmName("tryParseChar")
public fun DataColumn<Char?>.tryParse(options: ParserOptions? = null): DataColumn<*> {
// skip the Char parser, as we're trying to parse away from Char
val providedSkipTypes = options?.skipTypes ?: DataFrame.parser.skipTypes
val parserOptions = (options ?: ParserOptions()).copy(skipTypes = providedSkipTypes + typeOf<Char>())

return map { it?.toString() }.tryParse(parserOptions)
}

public fun <T> DataFrame<T>.parse(options: ParserOptions? = null): DataFrame<T> =
parse(options) {
colsAtAnyDepth { !it.isColumnGroup() }
@@ -220,6 +243,23 @@ public fun <T> DataFrame<T>.parse(options: ParserOptions? = null): DataFrame<T>
public fun DataColumn<String?>.parse(options: ParserOptions? = null): DataColumn<*> =
tryParse(options).also { if (it.typeClass == String::class) error("Can't guess column type") }

/**
* Tries to parse a column of chars as strings into a column of a different type.
* Each parser in [Parsers] is run in order until a valid parser is found,
* a.k.a. that parser was able to parse all values in the column successfully. If a parser
* fails to parse any value, the next parser is tried.
*
* If all fail, the column is returned as `String`, this can never fail.
*
* Parsers that are [covered by][StringParser.coveredBy] other parsers are skipped.
*
* @param options options for parsing, like providing a locale or a custom date-time formatter
* @return a new column with parsed values
*/
@JvmName("parseChar")
public fun DataColumn<Char?>.parse(options: ParserOptions? = null): DataColumn<*> =
tryParse(options) // no need to throw an exception, as Char can always be parsed as String

@JvmName("parseAnyFrameNullable")
public fun DataColumn<AnyFrame?>.parse(options: ParserOptions? = null): DataColumn<AnyFrame?> =
map { it?.parse(options) }
Original file line number Diff line number Diff line change
@@ -367,7 +367,13 @@ internal fun createConverter(from: KType, to: KType, options: ParserOptions? = n

Char::class -> when (toClass) {
Int::class -> convert<Char> { it.code }
else -> null

else -> // convert char to string and then to target type
getConverter(typeOf<String>(), to, options)?.let { stringConverter ->
convert<Char> {
stringConverter(it.toString())
}
}
}

Int::class -> when (toClass) {
Original file line number Diff line number Diff line change
@@ -45,8 +45,10 @@ import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema
import org.jetbrains.kotlinx.dataframe.schema.DataFrameSchema
import org.jetbrains.kotlinx.dataframe.size
import kotlin.reflect.KType
import kotlin.reflect.full.isSubtypeOf
import kotlin.reflect.full.withNullability
import kotlin.reflect.jvm.jvmErasure
import kotlin.reflect.typeOf

private val logger = KotlinLogging.logger {}

@@ -143,7 +145,22 @@ internal fun AnyFrame.convertToImpl(
// try to perform any user-specified conversions first
val from = originalColumn.type()
val to = targetSchema.type
val converter = dsl.getConverter(from, targetSchema)
var converter = dsl.getConverter(from, targetSchema)

// special case for Char columns; check if we have any converters for String -> target
// if so, we can convert Char -> String -> target
if (converter == null && from.isSubtypeOf(typeOf<Char?>())) {
val stringConverter = dsl.getConverter(
fromType = typeOf<String>().withNullability(from.isMarkedNullable),
toSchema = targetSchema,
)
if (stringConverter != null) {
converter = Converter(
transform = { stringConverter.transform(this, (it as Char?)?.toString()) },
skipNulls = stringConverter.skipNulls,
)
}
}

val convertedColumn = if (converter != null) {
val nullsAllowed = to.isMarkedNullable
Original file line number Diff line number Diff line change
@@ -564,29 +564,28 @@ internal fun <T> DataFrame<T>.parseImpl(options: ParserOptions?, columns: Column
when {
// when a frame column is requested to be parsed,
// parse each value/frame column at any depth inside each DataFrame in the frame column
col.isFrameColumn() -> {
col.isFrameColumn() ->
col.map {
it.parseImpl(options) {
colsAtAnyDepth { !it.isColumnGroup() }
}
}
}

// when a column group is requested to be parsed,
// parse each column in the group
col.isColumnGroup() -> {
col.isColumnGroup() ->
col.parseImpl(options) { all() }
.asColumnGroup(col.name())
.asDataColumn()
}

// Base case, parse the column as String if it's a `Char?` column
col.isSubtypeOf<Char?>() ->
col.cast<Char?>().map { it?.toString() }.tryParseImpl(options)

// Base case, parse the column if it's a `String?` column
col.isSubtypeOf<String?>() -> {
col.isSubtypeOf<String?>() ->
col.cast<String?>().tryParseImpl(options)
}

else -> {
col
}
else -> col
}
}
Original file line number Diff line number Diff line change
@@ -3,6 +3,7 @@ package org.jetbrains.kotlinx.dataframe.api
import io.kotest.assertions.throwables.shouldNotThrow
import io.kotest.assertions.throwables.shouldThrow
import io.kotest.matchers.shouldBe
import io.kotest.matchers.shouldNotBe
import kotlinx.datetime.Clock
import kotlinx.datetime.Instant
import kotlinx.datetime.LocalTime
@@ -69,6 +70,20 @@ class ConvertTests {
@Test
fun `convert string to enum`() {
columnOf("A", "B").convertTo<EnumClass>() shouldBe columnOf(EnumClass.A, EnumClass.B)

dataFrameOf(columnOf("A", "B") named "colA")
.convert("colA").to<EnumClass>()
.getColumn("colA") shouldBe columnOf(EnumClass.A, EnumClass.B).named("colA")
}

@Test
fun `convert char to enum`() {
// Char -> String -> Enum
columnOf('A', 'B').convertTo<EnumClass>() shouldBe columnOf(EnumClass.A, EnumClass.B)

dataFrameOf(columnOf('A', 'B') named "colA")
.convert("colA").to<EnumClass>()
.getColumn("colA") shouldBe columnOf(EnumClass.A, EnumClass.B).named("colA")
}

@JvmInline
@@ -199,6 +214,15 @@ class ConvertTests {
val col = columnOf(65, 66)
col.convertTo<Char>() shouldBe columnOf('A', 'B')
col.convertTo<Char>().convertTo<Int>() shouldBe col

// this means
columnOf('1', '2').convertToInt() shouldNotBe columnOf(1, 2)
columnOf('1', '2').convertToInt() shouldBe columnOf(49, 50)

// but
columnOf('1', '2').convertToString().convertToInt() shouldBe columnOf(1, 2)
// or
columnOf('1', '2').parse() shouldBe columnOf(1, 2)
}

@Test
Original file line number Diff line number Diff line change
@@ -51,6 +51,38 @@ class ConvertToTests {
df.convertTo<Schema> { parser { A(it.toInt()) } }
.single()
.a.value shouldBe 1

// shortcut for:
df.convertTo<Schema> { convert<String>().with { A(it.toInt()) } }
.single()
.a.value shouldBe 1
}

@Test
fun `convert from char with parser`() {
val df = dataFrameOf("a")('1')

shouldThrow<TypeConverterNotFoundException> {
df.convertTo<Schema>()
}

// Char -> String -> Target
df.convertTo<Schema> { parser { A(it.toInt()) } }
.single()
.a.value shouldBe 1

// shortcut for:
df.convertTo<Schema> { convert<String>().with { A(it.toInt()) } }
.single()
.a.value shouldBe 1

// Char -> Target
df.convertTo<Schema> {
parser<A> { error("should not be triggered if convert<Char>() is present") }
convert<String>().with<_, A> { error("should not be triggered if convert<Char>() is present") }

convert<Char>().with { A(it.digitToInt()) }
}.single().a.value shouldBe 1
}

@Test
16 changes: 16 additions & 0 deletions core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt
Original file line number Diff line number Diff line change
@@ -32,6 +32,22 @@ import java.time.Duration as JavaDuration
import java.time.Instant as JavaInstant

class ParseTests {

@Test
fun `parse chars to string`() {
val char = columnOf('a', 'b', 'c')
char.parse() shouldBe columnOf("a", "b", "c")
char.tryParse() shouldBe columnOf("a", "b", "c")
char.parse().cast<String>().parse() shouldBe char
}

@Test
fun `parse chars to int`() {
val char = columnOf('1', '2', '3')
char.parse() shouldBe columnOf(1, 2, 3)
char.tryParse() shouldBe columnOf(1, 2, 3)
}

@Test
fun parseDate() {
val currentLocale = Locale.getDefault()
Original file line number Diff line number Diff line change
@@ -46,6 +46,12 @@ class ParserTests {
DataFrame.parser.resetToDefault()
}

@Test
fun `parse to Char`() {
val col by columnOf("a", "b")
col.parse().type() shouldBe typeOf<Char>()
}

@Test(expected = IllegalStateException::class)
fun `parse should throw`() {
val col by columnOf("a", "bc")