@@ -43,6 +43,7 @@ namespace compute {
43
43
namespace internal {
44
44
45
45
namespace {
46
+
46
47
// ----------------------------------------------------------------------
47
48
// re2 utilities
48
49
@@ -2201,7 +2202,9 @@ struct BaseExtractRegexData {
2201
2202
}
2202
2203
return Status::OK ();
2203
2204
}
2205
+
2204
2206
int64_t num_groups () const { return static_cast <int64_t >(group_names.size ()); }
2207
+
2205
2208
std::unique_ptr<RE2> regex;
2206
2209
std::vector<std::string> group_names;
2207
2210
@@ -2297,14 +2300,15 @@ struct ExtractRegex : public ExtractRegexBase {
2297
2300
std::shared_ptr<DataType> type = out->array_data ()->type ;
2298
2301
ARROW_ASSIGN_OR_RAISE (std::unique_ptr<ArrayBuilder> array_builder,
2299
2302
MakeBuilder (type, ctx->memory_pool ()));
2300
- auto struct_builder = checked_pointer_cast <StructBuilder>( std::move ( array_builder));
2301
- ARROW_RETURN_NOT_OK (struct_builder->Reserve (batch[0 ].array . length ));
2303
+ StructBuilder* struct_builder = checked_cast <StructBuilder*>( array_builder. get ( ));
2304
+ ARROW_RETURN_NOT_OK (struct_builder->Reserve (batch[0 ].length () ));
2302
2305
2303
2306
std::vector<BuilderType*> field_builders;
2304
2307
field_builders.reserve (group_count);
2305
2308
for (int i = 0 ; i < group_count; i++) {
2306
2309
field_builders.push_back (
2307
2310
checked_cast<BuilderType*>(struct_builder->field_builder (i)));
2311
+ RETURN_NOT_OK (field_builders.back ()->Reserve (batch[0 ].length ()));
2308
2312
}
2309
2313
2310
2314
auto visit_null = [&]() { return struct_builder->AppendNull (); };
@@ -2353,6 +2357,7 @@ void AddAsciiStringExtractRegex(FunctionRegistry* registry) {
2353
2357
}
2354
2358
DCHECK_OK (registry->AddFunction (std::move (func)));
2355
2359
}
2360
+
2356
2361
struct ExtractRegexSpanData : public BaseExtractRegexData {
2357
2362
static Result<ExtractRegexSpanData> Make (const std::string& pattern,
2358
2363
bool is_utf8 = true ) {
@@ -2367,12 +2372,11 @@ struct ExtractRegexSpanData : public BaseExtractRegexData {
2367
2372
return nullptr ;
2368
2373
}
2369
2374
DCHECK (is_base_binary_like (input_type->id ()));
2370
- const size_t field_count = num_groups ();
2371
2375
FieldVector fields;
2372
- fields.reserve (field_count );
2376
+ fields.reserve (num_groups () );
2373
2377
auto index_type = is_binary_like (input_type->id ()) ? int32 () : int64 ();
2374
2378
for (const auto & group_name : group_names) {
2375
- // size list is 2 as every span contains position and length
2379
+ // list size is 2 as every span contains position and length
2376
2380
fields.push_back (field (group_name, fixed_size_list (index_type, 2 )));
2377
2381
}
2378
2382
return struct_ (std::move (fields));
@@ -2401,12 +2405,14 @@ struct ExtractRegexSpan : ExtractRegexBase {
2401
2405
ExtractRegexSpanData::Make (options.pattern , Type::is_utf8));
2402
2406
return ExtractRegexSpan{data}.Extract (ctx, batch, out);
2403
2407
}
2408
+
2404
2409
Status Extract (KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
2405
2410
DCHECK_NE (out->array_data (), nullptr );
2406
2411
std::shared_ptr<DataType> out_type = out->array_data ()->type ;
2407
2412
ARROW_ASSIGN_OR_RAISE (auto out_builder, MakeBuilder (out_type, ctx->memory_pool ()));
2408
- auto struct_builder = checked_pointer_cast <StructBuilder>( std::move ( out_builder));
2413
+ StructBuilder* struct_builder = checked_cast <StructBuilder*>( out_builder. get ( ));
2409
2414
ARROW_RETURN_NOT_OK (struct_builder->Reserve (batch[0 ].array .length ));
2415
+
2410
2416
std::vector<FixedSizeListBuilder*> span_builders;
2411
2417
std::vector<OffsetBuilderType*> array_builders;
2412
2418
span_builders.reserve (group_count);
@@ -2416,8 +2422,8 @@ struct ExtractRegexSpan : ExtractRegexBase {
2416
2422
checked_cast<FixedSizeListBuilder*>(struct_builder->field_builder (i)));
2417
2423
array_builders.push_back (
2418
2424
checked_cast<OffsetBuilderType*>(span_builders.back ()->value_builder ()));
2419
- RETURN_NOT_OK (span_builders.back ()->Reserve (batch[0 ].array . length ));
2420
- RETURN_NOT_OK (array_builders.back ()->Reserve (2 * batch[0 ].array . length ));
2425
+ RETURN_NOT_OK (span_builders.back ()->Reserve (batch[0 ].length () ));
2426
+ RETURN_NOT_OK (array_builders.back ()->Reserve (2 * batch[0 ].length () ));
2421
2427
}
2422
2428
2423
2429
auto visit_null = [&]() { return struct_builder->AppendNull (); };
@@ -2451,15 +2457,20 @@ struct ExtractRegexSpan : ExtractRegexBase {
2451
2457
};
2452
2458
2453
2459
const FunctionDoc extract_regex_span_doc (
2454
- " Extract substrings captured by a regex pattern and Save the result in the form of "
2455
- " (offset,length)" ,
2456
- " For each string in strings, match the regular expression and, if\n "
2457
- " successful, emit a struct with field names and values coming from the\n "
2458
- " regular expression's named capture groups, which are stored in a form of a\n "
2459
- " fixed_size_list(offset, length). If the input is null or the regular \n "
2460
- " expression Fails matching, a null output value is emitted.\n "
2461
- " Regular expression matching is done using the Google RE2 library." ,
2462
- {" strings" }, " ExtractRegexSpanOptions" , true );
2460
+ " Extract string spans captured by a regex pattern" ,
2461
+ (" For each string in strings, match the regular expression and, if\n "
2462
+ " successful, emit a struct with field names and values coming from the\n "
2463
+ " regular expression's named capture groups. Each struct field value\n "
2464
+ " will be a fixed_size_list(offset_type, 2) where offset_type is int32\n "
2465
+ " or int64, depending on the input string type. The two elements in\n "
2466
+ " each fixed-size list are the index and the length of the substring\n "
2467
+ " matched by the corresponding named capture group.\n "
2468
+ " \n "
2469
+ " If the input is null or the regular expression fails matching,\n "
2470
+ " a null output value is emitted.\n "
2471
+ " \n "
2472
+ " Regular expression matching is done using the Google RE2 library." ),
2473
+ {" strings" }, " ExtractRegexSpanOptions" , /* options_required=*/ true );
2463
2474
2464
2475
Result<TypeHolder> ResolveExtractRegexSpanOutputType (
2465
2476
KernelContext* ctx, const std::vector<TypeHolder>& types) {
0 commit comments