@@ -43,7 +43,6 @@ namespace compute {
43
43
namespace internal {
44
44
45
45
namespace {
46
-
47
46
// ----------------------------------------------------------------------
48
47
// re2 utilities
49
48
@@ -2185,9 +2184,34 @@ void AddAsciiStringReplaceSubstring(FunctionRegistry* registry) {
2185
2184
2186
2185
using ExtractRegexState = OptionsWrapper<ExtractRegexOptions>;
2187
2186
2187
+ struct BaseExtractRegexData {
2188
+ Status Init () {
2189
+ RETURN_NOT_OK (RegexStatus (*regex));
2190
+ const int group_count = regex->NumberOfCapturingGroups ();
2191
+ const auto & name_map = regex->CapturingGroupNames ();
2192
+ group_names.reserve (group_count);
2193
+
2194
+ for (int i = 0 ; i < group_count; i++) {
2195
+ auto item = name_map.find (i + 1 ); // re2 starts counting from 1
2196
+ if (item == name_map.end ()) {
2197
+ // XXX should we instead just create fields with an empty name?
2198
+ return Status::Invalid (" Regular expression contains unnamed groups" );
2199
+ }
2200
+ group_names.emplace_back (item->second );
2201
+ }
2202
+ return Status::OK ();
2203
+ }
2204
+ int64_t num_groups () const { return static_cast <int64_t >(group_names.size ()); }
2205
+ std::unique_ptr<RE2> regex;
2206
+ std::vector<std::string> group_names;
2207
+
2208
+ protected:
2209
+ explicit BaseExtractRegexData (const std::string& pattern, bool is_utf8 = true )
2210
+ : regex(new RE2(pattern, MakeRE2Options(is_utf8))) {}
2211
+ };
2212
+
2188
2213
// TODO cache this once per ExtractRegexOptions
2189
- class ExtractRegexData {
2190
- public:
2214
+ struct ExtractRegexData : public BaseExtractRegexData {
2191
2215
static Result<ExtractRegexData> Make (const ExtractRegexOptions& options,
2192
2216
bool is_utf8 = true ) {
2193
2217
ExtractRegexData data (options.pattern , is_utf8);
@@ -2197,50 +2221,24 @@ class ExtractRegexData {
2197
2221
2198
2222
Result<TypeHolder> ResolveOutputType (const std::vector<TypeHolder>& types) const {
2199
2223
const DataType* input_type = types[0 ].type ;
2200
- // as mentioned here
2201
- // https://arrow.apache.org/docs/developers/cpp/development.html#code-style-linting-and-ci
2202
- // nullptr should not be used
2203
- if (input_type == NULLPTR) {
2224
+ if (input_type == nullptr ) {
2204
2225
// No input type specified
2205
- return NULLPTR ;
2226
+ return nullptr ;
2206
2227
}
2207
2228
// Input type is either [Large]Binary or [Large]String and is also the type
2208
2229
// of each field in the output struct type.
2209
2230
DCHECK (is_base_binary_like (input_type->id ()));
2210
2231
FieldVector fields;
2211
- fields.reserve (group_names_. size ());
2232
+ fields.reserve (num_groups ());
2212
2233
std::shared_ptr<DataType> owned_type = input_type->GetSharedPtr ();
2213
- std::transform (group_names_ .begin (), group_names_ .end (), std::back_inserter (fields),
2234
+ std::transform (group_names .begin (), group_names .end (), std::back_inserter (fields),
2214
2235
[&](const std::string& name) { return field (name, owned_type); });
2215
- return struct_ (fields);
2236
+ return struct_ (std::move ( fields) );
2216
2237
}
2217
- int64_t num_group () const { return group_names_.size (); }
2218
- std::shared_ptr<RE2> regex () const { return regex_; }
2219
2238
2220
- protected :
2239
+ private :
2221
2240
explicit ExtractRegexData (const std::string& pattern, bool is_utf8 = true )
2222
- : regex_(new RE2(pattern, MakeRE2Options(is_utf8))) {}
2223
-
2224
- Status Init () {
2225
- RETURN_NOT_OK (RegexStatus (*regex_));
2226
-
2227
- const int group_count = regex_->NumberOfCapturingGroups ();
2228
- const auto & name_map = regex_->CapturingGroupNames ();
2229
- group_names_.reserve (group_count);
2230
-
2231
- for (int i = 0 ; i < group_count; i++) {
2232
- auto item = name_map.find (i + 1 ); // re2 starts counting from 1
2233
- if (item == name_map.end ()) {
2234
- // XXX should we instead just create fields with an empty name?
2235
- return Status::Invalid (" Regular expression contains unnamed groups" );
2236
- }
2237
- group_names_.emplace_back (item->second );
2238
- }
2239
- return Status::OK ();
2240
- }
2241
-
2242
- std::shared_ptr<RE2> regex_;
2243
- std::vector<std::string> group_names_;
2241
+ : BaseExtractRegexData(pattern, is_utf8) {}
2244
2242
};
2245
2243
2246
2244
Result<TypeHolder> ResolveExtractRegexOutput (KernelContext* ctx,
@@ -2251,17 +2249,17 @@ Result<TypeHolder> ResolveExtractRegexOutput(KernelContext* ctx,
2251
2249
}
2252
2250
2253
2251
struct ExtractRegexBase {
2254
- const ExtractRegexData & data;
2252
+ const BaseExtractRegexData & data;
2255
2253
const int group_count;
2256
2254
std::vector<re2::StringPiece> found_values;
2257
2255
std::vector<RE2::Arg> args;
2258
2256
std::vector<const RE2::Arg*> args_pointers;
2259
2257
const RE2::Arg** args_pointers_start;
2260
2258
const RE2::Arg* null_arg = nullptr ;
2261
2259
2262
- explicit ExtractRegexBase (const ExtractRegexData & data)
2260
+ explicit ExtractRegexBase (const BaseExtractRegexData & data)
2263
2261
: data(data),
2264
- group_count(static_cast <int >(data.num_group ())),
2262
+ group_count(static_cast <int >(data.num_groups ())),
2265
2263
found_values(group_count) {
2266
2264
args.reserve (group_count);
2267
2265
args_pointers.reserve (group_count);
@@ -2276,7 +2274,7 @@ struct ExtractRegexBase {
2276
2274
}
2277
2275
2278
2276
bool Match (std::string_view s) {
2279
- return RE2::PartialMatchN (ToStringPiece (s), *data.regex () , args_pointers_start,
2277
+ return RE2::PartialMatchN (ToStringPiece (s), *data.regex , args_pointers_start,
2280
2278
group_count);
2281
2279
}
2282
2280
};
@@ -2291,18 +2289,16 @@ struct ExtractRegex : public ExtractRegexBase {
2291
2289
static Status Exec (KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
2292
2290
ExtractRegexOptions options = ExtractRegexState::Get (ctx);
2293
2291
ARROW_ASSIGN_OR_RAISE (auto data, ExtractRegexData::Make (options, Type::is_utf8));
2294
- return ExtractRegex{ data} .Extract (ctx, batch, out);
2292
+ return ExtractRegex ( data) .Extract (ctx, batch, out);
2295
2293
}
2296
2294
2297
2295
Status Extract (KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
2298
- ExtractRegexOptions options = ExtractRegexState::Get (ctx);
2299
- DCHECK_NE (out->array_data (), NULLPTR);
2296
+ DCHECK_NE (out->array_data (), nullptr );
2300
2297
std::shared_ptr<DataType> type = out->array_data ()->type ;
2301
- DCHECK_NE (type, NULLPTR);
2302
-
2303
- std::unique_ptr<ArrayBuilder> array_builder;
2304
- RETURN_NOT_OK (MakeBuilder (ctx->memory_pool (), type, &array_builder));
2305
- StructBuilder* struct_builder = checked_cast<StructBuilder*>(array_builder.get ());
2298
+ ARROW_ASSIGN_OR_RAISE (std::unique_ptr<ArrayBuilder> array_builder,
2299
+ MakeBuilder (type, ctx->memory_pool ()));
2300
+ auto struct_builder = checked_pointer_cast<StructBuilder>(std::move (array_builder));
2301
+ ARROW_RETURN_NOT_OK (struct_builder->Reserve (batch[0 ].array .length ));
2306
2302
2307
2303
std::vector<BuilderType*> field_builders;
2308
2304
field_builders.reserve (group_count);
@@ -2357,82 +2353,83 @@ void AddAsciiStringExtractRegex(FunctionRegistry* registry) {
2357
2353
}
2358
2354
DCHECK_OK (registry->AddFunction (std::move (func)));
2359
2355
}
2360
- class ExtractRegexSpanData : public ExtractRegexData {
2361
- public:
2362
- static Result<ExtractRegexSpanData> Make ( const std::string& pattern ) {
2363
- auto data = ExtractRegexSpanData (pattern, true );
2356
+ struct ExtractRegexSpanData : public BaseExtractRegexData {
2357
+ static Result<ExtractRegexSpanData> Make ( const std::string& pattern,
2358
+ bool is_utf8 = true ) {
2359
+ auto data = ExtractRegexSpanData (pattern, is_utf8 );
2364
2360
ARROW_RETURN_NOT_OK (data.Init ());
2365
2361
return data;
2366
2362
}
2367
2363
2368
2364
Result<TypeHolder> ResolveOutputType (const std::vector<TypeHolder>& types) const {
2369
2365
const DataType* input_type = types[0 ].type ;
2370
- if (input_type == NULLPTR ) {
2371
- return NULLPTR ;
2366
+ if (input_type == nullptr ) {
2367
+ return nullptr ;
2372
2368
}
2373
2369
DCHECK (is_base_binary_like (input_type->id ()));
2374
- const size_t field_count = group_names_. size ();
2370
+ const size_t field_count = num_groups ();
2375
2371
FieldVector fields;
2376
2372
fields.reserve (field_count);
2377
- const auto owned_type = input_type->GetSharedPtr ();
2378
- for (const auto & group_name : group_names_) {
2379
- auto type = is_binary_like (owned_type->id ()) ? int32 () : int64 ();
2373
+ auto index_type = is_binary_like (input_type->id ()) ? int32 () : int64 ();
2374
+ for (const auto & group_name : group_names) {
2380
2375
// size list is 2 as every span contains position and length
2381
- fields.push_back (field (group_name + " _span " , fixed_size_list (type , 2 )));
2376
+ fields.push_back (field (group_name, fixed_size_list (index_type , 2 )));
2382
2377
}
2383
- return struct_ (fields);
2378
+ return struct_ (std::move ( fields) );
2384
2379
}
2385
2380
2386
2381
private:
2387
2382
ExtractRegexSpanData (const std::string& pattern, const bool is_utf8)
2388
- : ExtractRegexData (pattern, is_utf8) {}
2383
+ : BaseExtractRegexData (pattern, is_utf8) {}
2389
2384
};
2390
2385
2391
2386
template <typename Type>
2392
2387
struct ExtractRegexSpan : ExtractRegexBase {
2393
2388
using ArrayType = typename TypeTraits<Type>::ArrayType;
2394
2389
using BuilderType = typename TypeTraits<Type>::BuilderType;
2390
+ using offset_type = typename Type::offset_type;
2391
+ using OffsetBuilderType =
2392
+ typename TypeTraits<typename CTypeTraits<offset_type>::ArrowType>::BuilderType;
2393
+ using OffsetCType =
2394
+ typename TypeTraits<typename CTypeTraits<offset_type>::ArrowType>::CType;
2395
+
2395
2396
using ExtractRegexBase::ExtractRegexBase;
2396
2397
2397
2398
static Status Exec (KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
2398
2399
auto options = OptionsWrapper<ExtractRegexSpanOptions>::Get (ctx);
2399
- ARROW_ASSIGN_OR_RAISE (auto data, ExtractRegexSpanData::Make (options.pattern ));
2400
+ ARROW_ASSIGN_OR_RAISE (auto data,
2401
+ ExtractRegexSpanData::Make (options.pattern , Type::is_utf8));
2400
2402
return ExtractRegexSpan{data}.Extract (ctx, batch, out);
2401
2403
}
2402
2404
Status Extract (KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
2403
- DCHECK_NE (out->array_data (), NULLPTR );
2405
+ DCHECK_NE (out->array_data (), nullptr );
2404
2406
std::shared_ptr<DataType> out_type = out->array_data ()->type ;
2405
- DCHECK_NE (out_type, NULLPTR);
2406
- std::unique_ptr<ArrayBuilder> out_builder;
2407
- ARROW_RETURN_NOT_OK (
2408
- MakeBuilder (ctx->memory_pool (), out->type ()->GetSharedPtr (), &out_builder));
2407
+ ARROW_ASSIGN_OR_RAISE (auto out_builder, MakeBuilder (out_type, ctx->memory_pool ()));
2409
2408
auto struct_builder = checked_pointer_cast<StructBuilder>(std::move (out_builder));
2409
+ ARROW_RETURN_NOT_OK (struct_builder->Reserve (batch[0 ].array .length ));
2410
2410
std::vector<FixedSizeListBuilder*> span_builders;
2411
- std::vector<ArrayBuilder *> array_builders;
2411
+ std::vector<OffsetBuilderType *> array_builders;
2412
2412
span_builders.reserve (group_count);
2413
2413
array_builders.reserve (group_count);
2414
2414
for (int i = 0 ; i < group_count; i++) {
2415
2415
span_builders.push_back (
2416
2416
checked_cast<FixedSizeListBuilder*>(struct_builder->field_builder (i)));
2417
- array_builders.push_back (span_builders[i]->value_builder ());
2417
+ array_builders.push_back (
2418
+ checked_cast<OffsetBuilderType*>(span_builders.back ()->value_builder ()));
2419
+ RETURN_NOT_OK (span_builders.back ()->Reserve (batch[0 ].array .length ));
2420
+ RETURN_NOT_OK (array_builders.back ()->Reserve (2 * batch[0 ].array .length ));
2418
2421
}
2422
+
2419
2423
auto visit_null = [&]() { return struct_builder->AppendNull (); };
2420
2424
auto visit_value = [&](std::string_view element) -> Status {
2421
2425
if (Match (element)) {
2422
2426
for (int i = 0 ; i < group_count; i++) {
2423
2427
// https://github.com/google/re2/issues/24#issuecomment-97653183
2424
- if (found_values[i].data () != NULLPTR ) {
2428
+ if (found_values[i].data () != nullptr ) {
2425
2429
int64_t begin = found_values[i].data () - element.data ();
2426
2430
int64_t size = found_values[i].size ();
2427
- if (is_binary_like (batch.GetTypes ()[0 ].id ())) {
2428
- ARROW_RETURN_NOT_OK (checked_cast<Int32Builder*>(array_builders[i])
2429
- ->AppendValues ({static_cast <int32_t >(begin),
2430
- static_cast <int32_t >(size)}));
2431
- } else {
2432
- ARROW_RETURN_NOT_OK (checked_cast<Int64Builder*>(array_builders[i])
2433
- ->AppendValues ({begin, size}));
2434
- }
2435
-
2431
+ array_builders[i]->UnsafeAppend (static_cast <OffsetCType>(begin));
2432
+ array_builders[i]->UnsafeAppend (static_cast <OffsetCType>(size));
2436
2433
ARROW_RETURN_NOT_OK (span_builders[i]->Append ());
2437
2434
} else {
2438
2435
ARROW_RETURN_NOT_OK (span_builders[i]->AppendNull ());
@@ -2448,25 +2445,33 @@ struct ExtractRegexSpan : ExtractRegexBase {
2448
2445
VisitArraySpanInline<Type>(batch[0 ].array , visit_value, visit_null));
2449
2446
2450
2447
ARROW_ASSIGN_OR_RAISE (auto out_array, struct_builder->Finish ());
2451
- out->value = out_array->data ();
2448
+ out->value = std::move ( out_array->data () );
2452
2449
return Status::OK ();
2453
2450
}
2454
2451
};
2455
2452
2456
- const FunctionDoc extract_regex_doc_span (
2457
- " likes extract_regex; however, it contains the position and length of results" , " " ,
2453
+ const FunctionDoc extract_regex_span_doc (
2454
+ " Extract substrings captured by a regex pattern and Save the result in the form of "
2455
+ " (offset,length)" ,
2456
+ " For each string in strings, match the regular expression and, if\n "
2457
+ " successful, emit a struct with field names and values coming from the\n "
2458
+ " regular expression's named capture groups, which are stored in a form of a\n "
2459
+ " fixed_size_list(offset, length). If the input is null or the regular \n "
2460
+ " expression Fails matching, a null output value is emitted.\n "
2461
+ " Regular expression matching is done using the Google RE2 library." ,
2458
2462
{" strings" }, " ExtractRegexSpanOptions" , true );
2459
2463
2460
- Result<TypeHolder> resolver (KernelContext* ctx, const std::vector<TypeHolder>& types) {
2464
+ Result<TypeHolder> ResolveExtractRegexSpanOutputType (
2465
+ KernelContext* ctx, const std::vector<TypeHolder>& types) {
2461
2466
auto options = OptionsWrapper<ExtractRegexSpanOptions>::Get (*ctx->state ());
2462
2467
ARROW_ASSIGN_OR_RAISE (auto span, ExtractRegexSpanData::Make (options.pattern ));
2463
2468
return span.ResolveOutputType (types);
2464
2469
}
2465
2470
2466
2471
void AddAsciiStringExtractRegexSpan (FunctionRegistry* registry) {
2467
2472
auto func = std::make_shared<ScalarFunction>(" extract_regex_span" , Arity::Unary (),
2468
- extract_regex_doc_span );
2469
- OutputType output_type (resolver );
2473
+ extract_regex_span_doc );
2474
+ OutputType output_type (ResolveExtractRegexSpanOutputType );
2470
2475
for (const auto & type : BaseBinaryTypes ()) {
2471
2476
ScalarKernel kernel ({type}, output_type,
2472
2477
GenerateVarBinaryToVarBinary<ExtractRegexSpan>(type),
0 commit comments