|
| 1 | +from __future__ import annotations # Python 3.9 requires it for X | Y type hints |
| 2 | + |
1 | 3 | from collections.abc import Generator |
| 4 | +from typing import cast |
2 | 5 |
|
3 | 6 | import pyarrow as pa |
4 | 7 |
|
5 | 8 |
|
6 | | -def is_pa_type_a_list(pa_type: type[pa.Array]) -> bool: |
| 9 | +def is_pa_type_a_list(pa_type: pa.DataType) -> bool: |
7 | 10 | """Check if the given pyarrow type is a list type. |
8 | 11 |
|
9 | 12 | I.e. one of the following types: ListArray, LargeListArray, |
@@ -39,3 +42,148 @@ def enumerate_chunks(array: pa.ChunkedArray) -> Generator[tuple[slice, pa.Array] |
39 | 42 | index_stop = index_start + len(chunk) |
40 | 43 | yield slice(index_start, index_stop), chunk |
41 | 44 | index_start = index_stop |
| 45 | + |
| 46 | + |
| 47 | +def validate_struct_list_array_for_equal_lengths(array: pa.StructArray) -> None: |
| 48 | + """Check if the given struct array has lists of equal length. |
| 49 | +
|
| 50 | + Parameters |
| 51 | + ---------- |
| 52 | + array : pa.StructArray |
| 53 | + Input struct array. |
| 54 | +
|
| 55 | + Raises |
| 56 | + ------ |
| 57 | + ValueError |
| 58 | + If the struct array has lists of unequal length or type of the input |
| 59 | + array is not a StructArray or fields are not ListArrays. |
| 60 | + """ |
| 61 | + if not pa.types.is_struct(array.type): |
| 62 | + raise ValueError(f"Expected a StructArray, got {array.type}") |
| 63 | + |
| 64 | + first_list_array: pa.ListArray | None = None |
| 65 | + for field in array.type: |
| 66 | + inner_array = array.field(field.name) |
| 67 | + if not is_pa_type_a_list(inner_array.type): |
| 68 | + raise ValueError(f"Expected a ListArray, got {inner_array.type}") |
| 69 | + list_array = cast(pa.ListArray, inner_array) |
| 70 | + |
| 71 | + if first_list_array is None: |
| 72 | + first_list_array = list_array |
| 73 | + continue |
| 74 | + # compare offsets from the first list array with the current one |
| 75 | + if not first_list_array.offsets.equals(list_array.offsets): |
| 76 | + raise ValueError("Offsets of all ListArrays must be the same") |
| 77 | + |
| 78 | + |
| 79 | +def transpose_struct_list_type(t: pa.StructType) -> pa.ListType: |
| 80 | + """Converts a type of struct-list array into a type of list-struct array. |
| 81 | +
|
| 82 | + Parameters |
| 83 | + ---------- |
| 84 | + t : pa.DataType |
| 85 | + Input type of struct-list array. |
| 86 | +
|
| 87 | + Returns |
| 88 | + ------- |
| 89 | + pa.DataType |
| 90 | + Type of list-struct array. |
| 91 | +
|
| 92 | + Raises |
| 93 | + ------ |
| 94 | + ValueError |
| 95 | + If the input type is not a struct-list type. |
| 96 | + """ |
| 97 | + if not pa.types.is_struct(t): |
| 98 | + raise ValueError(f"Expected a StructType, got {t}") |
| 99 | + |
| 100 | + fields = [] |
| 101 | + for field in t: |
| 102 | + if not is_pa_type_a_list(field.type): |
| 103 | + raise ValueError(f"Expected a ListType, got {field.type}") |
| 104 | + list_type = cast(pa.ListType, field.type) |
| 105 | + fields.append(pa.field(field.name, list_type.value_type)) |
| 106 | + |
| 107 | + list_type = cast(pa.ListType, pa.list_(pa.struct(fields))) |
| 108 | + return list_type |
| 109 | + |
| 110 | + |
| 111 | +def transpose_struct_list_array(array: pa.StructArray, validate: bool = True) -> pa.ListArray: |
| 112 | + """Converts a struct-array of lists into a list-array of structs. |
| 113 | +
|
| 114 | + Parameters |
| 115 | + ---------- |
| 116 | + array : pa.StructArray |
| 117 | + Input struct array, each scalar must have lists of equal length. |
| 118 | + validate : bool, default True |
| 119 | + Whether to validate the input array for list lengths. Raises ValueError |
| 120 | + if something is wrong. |
| 121 | +
|
| 122 | + Returns |
| 123 | + ------- |
| 124 | + pa.ListArray |
| 125 | + List array of structs. |
| 126 | + """ |
| 127 | + if validate: |
| 128 | + validate_struct_list_array_for_equal_lengths(array) |
| 129 | + |
| 130 | + # Since we know that all lists have the same length, we can use the first list to get offsets |
| 131 | + offsets = array.field(0).offsets |
| 132 | + struct_flat_array = pa.StructArray.from_arrays( |
| 133 | + [field.values for field in array.flatten()], |
| 134 | + names=array.type.names, |
| 135 | + ) |
| 136 | + return pa.ListArray.from_arrays(offsets, struct_flat_array) |
| 137 | + |
| 138 | + |
| 139 | +def transpose_list_struct_type(t: pa.ListType) -> pa.StructType: |
| 140 | + """Converts a type of list-struct array into a type of struct-list array. |
| 141 | +
|
| 142 | + Parameters |
| 143 | + ---------- |
| 144 | + t : pa.DataType |
| 145 | + Input type of list-struct array. |
| 146 | +
|
| 147 | + Returns |
| 148 | + ------- |
| 149 | + pa.DataType |
| 150 | + Type of struct-list array. |
| 151 | +
|
| 152 | + Raises |
| 153 | + ------ |
| 154 | + ValueError |
| 155 | + If the input type is not a list-struct type. |
| 156 | + """ |
| 157 | + if not is_pa_type_a_list(t): |
| 158 | + raise ValueError(f"Expected a ListType, got {t}") |
| 159 | + |
| 160 | + struct_type = cast(pa.StructType, t.value_type) |
| 161 | + fields = [] |
| 162 | + for field in struct_type: |
| 163 | + fields.append(pa.field(field.name, pa.list_(field.type))) |
| 164 | + |
| 165 | + struct_type = cast(pa.StructType, pa.struct(fields)) |
| 166 | + return struct_type |
| 167 | + |
| 168 | + |
| 169 | +def transpose_list_struct_array(array: pa.ListArray) -> pa.StructArray: |
| 170 | + """Converts a list-array of structs into a struct-array of lists. |
| 171 | +
|
| 172 | + Parameters |
| 173 | + ---------- |
| 174 | + array : pa.ListArray |
| 175 | + Input list array of structs. |
| 176 | +
|
| 177 | + Returns |
| 178 | + ------- |
| 179 | + pa.StructArray |
| 180 | + Struct array of lists. |
| 181 | + """ |
| 182 | + offsets, values = array.offsets, array.values |
| 183 | + |
| 184 | + fields = [] |
| 185 | + for field_values in values.flatten(): |
| 186 | + list_array = pa.ListArray.from_arrays(offsets, field_values) |
| 187 | + fields.append(list_array) |
| 188 | + |
| 189 | + return pa.StructArray.from_arrays(fields, names=array.type.value_type.names) |
0 commit comments