Skip to content

Commit 5529ac2

Browse files
committed
refactor: extract auto-alignment logic to _tbl_data_align module
Extract dtype classification logic from Boxhead.align_from_data() into a dedicated module using singledispatch for cleaner separation of concerns. Changes: - Create _tbl_data_align.py with classify_dtype_for_alignment() and is_number_like_column() functions - Simplify align_from_data() to use the new module - Add comprehensive tests in test_tbl_data_align.py (32 tests) The refactor preserves existing behavior: - Uses re.match (not re.search) to match original _str_detect behavior - Supports both pandas 2.x ("object") and 3.x ("str") string dtypes - Number-like string detection works for both dtype variants
1 parent b328de0 commit 5529ac2

File tree

3 files changed

+302
-43
lines changed

3 files changed

+302
-43
lines changed

great_tables/_gt_data.py

Lines changed: 17 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
from __future__ import annotations
22

33
import copy
4-
import re
54
from collections.abc import Mapping, Sequence
65
from dataclasses import dataclass, field, replace
76
from enum import Enum, auto
@@ -28,8 +27,13 @@
2827
to_list,
2928
validate_frame,
3029
)
30+
from ._tbl_data_align import (
31+
ALIGNMENT_MAP,
32+
classify_dtype_for_alignment,
33+
is_number_like_column,
34+
)
3135
from ._text import BaseText
32-
from ._utils import OrderedSet, _str_detect
36+
from ._utils import OrderedSet
3337

3438
if TYPE_CHECKING:
3539
from ._helpers import UnitStr
@@ -361,7 +365,7 @@ def set_cols_unhidden(self, colnames: list[str]) -> Self:
361365
def align_from_data(self, data: TblData) -> Self:
362366
"""Updates align attribute in entries based on data types."""
363367

364-
# TODO: validate that data columns and ColInfo list correspond
368+
# Validate that data columns and ColInfo list correspond
365369
if len(get_column_names(data)) != len(self._d):
366370
raise ValueError("Number of data columns must match length of Boxhead")
367371

@@ -370,49 +374,19 @@ def align_from_data(self, data: TblData) -> Self:
370374
):
371375
raise ValueError("Column names must match between data and Boxhead")
372376

373-
# Obtain a list of column classes for each of the column names by iterating
374-
# through each of the columns and obtaining the type of the column from
375-
# a Pandas DataFrame or a Polars DataFrame
376-
col_classes = []
377+
# Classify each column and map to alignment
378+
align: list[str] = []
377379
for col in get_column_names(data):
378-
dtype = _get_column_dtype(data, col)
379-
380-
if dtype == "object" or str(dtype).lower() == "str":
381-
# Check whether all values in 'object' columns (pandas 2.x) or
382-
# 'str' columns (pandas 3.x) are strings that are 'number-like'
383-
384-
col_vals = data[col].to_list()
385-
386-
# Detect whether all non-NA values in the column are 'number-like'
387-
# through use of a regular expression
388-
number_like_matches = (
389-
re.match("^[0-9 -/:\\.]*$", val) for val in col_vals if isinstance(val, str)
390-
)
391-
392-
# If all values in the column are 'number-like', then set the
393-
# dtype to 'character-numeric'
394-
if all(number_like_matches):
395-
dtype = "character-numeric"
380+
classification = classify_dtype_for_alignment(data, col)
396381

397-
col_classes.append(dtype)
382+
# Special case: string columns with number-like content -> right-align
383+
# This handles both "object" (pandas 2.x) and "str" (pandas 3.x) dtypes
384+
if classification == "string":
385+
dtype = str(_get_column_dtype(data, col)).lower()
386+
if dtype in ("object", "str") and is_number_like_column(data, col):
387+
classification = "numeric"
398388

399-
# Get a list of `align` values by translating the column classes
400-
align: list[str] = []
401-
402-
align_to_left = {"object", "utf8", "string", "str"}
403-
for col_class in col_classes:
404-
# Ensure that `col_class` is lowercase
405-
col_class = str(col_class).lower()
406-
407-
# Translate the column classes to an alignment value of 'left', 'right', or 'center'
408-
if col_class == "character-numeric" or _str_detect(
409-
col_class, r"int|uint|float|date|double"
410-
):
411-
align.append("right")
412-
elif col_class in align_to_left:
413-
align.append("left")
414-
else:
415-
align.append("center")
389+
align.append(ALIGNMENT_MAP[classification])
416390

417391
# Set the alignment for each column in the boxhead
418392
new_cols: list[ColInfo] = [

great_tables/_tbl_data_align.py

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
"""Type classification for auto-alignment.
2+
3+
This module provides backend-specific dtype classification for determining
4+
column text alignment. The approach uses singledispatch to handle pandas,
5+
polars, and pyarrow backends with their native type APIs.
6+
"""
7+
8+
from __future__ import annotations
9+
10+
import re
11+
from functools import singledispatch
12+
from typing import TYPE_CHECKING, Literal
13+
14+
from ._tbl_data import (
15+
PdDataFrame,
16+
PlDataFrame,
17+
PyArrowTable,
18+
_get_column_dtype,
19+
to_list,
20+
)
21+
22+
if TYPE_CHECKING:
23+
from ._tbl_data import DataFrameLike
24+
25+
AlignmentClass = Literal["numeric", "string", "other"]
26+
27+
ALIGNMENT_MAP: dict[AlignmentClass, str] = {
28+
"numeric": "right",
29+
"string": "left",
30+
"other": "center",
31+
}
32+
33+
# Pattern matching numeric types in dtype strings (matches at start of string)
34+
# NOTE: Uses re.match (start anchored) to match original _str_detect behavior
35+
# This means "list(int64)" won't match, but "int64" will
36+
NUMERIC_DTYPE_PATTERN = re.compile(r"int|uint|float|date|double")
37+
38+
# String dtypes that should be left-aligned
39+
# Includes "str" for pandas 3.x compatibility (pandas 2.x uses "object")
40+
STRING_DTYPES = {"object", "utf8", "string", "str"}
41+
42+
# Pattern for "number-like" strings (dates, times, formatted numbers)
43+
# NOTE: Preserves original behavior including the character class quirk
44+
# The original pattern [0-9 -/:\\.] has " -/" which is technically a range from space to /
45+
# We keep this for now to match existing behavior exactly
46+
NUMBER_LIKE_PATTERN = re.compile(r"^[0-9 -/:\\.]*$")
47+
48+
49+
def is_number_like_column(data: DataFrameLike, column: str) -> bool:
50+
"""Check if an object/string column contains only number-like strings.
51+
52+
Used to right-align object columns that contain formatted dates/numbers.
53+
54+
Note: This preserves the original regex behavior exactly. See Phase 3 for bug fixes.
55+
"""
56+
col_vals = to_list(data[column])
57+
58+
# Match original behavior: only check string values, skip non-strings
59+
# If all string values match pattern (or there are no strings), return True
60+
number_like_matches = (
61+
NUMBER_LIKE_PATTERN.match(val) for val in col_vals if isinstance(val, str)
62+
)
63+
return all(number_like_matches)
64+
65+
66+
@singledispatch
67+
def classify_dtype_for_alignment(data: DataFrameLike, column: str) -> AlignmentClass:
68+
"""Classify a column's dtype for alignment purposes.
69+
70+
Returns:
71+
"numeric" -> right-aligned (numbers, dates, times)
72+
"string" -> left-aligned (text)
73+
"other" -> center-aligned (boolean, etc.)
74+
"""
75+
# Fallback: use string-based pattern matching (matches original behavior)
76+
dtype = str(_get_column_dtype(data, column)).lower()
77+
78+
if NUMERIC_DTYPE_PATTERN.match(dtype):
79+
return "numeric"
80+
elif dtype in STRING_DTYPES:
81+
return "string"
82+
else:
83+
return "other"
84+
85+
86+
@classify_dtype_for_alignment.register(PdDataFrame)
87+
def _classify_pandas(data: PdDataFrame, column: str) -> AlignmentClass:
88+
dtype = str(data[column].dtype).lower()
89+
90+
# Match original behavior: pattern-based detection
91+
if NUMERIC_DTYPE_PATTERN.match(dtype):
92+
return "numeric"
93+
elif dtype in STRING_DTYPES:
94+
return "string"
95+
else:
96+
return "other"
97+
98+
99+
@classify_dtype_for_alignment.register(PlDataFrame)
100+
def _classify_polars(data: PlDataFrame, column: str) -> AlignmentClass:
101+
dtype = str(data[column].dtype).lower()
102+
103+
# Match original behavior: pattern-based detection
104+
if NUMERIC_DTYPE_PATTERN.match(dtype):
105+
return "numeric"
106+
elif dtype in STRING_DTYPES:
107+
return "string"
108+
else:
109+
return "other"
110+
111+
112+
@classify_dtype_for_alignment.register(PyArrowTable)
113+
def _classify_pyarrow(data: PyArrowTable, column: str) -> AlignmentClass:
114+
dtype = str(data.column(column).type).lower()
115+
116+
# Match original behavior: pattern-based detection
117+
if NUMERIC_DTYPE_PATTERN.match(dtype):
118+
return "numeric"
119+
elif dtype in STRING_DTYPES:
120+
return "string"
121+
else:
122+
return "other"

tests/test_tbl_data_align.py

Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
"""Tests for dtype classification and auto-alignment."""
2+
3+
import pandas as pd
4+
import polars as pl
5+
import pytest
6+
7+
from great_tables._tbl_data_align import (
8+
ALIGNMENT_MAP,
9+
classify_dtype_for_alignment,
10+
is_number_like_column,
11+
)
12+
import great_tables as gt
13+
14+
15+
class TestClassifyDtypePandas:
16+
"""Test dtype classification for pandas."""
17+
18+
@pytest.mark.parametrize(
19+
"dtype,expected",
20+
[
21+
("int64", "numeric"),
22+
("int32", "numeric"),
23+
("float64", "numeric"),
24+
("float32", "numeric"),
25+
],
26+
)
27+
def test_numeric_types(self, dtype, expected):
28+
df = pd.DataFrame({"col": pd.array([1, 2, 3], dtype=dtype)})
29+
assert classify_dtype_for_alignment(df, "col") == expected
30+
31+
def test_str_type_classified_as_string(self):
32+
# Pandas 3.x uses "str" dtype for string columns, which is in STRING_DTYPES
33+
df = pd.DataFrame({"col": ["a", "b", "c"]})
34+
assert classify_dtype_for_alignment(df, "col") == "string"
35+
36+
def test_bool_type(self):
37+
df = pd.DataFrame({"col": [True, False, True]})
38+
assert classify_dtype_for_alignment(df, "col") == "other"
39+
40+
def test_datetime_type(self):
41+
df = pd.DataFrame({"col": pd.to_datetime(["2024-01-01", "2024-01-02"])})
42+
assert classify_dtype_for_alignment(df, "col") == "numeric"
43+
44+
45+
class TestClassifyDtypePolars:
46+
"""Test dtype classification for polars."""
47+
48+
@pytest.mark.parametrize(
49+
"dtype,expected",
50+
[
51+
(pl.Int64, "numeric"),
52+
(pl.Int32, "numeric"),
53+
(pl.UInt8, "numeric"),
54+
(pl.UInt64, "numeric"),
55+
(pl.Float64, "numeric"),
56+
(pl.Float32, "numeric"),
57+
],
58+
)
59+
def test_numeric_types(self, dtype, expected):
60+
df = pl.DataFrame({"col": pl.Series([1, 2, 3]).cast(dtype)})
61+
assert classify_dtype_for_alignment(df, "col") == expected
62+
63+
def test_string_type(self):
64+
df = pl.DataFrame({"col": ["a", "b", "c"]})
65+
# Polars "String" dtype lowercases to "string" which is in STRING_DTYPES
66+
assert classify_dtype_for_alignment(df, "col") == "string"
67+
68+
def test_bool_type(self):
69+
df = pl.DataFrame({"col": [True, False, True]})
70+
assert classify_dtype_for_alignment(df, "col") == "other"
71+
72+
def test_date_type(self):
73+
df = pl.DataFrame({"col": pl.Series(["2024-01-01", "2024-01-02"]).str.to_date()})
74+
assert classify_dtype_for_alignment(df, "col") == "numeric"
75+
76+
def test_datetime_type(self):
77+
df = pl.DataFrame({"col": pl.Series(["2024-01-01", "2024-01-02"]).str.to_datetime()})
78+
assert classify_dtype_for_alignment(df, "col") == "numeric"
79+
80+
81+
class TestNumberLikeDetection:
82+
"""Tests for is_number_like_column() function."""
83+
84+
def test_date_like_strings(self):
85+
df = pd.DataFrame({"col": ["2024-01-15", "2024-02-20", "2024-03-25"]})
86+
assert is_number_like_column(df, "col") is True
87+
88+
def test_time_like_strings(self):
89+
df = pd.DataFrame({"col": ["10:30:00", "14:45:30", "23:59:59"]})
90+
assert is_number_like_column(df, "col") is True
91+
92+
def test_mixed_text_not_number_like(self):
93+
df = pd.DataFrame({"col": ["abc", "123", "def"]})
94+
assert is_number_like_column(df, "col") is False
95+
96+
def test_pure_text_not_number_like(self):
97+
df = pd.DataFrame({"col": ["hello", "world", "test"]})
98+
assert is_number_like_column(df, "col") is False
99+
100+
def test_numeric_strings(self):
101+
df = pd.DataFrame({"col": ["123", "456", "789"]})
102+
assert is_number_like_column(df, "col") is True
103+
104+
def test_empty_strings(self):
105+
# NOTE: Empty strings match the current pattern (^[0-9 -/:\\.]*$)
106+
# This documents current behavior
107+
df = pd.DataFrame({"col": ["", "", ""]})
108+
assert is_number_like_column(df, "col") is True
109+
110+
def test_polars_number_like(self):
111+
# Polars string columns work with number-like detection
112+
df_str = pl.DataFrame({"col": ["2024-01-15", "2024-02-20"]})
113+
assert is_number_like_column(df_str, "col") is True
114+
115+
116+
class TestAlignmentMap:
117+
"""Tests for ALIGNMENT_MAP constant."""
118+
119+
def test_numeric_maps_to_right(self):
120+
assert ALIGNMENT_MAP["numeric"] == "right"
121+
122+
def test_string_maps_to_left(self):
123+
assert ALIGNMENT_MAP["string"] == "left"
124+
125+
def test_other_maps_to_center(self):
126+
assert ALIGNMENT_MAP["other"] == "center"
127+
128+
129+
class TestAutoAlignIntegration:
130+
"""Integration tests verifying GT auto-alignment behavior."""
131+
132+
def test_pandas_auto_align(self):
133+
df = pd.DataFrame({"num": [1, 2], "text": ["a", "b"]})
134+
gt_tbl = gt.GT(df)
135+
aligns = [col.column_align for col in gt_tbl._boxhead._d]
136+
assert aligns == ["right", "left"] # int -> right, str -> left
137+
138+
def test_polars_auto_align(self):
139+
df = pl.DataFrame({"num": [1, 2], "text": ["a", "b"]})
140+
gt_tbl = gt.GT(df)
141+
aligns = [col.column_align for col in gt_tbl._boxhead._d]
142+
assert aligns == ["right", "left"] # int -> right, String -> left
143+
144+
def test_auto_align_disabled(self):
145+
df = pd.DataFrame({"num": [1, 2], "text": ["a", "b"]})
146+
gt_tbl = gt.GT(df, auto_align=False)
147+
aligns = [col.column_align for col in gt_tbl._boxhead._d]
148+
# When auto_align=False, columns keep default alignment (None)
149+
assert aligns == [None, None]
150+
151+
def test_pandas_date_strings_right_align(self):
152+
# String columns with number-like content (dates) get right-aligned
153+
df = pd.DataFrame({"dates": ["2024-01-15", "2024-02-20", "2024-03-25"]})
154+
gt_tbl = gt.GT(df)
155+
aligns = [col.column_align for col in gt_tbl._boxhead._d]
156+
assert aligns == ["right"] # number-like strings -> right
157+
158+
def test_pandas_mixed_text_left_align(self):
159+
# String columns with non-number-like content get left-aligned
160+
df = pd.DataFrame({"mixed": ["hello", "123", "world"]})
161+
gt_tbl = gt.GT(df)
162+
aligns = [col.column_align for col in gt_tbl._boxhead._d]
163+
assert aligns == ["left"] # mixed text -> left

0 commit comments

Comments
 (0)