refactor: extract auto-alignment logic to _tbl_data_align module

machow · machow · commit 5529ac2cac8d · 2026-01-22T13:41:39.000-05:00
Extract dtype classification logic from Boxhead.align_from_data() into
a dedicated module using singledispatch for cleaner separation of concerns.

Changes:
- Create _tbl_data_align.py with classify_dtype_for_alignment() and
  is_number_like_column() functions
- Simplify align_from_data() to use the new module
- Add comprehensive tests in test_tbl_data_align.py (32 tests)

The refactor preserves existing behavior:
- Uses re.match (not re.search) to match original _str_detect behavior
- Supports both pandas 2.x ("object") and 3.x ("str") string dtypes
- Number-like string detection works for both dtype variants
diff --git a/great_tables/_gt_data.py b/great_tables/_gt_data.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 import copy
-import re
 from collections.abc import Mapping, Sequence
 from dataclasses import dataclass, field, replace
 from enum import Enum, auto
@@ -28,8 +27,13 @@
     to_list,
     validate_frame,
 )
+from ._tbl_data_align import (
+    ALIGNMENT_MAP,
+    classify_dtype_for_alignment,
+    is_number_like_column,
+)
 from ._text import BaseText
-from ._utils import OrderedSet, _str_detect
+from ._utils import OrderedSet
 
 if TYPE_CHECKING:
     from ._helpers import UnitStr
@@ -361,7 +365,7 @@ def set_cols_unhidden(self, colnames: list[str]) -> Self:
     def align_from_data(self, data: TblData) -> Self:
         """Updates align attribute in entries based on data types."""
 
-        # TODO: validate that data columns and ColInfo list correspond
+        # Validate that data columns and ColInfo list correspond
         if len(get_column_names(data)) != len(self._d):
             raise ValueError("Number of data columns must match length of Boxhead")
 
@@ -370,49 +374,19 @@ def align_from_data(self, data: TblData) -> Self:
         ):
             raise ValueError("Column names must match between data and Boxhead")
 
-        # Obtain a list of column classes for each of the column names by iterating
-        # through each of the columns and obtaining the type of the column from
-        # a Pandas DataFrame or a Polars DataFrame
-        col_classes = []
+        # Classify each column and map to alignment
+        align: list[str] = []
         for col in get_column_names(data):
-            dtype = _get_column_dtype(data, col)
-
-            if dtype == "object" or str(dtype).lower() == "str":
-                # Check whether all values in 'object' columns (pandas 2.x) or
-                # 'str' columns (pandas 3.x) are strings that are 'number-like'
-
-                col_vals = data[col].to_list()
-
-                # Detect whether all non-NA values in the column are 'number-like'
-                # through use of a regular expression
-                number_like_matches = (
-                    re.match("^[0-9 -/:\\.]*$", val) for val in col_vals if isinstance(val, str)
-                )
-
-                # If all values in the column are 'number-like', then set the
-                # dtype to 'character-numeric'
-                if all(number_like_matches):
-                    dtype = "character-numeric"
+            classification = classify_dtype_for_alignment(data, col)
 
-            col_classes.append(dtype)
+            # Special case: string columns with number-like content -> right-align
+            # This handles both "object" (pandas 2.x) and "str" (pandas 3.x) dtypes
+            if classification == "string":
+                dtype = str(_get_column_dtype(data, col)).lower()
+                if dtype in ("object", "str") and is_number_like_column(data, col):
+                    classification = "numeric"
 
-        # Get a list of `align` values by translating the column classes
-        align: list[str] = []
-
-        align_to_left = {"object", "utf8", "string", "str"}
-        for col_class in col_classes:
-            # Ensure that `col_class` is lowercase
-            col_class = str(col_class).lower()
-
-            # Translate the column classes to an alignment value of 'left', 'right', or 'center'
-            if col_class == "character-numeric" or _str_detect(
-                col_class, r"int|uint|float|date|double"
-            ):
-                align.append("right")
-            elif col_class in align_to_left:
-                align.append("left")
-            else:
-                align.append("center")
+            align.append(ALIGNMENT_MAP[classification])
 
         # Set the alignment for each column in the boxhead
         new_cols: list[ColInfo] = [
diff --git a/great_tables/_tbl_data_align.py b/great_tables/_tbl_data_align.py
@@ -0,0 +1,122 @@
+"""Type classification for auto-alignment.
+
+This module provides backend-specific dtype classification for determining
+column text alignment. The approach uses singledispatch to handle pandas,
+polars, and pyarrow backends with their native type APIs.
+"""
+
+from __future__ import annotations
+
+import re
+from functools import singledispatch
+from typing import TYPE_CHECKING, Literal
+
+from ._tbl_data import (
+    PdDataFrame,
+    PlDataFrame,
+    PyArrowTable,
+    _get_column_dtype,
+    to_list,
+)
+
+if TYPE_CHECKING:
+    from ._tbl_data import DataFrameLike
+
+AlignmentClass = Literal["numeric", "string", "other"]
+
+ALIGNMENT_MAP: dict[AlignmentClass, str] = {
+    "numeric": "right",
+    "string": "left",
+    "other": "center",
+}
+
+# Pattern matching numeric types in dtype strings (matches at start of string)
+# NOTE: Uses re.match (start anchored) to match original _str_detect behavior
+# This means "list(int64)" won't match, but "int64" will
+NUMERIC_DTYPE_PATTERN = re.compile(r"int|uint|float|date|double")
+
+# String dtypes that should be left-aligned
+# Includes "str" for pandas 3.x compatibility (pandas 2.x uses "object")
+STRING_DTYPES = {"object", "utf8", "string", "str"}
+
+# Pattern for "number-like" strings (dates, times, formatted numbers)
+# NOTE: Preserves original behavior including the character class quirk
+# The original pattern [0-9 -/:\\.] has " -/" which is technically a range from space to /
+# We keep this for now to match existing behavior exactly
+NUMBER_LIKE_PATTERN = re.compile(r"^[0-9 -/:\\.]*$")
+
+
+def is_number_like_column(data: DataFrameLike, column: str) -> bool:
+    """Check if an object/string column contains only number-like strings.
+
+    Used to right-align object columns that contain formatted dates/numbers.
+
+    Note: This preserves the original regex behavior exactly. See Phase 3 for bug fixes.
+    """
+    col_vals = to_list(data[column])
+
+    # Match original behavior: only check string values, skip non-strings
+    # If all string values match pattern (or there are no strings), return True
+    number_like_matches = (
+        NUMBER_LIKE_PATTERN.match(val) for val in col_vals if isinstance(val, str)
+    )
+    return all(number_like_matches)
+
+
+@singledispatch
+def classify_dtype_for_alignment(data: DataFrameLike, column: str) -> AlignmentClass:
+    """Classify a column's dtype for alignment purposes.
+
+    Returns:
+        "numeric" -> right-aligned (numbers, dates, times)
+        "string" -> left-aligned (text)
+        "other" -> center-aligned (boolean, etc.)
+    """
+    # Fallback: use string-based pattern matching (matches original behavior)
+    dtype = str(_get_column_dtype(data, column)).lower()
+
+    if NUMERIC_DTYPE_PATTERN.match(dtype):
+        return "numeric"
+    elif dtype in STRING_DTYPES:
+        return "string"
+    else:
+        return "other"
+
+
+@classify_dtype_for_alignment.register(PdDataFrame)
+def _classify_pandas(data: PdDataFrame, column: str) -> AlignmentClass:
+    dtype = str(data[column].dtype).lower()
+
+    # Match original behavior: pattern-based detection
+    if NUMERIC_DTYPE_PATTERN.match(dtype):
+        return "numeric"
+    elif dtype in STRING_DTYPES:
+        return "string"
+    else:
+        return "other"
+
+
+@classify_dtype_for_alignment.register(PlDataFrame)
+def _classify_polars(data: PlDataFrame, column: str) -> AlignmentClass:
+    dtype = str(data[column].dtype).lower()
+
+    # Match original behavior: pattern-based detection
+    if NUMERIC_DTYPE_PATTERN.match(dtype):
+        return "numeric"
+    elif dtype in STRING_DTYPES:
+        return "string"
+    else:
+        return "other"
+
+
+@classify_dtype_for_alignment.register(PyArrowTable)
+def _classify_pyarrow(data: PyArrowTable, column: str) -> AlignmentClass:
+    dtype = str(data.column(column).type).lower()
+
+    # Match original behavior: pattern-based detection
+    if NUMERIC_DTYPE_PATTERN.match(dtype):
+        return "numeric"
+    elif dtype in STRING_DTYPES:
+        return "string"
+    else:
+        return "other"
diff --git a/tests/test_tbl_data_align.py b/tests/test_tbl_data_align.py
@@ -0,0 +1,163 @@
+"""Tests for dtype classification and auto-alignment."""
+
+import pandas as pd
+import polars as pl
+import pytest
+
+from great_tables._tbl_data_align import (
+    ALIGNMENT_MAP,
+    classify_dtype_for_alignment,
+    is_number_like_column,
+)
+import great_tables as gt
+
+
+class TestClassifyDtypePandas:
+    """Test dtype classification for pandas."""
+
+    @pytest.mark.parametrize(
+        "dtype,expected",
+        [
+            ("int64", "numeric"),
+            ("int32", "numeric"),
+            ("float64", "numeric"),
+            ("float32", "numeric"),
+        ],
+    )
+    def test_numeric_types(self, dtype, expected):
+        df = pd.DataFrame({"col": pd.array([1, 2, 3], dtype=dtype)})
+        assert classify_dtype_for_alignment(df, "col") == expected
+
+    def test_str_type_classified_as_string(self):
+        # Pandas 3.x uses "str" dtype for string columns, which is in STRING_DTYPES
+        df = pd.DataFrame({"col": ["a", "b", "c"]})
+        assert classify_dtype_for_alignment(df, "col") == "string"
+
+    def test_bool_type(self):
+        df = pd.DataFrame({"col": [True, False, True]})
+        assert classify_dtype_for_alignment(df, "col") == "other"
+
+    def test_datetime_type(self):
+        df = pd.DataFrame({"col": pd.to_datetime(["2024-01-01", "2024-01-02"])})
+        assert classify_dtype_for_alignment(df, "col") == "numeric"
+
+
+class TestClassifyDtypePolars:
+    """Test dtype classification for polars."""
+
+    @pytest.mark.parametrize(
+        "dtype,expected",
+        [
+            (pl.Int64, "numeric"),
+            (pl.Int32, "numeric"),
+            (pl.UInt8, "numeric"),
+            (pl.UInt64, "numeric"),
+            (pl.Float64, "numeric"),
+            (pl.Float32, "numeric"),
+        ],
+    )
+    def test_numeric_types(self, dtype, expected):
+        df = pl.DataFrame({"col": pl.Series([1, 2, 3]).cast(dtype)})
+        assert classify_dtype_for_alignment(df, "col") == expected
+
+    def test_string_type(self):
+        df = pl.DataFrame({"col": ["a", "b", "c"]})
+        # Polars "String" dtype lowercases to "string" which is in STRING_DTYPES
+        assert classify_dtype_for_alignment(df, "col") == "string"
+
+    def test_bool_type(self):
+        df = pl.DataFrame({"col": [True, False, True]})
+        assert classify_dtype_for_alignment(df, "col") == "other"
+
+    def test_date_type(self):
+        df = pl.DataFrame({"col": pl.Series(["2024-01-01", "2024-01-02"]).str.to_date()})
+        assert classify_dtype_for_alignment(df, "col") == "numeric"
+
+    def test_datetime_type(self):
+        df = pl.DataFrame({"col": pl.Series(["2024-01-01", "2024-01-02"]).str.to_datetime()})
+        assert classify_dtype_for_alignment(df, "col") == "numeric"
+
+
+class TestNumberLikeDetection:
+    """Tests for is_number_like_column() function."""
+
+    def test_date_like_strings(self):
+        df = pd.DataFrame({"col": ["2024-01-15", "2024-02-20", "2024-03-25"]})
+        assert is_number_like_column(df, "col") is True
+
+    def test_time_like_strings(self):
+        df = pd.DataFrame({"col": ["10:30:00", "14:45:30", "23:59:59"]})
+        assert is_number_like_column(df, "col") is True
+
+    def test_mixed_text_not_number_like(self):
+        df = pd.DataFrame({"col": ["abc", "123", "def"]})
+        assert is_number_like_column(df, "col") is False
+
+    def test_pure_text_not_number_like(self):
+        df = pd.DataFrame({"col": ["hello", "world", "test"]})
+        assert is_number_like_column(df, "col") is False
+
+    def test_numeric_strings(self):
+        df = pd.DataFrame({"col": ["123", "456", "789"]})
+        assert is_number_like_column(df, "col") is True
+
+    def test_empty_strings(self):
+        # NOTE: Empty strings match the current pattern (^[0-9 -/:\\.]*$)
+        # This documents current behavior
+        df = pd.DataFrame({"col": ["", "", ""]})
+        assert is_number_like_column(df, "col") is True
+
+    def test_polars_number_like(self):
+        # Polars string columns work with number-like detection
+        df_str = pl.DataFrame({"col": ["2024-01-15", "2024-02-20"]})
+        assert is_number_like_column(df_str, "col") is True
+
+
+class TestAlignmentMap:
+    """Tests for ALIGNMENT_MAP constant."""
+
+    def test_numeric_maps_to_right(self):
+        assert ALIGNMENT_MAP["numeric"] == "right"
+
+    def test_string_maps_to_left(self):
+        assert ALIGNMENT_MAP["string"] == "left"
+
+    def test_other_maps_to_center(self):
+        assert ALIGNMENT_MAP["other"] == "center"
+
+
+class TestAutoAlignIntegration:
+    """Integration tests verifying GT auto-alignment behavior."""
+
+    def test_pandas_auto_align(self):
+        df = pd.DataFrame({"num": [1, 2], "text": ["a", "b"]})
+        gt_tbl = gt.GT(df)
+        aligns = [col.column_align for col in gt_tbl._boxhead._d]
+        assert aligns == ["right", "left"]  # int -> right, str -> left
+
+    def test_polars_auto_align(self):
+        df = pl.DataFrame({"num": [1, 2], "text": ["a", "b"]})
+        gt_tbl = gt.GT(df)
+        aligns = [col.column_align for col in gt_tbl._boxhead._d]
+        assert aligns == ["right", "left"]  # int -> right, String -> left
+
+    def test_auto_align_disabled(self):
+        df = pd.DataFrame({"num": [1, 2], "text": ["a", "b"]})
+        gt_tbl = gt.GT(df, auto_align=False)
+        aligns = [col.column_align for col in gt_tbl._boxhead._d]
+        # When auto_align=False, columns keep default alignment (None)
+        assert aligns == [None, None]
+
+    def test_pandas_date_strings_right_align(self):
+        # String columns with number-like content (dates) get right-aligned
+        df = pd.DataFrame({"dates": ["2024-01-15", "2024-02-20", "2024-03-25"]})
+        gt_tbl = gt.GT(df)
+        aligns = [col.column_align for col in gt_tbl._boxhead._d]
+        assert aligns == ["right"]  # number-like strings -> right
+
+    def test_pandas_mixed_text_left_align(self):
+        # String columns with non-number-like content get left-aligned
+        df = pd.DataFrame({"mixed": ["hello", "123", "world"]})
+        gt_tbl = gt.GT(df)
+        aligns = [col.column_align for col in gt_tbl._boxhead._d]
+        assert aligns == ["left"]  # mixed text -> left