Skip to content

Commit f6e2cbe

Browse files
GH-45295: [Python][CI] Make download_tzdata_on_windows more robust and use tzdata package for tzinfo database on Windows for ORC (#45425)
### Rationale for this change We have two Windows issues and this PR is addressing both: 1. PyArrow's `download_tzdata_on_windows` can fail due to TLS issues in certain CI environments. 2. The Python wheel test infrastructure needs a tzinfo database for ORC and the automation fetching that started failing because the URL was made invalid upstream. These two issues are being solved in one PR simply because they appeared together during the 19.0.1 release process but they're separate. ### What changes are included in this PR? 1. Makes `download_tzdata_on_windows` more robust to TLS errors by attempting to use `requests` if it's available and falling back to urllib otherwise. 2. Switches our Windows wheel test infrastructure to grab a tzinfo database from the tzdata package on PyPi instead of from a mirror URL. This should be much more stable for us over time. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #45295 Lead-authored-by: Bryce Mecum <[email protected]> Co-authored-by: Joris Van den Bossche <[email protected]> Signed-off-by: Bryce Mecum <[email protected]>
1 parent f1961ec commit f6e2cbe

File tree

4 files changed

+46
-18
lines changed

4 files changed

+46
-18
lines changed

ci/scripts/python_wheel_windows_test.bat

-7
Original file line numberDiff line numberDiff line change
@@ -58,12 +58,5 @@ py -0p
5858
@REM Validate wheel contents
5959
%PYTHON_CMD% C:\arrow\ci\scripts\python_wheel_validate_contents.py --path C:\arrow\python\repaired_wheels || exit /B 1
6060

61-
@rem Download IANA Timezone Database for ORC C++
62-
curl https://cygwin.osuosl.org/noarch/release/tzdata/tzdata-2024a-1.tar.xz --output tzdata.tar.xz || exit /B
63-
mkdir %USERPROFILE%\Downloads\test\tzdata
64-
arc unarchive tzdata.tar.xz %USERPROFILE%\Downloads\test\tzdata || exit /B
65-
set TZDIR=%USERPROFILE%\Downloads\test\tzdata\usr\share\zoneinfo
66-
dir %TZDIR%
67-
6861
@REM Execute unittest
6962
%PYTHON_CMD% -m pytest -r s --pyargs pyarrow || exit /B 1

python/pyarrow/tests/conftest.py

+16
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,22 @@
5555
set_timezone_db_path(tzdata_set_path)
5656

5757

58+
# GH-45295: For ORC, try to populate TZDIR env var from tzdata package resource
59+
# path.
60+
#
61+
# Note this is a different kind of database than what we allow to be set by
62+
# `PYARROW_TZDATA_PATH` and passed to set_timezone_db_path.
63+
if sys.platform == 'win32':
64+
if os.environ.get('TZDIR', None) is None:
65+
from importlib import resources
66+
try:
67+
os.environ['TZDIR'] = os.path.join(resources.files('tzdata'), 'zoneinfo')
68+
except ModuleNotFoundError:
69+
print(
70+
'Package "tzdata" not found. Not setting TZDIR environment variable.'
71+
)
72+
73+
5874
def pytest_addoption(parser):
5975
# Create options to selectively enable test groups
6076
def bool_env(name, default=None):

python/pyarrow/util.py

+29-11
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,20 @@ def _break_traceback_cycle_from_frame(frame):
230230
refs = frame = this_frame = None
231231

232232

233+
def _download_urllib(url, out_path):
234+
from urllib.request import urlopen
235+
with urlopen(url) as response:
236+
with open(out_path, 'wb') as f:
237+
f.write(response.read())
238+
239+
240+
def _download_requests(url, out_path):
241+
import requests
242+
with requests.get(url) as response:
243+
with open(out_path, 'wb') as f:
244+
f.write(response.content)
245+
246+
233247
def download_tzdata_on_windows():
234248
r"""
235249
Download and extract latest IANA timezone database into the
@@ -240,19 +254,23 @@ def download_tzdata_on_windows():
240254

241255
import tarfile
242256

257+
tzdata_url = "https://data.iana.org/time-zones/tzdata-latest.tar.gz"
243258
tzdata_path = os.path.expandvars(r"%USERPROFILE%\Downloads\tzdata")
244-
tzdata_compressed = os.path.join(tzdata_path, "tzdata.tar.gz")
259+
tzdata_compressed_path = os.path.join(tzdata_path, "tzdata.tar.gz")
260+
windows_zones_url = "https://raw.githubusercontent.com/unicode-org/cldr/master/common/supplemental/windowsZones.xml" # noqa
261+
windows_zones_path = os.path.join(tzdata_path, "windowsZones.xml")
245262
os.makedirs(tzdata_path, exist_ok=True)
246263

247-
from urllib.request import urlopen
248-
with urlopen('https://data.iana.org/time-zones/tzdata-latest.tar.gz') as response:
249-
with open(tzdata_compressed, 'wb') as f:
250-
f.write(response.read())
251-
252-
assert os.path.exists(tzdata_compressed)
264+
# Try to download the files with requests and then fall back to urllib. This
265+
# works around possible issues in certain older environment (GH-45295)
266+
try:
267+
_download_requests(tzdata_url, tzdata_compressed_path)
268+
_download_requests(windows_zones_url, windows_zones_path)
269+
except ImportError:
270+
_download_urllib(tzdata_url, tzdata_compressed_path)
271+
_download_urllib(windows_zones_url, windows_zones_path)
253272

254-
tarfile.open(tzdata_compressed).extractall(tzdata_path)
273+
assert os.path.exists(tzdata_compressed_path)
274+
assert os.path.exists(windows_zones_path)
255275

256-
with urlopen('https://raw.githubusercontent.com/unicode-org/cldr/master/common/supplemental/windowsZones.xml') as response_zones: # noqa
257-
with open(os.path.join(tzdata_path, "windowsZones.xml"), 'wb') as f:
258-
f.write(response_zones.read())
276+
tarfile.open(tzdata_compressed_path).extractall(tzdata_path)

python/requirements-wheel-test.txt

+1
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ hypothesis
44
pytest
55
pytz
66
pyuwsgi; sys.platform != 'win32' and python_version < '3.13'
7+
requests; sys_platform == 'win32'
78
tzdata; sys_platform == 'win32'
89

910
# We generally test with the oldest numpy version that supports a given Python

0 commit comments

Comments
 (0)