Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

time coding refactor #9906

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ Internal Changes
within ``as_compatible_data``. This is consistent with how lists of these objects
will be converted (:pull:`9900`).
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
- Refactor of time coding to prepare for relaxing nanosecond restriction (:pull:`9906`).
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.

.. _whats-new.2024.11.0:

Expand Down
125 changes: 88 additions & 37 deletions xarray/coding/times.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from xarray.core.common import contains_cftime_datetimes, is_np_datetime_like
from xarray.core.duck_array_ops import asarray, ravel, reshape
from xarray.core.formatting import first_n_items, format_timestamp, last_item
from xarray.core.pdcompat import nanosecond_precision_timestamp
from xarray.core.pdcompat import nanosecond_precision_timestamp, timestamp_as_unit
from xarray.core.utils import attempt_import, emit_user_level_warning
from xarray.core.variable import Variable
from xarray.namedarray.parallelcompat import T_ChunkedArray, get_chunked_array_type
Expand All @@ -36,7 +36,11 @@
except ImportError:
cftime = None

from xarray.core.types import CFCalendar, NPDatetimeUnitOptions, T_DuckArray
from xarray.core.types import (
CFCalendar,
NPDatetimeUnitOptions,
T_DuckArray,
)

T_Name = Union[Hashable, None]

Expand Down Expand Up @@ -189,18 +193,26 @@ def _unpack_netcdf_time_units(units: str) -> tuple[str, str]:
return delta_units, ref_date


def _unpack_time_units_and_ref_date(units: str) -> tuple[str, pd.Timestamp]:
def _maybe_strip_tz_from_timestamp(date: pd.Timestamp) -> pd.Timestamp:
# If the ref_date Timestamp is timezone-aware, convert to UTC and
# make it timezone-naive (GH 2649).
if date.tz is not None:
return date.tz_convert("UTC").tz_convert(None)
return date


def _unpack_time_unit_and_ref_date(
units: str,
) -> tuple[NPDatetimeUnitOptions, pd.Timestamp]:
# same us _unpack_netcdf_time_units but finalizes ref_date for
# processing in encode_cf_datetime
time_units, _ref_date = _unpack_netcdf_time_units(units)
time_unit, _ref_date = _unpack_netcdf_time_units(units)
time_unit = _netcdf_to_numpy_timeunit(time_unit)
# TODO: the strict enforcement of nanosecond precision Timestamps can be
# relaxed when addressing GitHub issue #7493.
ref_date = nanosecond_precision_timestamp(_ref_date)
# If the ref_date Timestamp is timezone-aware, convert to UTC and
# make it timezone-naive (GH 2649).
if ref_date.tz is not None:
ref_date = ref_date.tz_convert(None)
return time_units, ref_date
ref_date = _maybe_strip_tz_from_timestamp(ref_date)
return time_unit, ref_date


def _decode_cf_datetime_dtype(
Expand Down Expand Up @@ -247,6 +259,30 @@ def _decode_datetime_with_cftime(
return np.array([], dtype=object)


def _check_date_for_units_since_refdate(
date, unit: str, ref_date: pd.Timestamp
) -> pd.Timestamp:
# check for out-of-bounds floats and raise
if date > np.iinfo("int64").max or date < np.iinfo("int64").min:
raise OutOfBoundsTimedelta(
f"Value {date} can't be represented as Datetime/Timedelta."
)
delta = date * np.timedelta64(1, unit)
if not np.isnan(delta):
# this will raise on dtype overflow for integer dtypes
if date.dtype.kind in "iu" and not np.int64(delta) == date:
raise OutOfBoundsTimedelta(
"DType overflow in Datetime/Timedelta calculation."
)
# this will raise on overflow if ref_date + delta
# can't be represented in the current ref_date resolution
return timestamp_as_unit(ref_date + delta, ref_date.unit)
else:
# if date is exactly NaT (np.iinfo("int64").min) return refdate
# to make follow-up checks work
return ref_date


def _decode_datetime_with_pandas(
flat_num_dates: np.ndarray, units: str, calendar: str
) -> np.ndarray:
Expand All @@ -265,12 +301,8 @@ def _decode_datetime_with_pandas(
elif flat_num_dates.dtype.kind == "u":
flat_num_dates = flat_num_dates.astype(np.uint64)

time_units, ref_date_str = _unpack_netcdf_time_units(units)
time_units = _netcdf_to_numpy_timeunit(time_units)
try:
# TODO: the strict enforcement of nanosecond precision Timestamps can be
# relaxed when addressing GitHub issue #7493.
ref_date = nanosecond_precision_timestamp(ref_date_str)
time_unit, ref_date = _unpack_time_unit_and_ref_date(units)
except ValueError as err:
# ValueError is raised by pd.Timestamp for non-ISO timestamp
# strings, in which case we fall back to using cftime
Expand All @@ -280,8 +312,12 @@ def _decode_datetime_with_pandas(
warnings.filterwarnings("ignore", "invalid value encountered", RuntimeWarning)
if flat_num_dates.size > 0:
# avoid size 0 datetimes GH1329
pd.to_timedelta(flat_num_dates.min(), time_units) + ref_date
pd.to_timedelta(flat_num_dates.max(), time_units) + ref_date
_check_date_for_units_since_refdate(
flat_num_dates.min(), time_unit, ref_date
)
_check_date_for_units_since_refdate(
flat_num_dates.max(), time_unit, ref_date
)

# To avoid integer overflow when converting to nanosecond units for integer
# dtypes smaller than np.int64 cast all integer and unsigned integer dtype
Expand All @@ -294,20 +330,24 @@ def _decode_datetime_with_pandas(
elif flat_num_dates.dtype.kind in "f":
flat_num_dates = flat_num_dates.astype(np.float64)

# Cast input ordinals to integers of nanoseconds because pd.to_timedelta
# works much faster when dealing with integers (GH 1399).
# properly handle NaN/NaT to prevent casting NaN to int
# keep NaT/nan mask
nan = np.isnan(flat_num_dates) | (flat_num_dates == np.iinfo(np.int64).min)
flat_num_dates = flat_num_dates * _NS_PER_TIME_DELTA[time_units]
flat_num_dates_ns_int = np.zeros_like(flat_num_dates, dtype=np.int64)
flat_num_dates_ns_int[nan] = np.iinfo(np.int64).min
flat_num_dates_ns_int[~nan] = flat_num_dates[~nan].astype(np.int64)
# in case we need to change the unit, we fix the numbers here
# this should be safe, as errors would have been raised above
ns_time_unit = _NS_PER_TIME_DELTA[time_unit]
ns_ref_date_unit = _NS_PER_TIME_DELTA[ref_date.unit]
if flat_num_dates.dtype.kind in "iuf" and (ns_time_unit > ns_ref_date_unit):
flat_num_dates *= np.int64(ns_time_unit / ns_ref_date_unit)
time_unit = ref_date.unit

# Cast input ordinals to integers and properly handle NaN/NaT
# to prevent casting NaN to int
flat_num_dates_int = np.zeros_like(flat_num_dates, dtype=np.int64)
flat_num_dates_int[nan] = np.iinfo(np.int64).min
flat_num_dates_int[~nan] = flat_num_dates[~nan].astype(np.int64)

# Use pd.to_timedelta to safely cast integer values to timedeltas,
# and add those to a Timestamp to safely produce a DatetimeIndex. This
# ensures that we do not encounter integer overflow at any point in the
# process without raising OutOfBoundsDatetime.
return (pd.to_timedelta(flat_num_dates_ns_int, "ns") + ref_date).values
# cast to timedelta64[time_unit] and add to ref_date
return ref_date + flat_num_dates_int.astype(f"timedelta64[{time_unit}]")


def decode_cf_datetime(
Expand Down Expand Up @@ -339,10 +379,16 @@ def decode_cf_datetime(
dates = _decode_datetime_with_cftime(
flat_num_dates.astype(float), units, calendar
)
# retrieve cftype
cftype = type(dates[np.nanargmin(num_dates)])
# "ns" borders
# between ['1677-09-21T00:12:43.145224193', '2262-04-11T23:47:16.854775807']
lower = cftype(1677, 9, 21, 0, 12, 43, 145224)
upper = cftype(2262, 4, 11, 23, 47, 16, 854775)

if (
dates[np.nanargmin(num_dates)].year < 1678
or dates[np.nanargmax(num_dates)].year >= 2262
dates[np.nanargmin(num_dates)] < lower
or dates[np.nanargmax(num_dates)] > upper
):
if _is_standard_calendar(calendar):
warnings.warn(
Expand Down Expand Up @@ -763,8 +809,8 @@ def _eagerly_encode_cf_datetime(
raise OutOfBoundsDatetime
assert dates.dtype == "datetime64[ns]"

time_units, ref_date = _unpack_time_units_and_ref_date(units)
time_delta = _time_units_to_timedelta64(time_units)
time_unit, ref_date = _unpack_time_unit_and_ref_date(units)
time_delta = np.timedelta64(1, time_unit)

# Wrap the dates in a DatetimeIndex to do the subtraction to ensure
# an OverflowError is raised if the ref_date is too far away from
Expand All @@ -773,16 +819,17 @@ def _eagerly_encode_cf_datetime(
time_deltas = dates_as_index - ref_date

# retrieve needed units to faithfully encode to int64
needed_units, data_ref_date = _unpack_time_units_and_ref_date(data_units)
needed_unit, data_ref_date = _unpack_time_unit_and_ref_date(data_units)
needed_units = _numpy_to_netcdf_timeunit(needed_unit)
if data_units != units:
# this accounts for differences in the reference times
ref_delta = abs(data_ref_date - ref_date).to_timedelta64()
data_delta = _time_units_to_timedelta64(needed_units)
data_delta = np.timedelta64(1, needed_unit)
if (ref_delta % data_delta) > np.timedelta64(0, "ns"):
needed_units = _infer_time_units_from_diff(ref_delta)

# needed time delta to encode faithfully to int64
needed_time_delta = _time_units_to_timedelta64(needed_units)
needed_time_delta = _unit_timedelta_numpy(needed_units)

floor_division = np.issubdtype(dtype, np.integer) or dtype is None
if time_delta > needed_time_delta:
Expand All @@ -795,6 +842,7 @@ def _eagerly_encode_cf_datetime(
f"Set encoding['dtype'] to floating point dtype to silence this warning."
)
elif np.issubdtype(dtype, np.integer) and allow_units_modification:
floor_division = True
new_units = f"{needed_units} since {format_timestamp(ref_date)}"
emit_user_level_warning(
f"Times can't be serialized faithfully to int64 with requested units {units!r}. "
Expand All @@ -804,9 +852,12 @@ def _eagerly_encode_cf_datetime(
)
units = new_units
time_delta = needed_time_delta
floor_division = True

num = _division(time_deltas, time_delta, floor_division)
# get resolution of TimedeltaIndex and align time_delta
# todo: check, if this works in any case
num = _division(
time_deltas, time_delta.astype(f"=m8[{time_deltas.unit}]"), floor_division
)
num = reshape(num.values, dates.shape)

except (OutOfBoundsDatetime, OverflowError, ValueError):
Expand Down
17 changes: 16 additions & 1 deletion xarray/core/pdcompat.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,13 @@
from __future__ import annotations

from enum import Enum
from typing import Literal
from typing import Literal, cast

import pandas as pd
from packaging.version import Version

from xarray.core.types import PDDatetimeUnitOptions


def count_not_none(*args) -> int:
"""Compute the number of non-None arguments.
Expand Down Expand Up @@ -73,6 +75,19 @@ def __repr__(self) -> str:
NoDefault = Literal[_NoDefault.no_default] # For typing following pandas


def timestamp_as_unit(date: pd.Timestamp, unit: str) -> pd.Timestamp:
# compatibility function for pandas issue
# where "as_unit" is not defined for pandas.Timestamp
# in pandas versions < 2.2
# can be removed minimum pandas version is >= 2.2
unit = cast(PDDatetimeUnitOptions, unit)
if hasattr(date, "as_unit"):
date = date.as_unit(unit)
elif hasattr(date, "_as_unit"):
date = date._as_unit(unit)
return date


def nanosecond_precision_timestamp(*args, **kwargs) -> pd.Timestamp:
"""Return a nanosecond-precision Timestamp object.

Expand Down
4 changes: 2 additions & 2 deletions xarray/tests/test_coding_times.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,8 +167,8 @@ def test_decode_cf_datetime_overflow() -> None:
units = "days since 2000-01-01 00:00:00"

# date after 2262 and before 1678
days = (-117608, 95795)
expected = (datetime(1677, 12, 31), datetime(2262, 4, 12))
days = (-117710, 95795)
expected = (datetime(1677, 9, 20), datetime(2262, 4, 12))

for i, day in enumerate(days):
with warnings.catch_warnings():
Expand Down
Loading