diff --git a/asv_bench/benchmarks/io/hdf.py b/asv_bench/benchmarks/io/hdf.py index f08904ba70a5f..a5dc28eb9508c 100644 --- a/asv_bench/benchmarks/io/hdf.py +++ b/asv_bench/benchmarks/io/hdf.py @@ -1,7 +1,5 @@ -import warnings - import numpy as np -from pandas import DataFrame, Panel, date_range, HDFStore, read_hdf +from pandas import DataFrame, date_range, HDFStore, read_hdf import pandas.util.testing as tm from ..pandas_vb_common import BaseIO @@ -99,31 +97,6 @@ def time_store_info(self): self.store.info() -class HDFStorePanel(BaseIO): - - def setup(self): - self.fname = '__test__.h5' - with warnings.catch_warnings(record=True): - self.p = Panel(np.random.randn(20, 1000, 25), - items=['Item%03d' % i for i in range(20)], - major_axis=date_range('1/1/2000', periods=1000), - minor_axis=['E%03d' % i for i in range(25)]) - self.store = HDFStore(self.fname) - self.store.append('p1', self.p) - - def teardown(self): - self.store.close() - self.remove(self.fname) - - def time_read_store_table_panel(self): - with warnings.catch_warnings(record=True): - self.store.select('p1') - - def time_write_store_table_panel(self): - with warnings.catch_warnings(record=True): - self.store.append('p2', self.p) - - class HDF(BaseIO): params = ['table', 'fixed'] diff --git a/ci/deps/azure-27-compat.yaml b/ci/deps/azure-27-compat.yaml index 8899e22bdf6cf..986855c464852 100644 --- a/ci/deps/azure-27-compat.yaml +++ b/ci/deps/azure-27-compat.yaml @@ -20,6 +20,7 @@ dependencies: # universal - pytest - pytest-xdist + - pytest-mock - pip: - html5lib==1.0b2 - beautifulsoup4==4.2.1 diff --git a/ci/deps/azure-27-locale.yaml b/ci/deps/azure-27-locale.yaml index 0846ef5e8264e..f73079ecbe3d2 100644 --- a/ci/deps/azure-27-locale.yaml +++ b/ci/deps/azure-27-locale.yaml @@ -22,6 +22,7 @@ dependencies: # universal - pytest - pytest-xdist + - pytest-mock - hypothesis>=3.58.0 - pip: - html5lib==1.0b2 diff --git a/ci/deps/azure-36-locale_slow.yaml b/ci/deps/azure-36-locale_slow.yaml index c7d2334623501..6b8d38fd25082 100644 --- a/ci/deps/azure-36-locale_slow.yaml +++ b/ci/deps/azure-36-locale_slow.yaml @@ -28,6 +28,7 @@ dependencies: # universal - pytest - pytest-xdist + - pytest-mock - moto - pip: - hypothesis>=3.58.0 diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml index b5a05c49b8083..569b71dae003b 100644 --- a/ci/deps/azure-37-locale.yaml +++ b/ci/deps/azure-37-locale.yaml @@ -27,6 +27,7 @@ dependencies: # universal - pytest - pytest-xdist + - pytest-mock - pip: - hypothesis>=3.58.0 - moto # latest moto in conda-forge fails with 3.7, move to conda dependencies when this is fixed diff --git a/ci/deps/azure-37-numpydev.yaml b/ci/deps/azure-37-numpydev.yaml index 99ae228f25de3..a37be124cc546 100644 --- a/ci/deps/azure-37-numpydev.yaml +++ b/ci/deps/azure-37-numpydev.yaml @@ -8,6 +8,7 @@ dependencies: # universal - pytest - pytest-xdist + - pytest-mock - hypothesis>=3.58.0 - pip: - "git+git://github.com/dateutil/dateutil.git" diff --git a/ci/deps/azure-macos-35.yaml b/ci/deps/azure-macos-35.yaml index 58abbabce3d86..d1fe926744ecd 100644 --- a/ci/deps/azure-macos-35.yaml +++ b/ci/deps/azure-macos-35.yaml @@ -24,6 +24,7 @@ dependencies: # universal - pytest - pytest-xdist + - pytest-mock - pip: - python-dateutil==2.5.3 - hypothesis>=3.58.0 diff --git a/ci/deps/azure-windows-27.yaml b/ci/deps/azure-windows-27.yaml index b1533b071fa74..74faeed83c387 100644 --- a/ci/deps/azure-windows-27.yaml +++ b/ci/deps/azure-windows-27.yaml @@ -27,5 +27,6 @@ dependencies: - cython>=0.28.2 - pytest - pytest-xdist + - pytest-mock - moto - hypothesis>=3.58.0 diff --git a/ci/deps/azure-windows-36.yaml b/ci/deps/azure-windows-36.yaml index 7b132a134c44e..94d67b3d37788 100644 --- a/ci/deps/azure-windows-36.yaml +++ b/ci/deps/azure-windows-36.yaml @@ -25,4 +25,5 @@ dependencies: - cython>=0.28.2 - pytest - pytest-xdist + - pytest-mock - hypothesis>=3.58.0 diff --git a/ci/deps/travis-27.yaml b/ci/deps/travis-27.yaml index 2624797b24fa1..4915c003bce4e 100644 --- a/ci/deps/travis-27.yaml +++ b/ci/deps/travis-27.yaml @@ -41,6 +41,7 @@ dependencies: # universal - pytest - pytest-xdist + - pytest-mock - moto==1.3.4 - hypothesis>=3.58.0 - pip: diff --git a/ci/deps/travis-36-locale.yaml b/ci/deps/travis-36-locale.yaml index 2b38465c04512..2a7692f10752c 100644 --- a/ci/deps/travis-36-locale.yaml +++ b/ci/deps/travis-36-locale.yaml @@ -30,6 +30,7 @@ dependencies: # universal - pytest - pytest-xdist + - pytest-mock - moto - pip: - hypothesis>=3.58.0 diff --git a/ci/deps/travis-36-slow.yaml b/ci/deps/travis-36-slow.yaml index a6ffdb95e5e7c..7934d179c8618 100644 --- a/ci/deps/travis-36-slow.yaml +++ b/ci/deps/travis-36-slow.yaml @@ -27,5 +27,6 @@ dependencies: # universal - pytest - pytest-xdist + - pytest-mock - moto - hypothesis>=3.58.0 diff --git a/ci/deps/travis-36.yaml b/ci/deps/travis-36.yaml index 74db888d588f4..857c3fadfdaeb 100644 --- a/ci/deps/travis-36.yaml +++ b/ci/deps/travis-36.yaml @@ -36,6 +36,7 @@ dependencies: - pytest - pytest-xdist - pytest-cov + - pytest-mock - hypothesis>=3.58.0 - pip: - brotlipy diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml index c503124d8cd26..125750191de7d 100644 --- a/ci/deps/travis-37.yaml +++ b/ci/deps/travis-37.yaml @@ -14,6 +14,7 @@ dependencies: - pytz - pytest - pytest-xdist + - pytest-mock - hypothesis>=3.58.0 - s3fs - pip: diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet.pdf b/doc/cheatsheet/Pandas_Cheat_Sheet.pdf index d50896dc5ccc5..48da05d053b96 100644 Binary files a/doc/cheatsheet/Pandas_Cheat_Sheet.pdf and b/doc/cheatsheet/Pandas_Cheat_Sheet.pdf differ diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet.pptx b/doc/cheatsheet/Pandas_Cheat_Sheet.pptx index 95f2771017db5..039b3898fa301 100644 Binary files a/doc/cheatsheet/Pandas_Cheat_Sheet.pptx and b/doc/cheatsheet/Pandas_Cheat_Sheet.pptx differ diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet_JA.pdf b/doc/cheatsheet/Pandas_Cheat_Sheet_JA.pdf index 05e4b87f6a210..cf1e40e627f33 100644 Binary files a/doc/cheatsheet/Pandas_Cheat_Sheet_JA.pdf and b/doc/cheatsheet/Pandas_Cheat_Sheet_JA.pdf differ diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet_JA.pptx b/doc/cheatsheet/Pandas_Cheat_Sheet_JA.pptx index cb0f058db5448..564d92ddbb56a 100644 Binary files a/doc/cheatsheet/Pandas_Cheat_Sheet_JA.pptx and b/doc/cheatsheet/Pandas_Cheat_Sheet_JA.pptx differ diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst index 1dc74ad83b7e6..a129b75636536 100644 --- a/doc/source/reference/arrays.rst +++ b/doc/source/reference/arrays.rst @@ -120,6 +120,7 @@ Methods Timestamp.timetuple Timestamp.timetz Timestamp.to_datetime64 + Timestamp.to_numpy Timestamp.to_julian_date Timestamp.to_period Timestamp.to_pydatetime @@ -191,6 +192,7 @@ Methods Timedelta.round Timedelta.to_pytimedelta Timedelta.to_timedelta64 + Timedelta.to_numpy Timedelta.total_seconds A collection of timedeltas may be stored in a :class:`TimedeltaArray`. diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 2c2e5c5425216..e4dd82afcdf65 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -1317,7 +1317,7 @@ arbitrary function, for example: df.groupby(['Store', 'Product']).pipe(mean) where ``mean`` takes a GroupBy object and finds the mean of the Revenue and Quantity -columns repectively for each Store-Product combination. The ``mean`` function can +columns respectively for each Store-Product combination. The ``mean`` function can be any function that takes in a GroupBy object; the ``.pipe`` will pass the GroupBy object as a parameter into the function you specify. diff --git a/doc/source/whatsnew/v0.10.0.rst b/doc/source/whatsnew/v0.10.0.rst index bc2a4918bc27b..2d6550bb6888d 100644 --- a/doc/source/whatsnew/v0.10.0.rst +++ b/doc/source/whatsnew/v0.10.0.rst @@ -370,7 +370,7 @@ Updated PyTables Support df1.get_dtype_counts() - performance improvements on table writing -- support for arbitrarly indexed dimensions +- support for arbitrarily indexed dimensions - ``SparseSeries`` now has a ``density`` property (:issue:`2384`) - enable ``Series.str.strip/lstrip/rstrip`` methods to take an input argument to strip arbitrary characters (:issue:`2411`) diff --git a/doc/source/whatsnew/v0.16.1.rst b/doc/source/whatsnew/v0.16.1.rst index 7621cb9c1e27c..cbcb23e356577 100644 --- a/doc/source/whatsnew/v0.16.1.rst +++ b/doc/source/whatsnew/v0.16.1.rst @@ -136,7 +136,7 @@ groupby operations on the index will preserve the index nature as well reindexing operations, will return a resulting index based on the type of the passed indexer, meaning that passing a list will return a plain-old-``Index``; indexing with a ``Categorical`` will return a ``CategoricalIndex``, indexed according to the categories -of the PASSED ``Categorical`` dtype. This allows one to arbitrarly index these even with +of the PASSED ``Categorical`` dtype. This allows one to arbitrarily index these even with values NOT in the categories, similarly to how you can reindex ANY pandas index. .. code-block:: ipython diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index 5ae777ca68eba..8e59c2300e7ca 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -25,6 +25,7 @@ Fixed Regressions - Fixed regression in :meth:`DataFrame.apply` causing ``RecursionError`` when ``dict``-like classes were passed as argument. (:issue:`25196`) - Fixed regression in :meth:`DataFrame.duplicated()`, where empty dataframe was not returning a boolean dtyped Series. (:issue:`25184`) +- Fixed regression in :meth:`Series.min` and :meth:`Series.max` where ``numeric_only=True`` was ignored when the ``Series`` contained ```Categorical`` data (:issue:`25299`) .. _whatsnew_0242.enhancements: @@ -53,6 +54,7 @@ Bug Fixes **I/O** +- Better handling of terminal printing when the terminal dimensions are not known (:issue:`25080`); - Bug in reading a HDF5 table-format ``DataFrame`` created in Python 2, in Python 3 (:issue:`24925`) - Bug in reading a JSON with ``orient='table'`` generated by :meth:`DataFrame.to_json` with ``index=False`` (:issue:`25170`) - Bug where float indexes could have misaligned values when printing (:issue:`25061`) @@ -78,8 +80,8 @@ Bug Fixes **Reshaping** -- -- +- Bug in :meth:`pandas.core.groupby.GroupBy.transform` where applying a function to a timezone aware column would return a timezone naive result (:issue:`24198`) +- Bug in :func:`DataFrame.join` when joining on a timezone aware :class:`DatetimeIndex` (:issue:`23931`) - **Visualization** diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 11e735028a7d5..686c5ad0165e7 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -20,7 +20,7 @@ Other Enhancements ^^^^^^^^^^^^^^^^^^ - :meth:`Timestamp.replace` now supports the ``fold`` argument to disambiguate DST transition times (:issue:`25017`) -- +- :meth:`DataFrame.at_time` and :meth:`Series.at_time` now support :meth:`datetime.time` objects with timezones (:issue:`24043`) - .. _whatsnew_0250.api_breaking: @@ -34,6 +34,7 @@ Other API Changes ^^^^^^^^^^^^^^^^^ - :class:`DatetimeTZDtype` will now standardize pytz timezones to a common timezone instance (:issue:`24713`) +- ``Timestamp`` and ``Timedelta`` scalars now implement the :meth:`to_numpy` method as aliases to :meth:`Timestamp.to_datetime64` and :meth:`Timedelta.to_timedelta64`, respectively. (:issue:`24653`) - - @@ -172,7 +173,7 @@ Plotting Groupby/Resample/Rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- +- Bug in :meth:`pandas.core.resample.Resampler.agg` with a timezone aware index where ``OverflowError`` would raise when passing a list of functions (:issue:`22660`) - - @@ -183,7 +184,6 @@ Reshaping - Bug in :func:`pandas.merge` adds a string of ``None`` if ``None`` is assigned in suffixes instead of remain the column name as-is (:issue:`24782`). - Bug in :func:`merge` when merging by index name would sometimes result in an incorrectly numbered index (:issue:`24212`) - :func:`to_records` now accepts dtypes to its `column_dtypes` parameter (:issue:`24895`) -- Sparse diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 950ba3f89ffb7..e6b6e2c8a0055 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -382,6 +382,10 @@ def group_any_all(uint8_t[:] out, if values[i] == flag_val: out[lab] = flag_val +# ---------------------------------------------------------------------- +# group_add, group_prod, group_var, group_mean, group_ohlc +# ---------------------------------------------------------------------- + @cython.wraparound(False) @cython.boundscheck(False) @@ -396,9 +400,9 @@ def _group_add(floating[:, :] out, cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) floating val, count - ndarray[floating, ndim=2] sumx, nobs + floating[:, :] sumx, nobs - if not len(values) == len(labels): + if len(values) != len(labels): raise AssertionError("len(index) != len(labels)") nobs = np.zeros_like(out) @@ -407,7 +411,6 @@ def _group_add(floating[:, :] out, N, K = (values).shape with nogil: - for i in range(N): lab = labels[i] if lab < 0: @@ -433,5 +436,213 @@ def _group_add(floating[:, :] out, group_add_float32 = _group_add['float'] group_add_float64 = _group_add['double'] + +@cython.wraparound(False) +@cython.boundscheck(False) +def _group_prod(floating[:, :] out, + int64_t[:] counts, + floating[:, :] values, + const int64_t[:] labels, + Py_ssize_t min_count=0): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + floating val, count + floating[:, :] prodx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + prodx = np.ones_like(out) + + N, K = (values).shape + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + prodx[lab, j] *= val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] < min_count: + out[i, j] = NAN + else: + out[i, j] = prodx[i, j] + + +group_prod_float32 = _group_prod['float'] +group_prod_float64 = _group_prod['double'] + + +@cython.wraparound(False) +@cython.boundscheck(False) +@cython.cdivision(True) +def _group_var(floating[:, :] out, + int64_t[:] counts, + floating[:, :] values, + const int64_t[:] labels, + Py_ssize_t min_count=-1): + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + floating val, ct, oldmean + floating[:, :] nobs, mean + + assert min_count == -1, "'min_count' only used in add and prod" + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + mean = np.zeros_like(out) + + N, K = (values).shape + + out[:, :] = 0.0 + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + oldmean = mean[lab, j] + mean[lab, j] += (val - oldmean) / nobs[lab, j] + out[lab, j] += (val - mean[lab, j]) * (val - oldmean) + + for i in range(ncounts): + for j in range(K): + ct = nobs[i, j] + if ct < 2: + out[i, j] = NAN + else: + out[i, j] /= (ct - 1) + + +group_var_float32 = _group_var['float'] +group_var_float64 = _group_var['double'] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def _group_mean(floating[:, :] out, + int64_t[:] counts, + floating[:, :] values, + const int64_t[:] labels, + Py_ssize_t min_count=-1): + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + floating val, count + floating[:, :] sumx, nobs + + assert min_count == -1, "'min_count' only used in add and prod" + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + + N, K = (values).shape + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + # not nan + if val == val: + nobs[lab, j] += 1 + sumx[lab, j] += val + + for i in range(ncounts): + for j in range(K): + count = nobs[i, j] + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = sumx[i, j] / count + + +group_mean_float32 = _group_mean['float'] +group_mean_float64 = _group_mean['double'] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def _group_ohlc(floating[:, :] out, + int64_t[:] counts, + floating[:, :] values, + const int64_t[:] labels, + Py_ssize_t min_count=-1): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab + floating val, count + Py_ssize_t ngroups = len(counts) + + assert min_count == -1, "'min_count' only used in add and prod" + + if len(labels) == 0: + return + + N, K = (values).shape + + if out.shape[1] != 4: + raise ValueError('Output array must have 4 columns') + + if K > 1: + raise NotImplementedError("Argument 'values' must have only " + "one dimension") + out[:] = np.nan + + with nogil: + for i in range(N): + lab = labels[i] + if lab == -1: + continue + + counts[lab] += 1 + val = values[i, 0] + if val != val: + continue + + if out[lab, 0] != out[lab, 0]: + out[lab, 0] = out[lab, 1] = out[lab, 2] = out[lab, 3] = val + else: + out[lab, 1] = max(out[lab, 1], val) + out[lab, 2] = min(out[lab, 2], val) + out[lab, 3] = val + + +group_ohlc_float32 = _group_ohlc['float'] +group_ohlc_float64 = _group_ohlc['double'] + # generated from template include "groupby_helper.pxi" diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index db7018e1a7254..63cd4d6ac6ff2 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -8,219 +8,6 @@ cdef extern from "numpy/npy_math.h": float64_t NAN "NPY_NAN" _int64_max = np.iinfo(np.int64).max -# ---------------------------------------------------------------------- -# group_prod, group_var, group_mean, group_ohlc -# ---------------------------------------------------------------------- - -{{py: - -# name, c_type -dtypes = [('float64', 'float64_t'), - ('float32', 'float32_t')] - -def get_dispatch(dtypes): - - for name, c_type in dtypes: - yield name, c_type -}} - -{{for name, c_type in get_dispatch(dtypes)}} - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_prod_{{name}}({{c_type}}[:, :] out, - int64_t[:] counts, - {{c_type}}[:, :] values, - const int64_t[:] labels, - Py_ssize_t min_count=0): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - {{c_type}} val, count - ndarray[{{c_type}}, ndim=2] prodx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - prodx = np.ones_like(out) - - N, K = (values).shape - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - prodx[lab, j] *= val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] < min_count: - out[i, j] = NAN - else: - out[i, j] = prodx[i, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -@cython.cdivision(True) -def group_var_{{name}}({{c_type}}[:, :] out, - int64_t[:] counts, - {{c_type}}[:, :] values, - const int64_t[:] labels, - Py_ssize_t min_count=-1): - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - {{c_type}} val, ct, oldmean - ndarray[{{c_type}}, ndim=2] nobs, mean - - assert min_count == -1, "'min_count' only used in add and prod" - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - mean = np.zeros_like(out) - - N, K = (values).shape - - out[:, :] = 0.0 - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - oldmean = mean[lab, j] - mean[lab, j] += (val - oldmean) / nobs[lab, j] - out[lab, j] += (val - mean[lab, j]) * (val - oldmean) - - for i in range(ncounts): - for j in range(K): - ct = nobs[i, j] - if ct < 2: - out[i, j] = NAN - else: - out[i, j] /= (ct - 1) -# add passing bin edges, instead of labels - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_mean_{{name}}({{c_type}}[:, :] out, - int64_t[:] counts, - {{c_type}}[:, :] values, - const int64_t[:] labels, - Py_ssize_t min_count=-1): - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - {{c_type}} val, count - ndarray[{{c_type}}, ndim=2] sumx, nobs - - assert min_count == -1, "'min_count' only used in add and prod" - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - sumx = np.zeros_like(out) - - N, K = (values).shape - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - # not nan - if val == val: - nobs[lab, j] += 1 - sumx[lab, j] += val - - for i in range(ncounts): - for j in range(K): - count = nobs[i, j] - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = sumx[i, j] / count - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_ohlc_{{name}}({{c_type}}[:, :] out, - int64_t[:] counts, - {{c_type}}[:, :] values, - const int64_t[:] labels, - Py_ssize_t min_count=-1): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab - {{c_type}} val, count - Py_ssize_t ngroups = len(counts) - - assert min_count == -1, "'min_count' only used in add and prod" - - if len(labels) == 0: - return - - N, K = (values).shape - - if out.shape[1] != 4: - raise ValueError('Output array must have 4 columns') - - if K > 1: - raise NotImplementedError("Argument 'values' must have only " - "one dimension") - out[:] = np.nan - - with nogil: - for i in range(N): - lab = labels[i] - if lab == -1: - continue - - counts[lab] += 1 - val = values[i, 0] - if val != val: - continue - - if out[lab, 0] != out[lab, 0]: - out[lab, 0] = out[lab, 1] = out[lab, 2] = out[lab, 3] = val - else: - out[lab, 1] = max(out[lab, 1], val) - out[lab, 2] = min(out[lab, 2], val) - out[lab, 3] = val - -{{endfor}} - # ---------------------------------------------------------------------- # group_nth, group_last, group_rank # ---------------------------------------------------------------------- diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index b64c3479f23fe..a13fcfdc855d5 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -188,6 +188,26 @@ cdef class _NaT(datetime): """ return np.datetime64('NaT', 'ns') + def to_numpy(self, dtype=None, copy=False): + """ + Convert the Timestamp to a NumPy datetime64. + + .. versionadded:: 0.25.0 + + This is an alias method for `Timestamp.to_datetime64()`. The dtype and + copy parameters are available here only for compatibility. Their values + will not affect the return value. + + Returns + ------- + numpy.datetime64 + + See Also + -------- + DatetimeIndex.to_numpy : Similar method for DatetimeIndex. + """ + return self.to_datetime64() + def __repr__(self): return 'NaT' diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 58b2faac8b06b..6e40063fb925a 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -824,6 +824,26 @@ cdef class _Timedelta(timedelta): """ Returns a numpy.timedelta64 object with 'ns' precision """ return np.timedelta64(self.value, 'ns') + def to_numpy(self, dtype=None, copy=False): + """ + Convert the Timestamp to a NumPy timedelta64. + + .. versionadded:: 0.25.0 + + This is an alias method for `Timedelta.to_timedelta64()`. The dtype and + copy parameters are available here only for compatibility. Their values + will not affect the return value. + + Returns + ------- + numpy.timedelta64 + + See Also + -------- + Series.to_numpy : Similar method for Series. + """ + return self.to_timedelta64() + def total_seconds(self): """ Total duration of timedelta in seconds (to ns precision) diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 8a95d2494dfa4..a2929dbeb471f 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -345,6 +345,26 @@ cdef class _Timestamp(datetime): """ return np.datetime64(self.value, 'ns') + def to_numpy(self, dtype=None, copy=False): + """ + Convert the Timestamp to a NumPy datetime64. + + .. versionadded:: 0.25.0 + + This is an alias method for `Timestamp.to_datetime64()`. The dtype and + copy parameters are available here only for compatibility. Their values + will not affect the return value. + + Returns + ------- + numpy.datetime64 + + See Also + -------- + DatetimeIndex.to_numpy : Similar method for DatetimeIndex. + """ + return self.to_datetime64() + def __add__(self, other): cdef: int64_t other_int, nanos diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 0e89a4a082999..7ae52e06dc5b8 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -214,7 +214,7 @@ def contains(cat, key, container): class Categorical(ExtensionArray, PandasObject): """ - Represent a categorical variable in classic R / S-plus fashion + Represent a categorical variable in classic R / S-plus fashion. `Categoricals` can only take on only a limited, and usually fixed, number of possible values (`categories`). In contrast to statistical categorical @@ -235,7 +235,7 @@ class Categorical(ExtensionArray, PandasObject): The unique categories for this categorical. If not given, the categories are assumed to be the unique values of `values` (sorted, if possible, otherwise in the order in which they appear). - ordered : boolean, (default False) + ordered : bool, default False Whether or not this categorical is treated as a ordered categorical. If True, the resulting categorical will be ordered. An ordered categorical respects, when sorted, the order of its @@ -253,7 +253,7 @@ class Categorical(ExtensionArray, PandasObject): codes : ndarray The codes (integer positions, which point to the categories) of this categorical, read only. - ordered : boolean + ordered : bool Whether or not this Categorical is ordered. dtype : CategoricalDtype The instance of ``CategoricalDtype`` storing the ``categories`` @@ -297,7 +297,7 @@ class Categorical(ExtensionArray, PandasObject): Ordered `Categoricals` can be sorted according to the custom order of the categories and can have a min and max value. - >>> c = pd.Categorical(['a','b','c','a','b','c'], ordered=True, + >>> c = pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], ordered=True, ... categories=['c', 'b', 'a']) >>> c [a, b, c, a, b, c] @@ -618,7 +618,7 @@ def from_codes(cls, codes, categories=None, ordered=None, dtype=None): ---------- codes : array-like, integers An integer array, where each integer points to a category in - categories or dtype.categories, or else is -1 for NaN + categories or dtype.categories, or else is -1 for NaN. categories : index-like, optional The categories for the categorical. Items need to be unique. If the categories are not given here, then they must be provided @@ -700,7 +700,7 @@ def _set_categories(self, categories, fastpath=False): Parameters ---------- - fastpath : boolean (default: False) + fastpath : bool, default False Don't perform validation of the categories for uniqueness or nulls Examples @@ -747,15 +747,15 @@ def _set_dtype(self, dtype): def set_ordered(self, value, inplace=False): """ - Set the ordered attribute to the boolean value + Set the ordered attribute to the boolean value. Parameters ---------- - value : boolean to set whether this categorical is ordered (True) or - not (False) - inplace : boolean (default: False) - Whether or not to set the ordered attribute inplace or return a copy - of this categorical with ordered set to the value + value : bool + Set whether this categorical is ordered (True) or not (False). + inplace : bool, default False + Whether or not to set the ordered attribute in-place or return + a copy of this categorical with ordered set to the value. """ inplace = validate_bool_kwarg(inplace, 'inplace') new_dtype = CategoricalDtype(self.categories, ordered=value) @@ -770,9 +770,9 @@ def as_ordered(self, inplace=False): Parameters ---------- - inplace : boolean (default: False) - Whether or not to set the ordered attribute inplace or return a copy - of this categorical with ordered set to True + inplace : bool, default False + Whether or not to set the ordered attribute in-place or return + a copy of this categorical with ordered set to True. """ inplace = validate_bool_kwarg(inplace, 'inplace') return self.set_ordered(True, inplace=inplace) @@ -783,9 +783,9 @@ def as_unordered(self, inplace=False): Parameters ---------- - inplace : boolean (default: False) - Whether or not to set the ordered attribute inplace or return a copy - of this categorical with ordered set to False + inplace : bool, default False + Whether or not to set the ordered attribute in-place or return + a copy of this categorical with ordered set to False. """ inplace = validate_bool_kwarg(inplace, 'inplace') return self.set_ordered(False, inplace=inplace) @@ -815,19 +815,19 @@ def set_categories(self, new_categories, ordered=None, rename=False, ---------- new_categories : Index-like The categories in new order. - ordered : boolean, (default: False) + ordered : bool, default False Whether or not the categorical is treated as a ordered categorical. If not given, do not change the ordered information. - rename : boolean (default: False) + rename : bool, default False Whether or not the new_categories should be considered as a rename of the old categories or as reordered categories. - inplace : boolean (default: False) - Whether or not to reorder the categories inplace or return a copy of - this categorical with reordered categories. + inplace : bool, default False + Whether or not to reorder the categories in-place or return a copy + of this categorical with reordered categories. Returns ------- - cat : Categorical with reordered categories or None if inplace. + Categorical with reordered categories or None if inplace. Raises ------ @@ -890,7 +890,7 @@ def rename_categories(self, new_categories, inplace=False): Currently, Series are considered list like. In a future version of pandas they'll be considered dict-like. - inplace : boolean (default: False) + inplace : bool, default False Whether or not to rename the categories inplace or return a copy of this categorical with renamed categories. @@ -967,10 +967,10 @@ def reorder_categories(self, new_categories, ordered=None, inplace=False): ---------- new_categories : Index-like The categories in new order. - ordered : boolean, optional + ordered : bool, optional Whether or not the categorical is treated as a ordered categorical. If not given, do not change the ordered information. - inplace : boolean (default: False) + inplace : bool, default False Whether or not to reorder the categories inplace or return a copy of this categorical with reordered categories. @@ -1010,7 +1010,7 @@ def add_categories(self, new_categories, inplace=False): ---------- new_categories : category or list-like of category The new categories to be included. - inplace : boolean (default: False) + inplace : bool, default False Whether or not to add the categories inplace or return a copy of this categorical with added categories. @@ -1060,7 +1060,7 @@ def remove_categories(self, removals, inplace=False): ---------- removals : category or list of categories The categories which should be removed. - inplace : boolean (default: False) + inplace : bool, default False Whether or not to remove the categories inplace or return a copy of this categorical with removed categories. @@ -1108,7 +1108,7 @@ def remove_unused_categories(self, inplace=False): Parameters ---------- - inplace : boolean (default: False) + inplace : bool, default False Whether or not to drop unused categories inplace or return a copy of this categorical with unused categories dropped. @@ -1460,7 +1460,7 @@ def value_counts(self, dropna=True): Parameters ---------- - dropna : boolean, default True + dropna : bool, default True Don't include counts of NaN. Returns @@ -1581,9 +1581,9 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'): Parameters ---------- - inplace : boolean, default False + inplace : bool, default False Do operation in place. - ascending : boolean, default True + ascending : bool, default True Order ascending. Passing False orders descending. The ordering parameter provides the method by which the category values are organized. @@ -2172,7 +2172,7 @@ def _reverse_indexer(self): return result # reduction ops # - def _reduce(self, name, axis=0, skipna=True, **kwargs): + def _reduce(self, name, axis=0, **kwargs): func = getattr(self, name, None) if func is None: msg = 'Categorical cannot perform the operation {op}' @@ -2239,7 +2239,7 @@ def mode(self, dropna=True): Parameters ---------- - dropna : boolean, default True + dropna : bool, default True Don't consider counts of NaN/NaT. .. versionadded:: 0.24.0 @@ -2332,7 +2332,7 @@ def equals(self, other): Returns ------- - are_equal : boolean + bool """ if self.is_dtype_equal(other): if self.categories.equals(other.categories): @@ -2356,7 +2356,7 @@ def is_dtype_equal(self, other): Returns ------- - are_equal : boolean + bool """ try: diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 1b2a4da389dc4..cd8e8ed520ddc 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -799,14 +799,14 @@ def tz_convert(self, tz): Parameters ---------- - tz : string, pytz.timezone, dateutil.tz.tzfile or None + tz : str, pytz.timezone, dateutil.tz.tzfile or None Time zone for time. Corresponding timestamps would be converted to this time zone of the Datetime Array/Index. A `tz` of None will convert to UTC and remove the timezone information. Returns ------- - normalized : same type as self + Array or Index Raises ------ @@ -842,7 +842,7 @@ def tz_convert(self, tz): With the ``tz=None``, we can remove the timezone (after converting to UTC if necessary): - >>> dti = pd.date_range(start='2014-08-01 09:00',freq='H', + >>> dti = pd.date_range(start='2014-08-01 09:00', freq='H', ... periods=3, tz='Europe/Berlin') >>> dti @@ -882,7 +882,7 @@ def tz_localize(self, tz, ambiguous='raise', nonexistent='raise', Parameters ---------- - tz : string, pytz.timezone, dateutil.tz.tzfile or None + tz : str, pytz.timezone, dateutil.tz.tzfile or None Time zone to convert timestamps to. Passing ``None`` will remove the time zone information preserving local time. ambiguous : 'infer', 'NaT', bool array, default 'raise' @@ -930,7 +930,7 @@ def tz_localize(self, tz, ambiguous='raise', nonexistent='raise', Returns ------- - result : same type as self + Same type as self Array/Index converted to the specified time zone. Raises @@ -970,43 +970,39 @@ def tz_localize(self, tz, ambiguous='raise', nonexistent='raise', Be careful with DST changes. When there is sequential data, pandas can infer the DST time: - >>> s = pd.to_datetime(pd.Series([ - ... '2018-10-28 01:30:00', - ... '2018-10-28 02:00:00', - ... '2018-10-28 02:30:00', - ... '2018-10-28 02:00:00', - ... '2018-10-28 02:30:00', - ... '2018-10-28 03:00:00', - ... '2018-10-28 03:30:00'])) + >>> s = pd.to_datetime(pd.Series(['2018-10-28 01:30:00', + ... '2018-10-28 02:00:00', + ... '2018-10-28 02:30:00', + ... '2018-10-28 02:00:00', + ... '2018-10-28 02:30:00', + ... '2018-10-28 03:00:00', + ... '2018-10-28 03:30:00'])) >>> s.dt.tz_localize('CET', ambiguous='infer') - 2018-10-28 01:30:00+02:00 0 - 2018-10-28 02:00:00+02:00 1 - 2018-10-28 02:30:00+02:00 2 - 2018-10-28 02:00:00+01:00 3 - 2018-10-28 02:30:00+01:00 4 - 2018-10-28 03:00:00+01:00 5 - 2018-10-28 03:30:00+01:00 6 - dtype: int64 + 0 2018-10-28 01:30:00+02:00 + 1 2018-10-28 02:00:00+02:00 + 2 2018-10-28 02:30:00+02:00 + 3 2018-10-28 02:00:00+01:00 + 4 2018-10-28 02:30:00+01:00 + 5 2018-10-28 03:00:00+01:00 + 6 2018-10-28 03:30:00+01:00 + dtype: datetime64[ns, CET] In some cases, inferring the DST is impossible. In such cases, you can pass an ndarray to the ambiguous parameter to set the DST explicitly - >>> s = pd.to_datetime(pd.Series([ - ... '2018-10-28 01:20:00', - ... '2018-10-28 02:36:00', - ... '2018-10-28 03:46:00'])) + >>> s = pd.to_datetime(pd.Series(['2018-10-28 01:20:00', + ... '2018-10-28 02:36:00', + ... '2018-10-28 03:46:00'])) >>> s.dt.tz_localize('CET', ambiguous=np.array([True, True, False])) - 0 2018-10-28 01:20:00+02:00 - 1 2018-10-28 02:36:00+02:00 - 2 2018-10-28 03:46:00+01:00 - dtype: datetime64[ns, CET] + 0 2015-03-29 03:00:00+02:00 + 1 2015-03-29 03:30:00+02:00 + dtype: datetime64[ns, Europe/Warsaw] If the DST transition causes nonexistent times, you can shift these dates forward or backwards with a timedelta object or `'shift_forward'` or `'shift_backwards'`. - >>> s = pd.to_datetime(pd.Series([ - ... '2015-03-29 02:30:00', - ... '2015-03-29 03:30:00'])) + >>> s = pd.to_datetime(pd.Series(['2015-03-29 02:30:00', + ... '2015-03-29 03:30:00'])) >>> s.dt.tz_localize('Europe/Warsaw', nonexistent='shift_forward') 0 2015-03-29 03:00:00+02:00 1 2015-03-29 03:30:00+02:00 @@ -1129,7 +1125,7 @@ def to_period(self, freq=None): Parameters ---------- - freq : string or Offset, optional + freq : str or Offset, optional One of pandas' :ref:`offset strings ` or an Offset object. Will be inferred by default. @@ -1150,7 +1146,7 @@ def to_period(self, freq=None): Examples -------- - >>> df = pd.DataFrame({"y": [1,2,3]}, + >>> df = pd.DataFrame({"y": [1, 2, 3]}, ... index=pd.to_datetime(["2000-03-31 00:00:00", ... "2000-05-31 00:00:00", ... "2000-08-31 00:00:00"])) diff --git a/pandas/core/base.py b/pandas/core/base.py index 5a98e83c65884..7fdc64a8d9f85 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -794,7 +794,7 @@ def array(self): Returns ------- - array : ExtensionArray + ExtensionArray An ExtensionArray of the values stored within. For extension types, this is the actual array. For NumPy native types, this is a thin (no copy) wrapper around :class:`numpy.ndarray`. @@ -1022,7 +1022,7 @@ def max(self, axis=None, skipna=True): def argmax(self, axis=None, skipna=True): """ - Return a ndarray of the maximum argument indexer. + Return an ndarray of the maximum argument indexer. Parameters ---------- @@ -1087,6 +1087,10 @@ def argmin(self, axis=None, skipna=True): Dummy argument for consistency with Series skipna : bool, default True + Returns + ------- + numpy.ndarray + See Also -------- numpy.ndarray.argmin @@ -1102,6 +1106,10 @@ def tolist(self): (for str, int, float) or a pandas scalar (for Timestamp/Timedelta/Interval/Period) + Returns + ------- + list + See Also -------- numpy.ndarray.tolist @@ -1162,7 +1170,7 @@ def _map_values(self, mapper, na_action=None): Returns ------- - applied : Union[Index, MultiIndex], inferred + Union[Index, MultiIndex], inferred The output of the mapping function applied to the index. If the function returns a tuple with more than one element a MultiIndex will be returned. @@ -1246,7 +1254,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False, Returns ------- - counts : Series + Series See Also -------- @@ -1363,7 +1371,7 @@ def is_unique(self): Returns ------- - is_unique : boolean + bool """ return self.nunique(dropna=False) == len(self) @@ -1377,7 +1385,7 @@ def is_monotonic(self): Returns ------- - is_monotonic : boolean + bool """ from pandas import Index return Index(self).is_monotonic @@ -1394,7 +1402,7 @@ def is_monotonic_decreasing(self): Returns ------- - is_monotonic_decreasing : boolean + bool """ from pandas import Index return Index(self).is_monotonic_decreasing diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index f187d786d9f61..640d43f3b0e03 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -8,7 +8,8 @@ from pandas._libs.interval import Interval from pandas._libs.tslibs import NaT, Period, Timestamp, timezones -from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCIndexClass +from pandas.core.dtypes.generic import ( + ABCCategoricalIndex, ABCDateOffset, ABCIndexClass) from pandas import compat @@ -758,8 +759,7 @@ def __new__(cls, freq=None): # empty constructor for pickle compat return object.__new__(cls) - from pandas.tseries.offsets import DateOffset - if not isinstance(freq, DateOffset): + if not isinstance(freq, ABCDateOffset): freq = cls._parse_dtype_strict(freq) try: @@ -790,12 +790,10 @@ def construct_from_string(cls, string): Strict construction from a string, raise a TypeError if not possible """ - from pandas.tseries.offsets import DateOffset - if (isinstance(string, compat.string_types) and (string.startswith('period[') or string.startswith('Period[')) or - isinstance(string, DateOffset)): + isinstance(string, ABCDateOffset)): # do not parse string like U as period[U] # avoid tuple to be regarded as freq try: diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 3c6d3f212342b..697c58a365233 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -221,8 +221,8 @@ def _isna_ndarraylike(obj): # box if isinstance(obj, ABCSeries): - from pandas import Series - result = Series(result, index=obj.index, name=obj.name, copy=False) + result = obj._constructor( + result, index=obj.index, name=obj.name, copy=False) return result @@ -250,8 +250,8 @@ def _isna_ndarraylike_old(obj): # box if isinstance(obj, ABCSeries): - from pandas import Series - result = Series(result, index=obj.index, name=obj.name, copy=False) + result = obj._constructor( + result, index=obj.index, name=obj.name, copy=False) return result diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d125b561ab988..d5f5174b739d7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1065,7 +1065,7 @@ def from_dict(cls, data, orient='columns', dtype=None, columns=None): Returns ------- - pandas.DataFrame + DataFrame See Also -------- @@ -3542,7 +3542,13 @@ def _sanitize_column(self, key, value, broadcast=True): Returns ------- +<<<<<<< HEAD numpy-array +||||||| merged common ancestors + sanitized_column : numpy-array +======= + numpy.ndarray +>>>>>>> upstream/master """ def reindexer(value): @@ -3811,7 +3817,13 @@ def drop(self, labels=None, axis=0, index=None, columns=None, Returns ------- +<<<<<<< HEAD pandas.DataFrame +||||||| merged common ancestors + dropped : pandas.DataFrame +======= + DataFrame +>>>>>>> upstream/master Raises ------ @@ -4981,7 +4993,7 @@ def swaplevel(self, i=-2, j=-1, axis=0): Returns ------- - same type as caller (new object) + DataFrame .. versionchanged:: 0.18.1 @@ -5761,9 +5773,9 @@ def stack(self, level=-1, dropna=True): Notes ----- The function is named by analogy with a collection of books - being re-organised from being side by side on a horizontal + being reorganized from being side by side on a horizontal position (the columns of the dataframe) to being stacked - vertically on top of of each other (in the index of the + vertically on top of each other (in the index of the dataframe). Examples @@ -5907,7 +5919,13 @@ def unstack(self, level=-1, fill_value=None): Returns ------- +<<<<<<< HEAD DataFrame or Series +||||||| merged common ancestors + unstacked : DataFrame or Series +======= + Series or DataFrame +>>>>>>> upstream/master See Also -------- @@ -6345,7 +6363,15 @@ def apply(self, func, axis=0, broadcast=None, raw=False, reduce=None, Returns ------- +<<<<<<< HEAD Series or DataFrame +||||||| merged common ancestors + applied : Series or DataFrame +======= + Series or DataFrame + Result of applying ``func`` along the given axis of the + DataFrame. +>>>>>>> upstream/master See Also -------- @@ -6364,7 +6390,7 @@ def apply(self, func, axis=0, broadcast=None, raw=False, reduce=None, Examples -------- - >>> df = pd.DataFrame([[4, 9],] * 3, columns=['A', 'B']) + >>> df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B']) >>> df A B 0 4 9 @@ -6956,12 +6982,19 @@ def corr(self, method='pearson', min_periods=1): min_periods : int, optional Minimum number of observations required per pair of columns - to have a valid result. Currently only available for pearson - and spearman correlation + to have a valid result. Currently only available for Pearson + and Spearman correlation. Returns ------- +<<<<<<< HEAD + DataFrame +||||||| merged common ancestors + y : DataFrame +======= DataFrame + Correlation matrix. +>>>>>>> upstream/master See Also -------- @@ -6970,14 +7003,15 @@ def corr(self, method='pearson', min_periods=1): Examples -------- - >>> histogram_intersection = lambda a, b: np.minimum(a, b - ... ).sum().round(decimals=1) + >>> def histogram_intersection(a, b): + ... v = np.minimum(a, b).sum().round(decimals=1) + ... return v >>> df = pd.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)], ... columns=['dogs', 'cats']) >>> df.corr(method=histogram_intersection) - dogs cats - dogs 1.0 0.3 - cats 0.3 1.0 + dogs cats + dogs 1.0 0.3 + cats 0.3 1.0 """ numeric_df = self._get_numeric_data() cols = numeric_df.columns @@ -7140,10 +7174,11 @@ def corrwith(self, other, axis=0, drop=False, method='pearson'): Parameters ---------- other : DataFrame, Series + Object with which to compute correlations. axis : {0 or 'index', 1 or 'columns'}, default 0 - 0 or 'index' to compute column-wise, 1 or 'columns' for row-wise - drop : boolean, default False - Drop missing indices from result + 0 or 'index' to compute column-wise, 1 or 'columns' for row-wise. + drop : bool, default False + Drop missing indices from result. method : {'pearson', 'kendall', 'spearman'} or callable * pearson : standard correlation coefficient * kendall : Kendall Tau correlation coefficient @@ -7155,7 +7190,14 @@ def corrwith(self, other, axis=0, drop=False, method='pearson'): Returns ------- +<<<<<<< HEAD + Series +||||||| merged common ancestors + correls : Series +======= Series + Pairwise correlations. +>>>>>>> upstream/master See Also ------- @@ -7236,7 +7278,7 @@ def count(self, axis=0, level=None, numeric_only=False): If the axis is a `MultiIndex` (hierarchical), count along a particular `level`, collapsing into a `DataFrame`. A `str` specifies the level name. - numeric_only : boolean, default False + numeric_only : bool, default False Include only `float`, `int` or `boolean` data. Returns @@ -7523,7 +7565,14 @@ def idxmin(self, axis=0, skipna=True): Returns ------- +<<<<<<< HEAD + Series +||||||| merged common ancestors + idxmin : Series +======= Series + Indexes of minima along the specified axis. +>>>>>>> upstream/master Raises ------ @@ -7559,7 +7608,14 @@ def idxmax(self, axis=0, skipna=True): Returns ------- +<<<<<<< HEAD Series +||||||| merged common ancestors + idxmax : Series +======= + Series + Indexes of maxima along the specified axis. +>>>>>>> upstream/master Raises ------ @@ -7776,15 +7832,15 @@ def to_timestamp(self, freq=None, how='start', axis=0, copy=True): Parameters ---------- - freq : string, default frequency of PeriodIndex - Desired frequency + freq : str, default frequency of PeriodIndex + Desired frequency. how : {'s', 'e', 'start', 'end'} Convention for converting period to timestamp; start of period - vs. end + vs. end. axis : {0 or 'index', 1 or 'columns'}, default 0 - The axis to convert (the index by default) - copy : boolean, default True - If false then underlying input data is not copied + The axis to convert (the index by default). + copy : bool, default True + If False then underlying input data is not copied. Returns ------- @@ -7812,11 +7868,12 @@ def to_period(self, freq=None, axis=0, copy=True): Parameters ---------- - freq : string, default + freq : str, default + Frequency of the PeriodIndex. axis : {0 or 'index', 1 or 'columns'}, default 0 - The axis to convert (the index by default) - copy : boolean, default True - If False then underlying input data is not copied + The axis to convert (the index by default). + copy : bool, default True + If False then underlying input data is not copied. Returns ------- @@ -7893,7 +7950,7 @@ def isin(self, values): match. Note that 'falcon' does not match based on the number of legs in df2. - >>> other = pd.DataFrame({'num_legs': [8, 2],'num_wings': [0, 2]}, + >>> other = pd.DataFrame({'num_legs': [8, 2], 'num_wings': [0, 2]}, ... index=['spider', 'falcon']) >>> df.isin(other) num_legs num_wings diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ac7ce7726ed4e..bf23fbdc71f9d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -774,18 +774,18 @@ def pop(self, item): Parameters ---------- item : str - Column label to be popped + Label of column to be popped. Returns ------- - popped : Series + Series Examples -------- - >>> df = pd.DataFrame([('falcon', 'bird', 389.0), - ... ('parrot', 'bird', 24.0), - ... ('lion', 'mammal', 80.5), - ... ('monkey', 'mammal', np.nan)], + >>> df = pd.DataFrame([('falcon', 'bird', 389.0), + ... ('parrot', 'bird', 24.0), + ... ('lion', 'mammal', 80.5), + ... ('monkey','mammal', np.nan)], ... columns=('name', 'class', 'max_speed')) >>> df name class max_speed @@ -937,7 +937,7 @@ def swaplevel(self, i=-2, j=-1, axis=0): Parameters ---------- - i, j : int, string (can be mixed) + i, j : int, str (can be mixed) Level of index to be swapped. Can pass level name as string. Returns @@ -973,9 +973,9 @@ def rename(self, *args, **kwargs): and raise on DataFrame or Panel. dict-like or functions are transformations to apply to that axis' values - copy : boolean, default True - Also copy underlying data - inplace : boolean, default False + copy : bool, default True + Also copy underlying data. + inplace : bool, default False Whether to return a new %(klass)s. If True then value of copy is ignored. level : int or level name, default None @@ -2947,7 +2947,7 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, will treat them as non-numeric. quotechar : str, default '\"' String of length 1. Character used to quote fields. - line_terminator : string, optional + line_terminator : str, optional The newline character or character sequence to use in the output file. Defaults to `os.linesep`, which depends on the OS in which this method is called ('\n' for linux, '\r\n' for Windows, i.e.). @@ -5963,17 +5963,18 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, value : scalar, dict, Series, or DataFrame Value to use to fill holes (e.g. 0), alternately a dict/Series/DataFrame of values specifying which value to use for - each index (for a Series) or column (for a DataFrame). (values not - in the dict/Series/DataFrame will not be filled). This value cannot + each index (for a Series) or column (for a DataFrame). Values not + in the dict/Series/DataFrame will not be filled. This value cannot be a list. method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None Method to use for filling holes in reindexed Series pad / ffill: propagate last valid observation forward to next valid - backfill / bfill: use NEXT valid observation to fill gap + backfill / bfill: use next valid observation to fill gap. axis : %(axes_single_arg)s - inplace : boolean, default False - If True, fill in place. Note: this will modify any - other views on this object, (e.g. a no-copy slice for a column in a + Axis along which to fill missing values. + inplace : bool, default False + If True, fill in-place. Note: this will modify any + other views on this object (e.g., a no-copy slice for a column in a DataFrame). limit : int, default None If method is specified, this is the maximum number of consecutive @@ -5983,18 +5984,20 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, maximum number of entries along the entire axis where NaNs will be filled. Must be greater than 0 if not None. downcast : dict, default is None - a dict of item->dtype of what to downcast if possible, + A dict of item->dtype of what to downcast if possible, or the string 'infer' which will try to downcast to an appropriate - equal type (e.g. float64 to int64 if possible) + equal type (e.g. float64 to int64 if possible). Returns ------- - filled : %(klass)s + %(klass)s + Object with missing values filled. See Also -------- interpolate : Fill NaN values using interpolation. - reindex, asfreq + reindex : Conform object to new index. + asfreq : Convert TimeSeries to specified frequency. Examples -------- @@ -6002,7 +6005,7 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, ... [3, 4, np.nan, 1], ... [np.nan, np.nan, np.nan, 5], ... [np.nan, 3, np.nan, 4]], - ... columns=list('ABCD')) + ... columns=list('ABCD')) >>> df A B C D 0 NaN 2.0 NaN 0 @@ -6756,7 +6759,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, Note how the first entry in column 'b' remains ``NaN``, because there is no entry befofe it to use for interpolation. - >>> df = pd.DataFrame([(0.0, np.nan, -1.0, 1.0), + >>> df = pd.DataFrame([(0.0, np.nan, -1.0, 1.0), ... (np.nan, 2.0, np.nan, np.nan), ... (2.0, 3.0, np.nan, 9.0), ... (np.nan, 4.0, -4.0, 16.0)], @@ -6888,7 +6891,7 @@ def asof(self, where, subset=None): or when `self` is a DataFrame and `where` is a scalar * DataFrame : when `self` is a DataFrame and `where` is an array-like - Return scala, Sereis, or DataFrame. + Return scalar, Sereis, or DataFrame. See Also -------- @@ -7228,9 +7231,9 @@ def clip(self, lower=None, upper=None, axis=None, inplace=False, upper : float or array_like, default None Maximum threshold value. All values above this threshold will be set to it. - axis : int or string axis name, optional + axis : int or str axis name, optional Align object with lower and upper along the given axis. - inplace : boolean, default False + inplace : bool, default False Whether to perform the operation in place on the data. .. versionadded:: 0.21.0 @@ -7352,7 +7355,7 @@ def clip_upper(self, threshold, axis=None, inplace=False): axis : {0 or 'index', 1 or 'columns'}, default 0 Align object with `threshold` along the given axis. - inplace : boolean, default False + inplace : bool, default False Whether to perform the operation in place on the data. .. versionadded:: 0.21.0 @@ -7433,7 +7436,7 @@ def clip_lower(self, threshold, axis=None, inplace=False): axis : {0 or 'index', 1 or 'columns'}, default 0 Align `self` with `threshold` along the given axis. - inplace : boolean, default False + inplace : bool, default False Whether to perform the operation in place on the data. .. versionadded:: 0.21.0 @@ -7590,9 +7593,9 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, Examples -------- - >>> df = pd.DataFrame({'Animal' : ['Falcon', 'Falcon', - ... 'Parrot', 'Parrot'], - ... 'Max Speed' : [380., 370., 24., 26.]}) + >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', + ... 'Parrot', 'Parrot'], + ... 'Max Speed': [380., 370., 24., 26.]}) >>> df Animal Max Speed 0 Falcon 380.0 @@ -7611,16 +7614,16 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, using the `level` parameter: >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], - ... ['Capitve', 'Wild', 'Capitve', 'Wild']] + ... ['Captive', 'Wild', 'Captive', 'Wild']] >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) - >>> df = pd.DataFrame({'Max Speed' : [390., 350., 30., 20.]}, - ... index=index) + >>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]}, + ... index=index) >>> df Max Speed Animal Type - Falcon Capitve 390.0 + Falcon Captive 390.0 Wild 350.0 - Parrot Capitve 30.0 + Parrot Captive 30.0 Wild 20.0 >>> df.groupby(level=0).mean() Max Speed @@ -7630,7 +7633,7 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, >>> df.groupby(level=1).mean() Max Speed Type - Capitve 210.0 + Captive 210.0 Wild 185.0 """ from pandas.core.groupby.groupby import groupby @@ -7747,14 +7750,14 @@ def at_time(self, time, asof=False, axis=None): Parameters ---------- - time : datetime.time or string + time : datetime.time or str axis : {0 or 'index', 1 or 'columns'}, default 0 .. versionadded:: 0.24.0 Returns ------- - values_at_time : same type as caller + Series or DataFrame Raises ------ @@ -7772,7 +7775,7 @@ def at_time(self, time, asof=False, axis=None): Examples -------- >>> i = pd.date_range('2018-04-09', periods=4, freq='12H') - >>> ts = pd.DataFrame({'A': [1,2,3,4]}, index=i) + >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i) >>> ts A 2018-04-09 00:00:00 1 @@ -7807,17 +7810,17 @@ def between_time(self, start_time, end_time, include_start=True, Parameters ---------- - start_time : datetime.time or string - end_time : datetime.time or string - include_start : boolean, default True - include_end : boolean, default True + start_time : datetime.time or str + end_time : datetime.time or str + include_start : bool, default True + include_end : bool, default True axis : {0 or 'index', 1 or 'columns'}, default 0 .. versionadded:: 0.24.0 Returns ------- - values_between_time : same type as caller + Series or DataFrame Raises ------ @@ -7835,7 +7838,7 @@ def between_time(self, start_time, end_time, include_start=True, Examples -------- >>> i = pd.date_range('2018-04-09', periods=4, freq='1D20min') - >>> ts = pd.DataFrame({'A': [1,2,3,4]}, index=i) + >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i) >>> ts A 2018-04-09 00:00:00 1 @@ -10289,7 +10292,7 @@ def _doc_parms(cls): Parameters ---------- axis : %(axis_descr)s -skipna : boolean, default True +skipna : bool, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA level : int or level name, default None @@ -10298,7 +10301,7 @@ def _doc_parms(cls): ddof : int, default 1 Delta Degrees of Freedom. The divisor used in calculations is N - ddof, where N represents the number of elements. -numeric_only : boolean, default None +numeric_only : bool, default None Include only float, int, boolean columns. If None, will attempt to use everything, then use only numeric data. Not implemented for Series. diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 27e13e86a6e9e..52056a6842ed9 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -964,7 +964,7 @@ def _transform_fast(self, func, func_nm): ids, _, ngroup = self.grouper.group_info cast = self._transform_should_cast(func_nm) - out = algorithms.take_1d(func().values, ids) + out = algorithms.take_1d(func()._values, ids) if cast: out = self._try_cast(out, self.obj) return Series(out, index=self.obj.index, name=self.obj.name) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index df91c71cfe238..1037e2d9a3bd6 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1010,7 +1010,7 @@ def get_loc(self, key, method=None, tolerance=None): except (KeyError, ValueError, TypeError): try: return self._get_string_slice(key) - except (TypeError, KeyError, ValueError): + except (TypeError, KeyError, ValueError, OverflowError): pass try: @@ -1302,20 +1302,19 @@ def indexer_at_time(self, time, asof=False): -------- indexer_between_time, DataFrame.at_time """ - from dateutil.parser import parse - if asof: raise NotImplementedError("'asof' argument is not supported") if isinstance(time, compat.string_types): + from dateutil.parser import parse time = parse(time).time() if time.tzinfo: - # TODO - raise NotImplementedError("argument 'time' with timezone info is " - "not supported") - - time_micros = self._get_time_micros() + if self.tz is None: + raise ValueError("Index must be timezone aware.") + time_micros = self.tz_convert(time.tzinfo)._get_time_micros() + else: + time_micros = self._get_time_micros() micros = _time_to_micros(time) return (micros == time_micros).nonzero()[0] diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 53671e00e88b4..a6c945ac2e464 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -38,15 +38,15 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, If a dict is passed, the sorted keys will be used as the `keys` argument, unless it is passed, in which case the values will be selected (see below). Any None objects will be dropped silently unless - they are all None in which case a ValueError will be raised + they are all None in which case a ValueError will be raised. axis : {0/'index', 1/'columns'}, default 0 - The axis to concatenate along + The axis to concatenate along. join : {'inner', 'outer'}, default 'outer' - How to handle indexes on other axis(es) + How to handle indexes on other axis (or axes). join_axes : list of Index objects Specific indexes to use for the other n - 1 axes instead of performing - inner/outer set logic - ignore_index : boolean, default False + inner/outer set logic. + ignore_index : bool, default False If True, do not use the index values along the concatenation axis. The resulting axis will be labeled 0, ..., n - 1. This is useful if you are concatenating objects where the concatenation axis does not have @@ -54,16 +54,16 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, axes are still respected in the join. keys : sequence, default None If multiple levels passed, should contain tuples. Construct - hierarchical index using the passed keys as the outermost level + hierarchical index using the passed keys as the outermost level. levels : list of sequences, default None Specific levels (unique values) to use for constructing a - MultiIndex. Otherwise they will be inferred from the keys + MultiIndex. Otherwise they will be inferred from the keys. names : list, default None - Names for the levels in the resulting hierarchical index - verify_integrity : boolean, default False + Names for the levels in the resulting hierarchical index. + verify_integrity : bool, default False Check whether the new concatenated axis contains duplicates. This can - be very expensive relative to the actual data concatenation - sort : boolean, default None + be very expensive relative to the actual data concatenation. + sort : bool, default None Sort non-concatenation axis if it is not already aligned when `join` is 'outer'. The current default of sorting is deprecated and will change to not-sorting in a future version of pandas. @@ -76,12 +76,12 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, .. versionadded:: 0.23.0 - copy : boolean, default True - If False, do not copy data unnecessarily + copy : bool, default True + If False, do not copy data unnecessarily. Returns ------- - concatenated : object, type of objs + object, type of objs When concatenating all ``Series`` along the index (axis=0), a ``Series`` is returned. When ``objs`` contains at least one ``DataFrame``, a ``DataFrame`` is returned. When concatenating along @@ -89,10 +89,10 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, See Also -------- - Series.append - DataFrame.append - DataFrame.join - DataFrame.merge + Series.append : Concatenate Series. + DataFrame.append : Concatenate DataFrames. + DataFrame.join : Join DataFrames using indexes. + DataFrame.merge : Merge DataFrames by indexes or columns. Notes ----- @@ -128,7 +128,7 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, Add a hierarchical index at the outermost level of the data with the ``keys`` option. - >>> pd.concat([s1, s2], keys=['s1', 's2',]) + >>> pd.concat([s1, s2], keys=['s1', 's2']) s1 0 a 1 b s2 0 c diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index ad3327e694b67..fb50a3c60f705 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -909,7 +909,7 @@ def _get_merge_keys(self): in zip(self.right.index.levels, self.right.index.codes)] else: - right_keys = [self.right.index.values] + right_keys = [self.right.index._values] elif _any(self.right_on): for k in self.right_on: if is_rkey(k): diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 54f11646fc753..8d7616c4b6b61 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -392,36 +392,36 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None, aggfunc=None, margins=False, margins_name='All', dropna=True, normalize=False): """ - Compute a simple cross-tabulation of two (or more) factors. By default + Compute a simple cross tabulation of two (or more) factors. By default computes a frequency table of the factors unless an array of values and an - aggregation function are passed + aggregation function are passed. Parameters ---------- index : array-like, Series, or list of arrays/Series - Values to group by in the rows + Values to group by in the rows. columns : array-like, Series, or list of arrays/Series - Values to group by in the columns + Values to group by in the columns. values : array-like, optional Array of values to aggregate according to the factors. Requires `aggfunc` be specified. rownames : sequence, default None - If passed, must match number of row arrays passed + If passed, must match number of row arrays passed. colnames : sequence, default None - If passed, must match number of column arrays passed + If passed, must match number of column arrays passed. aggfunc : function, optional - If specified, requires `values` be specified as well - margins : boolean, default False - Add row/column margins (subtotals) - margins_name : string, default 'All' - Name of the row / column that will contain the totals + If specified, requires `values` be specified as well. + margins : bool, default False + Add row/column margins (subtotals). + margins_name : str, default 'All' + Name of the row/column that will contain the totals when margins is True. .. versionadded:: 0.21.0 - dropna : boolean, default True - Do not include columns whose entries are all NaN - normalize : boolean, {'all', 'index', 'columns'}, or {0,1}, default False + dropna : bool, default True + Do not include columns whose entries are all NaN. + normalize : bool, {'all', 'index', 'columns'}, or {0,1}, default False Normalize by dividing all values by the sum of values. - If passed 'all' or `True`, will normalize over all values. @@ -433,7 +433,13 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None, Returns ------- - crosstab : DataFrame + DataFrame + Cross tabulation of the data. + + See Also + -------- + DataFrame.pivot : Reshape data based on column values. + pivot_table : Create a pivot table as a DataFrame. Notes ----- @@ -455,32 +461,26 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None, ... "one", "two", "two", "two", "one"], dtype=object) >>> c = np.array(["dull", "dull", "shiny", "dull", "dull", "shiny", ... "shiny", "dull", "shiny", "shiny", "shiny"], - ... dtype=object) - + ... dtype=object) >>> pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c']) - ... # doctest: +NORMALIZE_WHITESPACE b one two c dull shiny dull shiny a bar 1 2 1 0 foo 2 2 1 2 + Here 'c' and 'f' are not represented in the data and will not be + shown in the output because dropna is True by default. Set + dropna=False to preserve categories with no data. + >>> foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c']) >>> bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f']) - >>> crosstab(foo, bar) # 'c' and 'f' are not represented in the data, - # and will not be shown in the output because - # dropna is True by default. Set 'dropna=False' - # to preserve categories with no data - ... # doctest: +SKIP + >>> pd.crosstab(foo, bar) col_0 d e row_0 a 1 0 b 0 1 - - >>> crosstab(foo, bar, dropna=False) # 'c' and 'f' are not represented - # in the data, but they still will be counted - # and shown in the output - ... # doctest: +SKIP + >>> pd.crosstab(foo, bar, dropna=False) col_0 d e f row_0 a 1 0 0 diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index f436b3b92a359..6ba33301753d6 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -701,19 +701,20 @@ def _convert_level_number(level_num, columns): def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False, drop_first=False, dtype=None): """ - Convert categorical variable into dummy/indicator variables + Convert categorical variable into dummy/indicator variables. Parameters ---------- data : array-like, Series, or DataFrame - prefix : string, list of strings, or dict of strings, default None + Data of which to get dummy indicators. + prefix : str, list of str, or dict of str, default None String to append DataFrame column names. Pass a list with length equal to the number of columns when calling get_dummies on a DataFrame. Alternatively, `prefix` can be a dictionary mapping column names to prefixes. - prefix_sep : string, default '_' + prefix_sep : str, default '_' If appending prefix, separator/delimiter to use. Or pass a - list or dictionary as with `prefix.` + list or dictionary as with `prefix`. dummy_na : bool, default False Add a column to indicate NaNs, if False NaNs are ignored. columns : list-like, default None @@ -736,11 +737,12 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, Returns ------- - dummies : DataFrame + DataFrame + Dummy-coded data. See Also -------- - Series.str.get_dummies + Series.str.get_dummies : Convert Series to dummy codes. Examples -------- diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 2a654fec36a9f..f99fd9004bb31 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -163,7 +163,7 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, Use `drop` optional when bins is not unique >>> pd.cut(s, [0, 2, 4, 6, 10, 10], labels=False, retbins=True, - ... right=False, duplicates='drop') + ... right=False, duplicates='drop') ... # doctest: +ELLIPSIS (a 0.0 b 1.0 diff --git a/pandas/core/series.py b/pandas/core/series.py index 6dd7582a8d570..68c7aa0744e7f 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1648,10 +1648,19 @@ def unique(self): Returns ------- ndarray or ExtensionArray - The unique values returned as a NumPy array. In case of an - extension-array backed Series, a new - :class:`~api.extensions.ExtensionArray` of that type with just - the unique values is returned. This includes + The unique values returned as a NumPy array. See Notes. + + See Also + -------- + unique : Top-level unique method for any 1-d array-like object. + Index.unique : Return Index with unique values from an Index object. + + Notes + ----- + Returns the unique values as a NumPy array. In case of an + extension-array backed Series, a new + :class:`~api.extensions.ExtensionArray` of that type with just + the unique values is returned. This includes * Categorical * Period @@ -1660,11 +1669,6 @@ def unique(self): * Sparse * IntegerNA . - See Also - -------- - unique : Top-level unique method for any 1-d array-like object. - Index.unique : Return Index with unique values from an Index object. - Examples -------- >>> pd.Series([2, 1, 3, 3], name='A').unique() @@ -3674,8 +3678,12 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, if axis is not None: self._get_axis_number(axis) - # dispatch to ExtensionArray interface - if isinstance(delegate, ExtensionArray): + if isinstance(delegate, Categorical): + # TODO deprecate numeric_only argument for Categorical and use + # skipna as well, see GH25303 + return delegate._reduce(name, numeric_only=numeric_only, **kwds) + elif isinstance(delegate, ExtensionArray): + # dispatch to ExtensionArray interface return delegate._reduce(name, skipna=skipna, **kwds) elif is_datetime64_dtype(delegate): # use DatetimeIndex implementation to handle skipna correctly diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 183a91c952140..cc7a4db515c42 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -120,7 +120,7 @@ def str_count(arr, pat, flags=0): Returns ------- - counts : Series or Index + Series or Index Same type as the calling object containing the integer counts. See Also @@ -283,7 +283,7 @@ def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True): return `True`. However, '.0' as a regex matches any character followed by a 0. - >>> s2 = pd.Series(['40','40.0','41','41.0','35']) + >>> s2 = pd.Series(['40', '40.0', '41', '41.0', '35']) >>> s2.str.contains('.0', regex=True) 0 True 1 True @@ -433,13 +433,13 @@ def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True): Parameters ---------- - pat : string or compiled regex + pat : str or compiled regex String can be a character sequence or regular expression. .. versionadded:: 0.20.0 `pat` also accepts a compiled regex. - repl : string or callable + repl : str or callable Replacement string or a callable. The callable is passed the regex match object and must return a replacement string to be used. See :func:`re.sub`. @@ -448,15 +448,15 @@ def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True): `repl` also accepts a callable. n : int, default -1 (all) - Number of replacements to make from start - case : boolean, default None + Number of replacements to make from start. + case : bool, default None - If True, case sensitive (the default if `pat` is a string) - Set to False for case insensitive - Cannot be set if `pat` is a compiled regex flags : int, default 0 (no flags) - re module flags, e.g. re.IGNORECASE - Cannot be set if `pat` is a compiled regex - regex : boolean, default True + regex : bool, default True - If True, assumes the passed-in pattern is a regular expression. - If False, treats the pattern as a literal string - Cannot be set to False if `pat` is a compiled regex or `repl` is @@ -537,6 +537,7 @@ def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True): Using a compiled regex with flags + >>> import re >>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE) >>> pd.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar') 0 foo @@ -604,6 +605,7 @@ def str_repeat(arr, repeats): 0 a 1 b 2 c + dtype: object Single int repeats string in Series @@ -611,6 +613,7 @@ def str_repeat(arr, repeats): 0 aa 1 bb 2 cc + dtype: object Sequence of int repeats corresponding string in Series @@ -618,6 +621,7 @@ def str_repeat(arr, repeats): 0 a 1 bb 2 ccc + dtype: object """ if is_scalar(repeats): def rep(x): @@ -646,13 +650,14 @@ def str_match(arr, pat, case=True, flags=0, na=np.nan): Parameters ---------- - pat : string - Character sequence or regular expression - case : boolean, default True - If True, case sensitive + pat : str + Character sequence or regular expression. + case : bool, default True + If True, case sensitive. flags : int, default 0 (no flags) - re module flags, e.g. re.IGNORECASE - na : default NaN, fill value for missing values + re module flags, e.g. re.IGNORECASE. + na : default NaN + Fill value for missing values. Returns ------- @@ -768,7 +773,7 @@ def str_extract(arr, pat, flags=0, expand=True): Parameters ---------- - pat : string + pat : str Regular expression pattern with capturing groups. flags : int, default 0 (no flags) Flags from the ``re`` module, e.g. ``re.IGNORECASE``, that @@ -966,21 +971,23 @@ def str_extractall(arr, pat, flags=0): def str_get_dummies(arr, sep='|'): """ - Split each string in the Series by sep and return a frame of - dummy/indicator variables. + Split each string in the Series by sep and return a DataFrame + of dummy/indicator variables. Parameters ---------- - sep : string, default "|" + sep : str, default "|" String to split on. Returns ------- - dummies : DataFrame + DataFrame + Dummy variables corresponding to values of the Series. See Also -------- - get_dummies + get_dummies : Convert categorical variable into dummy/indicator + variables. Examples -------- @@ -1089,11 +1096,11 @@ def str_findall(arr, pat, flags=0): Parameters ---------- - pat : string + pat : str Pattern or regular expression. flags : int, default 0 - ``re`` module flags, e.g. `re.IGNORECASE` (default is 0, which means - no flags). + Flags from ``re`` module, e.g. `re.IGNORECASE` (default is 0, which + means no flags). Returns ------- @@ -1182,17 +1189,18 @@ def str_find(arr, sub, start=0, end=None, side='left'): Parameters ---------- sub : str - Substring being searched + Substring being searched. start : int - Left edge index + Left edge index. end : int - Right edge index + Right edge index. side : {'left', 'right'}, default 'left' - Specifies a starting side, equivalent to ``find`` or ``rfind`` + Specifies a starting side, equivalent to ``find`` or ``rfind``. Returns ------- - found : Series/Index of integer values + Series or Index + Indexes where substring is found. """ if not isinstance(sub, compat.string_types): @@ -1430,7 +1438,7 @@ def str_slice_replace(arr, start=None, stop=None, repl=None): Returns ------- - replaced : Series or Index + Series or Index Same type as the original object. See Also @@ -1513,7 +1521,7 @@ def str_strip(arr, to_strip=None, side='both'): Returns ------- - stripped : Series/Index of objects + Series or Index """ if side == 'both': f = lambda x: x.strip(to_strip) @@ -1537,30 +1545,30 @@ def str_wrap(arr, width, **kwargs): Parameters ---------- width : int - Maximum line-width + Maximum line width. expand_tabs : bool, optional - If true, tab characters will be expanded to spaces (default: True) + If True, tab characters will be expanded to spaces (default: True). replace_whitespace : bool, optional - If true, each whitespace character (as defined by string.whitespace) + If True, each whitespace character (as defined by string.whitespace) remaining after tab expansion will be replaced by a single space - (default: True) + (default: True). drop_whitespace : bool, optional - If true, whitespace that, after wrapping, happens to end up at the - beginning or end of a line is dropped (default: True) + If True, whitespace that, after wrapping, happens to end up at the + beginning or end of a line is dropped (default: True). break_long_words : bool, optional - If true, then words longer than width will be broken in order to ensure + If True, then words longer than width will be broken in order to ensure that no lines are longer than width. If it is false, long words will - not be broken, and some lines may be longer than width. (default: True) + not be broken, and some lines may be longer than width (default: True). break_on_hyphens : bool, optional - If true, wrapping will occur preferably on whitespace and right after + If True, wrapping will occur preferably on whitespace and right after hyphens in compound words, as it is customary in English. If false, only whitespaces will be considered as potentially good places for line breaks, but you need to set break_long_words to false if you want truly - insecable words. (default: True) + insecable words (default: True). Returns ------- - wrapped : Series/Index of objects + Series or Index Notes ----- @@ -1581,6 +1589,7 @@ def str_wrap(arr, width, **kwargs): >>> s.str.wrap(12) 0 line to be\nwrapped 1 another line\nto be\nwrapped + dtype: object """ kwargs['width'] = width @@ -1613,7 +1622,7 @@ def str_translate(arr, table, deletechars=None): Returns ------- - translated : Series/Index of objects + Series or Index """ if deletechars is None: f = lambda x: x.translate(table) @@ -1641,15 +1650,16 @@ def str_get(arr, i): Returns ------- - items : Series/Index of objects + Series or Index Examples -------- >>> s = pd.Series(["String", - (1, 2, 3), - ["a", "b", "c"], - 123, -456, - {1:"Hello", "2":"World"}]) + ... (1, 2, 3), + ... ["a", "b", "c"], + ... 123, + ... -456, + ... {1: "Hello", "2": "World"}]) >>> s 0 String 1 (1, 2, 3) @@ -1674,7 +1684,7 @@ def str_get(arr, i): 2 c 3 NaN 4 NaN - 5 NaN + 5 None dtype: object """ def f(x): @@ -1699,7 +1709,7 @@ def str_decode(arr, encoding, errors="strict"): Returns ------- - decoded : Series/Index of objects + Series or Index """ if encoding in _cpython_optimized_decoders: # CPython optimized implementation @@ -2091,7 +2101,7 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): Returns ------- - concat : str or Series/Index of objects + str, Series or Index If `others` is None, `str` is returned, otherwise a `Series/Index` (same type as caller) of objects is returned. diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 3da349c570274..0c76ac6cd75ac 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -588,9 +588,8 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, if not cache_array.empty: result = arg.map(cache_array) else: - from pandas import Series values = convert_listlike(arg._values, True, format) - result = Series(values, index=arg.index, name=arg.name) + result = arg._constructor(values, index=arg.index, name=arg.name) elif isinstance(arg, (ABCDataFrame, compat.MutableMapping)): result = _assemble_from_unit_mappings(arg, errors, box, tz) elif isinstance(arg, ABCIndexClass): @@ -827,7 +826,6 @@ def to_time(arg, format=None, infer_time_format=False, errors='raise'): ------- datetime.time """ - from pandas.core.series import Series def _convert_listlike(arg, format): @@ -892,9 +890,9 @@ def _convert_listlike(arg, format): return arg elif isinstance(arg, time): return arg - elif isinstance(arg, Series): + elif isinstance(arg, ABCSeries): values = _convert_listlike(arg._values, format) - return Series(values, index=arg.index, name=arg.name) + return arg._constructor(values, index=arg.index, name=arg.name) elif isinstance(arg, ABCIndexClass): return _convert_listlike(arg, format) elif is_list_like(arg): diff --git a/pandas/core/tools/timedeltas.py b/pandas/core/tools/timedeltas.py index 30cb15f311b9f..7ebaf3056e79e 100644 --- a/pandas/core/tools/timedeltas.py +++ b/pandas/core/tools/timedeltas.py @@ -6,12 +6,12 @@ import numpy as np +from pandas._libs.tslibs import NaT from pandas._libs.tslibs.timedeltas import Timedelta, parse_timedelta_unit from pandas.core.dtypes.common import is_list_like from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries -import pandas as pd from pandas.core.arrays.timedeltas import sequence_to_td64ns @@ -100,10 +100,9 @@ def to_timedelta(arg, unit='ns', box=True, errors='raise'): if arg is None: return arg elif isinstance(arg, ABCSeries): - from pandas import Series values = _convert_listlike(arg._values, unit=unit, box=False, errors=errors) - return Series(values, index=arg.index, name=arg.name) + return arg._constructor(values, index=arg.index, name=arg.name) elif isinstance(arg, ABCIndexClass): return _convert_listlike(arg, unit=unit, box=box, errors=errors, name=arg.name) @@ -136,7 +135,7 @@ def _coerce_scalar_to_timedelta_type(r, unit='ns', box=True, errors='raise'): return r # coerce - result = pd.NaT + result = NaT return result diff --git a/pandas/io/formats/terminal.py b/pandas/io/formats/terminal.py index bb34259d710c7..cf2383955d593 100644 --- a/pandas/io/formats/terminal.py +++ b/pandas/io/formats/terminal.py @@ -15,6 +15,7 @@ import os import shutil +import subprocess from pandas.compat import PY3 @@ -94,22 +95,29 @@ def _get_terminal_size_tput(): # get terminal width # src: http://stackoverflow.com/questions/263890/how-do-i-find-the-width # -height-of-a-terminal-window + try: - import subprocess proc = subprocess.Popen(["tput", "cols"], stdin=subprocess.PIPE, stdout=subprocess.PIPE) - output = proc.communicate(input=None) - cols = int(output[0]) + output_cols = proc.communicate(input=None) proc = subprocess.Popen(["tput", "lines"], stdin=subprocess.PIPE, stdout=subprocess.PIPE) - output = proc.communicate(input=None) - rows = int(output[0]) - return (cols, rows) + output_rows = proc.communicate(input=None) except OSError: return None + try: + # Some terminals (e.g. spyder) may report a terminal size of '', + # making the `int` fail. + + cols = int(output_cols[0]) + rows = int(output_rows[0]) + return cols, rows + except (ValueError, IndexError): + return None + def _get_terminal_size_linux(): def ioctl_GWINSZ(fd): diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index ba647c42083b2..98d5370e7b070 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -2051,8 +2051,16 @@ def plot_series(data, kind='line', ax=None, # Series unique Returns ------- result + See Notes. - The return type depends on the `return_type` parameter: + See Also + -------- + Series.plot.hist: Make a histogram. + matplotlib.pyplot.boxplot : Matplotlib equivalent plot. + + Notes + ----- + The return type depends on the `return_type` parameter: * 'axes' : object of class matplotlib.axes.Axes * 'dict' : dict of matplotlib.lines.Line2D objects @@ -2064,13 +2072,6 @@ def plot_series(data, kind='line', ax=None, # Series unique * :class:`~numpy.array` (for ``return_type = None``) Return Series or numpy.array. - See Also - -------- - Series.plot.hist: Make a histogram. - matplotlib.pyplot.boxplot : Matplotlib equivalent plot. - - Notes - ----- Use ``return_type='dict'`` when you want to tweak the appearance of the lines after plotting. In this case a dict containing the Lines making up the boxes, caps, fliers, medians, and whiskers is returned. diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index bc37317f72802..31e81a9ca77c2 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -6,6 +6,7 @@ import numpy as np import pytest +import pytz from pandas.compat import product @@ -647,6 +648,28 @@ def test_at_time(self): rs = ts.at_time('16:00') assert len(rs) == 0 + @pytest.mark.parametrize('hour', ['1:00', '1:00AM', time(1), + time(1, tzinfo=pytz.UTC)]) + def test_at_time_errors(self, hour): + # GH 24043 + dti = pd.date_range('2018', periods=3, freq='H') + df = pd.DataFrame(list(range(len(dti))), index=dti) + if getattr(hour, 'tzinfo', None) is None: + result = df.at_time(hour) + expected = df.iloc[1:2] + tm.assert_frame_equal(result, expected) + else: + with pytest.raises(ValueError, match="Index must be timezone"): + df.at_time(hour) + + def test_at_time_tz(self): + # GH 24043 + dti = pd.date_range('2018', periods=3, freq='H', tz='US/Pacific') + df = pd.DataFrame(list(range(len(dti))), index=dti) + result = df.at_time(time(4, tzinfo=pytz.timezone('US/Eastern'))) + expected = df.iloc[1:2] + tm.assert_frame_equal(result, expected) + def test_at_time_raises(self): # GH20725 df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index f120402e6e8ca..b645073fcf72a 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -834,3 +834,14 @@ def demean_rename(x): tm.assert_frame_equal(result, expected) result_single = df.groupby('group').value.transform(demean_rename) tm.assert_series_equal(result_single, expected['value']) + + +@pytest.mark.parametrize('func', [min, max, np.min, np.max, 'first', 'last']) +def test_groupby_transform_timezone_column(func): + # GH 24198 + ts = pd.to_datetime('now', utc=True).tz_convert('Asia/Singapore') + result = pd.DataFrame({'end_time': [ts], 'id': [1]}) + result['max_end_time'] = result.groupby('id').end_time.transform(func) + expected = pd.DataFrame([[ts, 1, ts]], columns=['end_time', 'id', + 'max_end_time']) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index c99007cef90d4..8415bab802239 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1541,8 +1541,9 @@ def test_slice_locs(self, dtype): assert index2.slice_locs(8, 2) == (2, 6) assert index2.slice_locs(7, 3) == (2, 5) - def test_slice_float_locs(self): - index = Index(np.array([0, 1, 2, 5, 6, 7, 9, 10], dtype=float)) + @pytest.mark.parametrize("dtype", [int, float]) + def test_slice_float_locs(self, dtype): + index = Index(np.array([0, 1, 2, 5, 6, 7, 9, 10], dtype=dtype)) n = len(index) assert index.slice_locs(5.0, 10.0) == (3, n) assert index.slice_locs(4.5, 10.5) == (3, 8) @@ -1551,24 +1552,6 @@ def test_slice_float_locs(self): assert index2.slice_locs(8.5, 1.5) == (2, 6) assert index2.slice_locs(10.5, -1) == (0, n) - @pytest.mark.xfail(reason="Assertions were not correct - see GH#20915") - def test_slice_ints_with_floats_raises(self): - # int slicing with floats - # GH 4892, these are all TypeErrors - index = Index(np.array([0, 1, 2, 5, 6, 7, 9, 10], dtype=int)) - n = len(index) - - pytest.raises(TypeError, - lambda: index.slice_locs(5.0, 10.0)) - pytest.raises(TypeError, - lambda: index.slice_locs(4.5, 10.5)) - - index2 = index[::-1] - pytest.raises(TypeError, - lambda: index2.slice_locs(8.5, 1.5), (2, 6)) - pytest.raises(TypeError, - lambda: index2.slice_locs(10.5, -1), (0, n)) - def test_slice_locs_dup(self): index = Index(['a', 'a', 'b', 'c', 'd', 'd']) assert index.slice_locs('a', 'd') == (0, 6) diff --git a/pandas/tests/io/formats/test_console.py b/pandas/tests/io/formats/test_console.py index 055763bf62d6e..a3e0e195f4864 100644 --- a/pandas/tests/io/formats/test_console.py +++ b/pandas/tests/io/formats/test_console.py @@ -1,6 +1,9 @@ +import subprocess # noqa: F401 + import pytest from pandas.io.formats.console import detect_console_encoding +from pandas.io.formats.terminal import _get_terminal_size_tput class MockEncoding(object): # TODO(py27): replace with mock @@ -72,3 +75,18 @@ def test_detect_console_encoding_fallback_to_default(monkeypatch, std, locale): context.setattr('sys.stdout', MockEncoding(std)) context.setattr('sys.getdefaultencoding', lambda: 'sysDefaultEncoding') assert detect_console_encoding() == 'sysDefaultEncoding' + + +@pytest.mark.parametrize("size", ['', ['']]) +def test_terminal_unknown_dimensions(monkeypatch, size, mocker): + + def communicate(*args, **kwargs): + return size + + monkeypatch.setattr('subprocess.Popen', mocker.Mock()) + monkeypatch.setattr('subprocess.Popen.return_value.returncode', None) + monkeypatch.setattr( + 'subprocess.Popen.return_value.communicate', communicate) + result = _get_terminal_size_tput() + + assert result is None diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 173f719edd465..8520855d14918 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -960,6 +960,27 @@ def test_min_max(self): assert np.isnan(_min) assert _max == 1 + def test_min_max_numeric_only(self): + # TODO deprecate numeric_only argument for Categorical and use + # skipna as well, see GH25303 + cat = Series(Categorical( + ["a", "b", np.nan, "a"], categories=['b', 'a'], ordered=True)) + + _min = cat.min() + _max = cat.max() + assert np.isnan(_min) + assert _max == "a" + + _min = cat.min(numeric_only=True) + _max = cat.max(numeric_only=True) + assert _min == "b" + assert _max == "a" + + _min = cat.min(numeric_only=False) + _max = cat.max(numeric_only=False) + assert np.isnan(_min) + assert _max == "a" + class TestSeriesMode(object): # Note: the name TestSeriesMode indicates these tests diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index ceccb48194f85..71b100401ec21 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1160,9 +1160,13 @@ def test_resample_nunique_with_date_gap(): @pytest.mark.parametrize('k', [10, 100, 1000]) def test_resample_group_info(n, k): # GH10914 + + # use a fixed seed to always have the same uniques + prng = np.random.RandomState(1234) + dr = date_range(start='2015-08-27', periods=n // 10, freq='T') - ts = Series(np.random.randint(0, n // k, n).astype('int64'), - index=np.random.choice(dr, n)) + ts = Series(prng.randint(0, n // k, n).astype('int64'), + index=prng.choice(dr, n)) left = ts.resample('30T').nunique() ix = date_range(start=ts.index.min(), end=ts.index.max(), diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 69acf4ba6bde8..97f1e07380ef9 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -549,3 +549,25 @@ def test_selection_api_validation(): exp.index.name = 'd' assert_frame_equal(exp, df.resample('2D', level='d').sum()) + + +@pytest.mark.parametrize('col_name', ['t2', 't2x', 't2q', 'T_2M', + 't2p', 't2m', 't2m1', 'T2M']) +def test_agg_with_datetime_index_list_agg_func(col_name): + # GH 22660 + # The parametrized column names would get converted to dates by our + # date parser. Some would result in OutOfBoundsError (ValueError) while + # others would result in OverflowError when passed into Timestamp. + # We catch these errors and move on to the correct branch. + df = pd.DataFrame(list(range(200)), + index=pd.date_range(start='2017-01-01', freq='15min', + periods=200, tz='Europe/Berlin'), + columns=[col_name]) + result = df.resample('1d').aggregate(['mean']) + expected = pd.DataFrame([47.5, 143.5, 195.5], + index=pd.date_range(start='2017-01-01', freq='D', + periods=3, tz='Europe/Berlin'), + columns=pd.MultiIndex(levels=[[col_name], + ['mean']], + codes=[[0], [0]])) + assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 5d7a9ab6f4cf0..62c9047b17f3d 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -682,6 +682,28 @@ def test_join_multi_to_multi(self, join_type): with pytest.raises(ValueError, match=msg): right.join(left, on=['abc', 'xy'], how=join_type) + def test_join_on_tz_aware_datetimeindex(self): + # GH 23931 + df1 = pd.DataFrame( + { + 'date': pd.date_range(start='2018-01-01', periods=5, + tz='America/Chicago'), + 'vals': list('abcde') + } + ) + + df2 = pd.DataFrame( + { + 'date': pd.date_range(start='2018-01-03', periods=5, + tz='America/Chicago'), + 'vals_2': list('tuvwx') + } + ) + result = df1.join(df2.set_index('date'), on='date') + expected = df1.copy() + expected['vals_2'] = pd.Series([np.nan] * len(expected), dtype=object) + assert_frame_equal(result, expected) + def _check_join(left, right, result, join_col, how='left', lsuffix='_x', rsuffix='_y'): diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index abf95b276cda1..43747ea8621d9 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -9,7 +9,7 @@ from pandas import ( DatetimeIndex, Index, NaT, Period, Series, Timedelta, TimedeltaIndex, - Timestamp) + Timestamp, isna) from pandas.core.arrays import PeriodArray from pandas.util import testing as tm @@ -201,9 +201,10 @@ def _get_overlap_public_nat_methods(klass, as_tuple=False): "fromtimestamp", "isocalendar", "isoformat", "isoweekday", "month_name", "now", "replace", "round", "strftime", "strptime", "time", "timestamp", "timetuple", "timetz", - "to_datetime64", "to_pydatetime", "today", "toordinal", - "tz_convert", "tz_localize", "tzname", "utcfromtimestamp", - "utcnow", "utcoffset", "utctimetuple", "weekday"]), + "to_datetime64", "to_numpy", "to_pydatetime", "today", + "toordinal", "tz_convert", "tz_localize", "tzname", + "utcfromtimestamp", "utcnow", "utcoffset", "utctimetuple", + "weekday"]), (Timedelta, ["total_seconds"]) ]) def test_overlap_public_nat_methods(klass, expected): @@ -339,3 +340,11 @@ def test_nat_arithmetic_td64_vector(op_name, box): def test_nat_pinned_docstrings(): # see gh-17327 assert NaT.ctime.__doc__ == datetime.ctime.__doc__ + + +def test_to_numpy_alias(): + # GH 24653: alias .to_numpy() for scalars + expected = NaT.to_datetime64() + result = NaT.to_numpy() + + assert isna(expected) and isna(result) diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index 7d5b479810205..bf71c37aa9c3d 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -414,6 +414,11 @@ def test_timedelta_conversions(self): assert (Timedelta(timedelta(days=1)) == np.timedelta64(1, 'D').astype('m8[ns]')) + def test_to_numpy_alias(self): + # GH 24653: alias .to_numpy() for scalars + td = Timedelta('10m7s') + assert td.to_timedelta64() == td.to_numpy() + def test_round(self): t1 = Timedelta('1 days 02:34:56.789123456') diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index c27ef3d0662c8..f42fad4c925f0 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -969,3 +969,8 @@ def test_to_period_tz_warning(self): with tm.assert_produces_warning(UserWarning): # warning that timezone info will be lost ts.to_period('D') + + def test_to_numpy_alias(self): + # GH 24653: alias .to_numpy() for scalars + ts = Timestamp(datetime.now()) + assert ts.to_datetime64() == ts.to_numpy() diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 3d28b17750540..888cf78a1c66a 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1222,7 +1222,7 @@ def test_group_var_constant(self): class TestGroupVarFloat64(GroupVarTestMixin): __test__ = True - algo = libgroupby.group_var_float64 + algo = staticmethod(libgroupby.group_var_float64) dtype = np.float64 rtol = 1e-5 @@ -1245,7 +1245,7 @@ def test_group_var_large_inputs(self): class TestGroupVarFloat32(GroupVarTestMixin): __test__ = True - algo = libgroupby.group_var_float32 + algo = staticmethod(libgroupby.group_var_float32) dtype = np.float32 rtol = 1e-2 diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index e22b9a0ef25e3..92b4e5a99041a 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -9,7 +9,7 @@ import numpy as np # noqa import pytest -from pandas.compat import PY36 +from pandas.compat import PY2, PY36, is_platform_windows from pandas import DataFrame from pandas.util import testing as tm @@ -58,6 +58,8 @@ def test_xarray(df): assert df.to_xarray() is not None +@pytest.mark.skipif(is_platform_windows() and PY2, + reason="Broken on Windows / Py2") def test_oo_optimizable(): # GH 21071 subprocess.check_call([sys.executable, "-OO", "-c", "import pandas"])