Skip to content

Commit

Permalink
fix load_dill_with_pandas_backward_compatibility() bug (#382)
Browse files Browse the repository at this point in the history
* seek(0)

* fix FileLike

* add seekable check

* change order

* check seekable

* add assert seekable

* add seekable

* add comments
  • Loading branch information
mski-iksm authored Jul 18, 2024
1 parent 465cf3c commit 03e719b
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 6 deletions.
6 changes: 4 additions & 2 deletions gokart/file_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,10 @@ def format(self):
return luigi.format.Nop

def load(self, file):
if not ObjectStorage.is_buffered_reader(file):
# we cannot use dill.load(file) because ReadableS3File does not have 'readline' method
if not file.seekable():
# load_dill_with_pandas_backward_compatibility() requires file with seek() and readlines() implemented.
# Therefore, we need to wrap with BytesIO which makes file seekable and readlinesable.
# For example, ReadableS3File is not a seekable file.
return load_dill_with_pandas_backward_compatibility(BytesIO(file.read()))
return load_dill_with_pandas_backward_compatibility(_ChunkedLargeFileReader(file))

Expand Down
12 changes: 8 additions & 4 deletions gokart/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ def read(self, n: int) -> bytes: ...

def readline(self) -> bytes: ...

def seek(self, offset: int) -> None: ...

def seekable(self) -> bool: ...


def add_config(file_path: str):
_, ext = os.path.splitext(file_path)
Expand All @@ -27,8 +31,6 @@ def add_config(file_path: str):

FlattenableItems: TypeAlias = T | Iterable['FlattenableItems[T]'] | dict[str, 'FlattenableItems[T]']
else:
from typing import Union

FlattenableItems = Union[T, Iterable['FlattenableItems[T]'], dict[str, 'FlattenableItems[T]']]


Expand Down Expand Up @@ -74,6 +76,8 @@ def load_dill_with_pandas_backward_compatibility(file: FileLike) -> Any:
It is unclear whether all objects dumped by dill can be loaded by pd.read_pickle, we use dill.load as a fallback.
"""
try:
return pd.read_pickle(file)
except Exception:
return dill.load(file)
except Exception:
assert file.seekable(), f'{file} is not seekable.'
file.seek(0)
return pd.read_pickle(file)

0 comments on commit 03e719b

Please sign in to comment.