Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sanitize content hashes in archive methods #175

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion servicelayer/archive/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from normality import safe_filename

from servicelayer.archive.archive import Archive
from servicelayer.archive.util import ensure_path, checksum, BUF_SIZE
from servicelayer.archive.util import ensure_path, checksum, sanitize_checksum, BUF_SIZE
from servicelayer.archive.util import path_prefix, path_content_hash

log = logging.getLogger(__name__)
Expand Down Expand Up @@ -33,6 +33,8 @@ def archive_file(self, file_path, content_hash=None, mime_type=None):
"""Import the given file into the archive."""
if content_hash is None:
content_hash = checksum(file_path)
else:
content_hash = sanitize_checksum(content_hash)

if content_hash is None:
return
Expand All @@ -51,6 +53,7 @@ def archive_file(self, file_path, content_hash=None, mime_type=None):
return content_hash

def load_file(self, content_hash, file_name=None, temp_path=None):
content_hash = sanitize_checksum(content_hash)
return self._locate_key(content_hash)

def list_files(self, prefix=None):
Expand All @@ -67,6 +70,7 @@ def list_files(self, prefix=None):
yield path_content_hash(file_path)

def delete_file(self, content_hash):
content_hash = sanitize_checksum(content_hash)
prefix = path_prefix(content_hash)
if prefix is None:
return
Expand Down
6 changes: 5 additions & 1 deletion servicelayer/archive/gs.py
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Content hash also needes to be sanitized for the generate_url!

Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from google.resumable_media.common import DataCorruption, InvalidResponse

from servicelayer.archive.virtual import VirtualArchive
from servicelayer.archive.util import checksum, ensure_path
from servicelayer.archive.util import checksum, sanitize_checksum, ensure_path
from servicelayer.archive.util import path_prefix, ensure_posix_path
from servicelayer.archive.util import path_content_hash, HASH_LENGTH
from servicelayer.util import service_retries, backoff
Expand Down Expand Up @@ -89,6 +89,8 @@ def archive_file(self, file_path, content_hash=None, mime_type=None):
file_path = ensure_path(file_path)
if content_hash is None:
content_hash = checksum(file_path)
else:
content_hash = sanitize_checksum(content_hash)

if content_hash is None:
return
Expand All @@ -111,6 +113,7 @@ def archive_file(self, file_path, content_hash=None, mime_type=None):
def load_file(self, content_hash, file_name=None, temp_path=None):
"""Retrieve a file from Google storage and put it onto the local file
system for further processing."""
content_hash = sanitize_checksum(content_hash)
for attempt in service_retries():
try:
blob = self._locate_contenthash(content_hash)
Expand Down Expand Up @@ -147,6 +150,7 @@ def delete_file(self, content_hash):
"""Check if a file with the given hash exists on S3."""
if content_hash is None or len(content_hash) < HASH_LENGTH:
return
content_hash = sanitize_checksum(content_hash)
prefix = path_prefix(content_hash)
if prefix is None:
return
Expand Down
6 changes: 5 additions & 1 deletion servicelayer/archive/s3.py
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Content hash needs to be sanitized for the generate_url method as well!

Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from servicelayer import settings
from servicelayer.archive.virtual import VirtualArchive
from servicelayer.archive.util import checksum, ensure_path
from servicelayer.archive.util import checksum, sanitize_checksum, ensure_path
from servicelayer.archive.util import path_prefix, path_content_hash

log = logging.getLogger(__name__)
Expand Down Expand Up @@ -86,6 +86,8 @@ def archive_file(self, file_path, content_hash=None, mime_type=None):
file_path = ensure_path(file_path)
if content_hash is None:
content_hash = checksum(file_path)
else:
content_hash = sanitize_checksum(content_hash)

# if content_hash is None:
# return
Expand All @@ -105,6 +107,7 @@ def archive_file(self, file_path, content_hash=None, mime_type=None):
def load_file(self, content_hash, file_name=None, temp_path=None):
"""Retrieve a file from S3 storage and put it onto the local file
system for further processing."""
content_hash = sanitize_checksum(content_hash)
key = self._locate_key(content_hash)
if key is not None:
path = self._local_path(content_hash, file_name, temp_path)
Expand All @@ -114,6 +117,7 @@ def load_file(self, content_hash, file_name=None, temp_path=None):
def delete_file(self, content_hash):
if content_hash is None:
return
content_hash = sanitize_checksum(content_hash)
prefix = path_prefix(content_hash)
if prefix is None:
return
Expand Down
13 changes: 13 additions & 0 deletions servicelayer/archive/util.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import string
from hashlib import sha1
from pathlib import Path

Expand Down Expand Up @@ -32,6 +33,18 @@ def checksum(file_name):
return str(digest.hexdigest())


def sanitize_checksum(checksum):
"""Normalize the checksum. Raises an error if the given checksum invalid."""
if not checksum:
raise ValueError("Checksum is empty")

for char in checksum:
if char not in string.hexdigits:
raise ValueError(f'Checksum contains invalid character "{char}"')

return checksum


def path_prefix(content_hash):
"""Get a prefix for a content hashed folder structure."""
if content_hash is None:
Expand Down
3 changes: 2 additions & 1 deletion servicelayer/archive/virtual.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from normality import safe_filename

from servicelayer.archive.archive import Archive
from servicelayer.archive.util import ensure_path
from servicelayer.archive.util import ensure_path, sanitize_checksum

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -33,6 +33,7 @@ def cleanup_file(self, content_hash, temp_path=None):
"""Delete the local cached version of the file."""
if content_hash is None:
return
content_hash = sanitize_checksum(content_hash)
path = self._get_local_prefix(content_hash, temp_path=temp_path)
try:
shutil.rmtree(path, ignore_errors=True)
Expand Down
29 changes: 24 additions & 5 deletions tests/archive/test_file.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import pytest
import shutil
import tempfile
from unittest import TestCase
Expand Down Expand Up @@ -26,9 +27,11 @@ def test_basic_archive(self):
assert out == out2, (out, out2)

def test_basic_archive_with_checksum(self):
checksum_ = "banana"
out = self.archive.archive_file(self.file, checksum_)
assert checksum_ == out, (checksum_, out)
with pytest.raises(ValueError):
self.archive.archive_file(self.file, content_hash="banana")

out = self.archive.archive_file(self.file, content_hash="01234567890abcdef")
assert out == "01234567890abcdef"

def test_generate_url(self):
out = self.archive.archive_file(self.file)
Expand All @@ -39,6 +42,15 @@ def test_publish(self):
assert not self.archive.can_publish

def test_load_file(self):
# Invalid content hash
with pytest.raises(ValueError):
self.archive.load_file("banana")

# Valid content hash, but file does not exist
path = self.archive.load_file("01234567890abcdef")
assert path is None

# Valid content hash, file exists
out = self.archive.archive_file(self.file)
path = self.archive.load_file(out)
assert path is not None, path
Expand All @@ -64,10 +76,17 @@ def test_list_files(self):
assert len(keys) == 0, keys

def test_delete_file(self):
# Invalid content hash
with pytest.raises(ValueError):
self.archive.delete_file("banana")

# File does not exist
assert self.archive.delete_file("01234567890abcdef") is None

# Valid content hash, file exists
out = self.archive.archive_file(self.file)
path = self.archive.load_file(out)
assert path is not None, path
self.archive.cleanup_file(out)
self.archive.delete_file(out)
assert self.archive.delete_file(out) is None
path = self.archive.load_file(out)
assert path is None, path
34 changes: 31 additions & 3 deletions tests/archive/test_s3.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import pytest
from unittest import TestCase
from urllib.parse import urlparse, parse_qs

Expand Down Expand Up @@ -26,9 +27,11 @@ def test_basic_archive(self):
assert out == out2, (out, out2)

def test_basic_archive_with_checksum(self):
checksum_ = "banana"
out = self.archive.archive_file(self.file, checksum_)
assert checksum_ == out, (checksum_, out)
with pytest.raises(ValueError):
self.archive.archive_file(self.file, content_hash="banana")

out = self.archive.archive_file(self.file, content_hash="01234567890abcdef")
assert out == "01234567890abcdef"

def test_generate_url(self):
content_hash = self.archive.archive_file(self.file)
Expand Down Expand Up @@ -60,12 +63,29 @@ def test_publish_file(self):
assert "https://foo.s3.amazonaws.com/self.py" in url, url

def test_load_file(self):
# Invalid content hash
with pytest.raises(ValueError):
self.archive.load_file("banana")

# Valid content hash, but file does not exist
path = self.archive.load_file("01234567890abcdef")
assert path is None

# Valid content hash, file exists
out = self.archive.archive_file(self.file)
path = self.archive.load_file(out)
assert path is not None, path
assert path.is_file(), path

def test_cleanup_file(self):
# Invalid content hash
with pytest.raises(ValueError):
self.archive.cleanup_file("banana")

# File does not exist
assert self.archive.cleanup_file("01234567890abcdef") is None

# Valid content hash, file exists
out = self.archive.archive_file(self.file)
self.archive.cleanup_file(out)
path = self.archive.load_file(out)
Expand All @@ -86,6 +106,14 @@ def test_list_files(self):
assert len(keys) == 0, keys

def test_delete_file(self):
# Invalid content hash
with pytest.raises(ValueError):
self.archive.delete_file("banana")

# File does not exist
assert self.archive.delete_file("01234567890abcdef") is None

# Valid content hash, file exists
out = self.archive.archive_file(self.file)
path = self.archive.load_file(out)
assert path is not None, path
Expand Down
21 changes: 21 additions & 0 deletions tests/archive/test_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import pytest
from unittest import TestCase

from servicelayer.archive.util import sanitize_checksum


class UtilTest(TestCase):
def test_sanitize_checksum(self):
assert sanitize_checksum("0123456789abcdef") == "0123456789abcdef"

with pytest.raises(ValueError, match="Checksum is empty"):
sanitize_checksum(None)

with pytest.raises(ValueError, match="Checksum is empty"):
sanitize_checksum("")

with pytest.raises(ValueError, match='Checksum contains invalid character "n"'):
sanitize_checksum("banana")

with pytest.raises(ValueError, match='Checksum contains invalid character "/"'):
sanitize_checksum("/")
Loading