You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

234 lines
8.8 KiB

import os
import sys
import zipfile
from pathlib import Path
import pytest
from nltk.downloader import ErrorMessage, _unzip_iter
def _make_zip(file_path: Path, members: dict[str, bytes]) -> None:
"""
Create a ZIP file at file_path, with the given arcname->content mapping.
"""
with zipfile.ZipFile(file_path, "w") as zf:
for arcname, content in members.items():
zf.writestr(arcname, content)
def _run_unzip_iter(zip_path: Path, extract_root: Path, verbose: bool = False):
"""
Convenience wrapper that runs _unzip_iter and returns the list of yielded
messages (if any).
"""
return list(_unzip_iter(str(zip_path), str(extract_root), verbose=verbose))
class TestSecureUnzip:
"""
Tests for secure ZIP extraction behaviour in nltk.downloader._unzip_iter.
These tests are specifically designed so that:
* On the old implementation (using zf.extractall(root) without checks),
the "Zip-Slip" tests will fail because they rely on the new behaviour
(yielding ErrorMessage for escaping entries, and NOT writing outside
the extraction root).
* On the new implementation, the tests pass, because the new path
validation prevents escapes and emits the expected ErrorMessage.
"""
def test_normal_relative_paths_are_extracted(self, tmp_path: Path) -> None:
"""
A ZIP with only safe, relative paths should fully extract under the
given root, and should not yield any ErrorMessage.
"""
zip_path = tmp_path / "safe.zip"
extract_root = tmp_path / "extract"
members = {
"pkg/file.txt": b"hello",
"pkg/subdir/other.txt": b"world",
}
_make_zip(zip_path, members)
messages = _run_unzip_iter(zip_path, extract_root, verbose=False)
# No ErrorMessage should be yielded for valid archives.
assert not any(isinstance(m, ErrorMessage) for m in messages)
assert (extract_root / "pkg" / "file.txt").read_bytes() == b"hello"
assert (extract_root / "pkg" / "subdir" / "other.txt").read_bytes() == b"world"
def test_zip_slip_with_parent_directory_component_is_blocked(
self, tmp_path: Path
) -> None:
"""
An entry containing ``..`` that would escape the target directory
must not be written outside the extraction root, and must cause
_unzip_iter to yield an ErrorMessage.
On the old implementation (extractall(root)), this test will fail
because the file outside the root is actually created and no
ErrorMessage is yielded.
"""
zip_path = tmp_path / "zip_slip_parent.zip"
extract_root = tmp_path / "extract"
# This is the path that would be written if "../outside.txt" is
# extracted with extractall(root).
outside_target = (extract_root / ".." / "outside.txt").resolve()
members = {
"pkg/good.txt": b"ok",
# This would escape extract_root if not validated.
"../outside.txt": b"evil",
}
_make_zip(zip_path, members)
messages = _run_unzip_iter(zip_path, extract_root, verbose=False)
# With the secure implementation, the escaping entry must be blocked
# and reported as an ErrorMessage.
err_msgs = [m for m in messages if isinstance(m, ErrorMessage)]
assert (
err_msgs
), "Expected an ErrorMessage for a Zip-Slip parent-directory attempt"
# Check that the message text indicates it was blocked. Adjust to the
# exact wording if needed.
combined_messages = " ".join(str(m.message) for m in err_msgs)
assert "Zip Slip" in combined_messages and "blocked" in combined_messages
# The escaping path must not have been written.
assert not outside_target.exists()
# The safe entry should still be extracted under the root.
assert (extract_root / "pkg" / "good.txt").read_bytes() == b"ok"
@pytest.mark.skipif(
sys.platform.startswith("win"),
reason="Absolute POSIX paths are not meaningful on Windows",
)
def test_zip_slip_with_absolute_posix_path_is_blocked(self, tmp_path: Path) -> None:
"""
An entry with an absolute POSIX path (e.g. ``/tmp/evil``) must not be
extracted as-is; it should not overwrite arbitrary filesystem paths,
and should result in an ErrorMessage.
On the old implementation (extractall(root)), this test will fail
because the absolute path gets created without any ErrorMessage.
"""
zip_path = tmp_path / "zip_slip_abs_posix.zip"
extract_root = tmp_path / "extract"
# Choose a unique location in /tmp to avoid interfering with real files.
absolute_target = Path("/tmp") / f"nltk_zip_slip_test_{os.getpid()}"
try:
members = {
"pkg/good.txt": b"ok",
# This absolute path should never be created by the secure
# implementation.
str(absolute_target): b"evil",
}
_make_zip(zip_path, members)
messages = _run_unzip_iter(zip_path, extract_root, verbose=False)
err_msgs = [m for m in messages if isinstance(m, ErrorMessage)]
assert (
err_msgs
), "Expected an ErrorMessage for absolute-path Zip-Slip attempt"
combined_messages = " ".join(str(m.message) for m in err_msgs)
assert "Zip Slip" in combined_messages and "blocked" in combined_messages
# Absolute path must not be created by the secure implementation.
assert not absolute_target.exists()
# Safe entry must still be extracted under extract_root.
assert (extract_root / "pkg" / "good.txt").read_bytes() == b"ok"
finally:
# Best-effort cleanup if the implementation under test behaves
# incorrectly and creates the file.
if absolute_target.exists():
try:
absolute_target.unlink()
except OSError:
# Ignore cleanup errors: file may not exist or be locked.
pass
def test_entries_resolved_outside_root_are_blocked_via_symlink(
self, tmp_path: Path
) -> None:
"""
If there is a pre-existing symlink below the extraction root that
points outside the root, writing through that symlink should not
be allowed to escape the root.
This test documents the desired hardening to defend against this
class of attacks.
"""
if not hasattr(os, "symlink"):
pytest.skip("Symlinks not supported on this platform")
zip_path = tmp_path / "zip_slip_symlink.zip"
extract_root = tmp_path / "extract"
outside_dir = tmp_path / "outside_dir"
outside_dir.mkdir()
outside_target = outside_dir / "evil.txt"
members = {
"pkg/good.txt": b"ok",
"dir_link/evil.txt": b"evil",
}
_make_zip(zip_path, members)
extract_root.mkdir()
# Pre-existing symlink inside extract_root pointing *outside* it.
os.symlink(outside_dir, extract_root / "dir_link")
messages = _run_unzip_iter(zip_path, extract_root, verbose=False)
# Desired property:
assert not outside_target.exists()
assert (extract_root / "pkg" / "good.txt").read_bytes() == b"ok"
# Should yield an ErrorMessage for the blocked symlink escape
assert any(isinstance(m, ErrorMessage) for m in messages)
def test_bad_zipfile_yields_errormessage(self, tmp_path: Path) -> None:
"""
A corrupt or non-zip file should cause _unzip_iter to yield an
ErrorMessage instead of raising an unhandled exception.
"""
zip_path = tmp_path / "not_a_zip.txt"
zip_path.write_bytes(b"this is not a zip archive")
extract_root = tmp_path / "extract"
messages = _run_unzip_iter(zip_path, extract_root, verbose=False)
assert any(isinstance(m, ErrorMessage) for m in messages)
# If the implementation chooses to create the root directory at all,
# it should not leave partially extracted content.
if extract_root.exists():
assert not any(extract_root.iterdir())
def test_unzip_iter_verbose_writes_to_stdout(self, capsys, tmp_path: Path) -> None:
"""
When verbose=True, _unzip_iter should write a status line to stdout.
This checks that existing user-visible behaviour is preserved.
"""
zip_path = tmp_path / "verbose.zip"
extract_root = tmp_path / "extract"
members = {"pkg/file.txt": b"data"}
_make_zip(zip_path, members)
_run_unzip_iter(zip_path, extract_root, verbose=True)
captured = capsys.readouterr()
assert "Unzipping" in captured.out