You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

87 lines
2.8 KiB

import zipfile
import pytest
import nltk.data as data
def test_normalize_rejects_no_protocol_traversal():
"""No-protocol traversal sequences should be rejected."""
with pytest.raises(ValueError):
data.normalize_resource_url("../../etc/passwd")
with pytest.raises(ValueError):
data.normalize_resource_url("../relative/../etc/passwd")
def test_normalize_rejects_no_protocol_backslashes():
"""Windows-style backslash traversal should be rejected when no protocol is present."""
with pytest.raises(ValueError):
data.normalize_resource_url(r"..\..\etc\passwd")
def test_normalize_allows_package_paths():
"""Valid package-style resource names should still be treated as nltk: URLs."""
out = data.normalize_resource_url("corpora/brown")
assert out.startswith(
"nltk:"
), "Package-style paths should be treated as 'nltk:' URLs"
def test_find_rejects_traversal_direct_call():
"""Defense-in-depth: direct calls to find() should reject traversal-like names."""
with pytest.raises(ValueError):
data.find("../../etc/passwd")
def test_find_rejects_traversal_that_becomes_unsafe_after_normalization():
"""
Defense-in-depth edge case: a path can become unsafe only after normalization.
Example from review: "foo/../../etc/passwd" normalizes to "../etc/passwd" and
must still be rejected.
"""
with pytest.raises(ValueError):
data.find("foo/../../etc/passwd")
def test_normalize_rejects_no_protocol_absolute_posix_path():
"""Absolute POSIX paths without a protocol should be rejected."""
with pytest.raises(ValueError):
data.normalize_resource_url("/etc/passwd")
def test_normalize_rejects_no_protocol_windows_drive_letter_paths():
"""
Windows drive letter paths should be rejected even on non-Windows platforms.
Review note: don't gate 'C:/etc/passwd' on Windows only; ensure robust rejection
regardless of runtime platform.
"""
with pytest.raises(ValueError):
data.normalize_resource_url(r"C:\etc\passwd")
# Run on all platforms (per review suggestion)
with pytest.raises(ValueError):
data.normalize_resource_url("C:/etc/passwd")
def test_normalize_rejects_no_protocol_dotdot_only():
"""A resource name that is exactly '..' should be rejected."""
with pytest.raises(ValueError):
data.normalize_resource_url("..")
def test_find_zip_split_is_non_greedy(tmp_path):
# Create a.zip containing an entry whose name includes another ".zip".
zpath = tmp_path / "a.zip"
with zipfile.ZipFile(zpath, "w") as zf:
zf.writestr("b.zip/c.txt", "ok")
ptr = data.find("a.zip/b.zip/c.txt", paths=[str(tmp_path)])
with ptr.open() as f:
got = f.read()
if isinstance(got, bytes):
got = got.decode("utf-8")
assert got == "ok"