Skip to content

Commit

Permalink
ROB: Tolerate streams without length field (#1717)
Browse files Browse the repository at this point in the history
The field /Length is normally required, but Acrobat Reader and other readers are tolerant

Closes #1715
  • Loading branch information
pubpub-zz committed Mar 18, 2023
1 parent 1c7a6eb commit 1d98969
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 10 deletions.
6 changes: 3 additions & 3 deletions pypdf/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,10 +178,10 @@ def read_until_regex(stream: StreamType, regex: Pattern[bytes]) -> bytes:
tok = stream.read(16)
if not tok:
return name
m = regex.search(tok)
m = regex.search(name + tok)
if m is not None:
name += tok[: m.start()]
stream.seek(m.start() - len(tok), 1)
stream.seek(m.start() - (len(name) + len(tok)), 1)
name = (name + tok)[: m.start()]
break
name += tok
return name
Expand Down
15 changes: 13 additions & 2 deletions pypdf/generic/_data_structures.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,15 +447,26 @@ def read_unsized_from_steam(
stream.seek(-1, 1)
# this is a stream object, not a dictionary
if SA.LENGTH not in data:
raise PdfStreamError("Stream length not defined")
if pdf is not None and pdf.strict:
raise PdfStreamError("Stream length not defined")
else:
logger_warning(
f"Stream length not defined @pos={stream.tell()}", __name__
)
data[NameObject(SA.LENGTH)] = NumberObject(-1)
length = data[SA.LENGTH]
if isinstance(length, IndirectObject):
t = stream.tell()
assert pdf is not None # hint for mypy
length = pdf.get_object(length)
stream.seek(t, 0)
pstart = stream.tell()
data["__streamdata__"] = stream.read(length)
if length > 0:
data["__streamdata__"] = stream.read(length)
else:
data["__streamdata__"] = read_until_regex(
stream, re.compile(b"endstream")
)
e = read_non_whitespace(stream)
ndstream = stream.read(8)
if (e + ndstream) != b"endstream":
Expand Down
15 changes: 10 additions & 5 deletions tests/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,17 +380,22 @@ def test_dictionaryobject_read_from_stream_stream_no_newline():


@pytest.mark.parametrize(("strict"), [(True), (False)])
def test_dictionaryobject_read_from_stream_stream_no_stream_length(strict):
stream = BytesIO(b"<< /S /GoTo >>stream\n")
def test_dictionaryobject_read_from_stream_stream_no_stream_length(strict, caplog):
stream = BytesIO(b"<< /S /GoTo >>stream\n123456789endstream abcd")

class Tst: # to replace pdf
strict = False

pdf = Tst()
pdf.strict = strict
with pytest.raises(PdfReadError) as exc:
DictionaryObject.read_from_stream(stream, pdf)
assert exc.value.args[0] == "Stream length not defined"
if strict:
with pytest.raises(PdfReadError) as exc:
DictionaryObject.read_from_stream(stream, pdf)
assert exc.value.args[0] == "Stream length not defined"
else:
o = DictionaryObject.read_from_stream(stream, pdf)
assert "Stream length not defined" in caplog.text
assert o.get_data() == b"123456789"


@pytest.mark.parametrize(
Expand Down

0 comments on commit 1d98969

Please sign in to comment.