diff --git a/pypdf/_utils.py b/pypdf/_utils.py index ae3885e1f..55cf7cb12 100644 --- a/pypdf/_utils.py +++ b/pypdf/_utils.py @@ -178,10 +178,10 @@ def read_until_regex(stream: StreamType, regex: Pattern[bytes]) -> bytes: tok = stream.read(16) if not tok: return name - m = regex.search(tok) + m = regex.search(name + tok) if m is not None: - name += tok[: m.start()] - stream.seek(m.start() - len(tok), 1) + stream.seek(m.start() - (len(name) + len(tok)), 1) + name = (name + tok)[: m.start()] break name += tok return name diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 824dc16d4..c6b660337 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -447,7 +447,13 @@ def read_unsized_from_steam( stream.seek(-1, 1) # this is a stream object, not a dictionary if SA.LENGTH not in data: - raise PdfStreamError("Stream length not defined") + if pdf is not None and pdf.strict: + raise PdfStreamError("Stream length not defined") + else: + logger_warning( + f"Stream length not defined @pos={stream.tell()}", __name__ + ) + data[NameObject(SA.LENGTH)] = NumberObject(-1) length = data[SA.LENGTH] if isinstance(length, IndirectObject): t = stream.tell() @@ -455,7 +461,12 @@ def read_unsized_from_steam( length = pdf.get_object(length) stream.seek(t, 0) pstart = stream.tell() - data["__streamdata__"] = stream.read(length) + if length > 0: + data["__streamdata__"] = stream.read(length) + else: + data["__streamdata__"] = read_until_regex( + stream, re.compile(b"endstream") + ) e = read_non_whitespace(stream) ndstream = stream.read(8) if (e + ndstream) != b"endstream": diff --git a/tests/test_generic.py b/tests/test_generic.py index b8910a5c0..ef6b6eac9 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -380,17 +380,22 @@ def test_dictionaryobject_read_from_stream_stream_no_newline(): @pytest.mark.parametrize(("strict"), [(True), (False)]) -def test_dictionaryobject_read_from_stream_stream_no_stream_length(strict): - stream = BytesIO(b"<< /S /GoTo >>stream\n") +def test_dictionaryobject_read_from_stream_stream_no_stream_length(strict, caplog): + stream = BytesIO(b"<< /S /GoTo >>stream\n123456789endstream abcd") class Tst: # to replace pdf strict = False pdf = Tst() pdf.strict = strict - with pytest.raises(PdfReadError) as exc: - DictionaryObject.read_from_stream(stream, pdf) - assert exc.value.args[0] == "Stream length not defined" + if strict: + with pytest.raises(PdfReadError) as exc: + DictionaryObject.read_from_stream(stream, pdf) + assert exc.value.args[0] == "Stream length not defined" + else: + o = DictionaryObject.read_from_stream(stream, pdf) + assert "Stream length not defined" in caplog.text + assert o.get_data() == b"123456789" @pytest.mark.parametrize(