diff --git a/pypi/data-processing/src/file_handle/pdf_handle.py b/pypi/data-processing/src/file_handle/pdf_handle.py index 83ba6dcf3..d839fe202 100644 --- a/pypi/data-processing/src/file_handle/pdf_handle.py +++ b/pypi/data-processing/src/file_handle/pdf_handle.py @@ -85,7 +85,7 @@ def handle( all_document_for_process = [] for document in documents: chunck_id = ulid.ulid() - content = document.page_content.replace("\n", "") + content = document.page_content.replace("\n", "").replace("\x00", "") chunk_insert_item = { "id": chunck_id, "document_id": self._document_id, diff --git a/pypi/data-processing/src/file_handle/web_handle.py b/pypi/data-processing/src/file_handle/web_handle.py index 6488d1571..b5315e7ee 100644 --- a/pypi/data-processing/src/file_handle/web_handle.py +++ b/pypi/data-processing/src/file_handle/web_handle.py @@ -66,7 +66,7 @@ async def web_manipulate( all_document_for_process = [] for document in documents: chunck_id = ulid.ulid() - content = document.page_content.replace("\n", "") + content = document.page_content.replace("\n", "").replace("\x00", "") chunk_insert_item = { "id": chunck_id, "document_id": document_id, diff --git a/pypi/data-processing/src/file_handle/word_handle.py b/pypi/data-processing/src/file_handle/word_handle.py index 383c60d31..2e203e595 100644 --- a/pypi/data-processing/src/file_handle/word_handle.py +++ b/pypi/data-processing/src/file_handle/word_handle.py @@ -68,7 +68,7 @@ def docx_manipulate( all_document_for_process = [] for document in documents: chunck_id = ulid.ulid() - content = document.page_content.replace("\n", "") + content = document.page_content.replace("\n", "").replace("\x00", "") chunk_insert_item = { "id": chunck_id, "document_id": document_id,