langchain-ai · eyurtsev · Mar 23, 2024 · Mar 22, 2024 · Mar 22, 2024 · Mar 22, 2024
diff --git a/backend/server/extraction_runnable.py b/backend/server/extraction_runnable.py
@@ -57,10 +57,13 @@ def validate_schema(cls, v: Any) -> Dict[str, Any]:
         return v
 
 
-class ExtractResponse(TypedDict):
+class ExtractResponse(TypedDict, total=False):
     """Response body for the extract endpoint."""
 
     data: List[Any]
+    # content to long will be set to true if the content is too long
+    # and had to be truncated
+    content_too_long: Optional[bool]
 
 
 def _cast_example_to_dict(example: Example) -> Dict[str, Any]:
@@ -203,13 +206,19 @@ async def extract_entire_document(
         for text in texts
     ]
 
-    if settings.MAX_CHUNKS >= 1:
-        # Limit the number of chunks to process
+    # Limit the number of chunks to process
+    if len(extraction_requests) < settings.MAX_CHUNKS:
+        content_too_long = True
         extraction_requests = extraction_requests[: settings.MAX_CHUNKS]
+    else:
+        content_too_long = False
 
     # Run extractions which may potentially yield duplicate results
     extract_responses: List[ExtractResponse] = await extraction_runnable.abatch(
         extraction_requests, {"max_concurrency": settings.MAX_CONCURRENCY}
     )
     # Deduplicate the results
-    return deduplicate(extract_responses)
+    return {
+        "data": deduplicate(extract_responses)["data"],
+        "content_too_long": content_too_long,
+    }
diff --git a/backend/tests/unit_tests/api/test_api_extract.py b/backend/tests/unit_tests/api/test_api_extract.py
@@ -83,7 +83,10 @@ async def test_extract_from_file() -> None:
             headers=headers,
         )
         assert response.status_code == 200
-        assert response.json() == {"data": ["Test Conte"]}
+        assert response.json() == {
+            "data": ["Test Conte"],
+            "content_too_long": False,
+        }
 
         # Vary chat model
         response = await client.post(
@@ -97,7 +100,10 @@ async def test_extract_from_file() -> None:
             headers=headers,
         )
         assert response.status_code == 200
-        assert response.json() == {"data": ["Test Conte"]}
+        assert response.json() == {
+            "data": ["Test Conte"],
+            "content_too_long": False,
+        }
 
         # Test retrieval
         response = await client.post(
@@ -110,7 +116,9 @@ async def test_extract_from_file() -> None:
             headers=headers,
         )
         assert response.status_code == 200
-        assert response.json() == {"data": ["Test Conte"]}
+        assert response.json() == {
+            "data": ["Test Conte"],
+        }
 
         # We'll use multi-form data here.
         # Create a named temporary file
@@ -129,7 +137,7 @@ async def test_extract_from_file() -> None:
             )
 
         assert response.status_code == 200, response.text
-        assert response.json() == {"data": ["This is a "]}
+        assert response.json() == {"data": ["This is a "], "content_too_long": False}
 
 
 @patch(
@@ -191,4 +199,7 @@ async def test_extract_from_large_file() -> None:
                         headers=headers,
                     )
         assert response.status_code == 200
-        assert response.json() == {"data": ["a"]}
+        assert response.json() == {
+            "data": ["a", "b"],
+            "content_too_long": False,
+        }