[wip] more updates for multipart inline convert of data/link represen…

…tation
crim-ca · Oct 3, 2024 · 28c9d77 · 28c9d77
1 parent 8968c0b
commit 28c9d77
Show file tree

Hide file tree

Showing 5 changed files with 127 additions and 51 deletions.
diff --git a/tests/functional/test_wps_package.py b/tests/functional/test_wps_package.py
@@ -3568,6 +3568,34 @@ def remove_result_multipart_variable(results):
         results = re.sub(r"Last-Modified: .*\r\n", "", results)
         return results.strip()
 
+    @staticmethod
+    def fix_result_multipart_indent(results):
+        # type: (str) -> str
+        """
+        Remove indented whitespace from multipart literal contents.
+
+        This behaves similarly to :func:`inspect.cleandoc`, but handles cases were the nested part contents could
+        themselves contain newlines, leading to inconsistent indents for some lines when injected by string formating,
+        and causing :func:`inspect.cleandoc` to fail removing any indent.
+
+        Also, automatically applies ``\r\n`` characters correction which are critical in parsing multipart contents.
+        This is done to consider that literal newlines will include or not the ``\r`` depending on the OS running tests.
+
+        .. warning::
+            This should be used only for literal test string (i.e.: expected value) for comparison against the result.
+            Result contents obtained from the response should be compared as-is, without any fix for strict checks.
+        """
+        if results.startswith("\n "):
+            results = results[1:]
+        res_dedent = results.lstrip()
+        res_indent = len(results) - len(res_dedent)
+        res_spaces = " " * res_indent
+        res_dedent = res_dedent.replace(f"\n{res_spaces}", "\r\n")  # indented line
+        res_dedent = res_dedent.replace(f"\n\r\n", "\r\n\r\n")  # empty line (header/body separator)
+        res_dedent = res_dedent.replace("\r\r", "\r")  # in case windows
+        res_dedent = res_dedent.rstrip("\n ")  # last line often indented less because of closing multiline string
+        return res_dedent
+
     def test_execute_single_output_prefer_header_return_representation_literal(self):
         proc = "EchoResultsTester"
         p_id = self.fully_qualified_test_process_name(proc)
@@ -3605,8 +3633,10 @@ def test_execute_single_output_prefer_header_return_representation_literal(self)
         assert results.text == "test"
         outputs = self.app.get(f"/jobs/{job_id}/outputs", params={"schema": JobInputsOutputsSchema.OGC_STRICT})
         assert outputs.content_type.startswith(ContentType.APP_JSON)
-        assert outputs.json == {
-            "output_data": {"value": "test"},
+        assert outputs.json["outputs"] == {
+            "output_data": {
+                "value": "test"
+            },
         }
 
     def test_execute_single_output_prefer_header_return_representation_complex(self):
@@ -3648,7 +3678,7 @@ def test_execute_single_output_prefer_header_return_representation_complex(self)
         assert results.text == output_json
         outputs = self.app.get(f"/jobs/{job_id}/outputs", params={"schema": JobInputsOutputsSchema.OGC_STRICT})
         assert outputs.content_type.startswith(ContentType.APP_JSON)
-        assert outputs.json == {
+        assert outputs.json["outputs"] == {
             "output_json": {
                 "href": f"{out_url}/{job_id}/output_json/result.json",
                 "type": ContentType.APP_JSON,
@@ -3692,8 +3722,10 @@ def test_execute_single_output_prefer_header_return_minimal_literal(self):
         assert results.text == "test"
         outputs = self.app.get(f"/jobs/{job_id}/outputs", params={"schema": JobInputsOutputsSchema.OGC_STRICT})
         assert outputs.content_type.startswith(ContentType.APP_JSON)
-        assert outputs.json == {
-            "output_data": "test",
+        assert outputs.json["outputs"] == {
+            "output_data": {
+                "value": "test"
+            },
         }
 
     def test_execute_single_output_prefer_header_return_minimal_complex(self):
@@ -3744,7 +3776,7 @@ def test_execute_single_output_prefer_header_return_minimal_complex(self):
         ), "Filtered outputs should not be found in results response links."
         outputs = self.app.get(f"/jobs/{job_id}/outputs", params={"schema": JobInputsOutputsSchema.OGC_STRICT})
         assert outputs.content_type.startswith(ContentType.APP_JSON)
-        assert outputs.json == {
+        assert outputs.json["outputs"] == {
             "output_json": {
                 "href": f"{out_url}/{job_id}/output_json/result.json",
                 "type": ContentType.APP_JSON,
@@ -3790,7 +3822,9 @@ def test_execute_single_output_response_raw_value_literal(self):
         outputs = self.app.get(f"/jobs/{job_id}/outputs", params={"schema": JobInputsOutputsSchema.OGC_STRICT})
         assert outputs.content_type.startswith(ContentType.APP_JSON)
         assert outputs.json["outputs"] == {
-            "output_data": "test",
+            "output_data": {
+                "value": "test"
+            },
         }
 
     def test_execute_single_output_response_raw_value_complex(self):
@@ -3889,7 +3923,9 @@ def test_execute_single_output_response_raw_reference_literal(self):
         outputs = self.app.get(f"/jobs/{job_id}/outputs", params={"schema": JobInputsOutputsSchema.OGC_STRICT})
         assert outputs.content_type.startswith(ContentType.APP_JSON)
         assert outputs.json["outputs"] == {
-            "output_data": "test",
+            "output_data": {
+                "value": "test"
+            },
         }
 
     def test_execute_single_output_response_raw_reference_complex(self):
@@ -4004,15 +4040,15 @@ def test_execute_single_output_multipart_accept_data(self):
         assert ContentType.MULTIPART_MIXED in results.content_type
         boundary = parse_kvp(results.headers["Content-Type"])["boundary"][0]
         output_json = repr_json({"data": "test"}, separators=(",", ":"), force_string=True)
-        results_body = inspect.cleandoc(f"""
+        results_body = self.fix_result_multipart_indent(f"""
             --{boundary}
             Content-Type: {ContentType.APP_JSON}
             Content-Location: {out_url}/{job_id}/output_json/result.json
             Content-ID: <output_json@{job_id}>
 
             {output_json}
             --{boundary}--
-        """).replace("\n", "\r\n")
+        """)
         results_text = self.remove_result_multipart_variable(results.text)
         assert results_text == results_body
         outputs = self.app.get(f"/jobs/{job_id}/outputs", params={"schema": JobInputsOutputsSchema.OGC_STRICT})
@@ -4077,7 +4113,7 @@ def test_execute_single_output_multipart_accept_link(self):
         results = resp
         assert ContentType.MULTIPART_MIXED in results.content_type
         boundary = parse_kvp(results.headers["Content-Type"])["boundary"][0]
-        results_body = inspect.cleandoc(f"""
+        results_body = self.fix_result_multipart_indent(f"""
             --{boundary}
             Content-Disposition: attachment; name="output_json"; filename="result.json"
             Content-Type: {ContentType.APP_JSON}
@@ -4086,7 +4122,7 @@ def test_execute_single_output_multipart_accept_link(self):
             Content-Length: 0
 
             --{boundary}--
-        """).replace("\n", "\r\n")
+        """)
         results_text = self.remove_result_multipart_variable(results.text)
         assert results_text == results_body
         outputs = self.app.get(f"/jobs/{job_id}/outputs", params={"schema": JobInputsOutputsSchema.OGC_STRICT})
@@ -4151,15 +4187,15 @@ def test_execute_single_output_multipart_accept_alt_format(self):
         assert ContentType.MULTIPART_MIXED in results.content_type
         boundary = parse_kvp(results.headers["Content-Type"])["boundary"][0]
         output_json_as_yaml = yaml.safe_dump({"data": "test"})
-        results_body = inspect.cleandoc(f"""
+        results_body = self.fix_result_multipart_indent(f"""
             --{boundary}
             Content-Type: {ContentType.APP_YAML}
             Content-ID: <output_json@{job_id}>
             Content-Length: 12
 
             {output_json_as_yaml}
             --{boundary}--
-        """).replace("\n", "\r\n")
+        """)
         results_text = self.remove_result_multipart_variable(results.text)
         assert results.content_type.startswith(ContentType.MULTIPART_MIXED)
         assert results_text == results_body
@@ -4182,7 +4218,7 @@ def test_execute_single_output_multipart_accept_alt_format(self):
 
     # FIXME: implement (https://github.com/crim-ca/weaver/pull/548)
     @pytest.mark.xfail(reason="not implemented")
-    def test_execute_single_output_response_document_alt_format(self):
+    def test_execute_single_output_response_document_alt_format_yaml(self):
         proc = "EchoResultsTester"
         p_id = self.fully_qualified_test_process_name(proc)
         body = self.retrieve_payload(proc, "deploy", local=True)
@@ -4227,15 +4263,15 @@ def test_execute_single_output_response_document_alt_format(self):
         assert ContentType.MULTIPART_MIXED in results.content_type
         boundary = parse_kvp(results.headers["Content-Type"])["boundary"][0]
         output_json_as_yaml = yaml.safe_dump({"data": "test"})
-        results_body = inspect.cleandoc(f"""
+        results_body = self.fix_result_multipart_indent(f"""
             --{boundary}
             Content-Type: {ContentType.APP_YAML}
             Content-ID: <output_json@{job_id}>
             Content-Length: 12
 
             {output_json_as_yaml}
             --{boundary}--
-        """).replace("\n", "\r\n")
+        """)
         results_text = self.remove_result_multipart_variable(results.text)
         assert results.content_type.startswith(ContentType.MULTIPART_MIXED)
         assert results_text == results_body
@@ -4461,7 +4497,7 @@ def test_execute_multi_output_multipart_accept(self, multipart_header):
 
         results = self.app.get(f"/jobs/{job_id}/results")
         boundary = parse_kvp(results.headers["Content-Type"])["boundary"][0]
-        results_body = inspect.cleandoc(f"""
+        results_body = self.fix_result_multipart_indent(f"""
             --{boundary}
             Content-Disposition: attachment; name="output_data"
             Content-Type: {ContentType.TEXT_PLAIN}
@@ -4477,7 +4513,7 @@ def test_execute_multi_output_multipart_accept(self, multipart_header):
             Content-Length: 0
 
             --{boundary}--
-        """).replace("\n", "\r\n")
+        """)
         results_text = self.remove_result_multipart_variable(results.text)
         assert results.content_type.startswith(ContentType.MULTIPART_MIXED)
         assert results_text == results_body
@@ -4611,10 +4647,10 @@ def test_execute_multi_output_prefer_header_return_representation(self):
         out_url = get_wps_output_url(self.settings)
         results = self.app.get(f"/jobs/{job_id}/results")
         boundary = parse_kvp(results.headers["Content-Type"])["boundary"][0]
-        output_json = repr_json({"data": "test"}, separators=(",", ":"), force_string=True)
-        results_body = inspect.cleandoc(f"""
+        output_json = repr_json({"data": "test"}, indent=None, separators=(",", ":"), force_string=True)
+        results_body = self.fix_result_multipart_indent(f"""
             --{boundary}
-            Content-Disposition: attachment; filename="output_data.txt"; name="output_data"
+            Content-Disposition: attachment; name="output_data"; filename="output_data.txt"
             Content-Type: {ContentType.TEXT_PLAIN}
             Content-ID: <output_data@{job_id}>
             Content-Length: 4
@@ -4629,7 +4665,7 @@ def test_execute_multi_output_prefer_header_return_representation(self):
 
             {output_json}
             --{boundary}--
-        """).replace("\n", "\r\n")
+        """)
         results_text = self.remove_result_multipart_variable(results.text)
         assert results.content_type.startswith(ContentType.MULTIPART_MIXED)
         assert results_text == results_body
@@ -4683,7 +4719,7 @@ def test_execute_multi_output_response_raw_value(self):
         results = self.app.get(f"/jobs/{job_id}/results")
         boundary = parse_kvp(results.headers["Content-Type"])["boundary"][0]
         output_json = repr_json({"data": "test"}, separators=(",", ":"), force_string=True)
-        results_body = inspect.cleandoc(f"""
+        results_body = self.fix_result_multipart_indent(f"""
             --{boundary}
             Content-Disposition: attachment; name="output_data"
             Content-Type: {ContentType.TEXT_PLAIN}
@@ -4700,7 +4736,7 @@ def test_execute_multi_output_response_raw_value(self):
 
             {output_json}
             --{boundary}--
-        """).replace("\n", "\r\n")
+        """)
         assert results.content_type.startswith(ContentType.MULTIPART_MIXED)
         assert results.text == results_body
         outputs = self.app.get(f"/jobs/{job_id}/outputs", params={"schema": JobInputsOutputsSchema.OGC_STRICT})
@@ -4838,7 +4874,7 @@ def test_execute_multi_output_response_raw_reference_accept_multipart(self):
 
         results = self.app.get(f"/jobs/{job_id}/results")
         boundary = parse_kvp(results.headers["Content-Type"])["boundary"][0]
-        results_body = inspect.cleandoc(f"""
+        results_body = self.fix_result_multipart_indent(f"""
             --{boundary}
             Content-Disposition: attachment; name="output_data"; filename="output_data.txt"
             Content-Type: {ContentType.TEXT_PLAIN}
@@ -4854,7 +4890,7 @@ def test_execute_multi_output_response_raw_reference_accept_multipart(self):
             Content-Length: 0
 
             --{boundary}--
-        """).replace("\n", "\r\n")
+        """)
         results_text = self.remove_result_multipart_variable(results.text)
         assert results.content_type.startswith(ContentType.MULTIPART_MIXED)
         assert results_text == results_body
@@ -4908,8 +4944,8 @@ def test_execute_multi_output_response_raw_mixed(self):
         out_url = get_wps_output_url(self.settings)
         results = self.app.get(f"/jobs/{job_id}/results")
         boundary = parse_kvp(results.headers["Content-Type"])["boundary"][0]
-        output_json = repr_json({"data": "test"}, separators=(",", ":"), force_string=True)
-        results_body = inspect.cleandoc(f"""
+        output_json = repr_json({"data": "test"}, indent=None, separators=(",", ":"), force_string=True)
+        results_body = self.fix_result_multipart_indent(f"""
             --{boundary}
             Content-Disposition: attachment; name="output_data"
             Content-Type: {ContentType.TEXT_PLAIN}
@@ -4933,7 +4969,7 @@ def test_execute_multi_output_response_raw_mixed(self):
 
             {output_json}
             --{boundary}--
-        """).replace("\n", "\r\n")
+        """)
         results_text = self.remove_result_multipart_variable(results.text)
         assert results.content_type.startswith(ContentType.MULTIPART_MIXED)
         assert results_text == results_body
@@ -5060,11 +5096,11 @@ def test_execute_multi_output_prefer_header_return_minimal_override_transmission
         out_url = get_wps_output_url(self.settings)
         results = self.app.get(f"/jobs/{job_id}/results")
         results_json = self.remove_result_format(results.json)
-        output_json = repr_json({"data": "test"}, separators=(",", ":"), force_string=True)
+        output_json = repr_json({"data": "test"}, indent=None, separators=(",", ":"), force_string=True)
         assert results.content_type.startswith(ContentType.APP_JSON)
         assert results_json == {
             "output_data": {
-                "href": f"{out_url}/{job_id}/output_text/result.txt",
+                "href": f"{out_url}/{job_id}/output_data/output_data.txt",
                 "type": ContentType.TEXT_PLAIN,
             },
             "output_json": {

diff --git a/weaver/cli.py b/weaver/cli.py
@@ -51,7 +51,7 @@
     import_target,
     load_file,
     null,
-    parse_kvp,
+    parse_link_header,
     request_extra,
     setup_loggers
 )
@@ -1691,23 +1691,21 @@ def _download_references(self, outputs, out_links, out_dir, job_id, auth=None):
         # download links from headers
         LOGGER.debug("%s outputs in results link headers.", "Processing" if len(out_links) else "No")
         for _, link_header in ResponseHeaders(out_links).items():
-            link, params = link_header.split(";", 1)
-            href = link.strip("<>")
-            params = parse_kvp(params, multi_value_sep=None, accumulate_keys=False)
-            ctype = (params.get("type") or [None])[0]  # type: str
-            rel = params["rel"][0].split(".")
+            link = parse_link_header(link_header)
+            rel = link["rel"].rsplit(".", 1)
             output = rel[0]
             is_array = len(rel) > 1 and str.isnumeric(rel[1])
-            ref_path = fetch_reference(href, out_dir, auth=auth,
+            ref_path = fetch_reference(link["href"], out_dir, auth=auth,
                                        out_method=OutputMethod.COPY, out_listing=False)
-            value = {"href": href, "type": ctype, "path": ref_path, "source": "link"}  # type: ExecutionResultObjectRef
+            link = cast("ExecutionResultObjectRef", link)
+            link.update({"path": ref_path, "source": "link"})
             if output in outputs:
                 if isinstance(outputs[output], dict):  # in case 'rel="<output>.<index>"' was not employed
-                    outputs[output] = [outputs[output], value]
+                    outputs[output] = [outputs[output], link]
                 else:
-                    outputs[output].append(value)
+                    outputs[output].append(link)
             else:
-                outputs[output] = [value] if is_array else value
+                outputs[output] = [link] if is_array else link
         return outputs
 
     def results(self,

diff --git a/weaver/typedefs.py b/weaver/typedefs.py
@@ -113,7 +113,7 @@
     JSON = Union[Dict[str, Union[_JSON, _JsonItem]], List[Union[_JSON, _JsonItem]], AnyValueType]
 
     Link = TypedDict("Link", {
-        "title": str,
+        "title": NotRequired[str],
         "rel": Required[str],
         "href": Required[str],
         "hreflang": NotRequired[str],
@@ -349,7 +349,8 @@ class CWL_SchemaName(Protocol):
     WPS_OutputAsRefMediaType = Tuple[str, Optional[bool], Optional[str]]    # (output_id, as_ref, mime_type)
     WPS_OutputRequested = Union[WPS_OutputAsRef, WPS_OutputAsRefMediaType]
 
-    KVP_Item = Union[ValueType, Sequence[ValueType]]
+    KVP_Value = Optional[str]
+    KVP_Item = Union[KVP_Value, Sequence[KVP_Value]]
     KVP_Container = Union[Sequence[Tuple[str, KVP_Item]], Dict[str, KVP_Item]]
     KVP = Dict[str, List[KVP_Item]]
 

diff --git a/weaver/utils.py b/weaver/utils.py
@@ -1321,9 +1321,9 @@ def get_href_headers(
             if os.path.splitext(path)[-1] in ["", "."]:
                 f_ext = get_extension(f_type, dot=True)
                 path = f"{path}{f_ext}"
-            content_disposition_params = f"filename=\"{os.path.basename(path)}\""
-            if content_name:
-                content_disposition_params += f"; name=\"{content_name}\""
+            # set name, then filename, to align with order employed by requests-toolbelt multipart class
+            content_disposition_params = f"name=\"{content_name}\"; "if content_name else ""
+            content_disposition_params += f"filename=\"{os.path.basename(path)}\""
             headers["Content-Disposition"] = f"{content_disposition_type}; {content_disposition_params}"
     f_current = get_file_header_datetime(now())
     headers["Date"] = f_current
@@ -1371,6 +1371,23 @@ def make_link_header(
     return link
 
 
+def parse_link_header(link_header):
+    # type: (str) -> Link
+    """
+    Parses the parameters of the ``Link`` header.
+    """
+    url, params = link_header.split(";", 1)
+    href = url.strip("<>")
+    params = parse_kvp(params, multi_value_sep=None, accumulate_keys=False)
+    ctype = (params.pop("type", None) or [None])[0]
+    rel = str(params.pop("rel")[0])
+    link = {"href": href, "rel": rel}  # type: Link
+    if ctype and isinstance(ctype, str):
+        link["type"] = ctype
+    link.update({param: value[0] for param, value in params.items() if value})
+    return link
+
+
 def get_base_url(url):
     # type: (str) -> str
     """