diff --git a/newsfragments/705.feature b/newsfragments/705.feature new file mode 100644 index 000000000..562128534 --- /dev/null +++ b/newsfragments/705.feature @@ -0,0 +1,10 @@ +The template handling mechanism is extended so that a template with a +single ``#`` is expanded to match non-zero padded sequential numbers. +For example, ``image_#.cbf`` will match ``image_1.cbf``, ``image_2.cbf``, +..., ``image_10.cbf`` and so on. + +Using a single ``#`` to match up to 10 images _within_ a zero-padded +sequence continues to work as before. For example, +``dials.import template=insulin_1_01#.img`` will match the files +``insulin_1_010.img``, ``insulin_1_011.img``, ..., ``insulin_1_019.img``, +and no others. diff --git a/src/dxtbx/imageset.py b/src/dxtbx/imageset.py index 564ced681..9061387fa 100644 --- a/src/dxtbx/imageset.py +++ b/src/dxtbx/imageset.py @@ -1,5 +1,7 @@ from __future__ import annotations +import natsort + import boost_adaptbx.boost.python import dxtbx.format.image # noqa: F401, import dependency for unpickling @@ -45,7 +47,9 @@ ) -def _expand_template(template: str, indices: Iterable[int]) -> list[str]: +def _expand_template_to_sorted_filenames( + template: str, indices: Iterable[int] +) -> list[str]: """Expand a template string to a list of filenames. Args: @@ -55,7 +59,13 @@ def _expand_template(template: str, indices: Iterable[int]) -> list[str]: pfx = template.split("#")[0] sfx = template.split("#")[-1] count = template.count("#") - return [f"{pfx}{index:0{count}}{sfx}" for index in indices] + if count == 1: + # Special handling for a template with a single "#", which does not + # assume a zero-padded index. + filenames = [f"{pfx}{index}{sfx}" for index in indices] + else: + filenames = [f"{pfx}{index:0{count}}{sfx}" for index in indices] + return natsort.natsorted(filenames) class MemReader: @@ -449,7 +459,7 @@ def from_template( # Set the image range indices = range(image_range[0], image_range[1] + 1) - filenames = _expand_template(template, indices) + filenames = _expand_template_to_sorted_filenames(template, indices) else: if "master" not in template: raise ValueError("Invalid template") @@ -486,7 +496,7 @@ def _create_imageset(filelist, check_headers): # Get the template format if "#" in template: - filenames = sorted(_expand_template(template, indices)) + filenames = _expand_template_to_sorted_filenames(template, indices) else: filenames = [template] @@ -503,7 +513,7 @@ def _create_sequence(filelist, check_headers): # Expand the template if necessary if "#" in template: - filenames = sorted(_expand_template(template, indices)) + filenames = _expand_template_to_sorted_filenames(template, indices) else: filenames = [template] @@ -564,7 +574,7 @@ def make_sequence( # Get the template format if "#" in template: - filenames = sorted(_expand_template(template, indices)) + filenames = _expand_template_to_sorted_filenames(template, indices) else: filenames = [template] diff --git a/src/dxtbx/model/experiment_list.py b/src/dxtbx/model/experiment_list.py index 257515649..299831335 100644 --- a/src/dxtbx/model/experiment_list.py +++ b/src/dxtbx/model/experiment_list.py @@ -915,8 +915,6 @@ def from_templates(templates, **kwargs): f"Image file {filenames[0]} appears to be a '{type(format_class).__name__}', but this is an abstract Format" ) else: - index = slice(*template_string_number_index(template)) - image_range = kwargs.get("image_range") if image_range: first, last = image_range @@ -926,7 +924,13 @@ def from_templates(templates, **kwargs): if not kwargs.get("allow_incomplete_sequences", False): if "#" in template: # Check all images in range are present - if allowed - all_numbers = {int(f[index]) for f in filenames} + i0, i1 = template_string_number_index(template) + prefix = template[:i0] + suffix = template[i1:] + all_numbers = { + int(f.replace(prefix, "").replace(suffix, "")) + for f in filenames + } missing = set(range(first, last + 1)) - all_numbers if missing: raise ValueError( diff --git a/src/dxtbx/model/scan_helpers.py b/src/dxtbx/model/scan_helpers.py index 0eec00025..56b1ec4e6 100644 --- a/src/dxtbx/model/scan_helpers.py +++ b/src/dxtbx/model/scan_helpers.py @@ -14,10 +14,11 @@ r"([0-9]{2,12})\.(.*)", r"(.*)\.([0-9]{2,12})_(.*)", r"(.*)\.([0-9]{2,12})(.*)", + r"(.*)\.([0-9]{1,12})([^0]*)_(.*)", r"(.*)\.([0-9]{1})(.*)", ] -joiners = [".", "_", "", ""] +joiners = [".", "_", "", "_", ""] compiled_patterns = [re.compile(pattern) for pattern in patterns] @@ -38,6 +39,10 @@ def template_regex(filename): exten = "." + groups[0][::-1] digits = groups[1][::-1] prefix = groups[2][::-1] + joiners[j] + elif len(groups) == 4: + exten = "." + groups[0][::-1] + digits = groups[1][::-1] + prefix = groups[3][::-1] + joiners[j] + groups[2][::-1] else: exten = "" digits = groups[0][::-1] diff --git a/src/dxtbx/sequence_filenames.py b/src/dxtbx/sequence_filenames.py index d6c056408..420c3f108 100644 --- a/src/dxtbx/sequence_filenames.py +++ b/src/dxtbx/sequence_filenames.py @@ -5,6 +5,8 @@ from collections import defaultdict from glob import glob +import natsort + def template_regex(filename): """Works out a template from a filename. @@ -181,6 +183,9 @@ def replace_template_format_with_hash(match): def template_string_to_glob_expr(template): """Convert the template to a glob expression.""" + if template.count("#") == 1: + # https://github.com/cctbx/dxtbx/issues/646 + return template.replace("#", "*") return template.replace("#", "[0-9]") @@ -191,7 +196,14 @@ def template_string_number_index(template): def locate_files_matching_template_string(template): """Return all files matching template.""" - return glob(template_string_to_glob_expr(template)) + matches = glob(template_string_to_glob_expr(template)) + if template.count("#") != 1: + return matches + matches = [os.path.split(p) for p in matches] + i0, i1 = template_string_number_index(template) + suffix = template[i1:] + patt = re.compile("([^0]*)([0-9]+)" + suffix) + return [os.path.join(*m) for m in matches if patt.match(m[1])] def template_image_range(template): @@ -199,19 +211,19 @@ def template_image_range(template): # Find the files matching the template filenames = locate_files_matching_template_string(template) - filenames = sorted(filenames) + filenames = natsort.natsorted(filenames) # Check that the template matches some files if len(filenames) == 0: raise ValueError(f"Template {template} doesn't match any files.") - # Get the templete format - index = slice(*template_string_number_index(template)) - # Get the first and last indices if "#" in template: - first = int(filenames[0][index]) - last = int(filenames[-1][index]) + i0, i1 = template_string_number_index(template) + prefix = template[:i0] + suffix = template[i1:] + first = int(filenames[0].replace(prefix, "").replace(suffix, "")) + last = int(filenames[-1].replace(prefix, "").replace(suffix, "")) else: # template is one file first, last = 0, 0 diff --git a/tests/test_sequence_filenames.py b/tests/test_sequence_filenames.py index ed351916e..52be526b2 100644 --- a/tests/test_sequence_filenames.py +++ b/tests/test_sequence_filenames.py @@ -1,8 +1,10 @@ from __future__ import annotations +import shutil + import pytest -from dxtbx.sequence_filenames import template_regex +from dxtbx.sequence_filenames import template_image_range, template_regex @pytest.mark.parametrize( @@ -15,7 +17,26 @@ ("foo_bar_002.img1000", "foo_bar_###.img1000", 2), ("foo_bar_00005.img", "foo_bar_#####.img", 5), ("image0010", "image####", 10), + ("foo_123_1_1.rodhypix", "foo_123_1_#.rodhypix", 1), # Rigaku-style ], ) def test_template_regex(filename, template, digits): assert template_regex(filename) == (template, digits) + + +def test_template_image_range(dials_data): + template = str(dials_data("insulin", pathlib=True) / "insulin_1_###.img") + assert template_image_range(template) == (1, 45) + + +def test_template_image_range_non_zero_padded(dials_data, tmp_path): + images = sorted(dials_data("insulin", pathlib=True).glob("insulin_1_0[0-1]*")) + # symlink if possible, copy if necessary + for i, image in enumerate(images): + try: + (tmp_path / f"insulin_1_{i + 1}.img").symlink_to(image) + except OSError: + shutil.copy(image, (tmp_path / f"insulin_1_{i + 1}.img")) + + template = str(tmp_path / "insulin_1_#.img") + assert template_image_range(template) == (1, 19)