From 44d1a1e0e9b2a9045f3a3bd9289428239f011520 Mon Sep 17 00:00:00 2001
From: toonn <toonn@toonn.io>
Date: Wed, 21 Sep 2022 22:45:24 +0200
Subject: [PATCH 1/4] util: Allow path separators and spaces in filenames

The path separators are necessary if we want to be able to organize
comics into books or chapters or categories.

The spaces are nice to have. I assume they haven't been allowed because
they require escaping in a terminal but any modern terminal provides
assistance for that during tab completion. Disallowing spaces here means
they can never be used. Instead a method could be added to an
appropriate class to sanitize filenames and replace spaces by default if
that is actually desired.
---
 dosagelib/util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dosagelib/util.py b/dosagelib/util.py
index 2005f1e76b..5482084adb 100644
--- a/dosagelib/util.py
+++ b/dosagelib/util.py
@@ -415,7 +415,7 @@ def getFilename(name):
     """Get a filename from given name without dangerous or incompatible
     characters."""
     # first replace all illegal chars
-    name = re.sub(r"[^0-9a-zA-Z_\-\.]", "_", name)
+    name = re.sub(r"[^0-9a-zA-Z_ /\-\.\\]", "_", name)
     # then remove double dots and underscores
     while ".." in name:
         name = name.replace('..', '.')

From de2c599b53b729b2937d9d5ddfb29c5de6b12748 Mon Sep 17 00:00:00 2001
From: toonn <toonn@toonn.io>
Date: Wed, 21 Sep 2022 22:49:38 +0200
Subject: [PATCH 2/4] comic: Allow filename to be a path

If the filename is a path we need to ensure all its components exist to
avoid errors later when we try to create the file at that path.
---
 dosagelib/comic.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/dosagelib/comic.py b/dosagelib/comic.py
index 20374c1263..8f8704cb95 100644
--- a/dosagelib/comic.py
+++ b/dosagelib/comic.py
@@ -144,7 +144,10 @@ def _exist_err(self, fn):
     def _fnbase(self, basepath):
         '''Determine the target base name of this comic file and make sure the
         directory exists.'''
-        comicdir = self.scraper.get_download_dir(basepath)
+        comicpath = os.path.join(
+            self.scraper.get_download_dir(basepath), self.filename
+        )
+        comicdir = os.path.dirname(comicpath)
         if not os.path.isdir(comicdir):
             os.makedirs(comicdir)
-        return os.path.join(comicdir, self.filename)
+        return comicpath

From 2d2c1918fc87e5c7e5fd880ef58a660988ccb85b Mon Sep 17 00:00:00 2001
From: toonn <toonn@toonn.io>
Date: Wed, 21 Sep 2022 22:53:26 +0200
Subject: [PATCH 3/4] DresdenCodak: Fix and improve scraper

The scraper was broken due to the site layout changing. The structure is
lacking so there are many unique cases to deal with.

As the comic is separated into two storylines, one finished and one
ongoing, and a series of one-offs that don't fit in either storyline
I've put each of the series into its own subdirectory.
---
 dosagelib/plugins/d.py | 75 ++++++++++++++++++++++++++++++++++++++----
 1 file changed, 68 insertions(+), 7 deletions(-)

diff --git a/dosagelib/plugins/d.py b/dosagelib/plugins/d.py
index f7a2e1933c..a3634922d7 100644
--- a/dosagelib/plugins/d.py
+++ b/dosagelib/plugins/d.py
@@ -329,17 +329,78 @@ class DreamKeepersPrelude(_ParserScraper):
 
 
 class DresdenCodak(_ParserScraper):
-    url = 'http://dresdencodak.com/'
-    startUrl = url + 'cat/comic/'
-    firstStripUrl = url + '2007/02/08/pom/'
-    imageSearch = '//section[d:class("entry-content")]//img[d:class("aligncenter")]'
+    from datetime import datetime
+
+    url = "https://dresdencodak.com/"
+    firstStripUrl = url + "2005/06/08/the-tomorrow-man/"
+    imageSearch = '(//section[d:class("entry-content")]//img[d:class("size-full") and not (contains(@alt, "revious") or contains(@alt,"irst") or contains(@alt,"ext"))])[1]'
+    textSearch = '//section[d:class("entry-content")]//p[(4 < position()) and (position() < (last() - 1))]'
+    textOptional = True
     prevSearch = '//a[img[contains(@src, "prev")]]'
     latestSearch = '//a[d:class("tc-grid-bg-link")]'
     starter = indirectStarter
 
-    # Blog and comic are mixed...
-    def shouldSkipUrl(self, url, data):
-        return not data.xpath(self.imageSearch)
+    # Haven't found a better way to distinguish whether or not a page is part
+    # of Hob than by the date prefix.
+    date_format = "%Y-%m-%d"
+    hob_start = datetime.strptime("2007-02-08", date_format)
+    hob_end = datetime.strptime("2008-10-22", date_format)
+
+    pagenumber_re = compile("(?:[0-9]+-)*[^0-9]+_([0-9]+)(?:a|-1|_001)?\.jpg$")
+
+    def getPrevUrl(self, url, data):
+        # Fix skipping newest One-Off
+        if url == self.url + "2010/06/03/dark-science-01/":
+            newurl = self.url + "category/oneoffs/"
+            return self.fetchUrl(
+                newurl, self.getPage(newurl), self.latestSearch
+            )
+        return super(DresdenCodak, self).getPrevUrl(url, data)
+
+    def namer(self, image_url, page_url):
+        import os.path
+
+        filename = image_url.rsplit("/", 1)[-1]
+        # The archives are divided into three parts:
+        # Dark Science, Hob and One-Offs
+        if filename.startswith("ds"):
+            filename = filename[:2] + "_" + filename[2:]
+        elif filename == "84_new.jpg":
+            # Single anomalous page
+            filename = "ds_84.jpg"
+        elif filename == "cyborg_time.jpg":
+            filename = os.path.join("Dark Science", "84b.jpg")
+        elif "act_4" in filename:
+            filename = os.path.join("Dark Science", "80b.jpg")
+        elif "act_3" in filename:
+            filename = os.path.join("Dark Science", "38b.jpg")
+        elif "act_2" in filename:
+            filename = os.path.join("Dark Science", "18b.jpg")
+
+        if filename.startswith("ds_") or "-dark_science_" in filename:
+            # Dark Science
+            import re
+
+            pagenumber = re.match(self.pagenumber_re, filename).group(1)
+            filename = os.path.join(
+                "Dark Science", "{0:0>3}".format(pagenumber)
+            )
+        elif "/" not in filename:
+            # Hob
+            from datetime import datetime
+
+            date_prefix = page_url.rsplit("/", 5)[-5:-2]
+            date = datetime(*(int(i) for i in date_prefix))
+            if self.hob_start <= date <= self.hob_end:
+                filename = os.path.join("Hob", filename)
+            else:
+                # One-Offs
+                year_day_prefix = date.strftime("%Y-%m-%d")
+                filename = os.path.join(
+                    "One-Offs", "{0}-{1}".format(year_day_prefix, filename)
+                )
+
+        return filename
 
 
 class DrFun(_ParserScraper):

From fb0ea00bcb272d7d889a9114d61d05cf1368c6b2 Mon Sep 17 00:00:00 2001
From: toonn <toonn@toonn.io>
Date: Thu, 29 Dec 2022 00:52:40 +0100
Subject: [PATCH 4/4] DresdenCodak: New pages can have the resolution in the
 name

---
 dosagelib/plugins/d.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/dosagelib/plugins/d.py b/dosagelib/plugins/d.py
index a3634922d7..2ab1a5b116 100644
--- a/dosagelib/plugins/d.py
+++ b/dosagelib/plugins/d.py
@@ -346,7 +346,9 @@ class DresdenCodak(_ParserScraper):
     hob_start = datetime.strptime("2007-02-08", date_format)
     hob_end = datetime.strptime("2008-10-22", date_format)
 
-    pagenumber_re = compile("(?:[0-9]+-)*[^0-9]+_([0-9]+)(?:a|-1|_001)?\.jpg$")
+    pagenumber_re = compile(
+        "(?:[0-9]+-)*[^0-9]+_([0-9]+)(?:a|b|-1|_001|-[0-9]+x[0-9]+)?\.jpg$"
+    )
 
     def getPrevUrl(self, url, data):
         # Fix skipping newest One-Off