From 44d1a1e0e9b2a9045f3a3bd9289428239f011520 Mon Sep 17 00:00:00 2001 From: toonn Date: Wed, 21 Sep 2022 22:45:24 +0200 Subject: [PATCH 1/4] util: Allow path separators and spaces in filenames The path separators are necessary if we want to be able to organize comics into books or chapters or categories. The spaces are nice to have. I assume they haven't been allowed because they require escaping in a terminal but any modern terminal provides assistance for that during tab completion. Disallowing spaces here means they can never be used. Instead a method could be added to an appropriate class to sanitize filenames and replace spaces by default if that is actually desired. --- dosagelib/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dosagelib/util.py b/dosagelib/util.py index 2005f1e76b..5482084adb 100644 --- a/dosagelib/util.py +++ b/dosagelib/util.py @@ -415,7 +415,7 @@ def getFilename(name): """Get a filename from given name without dangerous or incompatible characters.""" # first replace all illegal chars - name = re.sub(r"[^0-9a-zA-Z_\-\.]", "_", name) + name = re.sub(r"[^0-9a-zA-Z_ /\-\.\\]", "_", name) # then remove double dots and underscores while ".." in name: name = name.replace('..', '.') From de2c599b53b729b2937d9d5ddfb29c5de6b12748 Mon Sep 17 00:00:00 2001 From: toonn Date: Wed, 21 Sep 2022 22:49:38 +0200 Subject: [PATCH 2/4] comic: Allow filename to be a path If the filename is a path we need to ensure all its components exist to avoid errors later when we try to create the file at that path. --- dosagelib/comic.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/dosagelib/comic.py b/dosagelib/comic.py index 20374c1263..8f8704cb95 100644 --- a/dosagelib/comic.py +++ b/dosagelib/comic.py @@ -144,7 +144,10 @@ def _exist_err(self, fn): def _fnbase(self, basepath): '''Determine the target base name of this comic file and make sure the directory exists.''' - comicdir = self.scraper.get_download_dir(basepath) + comicpath = os.path.join( + self.scraper.get_download_dir(basepath), self.filename + ) + comicdir = os.path.dirname(comicpath) if not os.path.isdir(comicdir): os.makedirs(comicdir) - return os.path.join(comicdir, self.filename) + return comicpath From 2d2c1918fc87e5c7e5fd880ef58a660988ccb85b Mon Sep 17 00:00:00 2001 From: toonn Date: Wed, 21 Sep 2022 22:53:26 +0200 Subject: [PATCH 3/4] DresdenCodak: Fix and improve scraper The scraper was broken due to the site layout changing. The structure is lacking so there are many unique cases to deal with. As the comic is separated into two storylines, one finished and one ongoing, and a series of one-offs that don't fit in either storyline I've put each of the series into its own subdirectory. --- dosagelib/plugins/d.py | 75 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 68 insertions(+), 7 deletions(-) diff --git a/dosagelib/plugins/d.py b/dosagelib/plugins/d.py index f7a2e1933c..a3634922d7 100644 --- a/dosagelib/plugins/d.py +++ b/dosagelib/plugins/d.py @@ -329,17 +329,78 @@ class DreamKeepersPrelude(_ParserScraper): class DresdenCodak(_ParserScraper): - url = 'http://dresdencodak.com/' - startUrl = url + 'cat/comic/' - firstStripUrl = url + '2007/02/08/pom/' - imageSearch = '//section[d:class("entry-content")]//img[d:class("aligncenter")]' + from datetime import datetime + + url = "https://dresdencodak.com/" + firstStripUrl = url + "2005/06/08/the-tomorrow-man/" + imageSearch = '(//section[d:class("entry-content")]//img[d:class("size-full") and not (contains(@alt, "revious") or contains(@alt,"irst") or contains(@alt,"ext"))])[1]' + textSearch = '//section[d:class("entry-content")]//p[(4 < position()) and (position() < (last() - 1))]' + textOptional = True prevSearch = '//a[img[contains(@src, "prev")]]' latestSearch = '//a[d:class("tc-grid-bg-link")]' starter = indirectStarter - # Blog and comic are mixed... - def shouldSkipUrl(self, url, data): - return not data.xpath(self.imageSearch) + # Haven't found a better way to distinguish whether or not a page is part + # of Hob than by the date prefix. + date_format = "%Y-%m-%d" + hob_start = datetime.strptime("2007-02-08", date_format) + hob_end = datetime.strptime("2008-10-22", date_format) + + pagenumber_re = compile("(?:[0-9]+-)*[^0-9]+_([0-9]+)(?:a|-1|_001)?\.jpg$") + + def getPrevUrl(self, url, data): + # Fix skipping newest One-Off + if url == self.url + "2010/06/03/dark-science-01/": + newurl = self.url + "category/oneoffs/" + return self.fetchUrl( + newurl, self.getPage(newurl), self.latestSearch + ) + return super(DresdenCodak, self).getPrevUrl(url, data) + + def namer(self, image_url, page_url): + import os.path + + filename = image_url.rsplit("/", 1)[-1] + # The archives are divided into three parts: + # Dark Science, Hob and One-Offs + if filename.startswith("ds"): + filename = filename[:2] + "_" + filename[2:] + elif filename == "84_new.jpg": + # Single anomalous page + filename = "ds_84.jpg" + elif filename == "cyborg_time.jpg": + filename = os.path.join("Dark Science", "84b.jpg") + elif "act_4" in filename: + filename = os.path.join("Dark Science", "80b.jpg") + elif "act_3" in filename: + filename = os.path.join("Dark Science", "38b.jpg") + elif "act_2" in filename: + filename = os.path.join("Dark Science", "18b.jpg") + + if filename.startswith("ds_") or "-dark_science_" in filename: + # Dark Science + import re + + pagenumber = re.match(self.pagenumber_re, filename).group(1) + filename = os.path.join( + "Dark Science", "{0:0>3}".format(pagenumber) + ) + elif "/" not in filename: + # Hob + from datetime import datetime + + date_prefix = page_url.rsplit("/", 5)[-5:-2] + date = datetime(*(int(i) for i in date_prefix)) + if self.hob_start <= date <= self.hob_end: + filename = os.path.join("Hob", filename) + else: + # One-Offs + year_day_prefix = date.strftime("%Y-%m-%d") + filename = os.path.join( + "One-Offs", "{0}-{1}".format(year_day_prefix, filename) + ) + + return filename class DrFun(_ParserScraper): From fb0ea00bcb272d7d889a9114d61d05cf1368c6b2 Mon Sep 17 00:00:00 2001 From: toonn Date: Thu, 29 Dec 2022 00:52:40 +0100 Subject: [PATCH 4/4] DresdenCodak: New pages can have the resolution in the name --- dosagelib/plugins/d.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dosagelib/plugins/d.py b/dosagelib/plugins/d.py index a3634922d7..2ab1a5b116 100644 --- a/dosagelib/plugins/d.py +++ b/dosagelib/plugins/d.py @@ -346,7 +346,9 @@ class DresdenCodak(_ParserScraper): hob_start = datetime.strptime("2007-02-08", date_format) hob_end = datetime.strptime("2008-10-22", date_format) - pagenumber_re = compile("(?:[0-9]+-)*[^0-9]+_([0-9]+)(?:a|-1|_001)?\.jpg$") + pagenumber_re = compile( + "(?:[0-9]+-)*[^0-9]+_([0-9]+)(?:a|b|-1|_001|-[0-9]+x[0-9]+)?\.jpg$" + ) def getPrevUrl(self, url, data): # Fix skipping newest One-Off