diff --git a/dosagelib/comic.py b/dosagelib/comic.py index 20374c1263..8f8704cb95 100644 --- a/dosagelib/comic.py +++ b/dosagelib/comic.py @@ -144,7 +144,10 @@ def _exist_err(self, fn): def _fnbase(self, basepath): '''Determine the target base name of this comic file and make sure the directory exists.''' - comicdir = self.scraper.get_download_dir(basepath) + comicpath = os.path.join( + self.scraper.get_download_dir(basepath), self.filename + ) + comicdir = os.path.dirname(comicpath) if not os.path.isdir(comicdir): os.makedirs(comicdir) - return os.path.join(comicdir, self.filename) + return comicpath diff --git a/dosagelib/plugins/d.py b/dosagelib/plugins/d.py index f7a2e1933c..2ab1a5b116 100644 --- a/dosagelib/plugins/d.py +++ b/dosagelib/plugins/d.py @@ -329,17 +329,80 @@ class DreamKeepersPrelude(_ParserScraper): class DresdenCodak(_ParserScraper): - url = 'http://dresdencodak.com/' - startUrl = url + 'cat/comic/' - firstStripUrl = url + '2007/02/08/pom/' - imageSearch = '//section[d:class("entry-content")]//img[d:class("aligncenter")]' + from datetime import datetime + + url = "https://dresdencodak.com/" + firstStripUrl = url + "2005/06/08/the-tomorrow-man/" + imageSearch = '(//section[d:class("entry-content")]//img[d:class("size-full") and not (contains(@alt, "revious") or contains(@alt,"irst") or contains(@alt,"ext"))])[1]' + textSearch = '//section[d:class("entry-content")]//p[(4 < position()) and (position() < (last() - 1))]' + textOptional = True prevSearch = '//a[img[contains(@src, "prev")]]' latestSearch = '//a[d:class("tc-grid-bg-link")]' starter = indirectStarter - # Blog and comic are mixed... - def shouldSkipUrl(self, url, data): - return not data.xpath(self.imageSearch) + # Haven't found a better way to distinguish whether or not a page is part + # of Hob than by the date prefix. + date_format = "%Y-%m-%d" + hob_start = datetime.strptime("2007-02-08", date_format) + hob_end = datetime.strptime("2008-10-22", date_format) + + pagenumber_re = compile( + "(?:[0-9]+-)*[^0-9]+_([0-9]+)(?:a|b|-1|_001|-[0-9]+x[0-9]+)?\.jpg$" + ) + + def getPrevUrl(self, url, data): + # Fix skipping newest One-Off + if url == self.url + "2010/06/03/dark-science-01/": + newurl = self.url + "category/oneoffs/" + return self.fetchUrl( + newurl, self.getPage(newurl), self.latestSearch + ) + return super(DresdenCodak, self).getPrevUrl(url, data) + + def namer(self, image_url, page_url): + import os.path + + filename = image_url.rsplit("/", 1)[-1] + # The archives are divided into three parts: + # Dark Science, Hob and One-Offs + if filename.startswith("ds"): + filename = filename[:2] + "_" + filename[2:] + elif filename == "84_new.jpg": + # Single anomalous page + filename = "ds_84.jpg" + elif filename == "cyborg_time.jpg": + filename = os.path.join("Dark Science", "84b.jpg") + elif "act_4" in filename: + filename = os.path.join("Dark Science", "80b.jpg") + elif "act_3" in filename: + filename = os.path.join("Dark Science", "38b.jpg") + elif "act_2" in filename: + filename = os.path.join("Dark Science", "18b.jpg") + + if filename.startswith("ds_") or "-dark_science_" in filename: + # Dark Science + import re + + pagenumber = re.match(self.pagenumber_re, filename).group(1) + filename = os.path.join( + "Dark Science", "{0:0>3}".format(pagenumber) + ) + elif "/" not in filename: + # Hob + from datetime import datetime + + date_prefix = page_url.rsplit("/", 5)[-5:-2] + date = datetime(*(int(i) for i in date_prefix)) + if self.hob_start <= date <= self.hob_end: + filename = os.path.join("Hob", filename) + else: + # One-Offs + year_day_prefix = date.strftime("%Y-%m-%d") + filename = os.path.join( + "One-Offs", "{0}-{1}".format(year_day_prefix, filename) + ) + + return filename class DrFun(_ParserScraper): diff --git a/dosagelib/util.py b/dosagelib/util.py index 2005f1e76b..5482084adb 100644 --- a/dosagelib/util.py +++ b/dosagelib/util.py @@ -415,7 +415,7 @@ def getFilename(name): """Get a filename from given name without dangerous or incompatible characters.""" # first replace all illegal chars - name = re.sub(r"[^0-9a-zA-Z_\-\.]", "_", name) + name = re.sub(r"[^0-9a-zA-Z_ /\-\.\\]", "_", name) # then remove double dots and underscores while ".." in name: name = name.replace('..', '.')