Skip to content

Commit

Permalink
merge #6050: [wikimedia] add 'wiki' extractor
Browse files Browse the repository at this point in the history
  • Loading branch information
mikf committed Aug 25, 2024
2 parents 65cae11 + 968c04a commit 4b286e8
Show file tree
Hide file tree
Showing 4 changed files with 111 additions and 41 deletions.
12 changes: 12 additions & 0 deletions docs/configuration.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4559,6 +4559,18 @@ Description
Download video files.


extractor.wikimedia.limit
-------------------------
Type
``integer``
Default
``50``
Description
Number of results to return in a single API query.

The value must be between 10 and 500.


extractor.ytdl.cmdline-args
---------------------------
Type
Expand Down
124 changes: 83 additions & 41 deletions gallery_dl/extractor/wikimedia.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,45 +17,19 @@ class WikimediaExtractor(BaseExtractor):
"""Base class for wikimedia extractors"""
basecategory = "wikimedia"
filename_fmt = "{filename} ({sha1[:8]}).{extension}"
directory_fmt = ("{category}", "{page}")
archive_fmt = "{sha1}"
request_interval = (1.0, 2.0)

def __init__(self, match):
BaseExtractor.__init__(self, match)
path = match.group(match.lastindex)

if self.category == "wikimedia":
self.category = self.root.split(".")[-2]
elif self.category in ("fandom", "wikigg"):
self.category = "{}-{}".format(
self.category, self.root.partition(".")[0].rpartition("/")[2])

if path.startswith("wiki/"):
path = path[5:]

pre, sep, _ = path.partition(":")
prefix = pre.lower() if sep else None

self.title = path = text.unquote(path)
if prefix:
self.subcategory = prefix

if prefix == "category":
self.params = {
"generator": "categorymembers",
"gcmtitle" : path,
"gcmtype" : "file",
}
elif prefix == "file":
self.params = {
"titles" : path,
}
else:
self.params = {
"generator": "images",
"titles" : path,
}
self.per_page = self.config("limit", 50)

def _init(self):
api_path = self.config_instance("api-path")
Expand All @@ -67,6 +41,22 @@ def _init(self):
else:
self.api_url = self.root + "/api.php"

@staticmethod
def prepare(image):
"""Adjust the content of a image object"""
image["metadata"] = {
m["name"]: m["value"]
for m in image["metadata"] or ()}
image["commonmetadata"] = {
m["name"]: m["value"]
for m in image["commonmetadata"] or ()}

filename = image["canonicaltitle"]
image["filename"], _, image["extension"] = \
filename.partition(":")[2].rpartition(".")
image["date"] = text.parse_datetime(
image["timestamp"], "%Y-%m-%dT%H:%M:%SZ")

def items(self):
for info in self._pagination(self.params):
try:
Expand All @@ -75,20 +65,7 @@ def items(self):
self.log.debug("Missing 'imageinfo' for %s", info)
continue

image["metadata"] = {
m["name"]: m["value"]
for m in image["metadata"] or ()}
image["commonmetadata"] = {
m["name"]: m["value"]
for m in image["commonmetadata"] or ()}

filename = image["canonicaltitle"]
image["filename"], _, image["extension"] = \
filename.partition(":")[2].rpartition(".")
image["date"] = text.parse_datetime(
image["timestamp"], "%Y-%m-%dT%H:%M:%SZ")
image["page"] = self.title

self.prepare(image)
yield Message.Directory, image
yield Message.Url, image["url"], image

Expand All @@ -110,6 +87,17 @@ def _pagination(self, params):
while True:
data = self.request(url, params=params).json()

# ref: https://www.mediawiki.org/wiki/API:Errors_and_warnings
error = data.get("error")
if error:
self.log.error("%s: %s", error["code"], error["info"])
return
# MediaWiki will emit warnings for non-fatal mistakes such as
# invalid parameter instead of raising an error
warnings = data.get("warnings")
if warnings:
self.log.debug("MediaWiki returned warnings: %s", warnings)

try:
pages = data["query"]["pages"]
except KeyError:
Expand Down Expand Up @@ -181,5 +169,59 @@ def _pagination(self, params):
class WikimediaArticleExtractor(WikimediaExtractor):
"""Extractor for wikimedia articles"""
subcategory = "article"
directory_fmt = ("{category}", "{page}")
pattern = BASE_PATTERN + r"/(?!static/)([^?#]+)"
example = "https://en.wikipedia.org/wiki/TITLE"

def __init__(self, match):
WikimediaExtractor.__init__(self, match)

path = match.group(match.lastindex)
if path.startswith("wiki/"):
path = path[5:]

pre, sep, _ = path.partition(":")
prefix = pre.lower() if sep else None

self.title = path = text.unquote(path)
if prefix:
self.subcategory = prefix

if prefix == "category":
self.params = {
"generator": "categorymembers",
"gcmtitle" : path,
"gcmtype" : "file",
"gcmlimit" : self.per_page,
}
elif prefix == "file":
self.params = {
"titles" : path,
}
else:
self.params = {
"generator": "images",
"gimlimit" : self.per_page,
"titles" : path,
}

def prepare(self, image):
WikimediaExtractor.prepare(image)
image["page"] = self.title


class WikimediaWikiExtractor(WikimediaExtractor):
"""Extractor for all files on a MediaWiki instance"""
subcategory = "wiki"
pattern = BASE_PATTERN + r"/?$"
example = "https://en.wikipedia.org/"

def __init__(self, match):
WikimediaExtractor.__init__(self, match)

# ref: https://www.mediawiki.org/wiki/API:Allpages
self.params = {
"generator" : "allpages",
"gapnamespace": 6, # "File" namespace
"gaplimit" : self.per_page,
}
8 changes: 8 additions & 0 deletions test/results/fandom.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,4 +98,12 @@
"#class" : wikimedia.WikimediaArticleExtractor,
},

{
"#url" : "https://youtube.fandom.com",
"#category": ("wikimedia", "fandom-youtube", "wiki"),
"#class" : wikimedia.WikimediaWikiExtractor,
"#range" : "1-20",
"#count" : 20,
},

)
8 changes: 8 additions & 0 deletions test/results/wikipedia.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,4 +50,12 @@
"#class" : wikimedia.WikimediaArticleExtractor,
},

{
"#url" : "https://en.wikipedia.org",
"#category": ("wikimedia", "wikipedia", "wiki"),
"#class" : wikimedia.WikimediaWikiExtractor,
"#range" : "1-10",
"#count" : 10,
},

)

0 comments on commit 4b286e8

Please sign in to comment.