-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
180 lines (131 loc) · 6.34 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.edge.options import Options
import re
from multiprocessing import Pool
from tqdm import tqdm
from modules.webdriver import setup_webdriver
from modules.helpers import download
class TonariScrapper:
#Use a custom enter and exit function to ensure the chromium process is killed
#even if the programm crashes during runtime
def __enter__(self):
return self
def __init__(self):
self.manga_url = 'https://tonarinoyj.jp/episode/4855956445072905450'
self.download_location = 'D:\\OPM\\source\\'
self._chapter_links = {}
self._chapter_page_links = {}
#Get the Website base URL
self._tonarinoyj_url = self.manga_url.rsplit('/', 1)[0] + '/'
self._options = Options()
self._options.add_argument('headless')
self._options.add_argument('window-size=1920x1080')
self._options.add_argument('disable-extensions')
self.webdriver = webdriver.Edge(options=self._options)
self._get_chapters()
self._chapter_selection()
self._chapter_download()
print('Download complete.')
def _chapter_selection(self):
"""
This function will check which chapters of the given manga are available
"""
_chapters_nums = list(self._chapter_links.keys())
_chapters_nums.sort(reverse=False)
print(f'********\nFound {len(_chapters_nums)} '
f'chapters. Select chapters for download.\nOptions:')
while True:
_download_selection = input('Chapter number, all or latest\n')
if _download_selection.lower() == 'latest':
print('Downloading chapter {}.'.format(_chapters_nums[-1]))
self._get_chapter_page_links(_chapters_nums[-1])
break
elif _download_selection.lower() == 'all':
print('All {} Chapters will be downloaded.'.format(
len(list(self._chapter_links.keys()))))
for c in tqdm(_chapters_nums, unit='iB', unit_scale=True):
self._get_chapter_page_links(c)
break
elif _download_selection.isdigit():
self._get_chapter_page_links(int(_download_selection))
break
else:
print('Invalid input. Example input (without ""): "42", "all" or "latest"')
def _get_chapters(self):
"""
This function will scrap the available chapters from the tonarinoyj webseite and
split found chapters into available and unavailable manga chapters
"""
self.webdriver.get(self.manga_url)
wait = WebDriverWait(self.webdriver, 10)
_chap_list_cont = self.webdriver.find_element(By.XPATH,
'//button[@class="js-read-more-button"]')
self.webdriver.execute_script('return arguments[0].scrollIntoView();',
_chap_list_cont)
_chapter_container_loaded = wait.until(EC.visibility_of_element_located((By.XPATH,
'//span[@class="loading-text"]')))
wait.until(lambda x: 'hidden' in _chapter_container_loaded.get_attribute('class'))
while True:
try:
wait.until(EC.visibility_of_element_located(
(By.XPATH, '//button[@class="js-read-more-button"]'))).click()
except (TimeoutException, StaleElementReferenceException):
break
_chapters_available = self.webdriver.find_elements(By.XPATH,
'//*[@class=" episode" and not(contains(@class, "private episode"))] | '
'//*[@class="episode current-readable-product"]')
_chapters_private = self.webdriver.find_elements(By.XPATH,
'//*[@class="private episode"]')
regx = re.compile(r".*?\[[^\d]*(\d+)[^\d]*\].*")
_chapters_private = [regx.findall(x.text)[0] for x in _chapters_private]
for c in _chapters_available:
c_num = regx.findall(c.text)
c_url = self._tonarinoyj_url + c.get_attribute('data-id')
if c_num and c_num not in _chapters_private:
self._chapter_links.update({int(c_num[0]): c_url})
if len(_chapters_private) > 0:
print(f'Note: Chapters {_chapters_private} are private and not available.')
def _get_chapter_page_links(self, chapter_num):
"""
This function scraps the direct links to the manga pages for
the user selected manga chapters.
"""
self.webdriver.get('view-source:' + self._chapter_links[chapter_num] + '.json')
content = self.webdriver.page_source
chapter_content = self.webdriver.find_element(By.CLASS_NAME,
'line-content').text
chapter_data = json.loads(chapter_content)
chapter_pages = chapter_data['readableProduct']['pageStructure']['pages']
_page_num = 0
for page in chapter_pages:
if 'src' in page:
_page_num = _page_num + 1
_page_link = page['src']
self._chapter_page_links.update({(chapter_num, _page_num): _page_link})
def _chapter_download(self):
"""
This function is downloading the manga pages. Manga pages are stored in their
corresponding chapter folders. The download is happening in parallel with 3 parallel
downloads.
"""
input_args = [(p_url,
self.download_location + str(c_num) + '\\',
str(p_num) + '.jpeg')
for (c_num, p_num), p_url in self._chapter_page_links.items()]
with Pool(3) as pool:
pool.starmap(download,
tqdm(input_args, total=len(input_args),
unit='iB', unit_scale=True))
def __exit__(self, exc_type, exc_val, exc_tb):
self.webdriver.quit()
def main():
setup_webdriver()
with TonariScrapper() as t:
print(t)
if __name__ == '__main__':
main()