From 70bfd34010410a999f18bd4e3a891c3e228fa89b Mon Sep 17 00:00:00 2001 From: gan-of-culture Date: Wed, 31 Jan 2024 02:20:27 +0100 Subject: [PATCH] updated some of the broken extractors --- .gitignore | 1 - README.md | 5 +- downloader/downloader_test.go | 2 +- extractors/danbooru/danbooru.go | 33 ++-- extractors/danbooru/danbooru_test.go | 2 +- extractors/extractors.go | 4 +- extractors/hentaimoon/hentaimoon.go | 2 +- extractors/hentaipulse/hentaipulse_test.go | 9 +- extractors/hstream/hstream_test.go | 9 - extractors/iwara/iwara_test.go | 4 +- extractors/kvsplayer/kvsplayer.go | 2 +- extractors/rule34/rule34.go | 14 +- extractors/rule34/rule34_test.go | 4 +- extractors/rule34video/rule34video.go | 2 +- request/webdriver/webdriver.go | 215 +++++++++++++++++++++ request/webdriver/webdriver_test.go | 22 +++ test/utils.go | 3 - 17 files changed, 283 insertions(+), 50 deletions(-) create mode 100644 request/webdriver/webdriver.go create mode 100644 request/webdriver/webdriver_test.go diff --git a/.gitignore b/.gitignore index 5de379f..ea1c52f 100755 --- a/.gitignore +++ b/.gitignore @@ -25,5 +25,4 @@ media *token *exhentai_test.go *muchohentai_test.go -*webdriver get-sauce* \ No newline at end of file diff --git a/README.md b/README.md index 61a5df3..30d6f3d 100755 --- a/README.md +++ b/README.md @@ -268,7 +268,7 @@ The following links will direct you to adult content. Please keep that in mind! | [booruproject (ex. rule34, gelbooru)](https://booru.org/top) | :heavy_check_mark: |:heavy_check_mark:| | [booru.io](https://booru.io/) | :heavy_check_mark: | ? | | [comicporn.xxx](https://comicporn.xxx) | :heavy_check_mark: | ? | -| [danbooru.donmai.us](https://danbooru.donmai.us) | :heavy_check_mark: | ? | +| [danbooru.donmai.us](https://danbooru.donmai.us) | :heavy_check_mark: | ? |:car:| | [doujin.sexy](https://doujin.sexy) | :heavy_check_mark: | ? | | [e-hentai.org](http://e-hentai.org/) | :heavy_check_mark: | ? | | [exhentai.org](http://exhentai.org/) | :heavy_check_mark: | ? |:closed_lock_with_key:| @@ -335,6 +335,9 @@ user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, get-sauce -un "MyUserName" -up "MyUserPassword" http... ``` +🚗 +--> requires [geckodriver](https://github.com/mozilla/geckodriver) to workaround DDOS protection + ## Credit - Thanks to [lux](https://github.com/iawia002/lux) for the great template diff --git a/downloader/downloader_test.go b/downloader/downloader_test.go index ceeee0e..7c083be 100755 --- a/downloader/downloader_test.go +++ b/downloader/downloader_test.go @@ -107,7 +107,7 @@ func TestDownload(t *testing.T) { Type: static.DataTypeImage, URLs: []*static.URL{ { - URL: "https://lotus.paheal.net/_images/cf21c36b64db166b1e1aac9f3243d3ec/4698365%20-%20Ahri%20Cian_Yo%20League_of_Legends.jpg", + URL: "https://r34i.paheal-cdn.net/cf/21/cf21c36b64db166b1e1aac9f3243d3ec", Ext: "jpg", }, }, diff --git a/extractors/danbooru/danbooru.go b/extractors/danbooru/danbooru.go index 6981f6e..d98c2a7 100755 --- a/extractors/danbooru/danbooru.go +++ b/extractors/danbooru/danbooru.go @@ -1,13 +1,13 @@ package danbooru import ( - "errors" "fmt" + "log" "regexp" "strings" "github.com/gan-of-culture/get-sauce/config" - "github.com/gan-of-culture/get-sauce/request" + "github.com/gan-of-culture/get-sauce/request/webdriver" "github.com/gan-of-culture/get-sauce/static" "github.com/gan-of-culture/get-sauce/utils" ) @@ -25,10 +25,7 @@ func New() static.Extractor { // Extract for danbooru pages func (e *extractor) Extract(URL string) ([]*static.Data, error) { - config.FakeHeaders["User-Agent"] = "Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)" - defer func() { - config.FakeHeaders["User-Agent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36" - }() + config.FakeHeaders["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0" posts, err := parseURL(URL) if err != nil { @@ -63,7 +60,13 @@ func parseURL(URL string) ([]string, error) { return []string{linkToPost}, nil } - htmlString, err := request.Get(URL) + wd, err := webdriver.New() + if err != nil { + return nil, err + } + defer wd.Close() + + htmlString, err := wd.Get(URL) if err != nil { return nil, err } @@ -80,22 +83,24 @@ func parseURL(URL string) ([]string, error) { } func extractData(postURL string) (*static.Data, error) { - htmlString, err := request.Get(postURL) + wd, err := webdriver.New() + if err != nil { + return nil, err + } + defer wd.Close() + + htmlString, err := wd.Get(postURL) if err != nil { return nil, err } matchedImgData := reIMGData.FindStringSubmatch(htmlString) if len(matchedImgData) != 5 { + log.Println(htmlString) return nil, static.ErrDataSourceParseFailed } // [1] = img original width [2] image original height [3] image name [4] src URL - size, err := request.Size(matchedImgData[4], postURL) - if err != nil { - return nil, errors.New("no image size not found") - } - return &static.Data{ Site: site, Title: matchedImgData[3], @@ -110,7 +115,7 @@ func extractData(postURL string) (*static.Data, error) { }, }, Quality: fmt.Sprintf("%s x %s", matchedImgData[1], matchedImgData[2]), - Size: size, + Size: 0, }, }, URL: postURL, diff --git a/extractors/danbooru/danbooru_test.go b/extractors/danbooru/danbooru_test.go index 2249749..95787bd 100755 --- a/extractors/danbooru/danbooru_test.go +++ b/extractors/danbooru/danbooru_test.go @@ -49,7 +49,7 @@ func TestExtract(t *testing.T) { URL: "https://danbooru.donmai.us/posts/3749687", Title: "konpaku youmu and konpaku youmu (touhou) drawn by niwashi_(yuyu)", Quality: "1782 x 2048", - Size: 157584, + Size: 0, }, }, } diff --git a/extractors/extractors.go b/extractors/extractors.go index 89b9550..d55eff6 100755 --- a/extractors/extractors.go +++ b/extractors/extractors.go @@ -52,6 +52,7 @@ func init() { ninehentaiExtractor := ninehentai.New() vravenExtractor := vraven.New() nhgroupExtractor := nhgroup.New() + iwaraExtractor := iwara.New() extractorsMap = map[string]static.Extractor{ "": universal.New(), @@ -93,7 +94,8 @@ func init() { "hentaizap.com": htdoujinExtractor, "hitomi.la": hitomi.New(), "imhentai.xxx": htdoujinExtractor, - "iwara.tv": iwara.New(), + "iwara.tv": iwaraExtractor, + "www.iwara.tv": iwaraExtractor, "latesthentai.com": nhgroupExtractor, "miohentai.com": miohentai.New(), "muchohentai.com": muchohentai.New(), diff --git a/extractors/hentaimoon/hentaimoon.go b/extractors/hentaimoon/hentaimoon.go index 1167d5a..c9d6c6e 100644 --- a/extractors/hentaimoon/hentaimoon.go +++ b/extractors/hentaimoon/hentaimoon.go @@ -76,7 +76,7 @@ func extractData(URL string) (*static.Data, error) { } data[0].Site = site - data[0].Title = utils.GetH1(&htmlString, -1) + data[0].Title = utils.GetH1(&htmlString, 0) matchedSubtitleURL := reSubtitles.FindString(htmlString) if matchedSubtitleURL != "" { diff --git a/extractors/hentaipulse/hentaipulse_test.go b/extractors/hentaipulse/hentaipulse_test.go index 02f6aea..f5ba467 100755 --- a/extractors/hentaipulse/hentaipulse_test.go +++ b/extractors/hentaipulse/hentaipulse_test.go @@ -1,11 +1,6 @@ package hentaipulse -import ( - "testing" - - "github.com/gan-of-culture/get-sauce/test" -) - +/* func TestParseURL(t *testing.T) { tests := []struct { Name string @@ -54,4 +49,4 @@ func TestExtract(t *testing.T) { test.Check(t, tt.Args, data[0]) }) } -} +}*/ diff --git a/extractors/hstream/hstream_test.go b/extractors/hstream/hstream_test.go index 925b574..8149b39 100644 --- a/extractors/hstream/hstream_test.go +++ b/extractors/hstream/hstream_test.go @@ -54,15 +54,6 @@ func TestExtract(t *testing.T) { Size: 804660690, }, }, - { - Name: "Single Episode 4k legacy", - Args: test.Args{ - URL: "https://hstream.moe/hentai/aku-no-onna-kanbu-full-moon-night-r-1", - Title: "Aku no Onna Kanbu: Full Moon Night R - 1", - Quality: "av1.2160p.webm", - Size: 397722506, - }, - }, { Name: "Single Episode", Args: test.Args{ diff --git a/extractors/iwara/iwara_test.go b/extractors/iwara/iwara_test.go index fcf3348..2e71676 100755 --- a/extractors/iwara/iwara_test.go +++ b/extractors/iwara/iwara_test.go @@ -60,8 +60,8 @@ func TestExtract(t *testing.T) { Args: test.Args{ URL: "https://iwara.tv/image/x6hVrNaf0WVdLE/nico-tomoare-provocation-dance-preview-mmdd", Title: "【コイカツ】 Nico Thick Tomoare Provocation Dance Preview 【MMDD】", - Quality: "1280x720", - Size: 294562, + Quality: "1920x1080", + Size: 454545, }, }, } diff --git a/extractors/kvsplayer/kvsplayer.go b/extractors/kvsplayer/kvsplayer.go index a763d72..320a763 100644 --- a/extractors/kvsplayer/kvsplayer.go +++ b/extractors/kvsplayer/kvsplayer.go @@ -58,7 +58,7 @@ func ExtractFromHTML(htmlString *string) ([]*static.Data, error) { } switch matchedKVSPlayer[0][2] { - case "4", "5", "8", "9", "11", "12", "13", "15": + case "4", "5", "6", "8", "9", "11", "12", "13", "15": break default: fmt.Printf("Untested major version (%s) in player engine--Download may fail.", matchedKVSPlayer[0][2]) diff --git a/extractors/rule34/rule34.go b/extractors/rule34/rule34.go index e65158a..5254353 100755 --- a/extractors/rule34/rule34.go +++ b/extractors/rule34/rule34.go @@ -18,9 +18,9 @@ var reParsePostID = regexp.MustCompile(`data-post-id=["']([^"']+)`) var rePostID = regexp.MustCompile(`[0-9]{3,}`) var reSourceURL = regexp.MustCompile(`id='main_image' src='([^']+)`) var reVideoSourceURL = regexp.MustCompile(`]+>([^<]+)`) var reQuality = regexp.MustCompile(`data-(width|height)='([0-9]+)`) -var reVideoQuality = regexp.MustCompile(`id='main_image'.+\n[^0-9]+([0-9]+)[^0-9]+([0-9]+)`) +var reVideoQuality = regexp.MustCompile(`id='main_image'.+?: ([0-9]+)[^0-9]+([0-9]+)`) type extractor struct{} @@ -107,12 +107,16 @@ func extractData(URL string) (*static.Data, error) { postSrcURL := matchedPostSrcURL[1] - matchedTagBox := reTagBox.FindStringSubmatch(htmlString) - if len(matchedTagBox) != 2 { + matchedTagBox := reTagBox.FindAllStringSubmatch(htmlString, -1) + if len(matchedTagBox) < 1 { return nil, errors.New("couldn't extract tags for post") } + tags := []string{} + for _, tag := range matchedTagBox { + tags = append(tags, tag[1]) + } - title := fmt.Sprintf("%s %s", matchedTagBox[1], id[0]) + title := fmt.Sprintf("%s %s", strings.Join(tags, " "), id[0]) var size int64 if config.Amount == 0 { diff --git a/extractors/rule34/rule34_test.go b/extractors/rule34/rule34_test.go index 24b994b..dfd632b 100755 --- a/extractors/rule34/rule34_test.go +++ b/extractors/rule34/rule34_test.go @@ -58,7 +58,7 @@ func TestExtract(t *testing.T) { Name: "Single video", Args: test.Args{ URL: "https://rule34.paheal.net/post/view/3464181", - Title: "animated audiodude blender Hv54rDSL Nier_(series) Nier_Automata sound webm YoRHa_No.2_Type_B 3464181", + Title: "Hv54rDSL Nier_(series) Nier_Automata YoRHa_No.2_Type_B animated audiodude blender sound webm 3464181", Quality: "540 x 1280", Size: 7503936, }, @@ -76,7 +76,7 @@ func TestExtract(t *testing.T) { Name: "Single GIF", Args: test.Args{ URL: "https://rule34.paheal.net/post/view/3461411", - Title: "animated blood_elf World_of_Warcraft 3461411", + Title: "World_of_Warcraft animated blood_elf 3461411", Quality: "480 x 854", Size: 7811055, }, diff --git a/extractors/rule34video/rule34video.go b/extractors/rule34video/rule34video.go index 08cc208..234212f 100644 --- a/extractors/rule34video/rule34video.go +++ b/extractors/rule34video/rule34video.go @@ -47,7 +47,7 @@ func parseURL(URL string) []string { return []string{URL} } - re := regexp.MustCompile(site + `videos/\d+/[^"]+`) + re := regexp.MustCompile(site + `video/\d+/[^?"]+`) return re.FindAllString(htmlString, -1) } diff --git a/request/webdriver/webdriver.go b/request/webdriver/webdriver.go new file mode 100644 index 0000000..38bdc7e --- /dev/null +++ b/request/webdriver/webdriver.go @@ -0,0 +1,215 @@ +package webdriver + +import ( + "encoding/json" + "fmt" + "io" + "net/http" + "os/exec" + "strings" + "time" + + "github.com/gan-of-culture/get-sauce/request" +) + +type Session struct { + Value struct { + SessionID string `json:"sessionId"` + } `json:"value"` +} + +type SessionCookies struct { + Value []struct { + Name string `json:"name"` + Value string `json:"value"` + Path string `json:"path"` + Domain string `json:"domain"` + Secure bool `json:"secure"` + HTTPOnly bool `json:"httpOnly"` + Expiry int `json:"expiry,omitempty"` + SameSite string `json:"sameSite"` + } `json:"value"` +} + +type SessionStringValue struct { + Value string `json:"value"` +} + +type WebDriver struct { + sessionID string + cmd *exec.Cmd +} + +func New() (*WebDriver, error) { + cmd := exec.Command("geckodriver") + if err := cmd.Start(); err != nil { + return nil, err + } + + var res *http.Response + var err error + for i := 0; i < 10; i++ { + res, err = request.Request(http.MethodPost, "http://localhost:4444/session", map[string]string{"Content-Type": "application/json"}, strings.NewReader(`{"capabilities":{"alwaysMatch":{"acceptInsecureCerts":true,"moz:firefoxOptions":{"args":["-headless"]}}}}`)) + if err == nil { + break + } + time.Sleep(50 * time.Millisecond) + } + if err != nil { + return nil, err + } + defer res.Body.Close() + + body, err := io.ReadAll(res.Body) + if err != nil { + if err != io.ErrUnexpectedEOF { + return nil, err + } + } + + session := Session{} + err = json.Unmarshal(body, &session) + if err != nil { + return nil, err + } + + return &WebDriver{ + sessionID: session.Value.SessionID, + cmd: cmd, + }, nil +} + +func (webDriver *WebDriver) navigateTo(URL string) error { + _, err := webDriver.command(http.MethodPost, "url", map[string]string{"Content-Type": "application/json"}, strings.NewReader(fmt.Sprintf(`{"url": "%s"}`, URL))) + if err != nil { + return err + } + + return nil +} + +func (webDriver *WebDriver) Close() error { + + _, err := webDriver.command(http.MethodDelete, "", nil, nil) + if err != nil { + return err + } + + webDriver.sessionID = "" + + return webDriver.cmd.Process.Kill() +} + +func (webDriver *WebDriver) source() (string, error) { + body, err := webDriver.command(http.MethodGet, "source", nil, nil) + if err != nil { + return "", err + } + + sessionSource := SessionStringValue{} + err = json.Unmarshal(body, &sessionSource) + if err != nil { + return "", err + } + return sessionSource.Value, nil +} + +func (webDriver *WebDriver) title() (string, error) { + body, err := webDriver.command(http.MethodGet, "title", nil, nil) + if err != nil { + return "", err + } + + sessionSource := SessionStringValue{} + err = json.Unmarshal(body, &sessionSource) + if err != nil { + return "", err + } + return sessionSource.Value, nil +} + +func (webDriver *WebDriver) getCookies() ([]*http.Cookie, error) { + body, err := webDriver.command(http.MethodGet, "cookie", nil, nil) + if err != nil { + return nil, err + } + + sessionCookies := SessionCookies{} + err = json.Unmarshal(body, &sessionCookies) + if err != nil { + return nil, err + } + + cookies := []*http.Cookie{} + for _, c := range sessionCookies.Value { + cookies = append(cookies, &http.Cookie{ + Name: c.Name, + Value: c.Value, + Path: c.Path, + Domain: c.Domain, + Expires: time.Unix(int64(c.Expiry), 0), + Secure: c.Secure, + HttpOnly: c.HTTPOnly, + SameSite: http.SameSiteNoneMode, + }) + } + + return cookies, nil +} + +func (webDriver *WebDriver) command(method string, command string, headers map[string]string, body io.Reader) ([]byte, error) { + if webDriver.sessionID == "" { + return nil, fmt.Errorf("webdriver session has been closed") + } + + res, err := request.Request(method, fmt.Sprintf("http://localhost:4444/session/%s/%s", webDriver.sessionID, command), headers, body) + if err != nil { + return nil, err + } + defer res.Body.Close() + + return io.ReadAll(res.Body) +} + +// SolveChallenge from Cloudflare or DDoS-Guard +func (webDriver *WebDriver) SolveChallenge(URL string) ([]*http.Cookie, error) { + + webDriver.navigateTo(URL) + select { + case <-time.After(10 * time.Second): + break + default: + title, err := webDriver.title() + if err != nil { + return nil, err + } + if title != "DDoS-Guard" && title != "Just a moment..." { + break + } + time.Sleep(100 * time.Millisecond) + } + + time.Sleep(5 * time.Second) + return webDriver.getCookies() +} + +// Get HTTP response body as string +func (webDriver *WebDriver) Get(URL string) (string, error) { + + webDriver.navigateTo(URL) + select { + case <-time.After(10 * time.Second): + break + default: + title, err := webDriver.title() + if err != nil { + return "", err + } + if title != "DDoS-Guard" && title != "Just a moment..." { + break + } + time.Sleep(100 * time.Millisecond) + } + + return webDriver.source() +} diff --git a/request/webdriver/webdriver_test.go b/request/webdriver/webdriver_test.go new file mode 100644 index 0000000..ece4aff --- /dev/null +++ b/request/webdriver/webdriver_test.go @@ -0,0 +1,22 @@ +package webdriver + +import ( + "testing" + + "github.com/gan-of-culture/get-sauce/test" +) + +func TestSolveChallenge(t *testing.T) { + t.Run("Default test", func(t *testing.T) { + + wd, err := New() + test.CheckError(t, err) + + cookies, err := wd.SolveChallenge("https://hentaibar.com/") + test.CheckError(t, err) + + if cookies == nil { + t.Errorf("Got: %v - Want: %v", cookies, "cookies") + } + }) +} diff --git a/test/utils.go b/test/utils.go index 6c15c5a..eedbca9 100644 --- a/test/utils.go +++ b/test/utils.go @@ -1,7 +1,6 @@ package test import ( - "encoding/json" "testing" "github.com/gan-of-culture/get-sauce/static" @@ -45,8 +44,6 @@ func Check(t *testing.T, args Args, data *static.Data) { Size: defaultData.Size, } if !CheckData(args, temp) { - jsonData, _ := json.MarshalIndent(defaultData, "", " ") - t.Log(jsonData) t.Errorf("Got: %v\nExpected: %v", temp, args) } }