diff --git a/README.md b/README.md index b407134..1422343 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ go get -u github.com/theblackturtle/gospider ## Usage ``` -A Simple Web Spider - v1.0 by @theblackturtle +A Simple Web Spider - v1.0.4 by @theblackturtle Usage: gospider [flags] @@ -49,7 +49,9 @@ Flags: -a, --other-source Find URLs from 3rd party (Archive.org, CommonCrawl.org, VirusTotal.com) -w, --include-subs Include subdomains crawled from 3rd party. Default is main domain --debug Turn on debug mode + -v, --verbose Turn on verbose --no-redirect Disable redirect + --version Check version -h, --help help for gospider ``` diff --git a/core/crawler.go b/core/crawler.go index c903967..a3b6984 100644 --- a/core/crawler.go +++ b/core/crawler.go @@ -9,6 +9,7 @@ import ( "github.com/spf13/cobra" "github.com/theblackturtle/gospider/stringset" "io/ioutil" + "net" "net/http" "net/url" "os" @@ -48,8 +49,18 @@ func NewCrawler(site string, cmd *cobra.Command) *Crawler { // Setup http client tr := &http.Transport{ - TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, + DialContext: (&net.Dialer{ + Timeout: 60 * time.Second, + KeepAlive: 30 * time.Second, + DualStack: true, + }).DialContext, + MaxIdleConns: 0, + IdleConnTimeout: 5 * time.Second, + TLSHandshakeTimeout: 10 * time.Second, + ExpectContinueTimeout: 5 * time.Second, + TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, } + // Set proxy proxy, _ := cmd.Flags().GetString("proxy") if proxy != "" { @@ -143,16 +154,15 @@ func NewCrawler(site string, cmd *cobra.Command) *Crawler { } // Set url whitelist regex - domainRegex := "^(https?|mms|mssx|mmsh|rtsp|pnm)://([^/]+[.])?(?i:" + strings.ReplaceAll(domain, ".", "[.]") + ")(/.*)?$" - domainRe := regexp.MustCompile(domainRegex) + domainRe := regexp.MustCompile(domain) c.URLFilters = append(c.URLFilters, domainRe) // Set Limit Rule err := c.Limit(&colly.LimitRule{ - DomainRegexp: domainRegex, - Parallelism: concurrent, - Delay: time.Duration(delay) * time.Second, - RandomDelay: time.Duration(randomDelay) * time.Second, + DomainGlob: domain, + Parallelism: concurrent, + Delay: time.Duration(delay) * time.Second, + RandomDelay: time.Duration(randomDelay) * time.Second, }) if err != nil { Logger.Errorf("Failed to set Limit Rule: %s", err) @@ -205,7 +215,6 @@ func (crawler *Crawler) Start() { if !formSet.Duplicate(formUrl) { if crawler.domainRe.MatchString(formUrl) { outputFormat := fmt.Sprintf("[form] - %s", formUrl) - //Logger.Info(outputFormat + "\n") fmt.Println(outputFormat) if crawler.Output != nil { crawler.Output.WriteToFile(outputFormat) @@ -229,7 +238,6 @@ func (crawler *Crawler) Start() { if !jsFileSet.Duplicate(jsFileUrl) { outputFormat := fmt.Sprintf("[javascript] - %s", jsFileUrl) - //Logger.Info(outputFormat + "\n") fmt.Println(outputFormat) if crawler.Output != nil { crawler.Output.WriteToFile(outputFormat) @@ -249,7 +257,6 @@ func (crawler *Crawler) Start() { for _, sub := range subs { if !subSet.Duplicate(sub) { outputFormat := fmt.Sprintf("[subdomains] - %s", sub) - //Logger.Info(outputFormat + "\n") fmt.Println(outputFormat) if crawler.Output != nil { crawler.Output.WriteToFile(outputFormat) @@ -262,7 +269,6 @@ func (crawler *Crawler) Start() { for _, e := range aws { if !awsSet.Duplicate(e) { outputFormat := fmt.Sprintf("[aws-s3] - %s", e) - //Logger.Info(outputFormat + "\n") fmt.Println(outputFormat) if crawler.Output != nil { crawler.Output.WriteToFile(outputFormat) @@ -270,23 +276,20 @@ func (crawler *Crawler) Start() { } } - // We will pass 404 Not Found and 429 status code - if response.StatusCode == 404 || response.StatusCode == 429 { - return - } - // Verify which links are working u := response.Request.URL.String() - if crawler.domainRe.MatchString(u) { - outputFormat := fmt.Sprintf("[url] - [code-%d] - %s", response.StatusCode, u) - fmt.Println(outputFormat) - if crawler.Output != nil { - crawler.Output.WriteToFile(outputFormat) - } + outputFormat := fmt.Sprintf("[url] - [code-%d] - %s", response.StatusCode, u) + fmt.Println(outputFormat) + if crawler.Output != nil { + crawler.Output.WriteToFile(outputFormat) } }) crawler.C.OnError(func(response *colly.Response, err error) { + // Status == 0 mean "The server IP address could not be found." + if response.StatusCode == 404 || response.StatusCode == 429 || response.StatusCode == 0 { + return + } u := response.Request.URL.String() outputFormat := fmt.Sprintf("[url] - [code-%d] - %s", response.StatusCode, u) fmt.Println(outputFormat) @@ -336,7 +339,6 @@ func (crawler *Crawler) linkFinder(site string, jsUrl string) { // JS Regex Result outputFormat := fmt.Sprintf("[linkfinder] - [from: %s] - %s", jsUrl, link) - //Logger.Info(outputFormat + "\n") fmt.Println(outputFormat) if crawler.Output != nil { crawler.Output.WriteToFile(outputFormat) diff --git a/core/output.go b/core/output.go index 64c4c0a..26219ac 100644 --- a/core/output.go +++ b/core/output.go @@ -28,3 +28,7 @@ func (o *Output) WriteToFile(msg string) { defer o.mu.Unlock() _, _ = o.f.WriteString(msg + "\n") } + +func (o *Output) Close() error { + return o.f.Close() +} diff --git a/core/sitemap.go b/core/sitemap.go index 88f03dc..2599335 100644 --- a/core/sitemap.go +++ b/core/sitemap.go @@ -18,12 +18,11 @@ func ParseSiteMap(site string, depth int, output *Output, c *colly.Collector, wg Logger.Infof("Trying to find %s", site+path) _ = sitemap.ParseFromSite(site+path, func(entry sitemap.Entry) error { outputFormat := fmt.Sprintf("[sitemap] - %s", entry.GetLocation()) - //Logger.Infof(outputFormat + "\n") fmt.Println(outputFormat) if output != nil { output.WriteToFile(outputFormat) } - c.Visit(entry.GetLocation()) + _ = c.Visit(entry.GetLocation()) return nil }) } diff --git a/core/version.go b/core/version.go index a69541b..0f54af1 100644 --- a/core/version.go +++ b/core/version.go @@ -3,5 +3,5 @@ package core const ( CLIName = "gospider" AUTHOR = "@theblackturtle" - VERSION = "v1.0.3" + VERSION = "v1.0.4" ) diff --git a/main.go b/main.go index ce0733e..2bccb53 100644 --- a/main.go +++ b/main.go @@ -62,21 +62,23 @@ func run(cmd *cobra.Command, args []string) { } version, _ := cmd.Flags().GetBool("version") - if version{ - fmt.Printf("Version: %s",core.VERSION) + if version { + fmt.Printf("Version: %s\n", core.VERSION) os.Exit(0) } + verbose, _ := cmd.Flags().GetBool("verbose") + if !verbose { + core.Logger.SetOutput(ioutil.Discard) + } + isDebug, _ := cmd.Flags().GetBool("debug") if isDebug { core.Logger.SetLevel(logrus.DebugLevel) + core.Logger.SetOutput(os.Stdout) } else { core.Logger.SetLevel(logrus.InfoLevel) - } - - verbose, _ := cmd.Flags().GetBool("verbose") - if !verbose { - core.Logger.SetOutput(ioutil.Discard) + core.Logger.SetOutput(os.Stdout) } // Create output folder when save file option selected @@ -141,6 +143,7 @@ func run(cmd *cobra.Command, args []string) { crawler := core.NewCrawler(site, cmd) site = strings.TrimSuffix(u.String(), "/") + siteWg.Add(1) go func() { crawler.Start() @@ -169,7 +172,6 @@ func run(cmd *cobra.Command, args []string) { continue } outputFormat := fmt.Sprintf("[other-sources] - %s", url) - //core.Logger.Info(outputFormat + "\n") fmt.Println(outputFormat) if crawler.Output != nil { crawler.Output.WriteToFile(outputFormat) @@ -180,6 +182,7 @@ func run(cmd *cobra.Command, args []string) { } siteWg.Wait() crawler.C.Wait() + _ = crawler.Output.Close() } }() }