Merge branch 'develop'

tijme · tijme · commit ae57d8a8f31f · 2017-11-03T17:35:49.000+01:00
diff --git a/.semver b/.semver
@@ -1 +1 @@
-1.7.11
+1.8.0
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -228,9 +228,10 @@ def linkcode_resolve(domain, info):
 
 # Always make sure current release is in releases.js
 import json
+from collections import OrderedDict
 
 releasesjs = open('../../releases.js').read().replace("var releases = ", "")
-releases = json.loads(releasesjs);
+releases = json.loads(releasesjs, object_pairs_hook=OrderedDict);
 
 releases[release] = True
 
diff --git a/docs/source/kitchen_sink.rst b/docs/source/kitchen_sink.rst
@@ -45,12 +45,11 @@ The English phrase "Everything but the kitchen sink" means "almost anything one
         pass
 
     def cb_request_on_error(queue_item, message):
-        print("A request failed with an error message.")
-        print(message)
+        print("[error] " + message)
 
     def cb_form_before_autofill(queue_item, elements, form_data):
-
         # return CrawlerActions.DO_NOT_AUTOFILL_FORM
+
         return CrawlerActions.DO_AUTOFILL_FORM
 
     def cb_form_after_autofill(queue_item, elements, form_data):
@@ -59,7 +58,7 @@ The English phrase "Everything but the kitchen sink" means "almost anything one
     # Declare the options
     options = Options()
 
-    # Callback options
+    # Callback options (https://tijme.github.io/not-your-average-web-crawler/latest/options_callbacks.html)
     options.callbacks.crawler_before_start = cb_crawler_before_start # Called before the crawler starts crawling. Default is a null route.
     options.callbacks.crawler_after_finish = cb_crawler_after_finish # Called after the crawler finished crawling. Default is a null route.
     options.callbacks.request_before_start = cb_request_before_start # Called before the crawler starts a new request. Default is a null route.
@@ -70,7 +69,7 @@ The English phrase "Everything but the kitchen sink" means "almost anything one
     options.callbacks.form_before_autofill = cb_form_before_autofill # Called before the crawler autofills a form. Default is a null route.
     options.callbacks.form_after_autofill = cb_form_after_autofill # Called after the crawler autofills a form. Default is a null route.
 
-    # Scope options
+    # Scope options (https://tijme.github.io/not-your-average-web-crawler/latest/options_crawling_scope.html)
     options.scope.protocol_must_match = False # Only crawl pages with the same protocol as the startpoint (e.g. only https). Default is False.
     options.scope.subdomain_must_match = True # Only crawl pages with the same subdomain as the startpoint. If the startpoint is not a subdomain, no subdomains will be crawled. Default is True.
     options.scope.hostname_must_match = True # Only crawl pages with the same hostname as the startpoint (e.g. only `finnwea`). Default is True.
@@ -86,7 +85,7 @@ The English phrase "Everything but the kitchen sink" means "almost anything one
         Request.METHOD_HEAD
     ]
 
-    # Identity options
+    # Identity options (https://tijme.github.io/not-your-average-web-crawler/latest/options_crawling_identity.html)
     options.identity.auth = HTTPBasicAuth('user', 'pass') # Or any other authentication (http://docs.python-requests.org/en/master/user/authentication/). Default is None.
     options.identity.cookies.set(name='tasty_cookie', value='yum', domain='finnwea.com', path='/cookies')
     options.identity.cookies.set(name='gross_cookie', value='blech', domain='finnwea.com', path='/elsewhere')
@@ -107,14 +106,21 @@ The English phrase "Everything but the kitchen sink" means "almost anything one
         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"
     })
 
-    # Performance options
+    # Performance options (https://tijme.github.io/not-your-average-web-crawler/latest/options_performance.html)
     options.performance.max_threads = 10 # The maximum amount of simultaneous threads to use for crawling. Default is 8.
-    options.performance.request_timeout = 10 # The request timeout in seconds (throws an exception if exceeded). Default is 30.
+    options.performance.request_timeout = 15 # The request timeout in seconds (throws an exception if exceeded). Default is 30.
+
+    # Routing options (https://tijme.github.io/not-your-average-web-crawler/latest/options_routing.html)
+    options.routing.minimum_threshold = 4 # The minimum amount of requests to crawl (matching a certain route) before ignoring the rest. Default is 20.
+    options.routing.routes = [ 
+        # The regular expressions that represent routes that should not be cralwed more times than the minimum treshold. Default is an empty array.
+        "^(https?:\/\/)?(www\.)?finnwea\.com\/blog\/[^\n \/]+\/$" # Only crawl /blog/{some-blog-alias} 4 times.
+    ]
 
-    # Misc options
+    # Misc options (https://tijme.github.io/not-your-average-web-crawler/latest/options_misc.html)
     options.misc.debug = False # If debug is enabled extra information will be logged to the console. Default is False.
     options.misc.verify_ssl_certificates = True # If verification is enabled all SSL certificates will be checked for validity. Default is True.
-    options.misc.trusted_certificates = None # You can pass the path to a CA_BUNDLE file or directory with certificates of trusted CAs. Default is None.
+    options.misc.trusted_certificates = None # You can pass the path to a CA_BUNDLE file (.pem) or directory with certificates of trusted CAs. Default is None.
 
     crawler = Crawler(options)
     crawler.start_with(Request("https://finnwea.com/"))
diff --git a/docs/source/options_misc.rst b/docs/source/options_misc.rst
@@ -1,7 +1,7 @@
 .. title:: Misc
 
 How to use misc options
-------------------------------
+-----------------------
 
 .. code:: python
 
diff --git a/docs/source/options_routing.rst b/docs/source/options_routing.rst
@@ -0,0 +1,57 @@
+.. title:: Routing
+
+How to use routing options
+--------------------------
+
+.. code:: python
+
+    # misc_example.py
+
+    from nyawc.Options import Options
+    from nyawc.Crawler import Crawler
+    from nyawc.http.Request import Request
+
+    options = Options()
+
+    options.routing.minimum_threshold = 4
+    options.routing.routes = [ 
+        "^(https?:\/\/)?(www\.)?finnwea\.com\/blog\/[^\n \/]+\/$"
+    ]
+
+    crawler = Crawler(options)
+    crawler.start_with(Request("https://finnwea.com/"))
+
+Available routing options
+-------------------------
+
+Minimum threshold
+~~~~~~~~~~~~~~~~~
+
+The minimum amount of requests to crawl (matching a certain route) before ignoring the rest. Default is 20.
+
+For example, lets say we have these rquests;
+
+.. code::
+
+    https://finnwea.com/blog/1
+    https://finnwea.com/blog/2
+    https://finnwea.com/blog/3
+    ...
+    https://finnwea.com/blog/54
+
+It will only crawl the first 20 requests. After that it ignores the rest of the blog posts.
+
+**Please note that it will probably crawl a bit more than the minimum threshold depending on the maximum amount of threads to use.**
+
+``options.routing.minimum_threshold = 20``
+
+Routes
+~~~~~~
+
+The regular expressions that represent routes that should not be cralwed more times than the minimum treshold. Default is an empty array.
+
+For example the route below represents ``http://finnwea.com/blog/{a-variable-blog-alias}/``.
+
+``options.routing.routes = [ 
+        "^(https?:\/\/)?(www\.)?finnwea\.com\/blog\/[^\n \/]+\/$"
+]``
diff --git a/example_extensive.py b/example_extensive.py
@@ -74,7 +74,7 @@ def cb_form_after_autofill(queue_item, elements, form_data):
 # Declare the options
 options = Options()
 
-# Callback options
+# Callback options (https://tijme.github.io/not-your-average-web-crawler/latest/options_callbacks.html)
 options.callbacks.crawler_before_start = cb_crawler_before_start # Called before the crawler starts crawling. Default is a null route.
 options.callbacks.crawler_after_finish = cb_crawler_after_finish # Called after the crawler finished crawling. Default is a null route.
 options.callbacks.request_before_start = cb_request_before_start # Called before the crawler starts a new request. Default is a null route.
@@ -85,7 +85,7 @@ def cb_form_after_autofill(queue_item, elements, form_data):
 options.callbacks.form_before_autofill = cb_form_before_autofill # Called before the crawler autofills a form. Default is a null route.
 options.callbacks.form_after_autofill = cb_form_after_autofill # Called after the crawler autofills a form. Default is a null route.
 
-# Scope options
+# Scope options (https://tijme.github.io/not-your-average-web-crawler/latest/options_crawling_scope.html)
 options.scope.protocol_must_match = False # Only crawl pages with the same protocol as the startpoint (e.g. only https). Default is False.
 options.scope.subdomain_must_match = True # Only crawl pages with the same subdomain as the startpoint. If the startpoint is not a subdomain, no subdomains will be crawled. Default is True.
 options.scope.hostname_must_match = True # Only crawl pages with the same hostname as the startpoint (e.g. only `finnwea`). Default is True.
@@ -101,7 +101,7 @@ def cb_form_after_autofill(queue_item, elements, form_data):
     Request.METHOD_HEAD
 ]
 
-# Identity options
+# Identity options (https://tijme.github.io/not-your-average-web-crawler/latest/options_crawling_identity.html)
 options.identity.auth = HTTPBasicAuth('user', 'pass') # Or any other authentication (http://docs.python-requests.org/en/master/user/authentication/). Default is None.
 options.identity.cookies.set(name='tasty_cookie', value='yum', domain='finnwea.com', path='/cookies')
 options.identity.cookies.set(name='gross_cookie', value='blech', domain='finnwea.com', path='/elsewhere')
@@ -122,11 +122,18 @@ def cb_form_after_autofill(queue_item, elements, form_data):
     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"
 })
 
-# Performance options
+# Performance options (https://tijme.github.io/not-your-average-web-crawler/latest/options_performance.html)
 options.performance.max_threads = 10 # The maximum amount of simultaneous threads to use for crawling. Default is 8.
 options.performance.request_timeout = 15 # The request timeout in seconds (throws an exception if exceeded). Default is 30.
 
-# Misc options
+# Routing options (https://tijme.github.io/not-your-average-web-crawler/latest/options_routing.html)
+options.routing.minimum_threshold = 4 # The minimum amount of requests to crawl (matching a certain route) before ignoring the rest. Default is 20.
+options.routing.routes = [ 
+    # The regular expressions that represent routes that should not be cralwed more times than the minimum treshold. Default is an empty array.
+    "^(https?:\/\/)?(www\.)?finnwea\.com\/blog\/[^\n \/]+\/$" # Only crawl /blog/{some-blog-alias} 4 times.
+]
+
+# Misc options (https://tijme.github.io/not-your-average-web-crawler/latest/options_misc.html)
 options.misc.debug = False # If debug is enabled extra information will be logged to the console. Default is False.
 options.misc.verify_ssl_certificates = True # If verification is enabled all SSL certificates will be checked for validity. Default is True.
 options.misc.trusted_certificates = None # You can pass the path to a CA_BUNDLE file (.pem) or directory with certificates of trusted CAs. Default is None.
diff --git a/nyawc/Crawler.py b/nyawc/Crawler.py
@@ -29,6 +29,7 @@
 import traceback
 
 from nyawc.Queue import Queue
+from nyawc.Routing import Routing
 from nyawc.QueueItem import QueueItem
 from nyawc.CrawlerThread import CrawlerThread
 from nyawc.CrawlerActions import CrawlerActions
@@ -40,6 +41,7 @@ class Crawler(object):
 
     Attributes:
         queue (:class:`nyawc.Queue`): The request/response pair queue containing everything to crawl.
+        routing (:class:`nyawc.Routing`): A class that identifies requests based on routes from the options.
         __options (:class:`nyawc.Options`): The options to use for the current crawling runtime.
         __should_spawn_new_requests (bool): If the crawler should start spwaning new requests.
         __should_stop (bool): If the crawler should stop the crawling process.
@@ -59,6 +61,7 @@ def __init__(self, options):
         """
 
         self.queue = Queue(options)
+        self.routing = Routing(options)
         self.__options = options
         self.__should_spawn_new_requests = False
         self.__should_stop = False
@@ -125,9 +128,17 @@ def __spawn_new_request(self):
         """
 
         first_in_line = self.queue.get_first(QueueItem.STATUS_QUEUED)
+        
         if first_in_line is None:
             return False
 
+        while self.routing.is_treshold_reached(first_in_line.request):
+            self.queue.move(first_in_line, QueueItem.STATUS_CANCELLED)
+
+            first_in_line = self.queue.get_first(QueueItem.STATUS_QUEUED)
+            if first_in_line is None:
+                return False
+
         self.__request_start(first_in_line)
         return True
 
@@ -248,6 +259,7 @@ def __request_finish(self, queue_item, new_requests, request_failed=False):
             new_queue_items = []
             self.queue.move(queue_item, QueueItem.STATUS_ERRORED)
         else:
+            self.routing.increase_route_count(queue_item.request)
             new_queue_items = self.__add_scraped_requests_to_queue(queue_item, new_requests)
             self.queue.move(queue_item, QueueItem.STATUS_FINISHED)
 
diff --git a/nyawc/Options.py b/nyawc/Options.py
@@ -37,6 +37,7 @@ class Options(object):
         callbacks (:class:`nyawc.Options.OptionsCallbacks`): Can be used to define crawling callbacks.
         performance (:class:`nyawc.Options.OptionsPerformance`): Can be used to define performance options.
         identity (:class:`nyawc.Options.OptionsIdentity`): Can be used to define the identity/footprint options.
+        routing (:class:`nyawc.Options.OptionsRouting`): Can be used to define routes to ignore similar requests.
         misc (:class:`nyawc.Options.OptionsMisc`): Can be used to define the other options.
 
     """
@@ -48,6 +49,7 @@ def __init__(self):
         self.callbacks = OptionsCallbacks()
         self.performance = OptionsPerformance()
         self.identity = OptionsIdentity()
+        self.routing = OptionsRouting()
         self.misc = OptionsMisc()
 
 class OptionsScope(object):
@@ -245,6 +247,30 @@ def __init__(self):
         self.headers.update({"User-Agent": user_agent(PackageHelper.get_alias(), PackageHelper.get_version())})
         self.proxies = None
 
+class OptionsRouting(object):
+    """The OptionsRouting class can contain routes that prevent the crawler from crawling similar pages multiple times.
+
+    Attributes:
+        minimum_threshold (int): The minimum amount of requests to crawl (matching a certain route) before ignoring the rest. Default is 20.
+        routes (arr): The regular expressions that represent routes that should not be cralwed more times than the minimum treshold. Default is an empty array.
+
+    Note:
+        An example would be if you have a news site with URLs like (/news/3443, news/2132, news/9475, etc). You can add a regular expression
+        that matches this route so only X requests that match regular expression will be crawled (where X is the minimum treshold).
+
+    Note:
+        The crawler will only stop crawling requests of certain routes at exactly the minimum treshold if the maximum threads option is set to 1.
+        If the maximum threads option is set to a value higher than 1 the threshold will get a bit higher depending on the amount of threads used.
+
+    """
+
+    def __init__(self):
+        """Constructs an OptionsRouting instance."""
+
+        self.minimum_threshold = 20
+        self.routes = []
+
+
 class OptionsMisc(object):
     """The OptionsMisc class contains all kind of misc options.
 
diff --git a/nyawc/QueueItem.py b/nyawc/QueueItem.py
@@ -39,7 +39,8 @@ class QueueItem(object):
         decomposed (bool): If the this queue item is decomposed.
         request (:class:`nyawc.http.Request`): The Request object.
         response (:class:`nyawc.http.Response`): The Response object.
-        response_soup (obj): The BeautifulSoup container for the response text.
+        __response_soup (obj): The BeautifulSoup container for the response text.
+        __index_hash (str): The index of the queue (if cached), otherwise None.
 
     Note:
         A queue item will be decomposed (cached objects are deleted to free up memory) when it is
@@ -76,7 +77,8 @@ def __init__(self, request, response):
 
         self.status = QueueItem.STATUS_QUEUED
         self.decomposed = False
-        self.response_soup = None
+        self.__response_soup = None
+        self.__index_hash = None
 
         self.request = request
         self.response = response
@@ -90,15 +92,15 @@ def get_soup_response(self):
         """
 
         if self.response is not None:
-            if self.response_soup is None:
+            if self.__response_soup is None:
                 result = BeautifulSoup(self.response.text, "lxml")
 
                 if self.decomposed:
                     return result
                 else:
-                    self.response_soup = BeautifulSoup(self.response.text, "lxml")
+                    self.__response_soup = BeautifulSoup(self.response.text, "lxml")
 
-        return self.response_soup
+        return self.__response_soup
 
     def decompose(self):
         """Decompose this queue item (set cached variables to None) to free up memory.
@@ -109,8 +111,9 @@ def decompose(self):
         
         """
 
+        self.__response_soup = None
+
         self.decomposed = True
-        self.response_soup = None
 
     def get_hash(self):
         """Generate and return the dict index hash of the given queue item.
@@ -129,6 +132,9 @@ def get_hash(self):
 
         """
 
+        if self.__index_hash:
+            return self.__index_hash
+
         key = self.request.method
 
         key += URLHelper.get_protocol(self.request.url)
@@ -142,4 +148,5 @@ def get_hash(self):
         if self.request.data is not None:
             key += str(self.request.data.keys())
 
-        return key
+        self.__index_hash = key
+        return self.__index_hash
diff --git a/nyawc/Routing.py b/nyawc/Routing.py