Skip to content

Commit ae57d8a

Browse files
committed
Merge branch 'develop'
2 parents 157e66a + 629ea90 commit ae57d8a

File tree

10 files changed

+226
-25
lines changed

10 files changed

+226
-25
lines changed

.semver

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1.7.11
1+
1.8.0

docs/source/conf.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -228,9 +228,10 @@ def linkcode_resolve(domain, info):
228228

229229
# Always make sure current release is in releases.js
230230
import json
231+
from collections import OrderedDict
231232

232233
releasesjs = open('../../releases.js').read().replace("var releases = ", "")
233-
releases = json.loads(releasesjs);
234+
releases = json.loads(releasesjs, object_pairs_hook=OrderedDict);
234235

235236
releases[release] = True
236237

docs/source/kitchen_sink.rst

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -45,12 +45,11 @@ The English phrase "Everything but the kitchen sink" means "almost anything one
4545
pass
4646
4747
def cb_request_on_error(queue_item, message):
48-
print("A request failed with an error message.")
49-
print(message)
48+
print("[error] " + message)
5049
5150
def cb_form_before_autofill(queue_item, elements, form_data):
52-
5351
# return CrawlerActions.DO_NOT_AUTOFILL_FORM
52+
5453
return CrawlerActions.DO_AUTOFILL_FORM
5554
5655
def cb_form_after_autofill(queue_item, elements, form_data):
@@ -59,7 +58,7 @@ The English phrase "Everything but the kitchen sink" means "almost anything one
5958
# Declare the options
6059
options = Options()
6160
62-
# Callback options
61+
# Callback options (https://tijme.github.io/not-your-average-web-crawler/latest/options_callbacks.html)
6362
options.callbacks.crawler_before_start = cb_crawler_before_start # Called before the crawler starts crawling. Default is a null route.
6463
options.callbacks.crawler_after_finish = cb_crawler_after_finish # Called after the crawler finished crawling. Default is a null route.
6564
options.callbacks.request_before_start = cb_request_before_start # Called before the crawler starts a new request. Default is a null route.
@@ -70,7 +69,7 @@ The English phrase "Everything but the kitchen sink" means "almost anything one
7069
options.callbacks.form_before_autofill = cb_form_before_autofill # Called before the crawler autofills a form. Default is a null route.
7170
options.callbacks.form_after_autofill = cb_form_after_autofill # Called after the crawler autofills a form. Default is a null route.
7271
73-
# Scope options
72+
# Scope options (https://tijme.github.io/not-your-average-web-crawler/latest/options_crawling_scope.html)
7473
options.scope.protocol_must_match = False # Only crawl pages with the same protocol as the startpoint (e.g. only https). Default is False.
7574
options.scope.subdomain_must_match = True # Only crawl pages with the same subdomain as the startpoint. If the startpoint is not a subdomain, no subdomains will be crawled. Default is True.
7675
options.scope.hostname_must_match = True # Only crawl pages with the same hostname as the startpoint (e.g. only `finnwea`). Default is True.
@@ -86,7 +85,7 @@ The English phrase "Everything but the kitchen sink" means "almost anything one
8685
Request.METHOD_HEAD
8786
]
8887
89-
# Identity options
88+
# Identity options (https://tijme.github.io/not-your-average-web-crawler/latest/options_crawling_identity.html)
9089
options.identity.auth = HTTPBasicAuth('user', 'pass') # Or any other authentication (http://docs.python-requests.org/en/master/user/authentication/). Default is None.
9190
options.identity.cookies.set(name='tasty_cookie', value='yum', domain='finnwea.com', path='/cookies')
9291
options.identity.cookies.set(name='gross_cookie', value='blech', domain='finnwea.com', path='/elsewhere')
@@ -107,14 +106,21 @@ The English phrase "Everything but the kitchen sink" means "almost anything one
107106
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"
108107
})
109108
110-
# Performance options
109+
# Performance options (https://tijme.github.io/not-your-average-web-crawler/latest/options_performance.html)
111110
options.performance.max_threads = 10 # The maximum amount of simultaneous threads to use for crawling. Default is 8.
112-
options.performance.request_timeout = 10 # The request timeout in seconds (throws an exception if exceeded). Default is 30.
111+
options.performance.request_timeout = 15 # The request timeout in seconds (throws an exception if exceeded). Default is 30.
112+
113+
# Routing options (https://tijme.github.io/not-your-average-web-crawler/latest/options_routing.html)
114+
options.routing.minimum_threshold = 4 # The minimum amount of requests to crawl (matching a certain route) before ignoring the rest. Default is 20.
115+
options.routing.routes = [
116+
# The regular expressions that represent routes that should not be cralwed more times than the minimum treshold. Default is an empty array.
117+
"^(https?:\/\/)?(www\.)?finnwea\.com\/blog\/[^\n \/]+\/$" # Only crawl /blog/{some-blog-alias} 4 times.
118+
]
113119
114-
# Misc options
120+
# Misc options (https://tijme.github.io/not-your-average-web-crawler/latest/options_misc.html)
115121
options.misc.debug = False # If debug is enabled extra information will be logged to the console. Default is False.
116122
options.misc.verify_ssl_certificates = True # If verification is enabled all SSL certificates will be checked for validity. Default is True.
117-
options.misc.trusted_certificates = None # You can pass the path to a CA_BUNDLE file or directory with certificates of trusted CAs. Default is None.
123+
options.misc.trusted_certificates = None # You can pass the path to a CA_BUNDLE file (.pem) or directory with certificates of trusted CAs. Default is None.
118124
119125
crawler = Crawler(options)
120126
crawler.start_with(Request("https://finnwea.com/"))

docs/source/options_misc.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
.. title:: Misc
22

33
How to use misc options
4-
------------------------------
4+
-----------------------
55

66
.. code:: python
77

docs/source/options_routing.rst

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
.. title:: Routing
2+
3+
How to use routing options
4+
--------------------------
5+
6+
.. code:: python
7+
8+
# misc_example.py
9+
10+
from nyawc.Options import Options
11+
from nyawc.Crawler import Crawler
12+
from nyawc.http.Request import Request
13+
14+
options = Options()
15+
16+
options.routing.minimum_threshold = 4
17+
options.routing.routes = [
18+
"^(https?:\/\/)?(www\.)?finnwea\.com\/blog\/[^\n \/]+\/$"
19+
]
20+
21+
crawler = Crawler(options)
22+
crawler.start_with(Request("https://finnwea.com/"))
23+
24+
Available routing options
25+
-------------------------
26+
27+
Minimum threshold
28+
~~~~~~~~~~~~~~~~~
29+
30+
The minimum amount of requests to crawl (matching a certain route) before ignoring the rest. Default is 20.
31+
32+
For example, lets say we have these rquests;
33+
34+
.. code::
35+
36+
https://finnwea.com/blog/1
37+
https://finnwea.com/blog/2
38+
https://finnwea.com/blog/3
39+
...
40+
https://finnwea.com/blog/54
41+
42+
It will only crawl the first 20 requests. After that it ignores the rest of the blog posts.
43+
44+
**Please note that it will probably crawl a bit more than the minimum threshold depending on the maximum amount of threads to use.**
45+
46+
``options.routing.minimum_threshold = 20``
47+
48+
Routes
49+
~~~~~~
50+
51+
The regular expressions that represent routes that should not be cralwed more times than the minimum treshold. Default is an empty array.
52+
53+
For example the route below represents ``http://finnwea.com/blog/{a-variable-blog-alias}/``.
54+
55+
``options.routing.routes = [
56+
"^(https?:\/\/)?(www\.)?finnwea\.com\/blog\/[^\n \/]+\/$"
57+
]``

example_extensive.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def cb_form_after_autofill(queue_item, elements, form_data):
7474
# Declare the options
7575
options = Options()
7676

77-
# Callback options
77+
# Callback options (https://tijme.github.io/not-your-average-web-crawler/latest/options_callbacks.html)
7878
options.callbacks.crawler_before_start = cb_crawler_before_start # Called before the crawler starts crawling. Default is a null route.
7979
options.callbacks.crawler_after_finish = cb_crawler_after_finish # Called after the crawler finished crawling. Default is a null route.
8080
options.callbacks.request_before_start = cb_request_before_start # Called before the crawler starts a new request. Default is a null route.
@@ -85,7 +85,7 @@ def cb_form_after_autofill(queue_item, elements, form_data):
8585
options.callbacks.form_before_autofill = cb_form_before_autofill # Called before the crawler autofills a form. Default is a null route.
8686
options.callbacks.form_after_autofill = cb_form_after_autofill # Called after the crawler autofills a form. Default is a null route.
8787

88-
# Scope options
88+
# Scope options (https://tijme.github.io/not-your-average-web-crawler/latest/options_crawling_scope.html)
8989
options.scope.protocol_must_match = False # Only crawl pages with the same protocol as the startpoint (e.g. only https). Default is False.
9090
options.scope.subdomain_must_match = True # Only crawl pages with the same subdomain as the startpoint. If the startpoint is not a subdomain, no subdomains will be crawled. Default is True.
9191
options.scope.hostname_must_match = True # Only crawl pages with the same hostname as the startpoint (e.g. only `finnwea`). Default is True.
@@ -101,7 +101,7 @@ def cb_form_after_autofill(queue_item, elements, form_data):
101101
Request.METHOD_HEAD
102102
]
103103

104-
# Identity options
104+
# Identity options (https://tijme.github.io/not-your-average-web-crawler/latest/options_crawling_identity.html)
105105
options.identity.auth = HTTPBasicAuth('user', 'pass') # Or any other authentication (http://docs.python-requests.org/en/master/user/authentication/). Default is None.
106106
options.identity.cookies.set(name='tasty_cookie', value='yum', domain='finnwea.com', path='/cookies')
107107
options.identity.cookies.set(name='gross_cookie', value='blech', domain='finnwea.com', path='/elsewhere')
@@ -122,11 +122,18 @@ def cb_form_after_autofill(queue_item, elements, form_data):
122122
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"
123123
})
124124

125-
# Performance options
125+
# Performance options (https://tijme.github.io/not-your-average-web-crawler/latest/options_performance.html)
126126
options.performance.max_threads = 10 # The maximum amount of simultaneous threads to use for crawling. Default is 8.
127127
options.performance.request_timeout = 15 # The request timeout in seconds (throws an exception if exceeded). Default is 30.
128128

129-
# Misc options
129+
# Routing options (https://tijme.github.io/not-your-average-web-crawler/latest/options_routing.html)
130+
options.routing.minimum_threshold = 4 # The minimum amount of requests to crawl (matching a certain route) before ignoring the rest. Default is 20.
131+
options.routing.routes = [
132+
# The regular expressions that represent routes that should not be cralwed more times than the minimum treshold. Default is an empty array.
133+
"^(https?:\/\/)?(www\.)?finnwea\.com\/blog\/[^\n \/]+\/$" # Only crawl /blog/{some-blog-alias} 4 times.
134+
]
135+
136+
# Misc options (https://tijme.github.io/not-your-average-web-crawler/latest/options_misc.html)
130137
options.misc.debug = False # If debug is enabled extra information will be logged to the console. Default is False.
131138
options.misc.verify_ssl_certificates = True # If verification is enabled all SSL certificates will be checked for validity. Default is True.
132139
options.misc.trusted_certificates = None # You can pass the path to a CA_BUNDLE file (.pem) or directory with certificates of trusted CAs. Default is None.

nyawc/Crawler.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import traceback
3030

3131
from nyawc.Queue import Queue
32+
from nyawc.Routing import Routing
3233
from nyawc.QueueItem import QueueItem
3334
from nyawc.CrawlerThread import CrawlerThread
3435
from nyawc.CrawlerActions import CrawlerActions
@@ -40,6 +41,7 @@ class Crawler(object):
4041
4142
Attributes:
4243
queue (:class:`nyawc.Queue`): The request/response pair queue containing everything to crawl.
44+
routing (:class:`nyawc.Routing`): A class that identifies requests based on routes from the options.
4345
__options (:class:`nyawc.Options`): The options to use for the current crawling runtime.
4446
__should_spawn_new_requests (bool): If the crawler should start spwaning new requests.
4547
__should_stop (bool): If the crawler should stop the crawling process.
@@ -59,6 +61,7 @@ def __init__(self, options):
5961
"""
6062

6163
self.queue = Queue(options)
64+
self.routing = Routing(options)
6265
self.__options = options
6366
self.__should_spawn_new_requests = False
6467
self.__should_stop = False
@@ -125,9 +128,17 @@ def __spawn_new_request(self):
125128
"""
126129

127130
first_in_line = self.queue.get_first(QueueItem.STATUS_QUEUED)
131+
128132
if first_in_line is None:
129133
return False
130134

135+
while self.routing.is_treshold_reached(first_in_line.request):
136+
self.queue.move(first_in_line, QueueItem.STATUS_CANCELLED)
137+
138+
first_in_line = self.queue.get_first(QueueItem.STATUS_QUEUED)
139+
if first_in_line is None:
140+
return False
141+
131142
self.__request_start(first_in_line)
132143
return True
133144

@@ -248,6 +259,7 @@ def __request_finish(self, queue_item, new_requests, request_failed=False):
248259
new_queue_items = []
249260
self.queue.move(queue_item, QueueItem.STATUS_ERRORED)
250261
else:
262+
self.routing.increase_route_count(queue_item.request)
251263
new_queue_items = self.__add_scraped_requests_to_queue(queue_item, new_requests)
252264
self.queue.move(queue_item, QueueItem.STATUS_FINISHED)
253265

nyawc/Options.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ class Options(object):
3737
callbacks (:class:`nyawc.Options.OptionsCallbacks`): Can be used to define crawling callbacks.
3838
performance (:class:`nyawc.Options.OptionsPerformance`): Can be used to define performance options.
3939
identity (:class:`nyawc.Options.OptionsIdentity`): Can be used to define the identity/footprint options.
40+
routing (:class:`nyawc.Options.OptionsRouting`): Can be used to define routes to ignore similar requests.
4041
misc (:class:`nyawc.Options.OptionsMisc`): Can be used to define the other options.
4142
4243
"""
@@ -48,6 +49,7 @@ def __init__(self):
4849
self.callbacks = OptionsCallbacks()
4950
self.performance = OptionsPerformance()
5051
self.identity = OptionsIdentity()
52+
self.routing = OptionsRouting()
5153
self.misc = OptionsMisc()
5254

5355
class OptionsScope(object):
@@ -245,6 +247,30 @@ def __init__(self):
245247
self.headers.update({"User-Agent": user_agent(PackageHelper.get_alias(), PackageHelper.get_version())})
246248
self.proxies = None
247249

250+
class OptionsRouting(object):
251+
"""The OptionsRouting class can contain routes that prevent the crawler from crawling similar pages multiple times.
252+
253+
Attributes:
254+
minimum_threshold (int): The minimum amount of requests to crawl (matching a certain route) before ignoring the rest. Default is 20.
255+
routes (arr): The regular expressions that represent routes that should not be cralwed more times than the minimum treshold. Default is an empty array.
256+
257+
Note:
258+
An example would be if you have a news site with URLs like (/news/3443, news/2132, news/9475, etc). You can add a regular expression
259+
that matches this route so only X requests that match regular expression will be crawled (where X is the minimum treshold).
260+
261+
Note:
262+
The crawler will only stop crawling requests of certain routes at exactly the minimum treshold if the maximum threads option is set to 1.
263+
If the maximum threads option is set to a value higher than 1 the threshold will get a bit higher depending on the amount of threads used.
264+
265+
"""
266+
267+
def __init__(self):
268+
"""Constructs an OptionsRouting instance."""
269+
270+
self.minimum_threshold = 20
271+
self.routes = []
272+
273+
248274
class OptionsMisc(object):
249275
"""The OptionsMisc class contains all kind of misc options.
250276

nyawc/QueueItem.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,8 @@ class QueueItem(object):
3939
decomposed (bool): If the this queue item is decomposed.
4040
request (:class:`nyawc.http.Request`): The Request object.
4141
response (:class:`nyawc.http.Response`): The Response object.
42-
response_soup (obj): The BeautifulSoup container for the response text.
42+
__response_soup (obj): The BeautifulSoup container for the response text.
43+
__index_hash (str): The index of the queue (if cached), otherwise None.
4344
4445
Note:
4546
A queue item will be decomposed (cached objects are deleted to free up memory) when it is
@@ -76,7 +77,8 @@ def __init__(self, request, response):
7677

7778
self.status = QueueItem.STATUS_QUEUED
7879
self.decomposed = False
79-
self.response_soup = None
80+
self.__response_soup = None
81+
self.__index_hash = None
8082

8183
self.request = request
8284
self.response = response
@@ -90,15 +92,15 @@ def get_soup_response(self):
9092
"""
9193

9294
if self.response is not None:
93-
if self.response_soup is None:
95+
if self.__response_soup is None:
9496
result = BeautifulSoup(self.response.text, "lxml")
9597

9698
if self.decomposed:
9799
return result
98100
else:
99-
self.response_soup = BeautifulSoup(self.response.text, "lxml")
101+
self.__response_soup = BeautifulSoup(self.response.text, "lxml")
100102

101-
return self.response_soup
103+
return self.__response_soup
102104

103105
def decompose(self):
104106
"""Decompose this queue item (set cached variables to None) to free up memory.
@@ -109,8 +111,9 @@ def decompose(self):
109111
110112
"""
111113

114+
self.__response_soup = None
115+
112116
self.decomposed = True
113-
self.response_soup = None
114117

115118
def get_hash(self):
116119
"""Generate and return the dict index hash of the given queue item.
@@ -129,6 +132,9 @@ def get_hash(self):
129132
130133
"""
131134

135+
if self.__index_hash:
136+
return self.__index_hash
137+
132138
key = self.request.method
133139

134140
key += URLHelper.get_protocol(self.request.url)
@@ -142,4 +148,5 @@ def get_hash(self):
142148
if self.request.data is not None:
143149
key += str(self.request.data.keys())
144150

145-
return key
151+
self.__index_hash = key
152+
return self.__index_hash

0 commit comments

Comments
 (0)