Skip to content

Commit 2d204ef

Browse files
authored
Merge pull request #114 from J-CPelletier/fix-supported-comics
Fix Supported Comics
2 parents 8a9edb0 + 6392b84 commit 2d204ef

File tree

7 files changed

+228
-132
lines changed

7 files changed

+228
-132
lines changed

poetry.lock

Lines changed: 70 additions & 112 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "webcomix"
3-
version = "3.11.5"
3+
version = "3.12.0"
44
description = "Webcomic downloader"
55
authors = ["Jean-Christophe Pelletier <[email protected]>"]
66
readme = "README.md"
@@ -33,6 +33,7 @@ scrapy-splash = "^0.10.0"
3333
scrapy-fake-useragent = "^1.4.4"
3434
pytest-rerunfailures = "^11.1.2"
3535
docker = "^7.1.0"
36+
cloudscraper = "^1.2.71"
3637

3738
[tool.poetry.dev-dependencies]
3839
pytest = "^7.4.4"

webcomix/comic.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,15 +26,16 @@
2626
"DOWNLOADER_MIDDLEWARES": {
2727
"scrapy.downloadermiddlewares.useragent.UserAgentMiddleware": None,
2828
"scrapy.downloadermiddlewares.retry.RetryMiddleware": None,
29-
"scrapy_fake_useragent.middleware.RandomUserAgentMiddleware": 400,
30-
"scrapy_fake_useragent.middleware.RetryUserAgentMiddleware": 401,
29+
"scrapy_fake_useragent.middleware.RandomUserAgentMiddleware": 500,
30+
"scrapy_fake_useragent.middleware.RetryUserAgentMiddleware": 501,
31+
"webcomix.scrapy.custom_cloudflare_middleware.CustomCloudflareMiddleware": 543,
3132
},
3233
"FAKEUSERAGENT_PROVIDERS": [
3334
"scrapy_fake_useragent.providers.FakeUserAgentProvider",
3435
"scrapy_fake_useragent.providers.FakerProvider",
3536
"scrapy_fake_useragent.providers.FixedUserAgentProvider",
3637
],
37-
"FAKEUSERAGENT_FALLBACK": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
38+
"FAKEUSERAGENT_FALLBACK": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
3839
}
3940

4041

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import cloudscraper
2+
from scrapy.http import HtmlResponse
3+
4+
5+
class CustomCloudflareMiddleware:
6+
7+
cloudflare_scraper = cloudscraper.create_scraper()
8+
9+
def process_response(self, request, response, spider):
10+
request_url = request.url
11+
response_status = response.status
12+
if response_status not in (403, 503):
13+
return response
14+
15+
spider.logger.info(
16+
"Cloudflare detected. Using cloudscraper on URL: %s", request_url
17+
)
18+
cflare_response = self.cloudflare_scraper.get(request_url)
19+
cflare_res_transformed = HtmlResponse(
20+
url=request_url, body=cflare_response.text, encoding="utf-8"
21+
)
22+
return cflare_res_transformed
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
import pytest
2+
from scrapy.http import HtmlResponse, Request
3+
from webcomix.scrapy.custom_cloudflare_middleware import CustomCloudflareMiddleware
4+
5+
6+
AN_URL = "https://example.com/comic"
7+
CLOUDFLARE_HTML = "<html><body>Cloudflare protected content</body></html>"
8+
9+
10+
@pytest.fixture
11+
def middleware():
12+
return CustomCloudflareMiddleware()
13+
14+
15+
@pytest.fixture
16+
def spider(mocker):
17+
spider = mocker.Mock()
18+
spider.logger = mocker.Mock()
19+
return spider
20+
21+
22+
@pytest.fixture
23+
def test_request():
24+
return Request(AN_URL)
25+
26+
27+
def test_middleware_returns_response_when_status_200(middleware, test_request, spider):
28+
response = HtmlResponse(AN_URL, status=200, body=b"<html></html>")
29+
30+
result = middleware.process_response(test_request, response, spider)
31+
32+
assert result is response
33+
spider.logger.info.assert_not_called()
34+
35+
36+
def test_middleware_returns_response_when_status_404(middleware, test_request, spider):
37+
response = HtmlResponse(AN_URL, status=404, body=b"<html></html>")
38+
39+
result = middleware.process_response(test_request, response, spider)
40+
41+
assert result is response
42+
spider.logger.info.assert_not_called()
43+
44+
45+
def test_middleware_uses_cloudscraper_when_status_403(
46+
mocker, middleware, test_request, spider
47+
):
48+
response = HtmlResponse(AN_URL, status=403, body=b"<html></html>")
49+
mock_cf_response = mocker.Mock()
50+
mock_cf_response.text = CLOUDFLARE_HTML
51+
mock_scraper = mocker.patch.object(CustomCloudflareMiddleware, "cloudflare_scraper")
52+
mock_scraper.get.return_value = mock_cf_response
53+
54+
result = middleware.process_response(test_request, response, spider)
55+
56+
mock_scraper.get.assert_called_once_with(AN_URL)
57+
assert isinstance(result, HtmlResponse)
58+
assert result.url == AN_URL
59+
assert CLOUDFLARE_HTML in result.text
60+
spider.logger.info.assert_called_once()
61+
62+
63+
def test_middleware_uses_cloudscraper_when_status_503(
64+
mocker, middleware, test_request, spider
65+
):
66+
response = HtmlResponse(AN_URL, status=503, body=b"<html></html>")
67+
mock_cf_response = mocker.Mock()
68+
mock_cf_response.text = CLOUDFLARE_HTML
69+
mock_scraper = mocker.patch.object(CustomCloudflareMiddleware, "cloudflare_scraper")
70+
mock_scraper.get.return_value = mock_cf_response
71+
72+
result = middleware.process_response(test_request, response, spider)
73+
74+
mock_scraper.get.assert_called_once_with(AN_URL)
75+
assert isinstance(result, HtmlResponse)
76+
assert result.url == AN_URL
77+
spider.logger.info.assert_called_once()
78+
79+
80+
def test_middleware_logs_cloudflare_detection(mocker, middleware, test_request, spider):
81+
response = HtmlResponse(AN_URL, status=403, body=b"<html></html>")
82+
mock_cf_response = mocker.Mock()
83+
mock_cf_response.text = CLOUDFLARE_HTML
84+
mock_scraper = mocker.patch.object(CustomCloudflareMiddleware, "cloudflare_scraper")
85+
mock_scraper.get.return_value = mock_cf_response
86+
87+
middleware.process_response(test_request, response, spider)
88+
89+
spider.logger.info.assert_called_once_with(
90+
"Cloudflare detected. Using cloudscraper on URL: %s", AN_URL
91+
)
92+
93+
94+
def test_middleware_returns_htmlresponse_with_utf8_encoding(
95+
mocker, middleware, test_request, spider
96+
):
97+
response = HtmlResponse(AN_URL, status=503, body=b"<html></html>")
98+
mock_cf_response = mocker.Mock()
99+
mock_cf_response.text = CLOUDFLARE_HTML
100+
mock_scraper = mocker.patch.object(CustomCloudflareMiddleware, "cloudflare_scraper")
101+
mock_scraper.get.return_value = mock_cf_response
102+
103+
result = middleware.process_response(test_request, response, spider)
104+
105+
assert result.encoding == "utf-8"
106+
107+
108+
def test_middleware_handles_different_urls(mocker, middleware, spider):
109+
different_url = "https://different-site.com/page"
110+
test_request = Request(different_url)
111+
response = HtmlResponse(different_url, status=403, body=b"<html></html>")
112+
mock_cf_response = mocker.Mock()
113+
mock_cf_response.text = CLOUDFLARE_HTML
114+
mock_scraper = mocker.patch.object(CustomCloudflareMiddleware, "cloudflare_scraper")
115+
mock_scraper.get.return_value = mock_cf_response
116+
117+
result = middleware.process_response(test_request, response, spider)
118+
119+
mock_scraper.get.assert_called_once_with(different_url)
120+
assert result.url == different_url

webcomix/supported_comics.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
"Nedroid": {
2222
"name": "Nedroid",
2323
"start_url": "https://nedroid.com/?1",
24-
"comic_image_selector": "//img[@class='comic']/@src",
24+
"comic_image_selector": "//img[@class='comic_img']/@src",
2525
"next_page_selector": "//a[text()='NEXT>']/@href",
2626
},
2727
"JL8": {
@@ -121,12 +121,6 @@
121121
"next_page_selector": "//a[@class='cc-next']/@href",
122122
"single_page": True,
123123
},
124-
"MissingMonday": {
125-
"name": "MissingMonday",
126-
"start_url": "https://www.missingmondaycomic.com/comic/chapter-01-page-01",
127-
"comic_image_selector": "//img[@id='cc-comic']/@src",
128-
"next_page_selector": "//a[@class='cc-next']/@href",
129-
},
130124
"StarTrip": {
131125
"name": "StarTrip",
132126
"start_url": "https://www.startripcomic.com/comic/chapter-1-cover",

webcomix/tests/test_comic_availability.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,19 +7,19 @@
77

88

99
# TODO: Handle 403 errors
10-
supported_comics_ignored = {
11-
k: v
12-
for k, v in supported_comics.items()
13-
if not (
14-
(k == "TheAbominableCharlesChristopher" or k == "Lackadaisy")
15-
and os.environ.get("CI", False)
16-
)
17-
}
10+
# supported_comics_ignored = {
11+
# k: v
12+
# for k, v in supported_comics.items()
13+
# if not (
14+
# (k == "TheAbominableCharlesChristopher" or k == "Lackadaisy")
15+
# and os.environ.get("CI", False)
16+
# )
17+
# }
1818

1919

2020
@pytest.mark.flaky(reruns=5, reruns_delay=60)
2121
@pytest.mark.slow
22-
@pytest.mark.parametrize("comic_name", supported_comics_ignored.keys())
22+
@pytest.mark.parametrize("comic_name", supported_comics.keys())
2323
def test_supported_comics(comic_name):
2424
comic = Comic(**supported_comics[comic_name], debug=True)
2525
first_pages = comic.verify_xpath()

0 commit comments

Comments
 (0)