Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 36 additions & 2 deletions webcomix/comic.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,8 +192,42 @@ def save_image_filename(
# No file extension (only dot in url is domain name)
return str(page)

parsed_filepath = urlparse(url).path
file_extension = parsed_filepath[parsed_filepath.rindex(".") :]
"""
Checking if we have the extension in the URL path or in a query parameter
Some comics use query parameters to identify content
This can cause an exception in rindex as the path won't have the file nor extension
Thus, we try the path first, followed by the query broken down into component parameters
"""
file_extension = ""
parts = urlparse(url)

try:
parsed_filepath = parts.path
file_extension = parsed_filepath[parsed_filepath.rindex(".") :]
except:
if "?" in url:
if "&" in parts.query:
parsed_queries = parts.query.split("&")
for current_query in parsed_queries:
try:
file_extension = current_query[current_query.rindex(".") :]
break
except:
# nothing, loop again
dummy_var = ""
else:
try:
file_extension = parts.query[parts.query.rindex(".") :]
except:
print(
"File extension unknown; setting as '.unknown' to preserve data"
)
file_extension = ".unknown"
else:
# worst case we can't identify the extension, setting 'unknown' to allow saving file for evaluation
print("File extension unknown; setting as '.unknown' to preserve data")
file_extension = ".unknown"

Comment on lines +202 to +230
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. We'd probably want to extract this into a different method (like get_file_path that returns both file path and extension)
  2. We could also use the parse_qs(...) method available in the urllib.parse module. Here's an example of what the results would look like:
>>> from urllib.parse import parse_qs
>>> parts = urlparse("https://quantumvibe.com/disppageV3?story=qv&file=/simages/qv/qv1-001.jpg")
>>> parse_qs(parts.query)
{'story': ['qv'], 'file': ['/simages/qv/qv1-001.jpg']}
  1. We should also add unit tests for the extracted method

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah yeah that parse_qs would probably be more efficient, I didn't catch that in urlparse documentation.

I'll break this out in the next couple days due to #job

if title_present:
return "{}-{}{}".format(comic_name, page, file_extension)
else:
Expand Down
12 changes: 12 additions & 0 deletions webcomix/supported_comics.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,4 +259,16 @@
"comic_image_selector": "//img[@id='comicimage']/@src",
"next_page_selector": "//a[@rel='next']//@href",
},
"InkBlot": {
"name": "InkBlot",
"start_url": "https://www.inkboltcomic.com/index.php?page=1",
"comic_image_selector": "//img[@alt='Comic Image']/@src",
"next_page_selector": "//a[img[@alt='Next Page']]//@href",
},
"QuantumVibe": {
"name": "QuantumVibe",
"start_url": "https://quantumvibe.com/strip?page=1",
"comic_image_selector": "//a[contains(@href, 'strip?page=')]//img[contains(@src, 'disppage')]/@src",
"next_page_selector": "//a[img[contains(@src, 'nav/NextStrip2.gif')]]/@href",
},
}