From 38e69748453c5fcb2514291a3fb496bda35e366f Mon Sep 17 00:00:00 2001 From: Robert Felten Date: Wed, 22 Feb 2017 11:23:31 +0100 Subject: [PATCH 1/7] #292: added testcase to reproduce issue --- testing/test_dumpgenerator_offline.py | 54 +++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 testing/test_dumpgenerator_offline.py diff --git a/testing/test_dumpgenerator_offline.py b/testing/test_dumpgenerator_offline.py new file mode 100644 index 00000000..eb7bf4a9 --- /dev/null +++ b/testing/test_dumpgenerator_offline.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# This file is part of the wikiteam project. +# +# Copyright (C) 2017 Robert Felten - https://github.com/rfelten/ +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software Foundation, +# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +import sys +import os +sys.path.append(os.path.join(os.path.dirname(__file__), '..')) # q&d import hack, sorry +import unittest +from dumpgenerator import truncateFilename + +# This file is intended to test offline functionality of the dumpgenerator.py. +# For all other tests see test_dumpgenerator.py + + +class TestDumpgeneratorOffline(unittest.TestCase): + + def setUp(self): + pass + + def tearDown(self): + pass + + def test_truncateFilename(self): + """ Test if truncFilename() obey other['filenamelimit']""" + other = dict() # FIXME: get from dumpgenerator, but code base is a pre-OO mess + other['filenamelimit'] = 100 + fn = u"Assortiment de différentes préparation à bases de légumes et féculents, bien sur servit avec de l'injara.JPG" + self.assertEqual(len(fn), 108) + self.assertEqual(len(fn.encode("utf-8")), 113) # chars like 'è' will extend length + + fn_trunc = truncateFilename(other=other, filename=fn) + self.assertLessEqual(len(fn_trunc), other['filenamelimit'], + "trunced filename '%s' len of %d exceed limit of %d." % + (fn_trunc, len(fn_trunc), other['filenamelimit'])) + +if __name__ == '__main__': + unittest.main() From b64513e8d18ef07ff30ddef2e2584f1da53ae9ef Mon Sep 17 00:00:00 2001 From: Robert Felten Date: Wed, 22 Feb 2017 12:15:10 +0100 Subject: [PATCH 2/7] #292: added more testcases --- testing/test_dumpgenerator_offline.py | 41 ++++++++++++++++++++------- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/testing/test_dumpgenerator_offline.py b/testing/test_dumpgenerator_offline.py index eb7bf4a9..9d1c1d0a 100644 --- a/testing/test_dumpgenerator_offline.py +++ b/testing/test_dumpgenerator_offline.py @@ -32,23 +32,44 @@ class TestDumpgeneratorOffline(unittest.TestCase): def setUp(self): - pass + other = dict() # FIXME: get from dumpgenerator, but code base is a pre-OO mess + other['filenamelimit'] = 100 + + self.other = other def tearDown(self): pass - def test_truncateFilename(self): - """ Test if truncFilename() obey other['filenamelimit']""" - other = dict() # FIXME: get from dumpgenerator, but code base is a pre-OO mess - other['filenamelimit'] = 100 + def helper_truncateFilename(self, fn): + fn_trunc = truncateFilename(other=self.other, filename=fn) + self.assertLessEqual(len(fn_trunc), self.other['filenamelimit'], + "trunced filename '%s' len of %d exceed limit of %d." % ( + fn_trunc, len(fn_trunc), self.other['filenamelimit'])) + + def test_truncateFilename1(self): + """ Test if truncFilename() obey other['filenamelimit'] - real world example 1""" fn = u"Assortiment de différentes préparation à bases de légumes et féculents, bien sur servit avec de l'injara.JPG" self.assertEqual(len(fn), 108) - self.assertEqual(len(fn.encode("utf-8")), 113) # chars like 'è' will extend length + self.assertEqual(len(fn.encode("utf-8")), 113) # chars like 'è' will extend length - this is maybe unexpected + self.helper_truncateFilename(fn) + + def test_truncateFilename2(self): + """ Test if truncFilename() obey other['filenamelimit'] - longest valid name w/o file extension""" + fn = "A" * self.other['filenamelimit'] + self.helper_truncateFilename(fn) + + def test_truncateFilename3(self): + """ Test if truncFilename() obey other['filenamelimit'] - longest valid name w/ file extension""" + fn = "A" * self.other['filenamelimit'] + fn = fn[:-4] + ".jpg" + self.helper_truncateFilename(fn) + + def test_truncateFilename4(self): + """ Test if truncFilename() obey other['filenamelimit'] - longest valid name w/ file extension""" + fn = "A" * (self.other['filenamelimit'] / 2) + fn = fn[:-4] + ".jpg" + self.helper_truncateFilename(fn) - fn_trunc = truncateFilename(other=other, filename=fn) - self.assertLessEqual(len(fn_trunc), other['filenamelimit'], - "trunced filename '%s' len of %d exceed limit of %d." % - (fn_trunc, len(fn_trunc), other['filenamelimit'])) if __name__ == '__main__': unittest.main() From 10f13373251f23f8f6e1fc303854e9f75eb08801 Mon Sep 17 00:00:00 2001 From: Robert Felten Date: Wed, 22 Feb 2017 12:45:41 +0100 Subject: [PATCH 3/7] #292: refactoring: truncate filename logic at one place --- dumpgenerator.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index a045ace5..807783db 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -69,9 +69,15 @@ def getVersion(): def truncateFilename(other={}, filename=''): - """ Truncate filenames when downloading images with large filenames """ - return filename[:other['filenamelimit']] + \ + """ Truncate filenames if needed """ + # truncate filename if length > 100 (100 + 32 (md5) = 132 < 143 (crash + # limit). Later .desc is added to filename, so better 100 as max) + if len(filename) < other['filenamelimit']: + return filename + filename2 = filename[:other['filenamelimit']] + \ md5(filename.encode('utf-8')).hexdigest() + '.' + filename.split('.')[-1] + print 'Filename is too long, truncating. Now it is:', filename2 + return filename2 def delay(config={}, session=None): @@ -1097,13 +1103,8 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None): delay(config=config, session=session) # saving file - # truncate filename if length > 100 (100 + 32 (md5) = 132 < 143 (crash - # limit). Later .desc is added to filename, so better 100 as max) filename2 = urllib.unquote(filename) - if len(filename2) > other['filenamelimit']: - # split last . (extension) and then merge - filename2 = truncateFilename(other=other, filename=filename2) - print 'Filename is too long, truncating. Now it is:', filename2 + filename2 = truncateFilename(other=other, filename=filename2) filename3 = u'%s/%s' % (imagepath, filename2) imagefile = open(filename3, 'wb') r = requests.get(url=url) From 2f826ef13a225180e788e1882e5dff5c3066df9b Mon Sep 17 00:00:00 2001 From: Robert Felten Date: Wed, 22 Feb 2017 13:39:56 +0100 Subject: [PATCH 4/7] #292: fixed #292 --- dumpgenerator.py | 29 ++++++++++++++++++--------- testing/test_dumpgenerator_offline.py | 1 + 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index 807783db..7b0e5834 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -69,15 +69,23 @@ def getVersion(): def truncateFilename(other={}, filename=''): - """ Truncate filenames if needed """ - # truncate filename if length > 100 (100 + 32 (md5) = 132 < 143 (crash - # limit). Later .desc is added to filename, so better 100 as max) - if len(filename) < other['filenamelimit']: + """ Truncate filename if longer than other['filenamelimit'] """ + if len(filename.encode('utf-8')) < other['filenamelimit']: return filename - filename2 = filename[:other['filenamelimit']] + \ - md5(filename.encode('utf-8')).hexdigest() + '.' + filename.split('.')[-1] - print 'Filename is too long, truncating. Now it is:', filename2 - return filename2 + fileext = filename.split('.') + if len(fileext) == 1: + fileext = "" + else: + fileext = '.' + fileext[-1] + # make room for md5, file extension and imagesdescext + trunc = other['filenamelimit'] - 32 - len(fileext) - len(other['imagesdescext']) + assert (trunc > 0) + while len(filename[:trunc].encode('utf-8')) > other['filenamelimit']: + print trunc, filename[:trunc], filename[:trunc].encode('utf-8'), + trunc -= 1 + trunked_fn = filename[:trunc] + md5(filename.encode('utf-8')).hexdigest() + fileext + print 'Filename is too long, truncating. Now it is:', trunked_fn + return trunked_fn def delay(config={}, session=None): @@ -1124,7 +1132,7 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None): text=u'The page "%s" was missing in the wiki (probably deleted)' % (title.decode('utf-8')) ) - f = open('%s/%s.desc' % (imagepath, filename2), 'w') + f = open('%s/%s%s' % (imagepath, filename2, other['imagesdescext']), 'w') # Banner featuring SG1, SGA, SGU teams if not re.search(r'', xmlfiledesc): # failure when retrieving desc? then save it as empty .desc @@ -1501,7 +1509,8 @@ def getParameters(params=[]): 'resume': args.resume, 'filenamelimit': 100, # do not change 'force': args.force, - 'session': session + 'session': session, + 'imagesdescext': '.desc' } # calculating path, if not defined by user with --path= diff --git a/testing/test_dumpgenerator_offline.py b/testing/test_dumpgenerator_offline.py index 9d1c1d0a..5d3e9b92 100644 --- a/testing/test_dumpgenerator_offline.py +++ b/testing/test_dumpgenerator_offline.py @@ -34,6 +34,7 @@ class TestDumpgeneratorOffline(unittest.TestCase): def setUp(self): other = dict() # FIXME: get from dumpgenerator, but code base is a pre-OO mess other['filenamelimit'] = 100 + other['imagesdescext'] = '.desc' self.other = other From f92bf8bcb6ab9bc5cfaba64e497cfca83ebc1c22 Mon Sep 17 00:00:00 2001 From: Robert Felten Date: Wed, 22 Feb 2017 17:42:03 +0100 Subject: [PATCH 5/7] #292: make parameter type explicit, removed debug print --- dumpgenerator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index 7b0e5834..533801eb 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -70,6 +70,7 @@ def getVersion(): def truncateFilename(other={}, filename=''): """ Truncate filename if longer than other['filenamelimit'] """ + filename = unicode(filename) if len(filename.encode('utf-8')) < other['filenamelimit']: return filename fileext = filename.split('.') @@ -81,7 +82,6 @@ def truncateFilename(other={}, filename=''): trunc = other['filenamelimit'] - 32 - len(fileext) - len(other['imagesdescext']) assert (trunc > 0) while len(filename[:trunc].encode('utf-8')) > other['filenamelimit']: - print trunc, filename[:trunc], filename[:trunc].encode('utf-8'), trunc -= 1 trunked_fn = filename[:trunc] + md5(filename.encode('utf-8')).hexdigest() + fileext print 'Filename is too long, truncating. Now it is:', trunked_fn From 6c2cbbb170041b844273da409be591653696ba56 Mon Sep 17 00:00:00 2001 From: Robert Felten Date: Wed, 22 Feb 2017 17:43:37 +0100 Subject: [PATCH 6/7] #292: added another testcase --- testing/test_dumpgenerator_offline.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/testing/test_dumpgenerator_offline.py b/testing/test_dumpgenerator_offline.py index 5d3e9b92..903251d0 100644 --- a/testing/test_dumpgenerator_offline.py +++ b/testing/test_dumpgenerator_offline.py @@ -35,7 +35,6 @@ def setUp(self): other = dict() # FIXME: get from dumpgenerator, but code base is a pre-OO mess other['filenamelimit'] = 100 other['imagesdescext'] = '.desc' - self.other = other def tearDown(self): @@ -66,11 +65,18 @@ def test_truncateFilename3(self): self.helper_truncateFilename(fn) def test_truncateFilename4(self): - """ Test if truncFilename() obey other['filenamelimit'] - longest valid name w/ file extension""" + """ Test if truncFilename() obey other['filenamelimit'] - valid name w/ file extension""" fn = "A" * (self.other['filenamelimit'] / 2) fn = fn[:-4] + ".jpg" self.helper_truncateFilename(fn) + def test_truncateFilename5(self): + """ Test if truncFilename() obey other['filenamelimit'] - longest valid name w/ file extension (unicode)""" + fn = u"è" * self.other['filenamelimit'] + fn = fn[:-4] + ".jpg" + self.helper_truncateFilename(fn) + + if __name__ == '__main__': unittest.main() From d63871260e117d90c53e907a3f1353bbb13b03c4 Mon Sep 17 00:00:00 2001 From: Robert Felten Date: Wed, 22 Feb 2017 17:47:44 +0100 Subject: [PATCH 7/7] changed filenamelimit to 140 after bug fix filename truncation. see #292 --- dumpgenerator.py | 2 +- testing/test_dumpgenerator_offline.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index 533801eb..b5b63bb3 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -1507,7 +1507,7 @@ def getParameters(params=[]): other = { 'resume': args.resume, - 'filenamelimit': 100, # do not change + 'filenamelimit': 140, # encryptfs reduce the filename limit from 255 to ~148 chars :/ 'force': args.force, 'session': session, 'imagesdescext': '.desc' diff --git a/testing/test_dumpgenerator_offline.py b/testing/test_dumpgenerator_offline.py index 903251d0..0bbe7ddc 100644 --- a/testing/test_dumpgenerator_offline.py +++ b/testing/test_dumpgenerator_offline.py @@ -33,7 +33,7 @@ class TestDumpgeneratorOffline(unittest.TestCase): def setUp(self): other = dict() # FIXME: get from dumpgenerator, but code base is a pre-OO mess - other['filenamelimit'] = 100 + other['filenamelimit'] = 140 # encryptfs reduce the filename limit from 255 to ~148 chars :/ other['imagesdescext'] = '.desc' self.other = other