Skip to content

Commit 02cff71

Browse files
authored
Merge pull request #5 from ekorkut/ekorkut1_hashsign
Hash sign in urls is not encoded and more strict error check for order and number of metadata columns
2 parents 1c62ec0 + d68bbd4 commit 02cff71

21 files changed

+378
-22
lines changed

pycsvw/csvw.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@
2828

2929
from . import nt_serializer
3030
from .csvw_exceptions import NoDefaultOrValueUrlError, \
31-
BothDefaultAndValueUrlError, BothLangAndDatatypeError, RiotWarning, RiotError
31+
BothDefaultAndValueUrlError, BothLangAndDatatypeError, \
32+
VirtualColumnPrecedesNonVirtualColumn, RiotWarning, RiotError
3233

3334
READ_PERMISSIONS = stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH
3435

@@ -92,6 +93,16 @@ def _read_metadata(handle):
9293

9394
col["aboutUrl"] = col.get("aboutUrl", None)
9495
col["suppressOutput"] = col.get("suppressOutput", False)
96+
# Non-virtual columns should precede virtual columns
97+
virtual_seen_yet = False
98+
for col in table_schema["columns"]:
99+
if not virtual_seen_yet and col["virtual"]:
100+
virtual_seen_yet = True
101+
continue
102+
if virtual_seen_yet and not col["virtual"]:
103+
raise VirtualColumnPrecedesNonVirtualColumn(
104+
"Non-virtual column {} comes after a virtual column. "
105+
"All virtual columns should come after non-virtual columns.".format(col))
95106

96107
return out
97108

pycsvw/csvw_exceptions.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,22 @@ class NoDefaultOrValueUrlError(Exception):
2020
pass
2121

2222

23+
class NumberOfNonVirtualColumnsMismatch(Exception):
24+
"""
25+
The exception throw when number of non-virtual columns in metadata
26+
does not match number of columns in CSV file.
27+
"""
28+
pass
29+
30+
31+
class VirtualColumnPrecedesNonVirtualColumn(Exception):
32+
"""
33+
The exception thrown when a virtual column precedes a non-virtual column
34+
in the specified metadata.
35+
"""
36+
pass
37+
38+
2339
class BothDefaultAndValueUrlError(Exception):
2440
"""
2541
The exception thrown when a virtual column specifies both

pycsvw/nt_serializer.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@
1616

1717
from .generator_utils import process_dates_times, DATATYPE_MAP, read_csv
1818
from .csvw_exceptions import NullValueException, BothValueAndLiteralError, \
19-
BothValueAndDatatypeError, NoValueOrLiteralError, InvalidItemError
19+
BothValueAndDatatypeError, NoValueOrLiteralError, InvalidItemError, \
20+
NumberOfNonVirtualColumnsMismatch
2021
from .rdf_utils import is_null_value, get_column_map, get_subject_for_cell, \
2122
get_predicate_for_cell, apply_all_subs
2223

@@ -271,6 +272,7 @@ def serialize(tables, md_tables, custom_prefixes, output_obj):
271272
'prefixes': custom_prefixes
272273
}
273274
}
275+
num_nonvirtual_columns = sum([1 for x in metadata["tableSchema"]["columns"] if not x["virtual"]])
274276
# Read the csv file fresh after rewinding the file
275277
table_file_obj = tables[table_url]
276278
table_file_obj.seek(0)
@@ -279,4 +281,10 @@ def serialize(tables, md_tables, custom_prefixes, output_obj):
279281
next(table_csv_reader) # Ignore header
280282

281283
for row_num, row in enumerate(table_csv_reader):
284+
if len(row) != num_nonvirtual_columns:
285+
raise NumberOfNonVirtualColumnsMismatch(
286+
"The number of non-virtual columns in metadata, {}, "
287+
"do not match with the number of columns in row {}, {}, "
288+
"of the csv file '{}'.".format(
289+
num_nonvirtual_columns, row_num + 1, len(row), table_url))
282290
write_row(output_obj, str(row_num + 1), row, table_info)

pycsvw/rdf_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from .csvw_exceptions import NullValueException, MissingColumnError, FailedSubstitutionError
1818

1919

20-
SUB_PATTERN = re.compile(r'{(\w+)}')
20+
SUB_PATTERN = re.compile(r'{([A-Za-z0-9_\-# /:]+)}')
2121

2222

2323
def is_null_value(val, null_values):
@@ -60,7 +60,7 @@ def apply_sub(url, row, column_name_to_sub, column_info, quote_sub=True):
6060

6161
rep_before = "{" + column_name_to_sub + "}"
6262
if quote_sub:
63-
return url.replace(rep_before, quote(rep_after.encode('utf-8'), safe=':/'))
63+
return url.replace(rep_before, quote(rep_after.encode('utf-8'), safe=':/#'))
6464
else:
6565
return url.replace(rep_before, rep_after)
6666

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313

1414
setup(
1515
name='pycsvw',
16-
version="1.0.1",
16+
version="1.0.2",
1717
description='Generate JSON and RDF from csv files with metadata',
1818
url='https://github.com/bloomberg/pycsvw',
1919
author='Dev Ramudit, Erman Korkut',
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
{
2+
"@context": "http://www.w3.org/ns/csvw",
3+
"url": "http://example.org/simple.csv",
4+
"tableSchema": {
5+
"columns": [{
6+
"titles": "t1"
7+
},{
8+
"titles": "t2"
9+
}]
10+
}
11+
}
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
{
2+
"@context": "http://www.w3.org/ns/csvw",
3+
"url": "http://example.org/simple.csv",
4+
"tableSchema": {
5+
"columns": [{
6+
"titles": "t1"
7+
},{
8+
"titles": "t2"
9+
},{
10+
"titles": "t3"
11+
}, {
12+
"titles": "t4"
13+
}
14+
]
15+
}
16+
}
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
{
2+
"@context": "http://www.w3.org/ns/csvw",
3+
"url": "http://example.org/simple.csv",
4+
"tableSchema": {
5+
"columns": [{
6+
"titles": "t1"
7+
},
8+
{
9+
"name": "v1",
10+
"virtual": true,
11+
"aboutUrl": "owl:sub-{_row}",
12+
"propertyUrl": "owl:obj-{_row}",
13+
"valueUrl": "owl:pred-{_row}"
14+
}, {
15+
"titles": "t2"
16+
}]
17+
}
18+
}
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
expense,description,amount
2+
taxi,from conference to hotel,20
3+
fee,conference registration fee,50
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
# Copyright 2017 Bloomberg Finance L.P.
2+
# Licensed under the Apache License, Version 2.0 (the "License");
3+
# you may not use this file except in compliance with the License.
4+
# You may obtain a copy of the License at
5+
# http://www.apache.org/licenses/LICENSE-2.0
6+
# Unless required by applicable law or agreed to in writing, software
7+
# distributed under the License is distributed on an "AS IS" BASIS,
8+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9+
# See the License for the specific language governing permissions and
10+
# limitations under the License.
11+
12+
"""
13+
This test covers the cases where csv file and metadata works with url safe characters like #, /, :
14+
"""
15+
16+
from rdflib import ConjunctiveGraph, Literal, URIRef, Namespace
17+
18+
from pycsvw import CSVW
19+
20+
PRE_NS = Namespace('http://www.example.org/')
21+
22+
23+
def test_url_safe_chars():
24+
25+
csvw = CSVW(csv_path="tests/url_special_chars.csv",
26+
metadata_path="tests/url_special_chars.csv-metadata.json")
27+
rdf_output = csvw.to_rdf()
28+
29+
g = ConjunctiveGraph()
30+
g.parse(data=rdf_output, format="turtle")
31+
32+
# Check subjects
33+
sub1 = URIRef('http://www.example.org/c#1/chash2/chash3/chash4/chash5/chash6')
34+
literals = [Literal('c#1'), Literal('chash2'), Literal('chash3'),
35+
Literal('chash4'), Literal('chash6'), Literal('chash5')]
36+
verify_non_virtual_columns(sub1, g, literals)
37+
verify_virtual_columns(sub1, g, '#/:- _r1', '#/:-%20_r1')
38+
39+
sub2 = URIRef('http://www.example.org/c/1/c/2/c/3/c/4/c/5/c/6')
40+
literals = [Literal('c/1'), Literal('c/2'), Literal('c/3'),
41+
Literal('c/4'), Literal('c/6'), Literal('c/5')]
42+
verify_non_virtual_columns(sub2, g, literals)
43+
verify_virtual_columns(sub2, g, '/#:- _r2','/#:-%20_r2')
44+
45+
sub3 = URIRef('http://www.example.org/c:1/c:2/c:3/c:4/c:5/c:6')
46+
literals = [Literal('c:1'), Literal('c:2'), Literal('c:3'),
47+
Literal('c:4'), Literal('c:6'), Literal('c:5')]
48+
verify_non_virtual_columns(sub3, g, literals)
49+
verify_virtual_columns(sub3, g, ':#/-_ r3', ':#/-_%20r3')
50+
51+
sub4 = URIRef('http://www.example.org/c-1/c-2/c-3/c-4/c-5/c-6')
52+
literals = [Literal('c-1'), Literal('c-2'), Literal('c-3'),
53+
Literal('c-4'), Literal('c-6'), Literal('c-5')]
54+
verify_non_virtual_columns(sub4, g, literals)
55+
verify_virtual_columns(sub4, g, '-/#_ :r4', '-/#_%20:r4')
56+
57+
sub5 = URIRef('http://www.example.org/c%201/c%202/c%203/c%204/c%205/c%206')
58+
literals = [Literal('c 1'), Literal('c 2'), Literal('c 3'),
59+
Literal('c 4'), Literal('c 6'), Literal('c 5')]
60+
verify_non_virtual_columns(sub5, g, literals)
61+
verify_virtual_columns(sub5, g, ' -/#:_r5', '%20-/#:_r5')
62+
63+
sub6 = URIRef('http://www.example.org/c_1/c_2/c_3/c_4/c_5/c_6')
64+
literals = [Literal('c_1'), Literal('c_2'), Literal('c_3'),
65+
Literal('c_4'), Literal('c_6'), Literal('c_5')]
66+
verify_non_virtual_columns(sub6, g, literals)
67+
verify_virtual_columns(sub6, g, '_ /:#r6', '_%20/:#r6')
68+
69+
70+
def verify_virtual_columns(sub, g, orig_value_str, encoded_value_str):
71+
v1_triples = list(g.triples((sub, PRE_NS['v1p{}'.format(encoded_value_str)], None)))
72+
assert len(v1_triples) == 1
73+
assert "v1p{}".format(encoded_value_str) in str(v1_triples[0][1])
74+
assert orig_value_str == str(v1_triples[0][2])
75+
v2_triples = list(g.triples((sub, PRE_NS['v2p{}'.format(encoded_value_str)], None)))
76+
assert len(v2_triples) == 1
77+
assert "v2p{}".format(encoded_value_str) in str(v2_triples[0][1])
78+
assert 'v2v{}'.format(encoded_value_str) in str(v2_triples[0][2])
79+
80+
# Standalone virtual column
81+
standalone_sub = URIRef('http://www.example.org/v3s{}'.format(encoded_value_str))
82+
v3_triples = list(g.triples((standalone_sub, None, None)))
83+
assert len(v3_triples) == 1
84+
assert "v3p{}".format(encoded_value_str) in str(v3_triples[0][1])
85+
assert 'v3v{}'.format(encoded_value_str) in str(v3_triples[0][2])
86+
87+
88+
def verify_non_virtual_columns(sub, g, literals):
89+
all_triples = list(g.triples((sub, None, None)))
90+
assert len(all_triples) == 9
91+
assert (sub, PRE_NS['t#p'], literals[0]) in all_triples
92+
assert (sub, PRE_NS['t/p'], literals[1]) in all_triples
93+
assert (sub, PRE_NS['t:p'], literals[2]) in all_triples
94+
assert (sub, PRE_NS['t-p'], literals[3]) in all_triples
95+
assert (sub, PRE_NS['t_p'], literals[4]) in all_triples
96+
# Space value
97+
space_triple = list(g.triples((sub, PRE_NS['t%20p'], None)))
98+
assert len(space_triple) == 1
99+
assert "%20" in str(space_triple[0][1])
100+
assert " " not in str(space_triple[0][1])
101+
assert literals[5] == space_triple[0][2]
102+
103+
104+
105+
106+
107+
108+
109+

0 commit comments

Comments
 (0)