Skip to content

Commit 8f1f43c

Browse files
committed
fetch 10 rows before and after + ignore the first row, to avoid partial rows
1 parent 9f95cbc commit 8f1f43c

File tree

1 file changed

+22
-13
lines changed

1 file changed

+22
-13
lines changed

src/csv.ts

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ export type CSVDataFrame = DataFrame<Metadata>;
2323

2424
const defaultChunkSize = 50 * 1024; // 50 KB, same as Papaparse default
2525
const defaultMaxCachedBytes = 20 * 1024 * 1024; // 20 MB
26+
const paddingRows = 10; // fetch a bit before and after the requested range, to avoid cutting rows
2627

2728
interface Params {
2829
url: string;
@@ -365,6 +366,7 @@ export async function csvDataFrame({
365366
// TODO(SL): Update the average size of a row?
366367
// For now, we keep it constant, to provide stability - otherwise empty rows appear after the update
367368
// cache.averageRowBytes = getAverageRowBytes(cache);
369+
// eventTarget.dispatchEvent(new CustomEvent("resolve")); // to refresh the table
368370
});
369371

370372
// TODO(SL): evict old rows (or only cell contents?) if needed
@@ -468,8 +470,13 @@ function fetchRange({
468470
}): Promise<void> {
469471
checkSignal(signal);
470472

471-
let cursor = start;
473+
const firstChunkOffset = Math.max(
474+
cache.serial.end, // don't fetch known rows again
475+
Math.floor(start - paddingRows * cache.averageRowBytes) // fetch a bit before, to ensure we get a complete first row
476+
);
477+
let cursor = firstChunkOffset;
472478
let isFirstStep = true;
479+
const endCursor = Math.ceil(end + paddingRows * cache.averageRowBytes); // fetch a bit after, just in case the average is not accurate
473480

474481
return new Promise<void>((resolve, reject) => {
475482
Papa.parse<string[]>(cache.url, {
@@ -481,15 +488,24 @@ function fetchRange({
481488
delimiter: cache.header.delimiter,
482489
newline: cache.header.newline,
483490
chunkSize: cache.chunkSize,
484-
firstChunkOffset: start, // custom option, only available in the modified Papaparse @severo_tests/papaparse
491+
firstChunkOffset, // custom option, only available in the modified Papaparse @severo_tests/papaparse
485492
step: ({ data, meta }, parser) => {
486493
if (signal?.aborted) {
487494
parser.abort();
488495
return;
489496
}
490497

491-
const parsedRow = { start: cursor, end: start + meta.cursor, data };
492-
cursor = start + meta.cursor;
498+
const parsedRow = {
499+
start: cursor,
500+
end: firstChunkOffset + meta.cursor,
501+
data,
502+
};
503+
cursor = parsedRow.end;
504+
505+
if (isFirstStep) {
506+
isFirstStep = false;
507+
return; // ignore the first row, because we cannot know if it's partial or complete
508+
}
493509

494510
if (meta.delimiter !== cache.header.delimiter) {
495511
reject(
@@ -507,12 +523,12 @@ function fetchRange({
507523
}
508524

509525
// add the row to the cache
510-
if (addParsedRowToCache({ cache, parsedRow, isFirstStep })) {
526+
if (addParsedRowToCache({ cache, parsedRow })) {
511527
// send an event for the new row
512528
eventTarget.dispatchEvent(new CustomEvent("resolve"));
513529
}
514530

515-
if (cursor >= end) {
531+
if (cursor >= endCursor) {
516532
// abort the parsing, we have enough rows for now
517533
parser.abort();
518534
return;
@@ -537,17 +553,10 @@ function isEmpty(data: string[]): boolean {
537553
function addParsedRowToCache({
538554
cache,
539555
parsedRow,
540-
isFirstStep,
541556
}: {
542557
cache: Cache;
543558
parsedRow: ParsedRow;
544-
isFirstStep: boolean; // to handle the case where we start in the middle of a row
545559
}): boolean {
546-
if (isFirstStep && parsedRow.data.length < cache.header.data.length) {
547-
// the first parsed row is partial, we ignore it, it must be part of the previous row
548-
return false;
549-
}
550-
551560
// TODO(SL): optimize
552561
const inserted = !isEmpty(parsedRow.data);
553562
const allRanges = [cache.serial, ...cache.random];

0 commit comments

Comments
 (0)