Skip to content

Commit d78c126

Browse files
committed
only store the first 50 rows initially + the whole chunk
1 parent c592277 commit d78c126

File tree

2 files changed

+38
-2
lines changed

2 files changed

+38
-2
lines changed

src/dataframe.ts

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ import { CSVCache } from './cache'
1717
import { checkNonNegativeInteger } from './helpers.js'
1818

1919
const defaultChunkSize = 100 * 1024 // 100 KB
20-
const defaultInitialRowCount = 500
20+
const defaultInitialRowCount = 50
2121
// const paddingRowCount = 20 // fetch a bit before and after the requested range, to avoid cutting rows
2222

2323
interface Params {
@@ -215,6 +215,7 @@ export async function csvDataFrame(params: Params): Promise<CSVDataFrame> {
215215

216216
// Store the new row in the cache
217217
const isEmpty = isEmptyLine(result.row)
218+
// v8 ignore else -- @preserve
218219
if (!cache.isStored({ byteOffset: result.meta.byteOffset })) {
219220
cache.store({
220221
cells: isEmpty ? undefined : result.row,
@@ -227,6 +228,10 @@ export async function csvDataFrame(params: Params): Promise<CSVDataFrame> {
227228
eventTarget.dispatchEvent(new CustomEvent('resolve'))
228229
}
229230
}
231+
else {
232+
// the row should not be already stored, but double check
233+
console.debug('Row already stored, should not happen, skipping', { byteOffset: result.meta.byteOffset, row })
234+
}
230235
if (!isEmpty) {
231236
row++
232237
}
@@ -295,7 +300,7 @@ async function initializeCSVCachefromURL({ url, byteLength, chunkSize, initialRo
295300
cache = CSVCache.fromHeader({ header: result, byteLength })
296301
continue
297302
}
298-
else if (cache.rowCount >= initialRowCount) {
303+
else if (cache.rowCount >= initialRowCount && result.meta.byteOffset > 0.9 * chunkSize) {
299304
// enough rows for now
300305
break
301306
}

test/dataframe.test.ts

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,12 +50,27 @@ describe('csvDataFrame', () => {
5050
url,
5151
byteLength: fileSize,
5252
initialRowCount: 2,
53+
chunkSize: 5,
5354
})
5455
expect(df.getCell({ row: 1, column: 'b' })).toStrictEqual({ value: '5' })
5556
expect(df.getCell({ row: 2, column: 'b' })).toBeUndefined()
5657
revoke()
5758
})
5859

60+
it('should fetch more than initial rows when specified if the chunk size is bigger', async () => {
61+
const text = 'a,b,c\n1,2,3\n4,5,6\n7,8,9\n'
62+
const { url, revoke, fileSize } = toURL(text, { withNodeWorkaround: true })
63+
const df = await csvDataFrame({
64+
url,
65+
byteLength: fileSize,
66+
initialRowCount: 2,
67+
chunkSize: 500,
68+
})
69+
expect(df.getCell({ row: 1, column: 'b' })).toStrictEqual({ value: '5' })
70+
expect(df.getCell({ row: 2, column: 'b' })).not.toBeUndefined()
71+
revoke()
72+
})
73+
5974
it.each([
6075
{ text: 'a,b,c\n1111,2222,3333\nn44,55,66\n77,88,99\n', expectedRows: 2 },
6176
{ text: 'a,b,c\n11,22,33\n44,55,66\n77,88,99\n', expectedRows: 3 },
@@ -66,6 +81,7 @@ describe('csvDataFrame', () => {
6681
url,
6782
byteLength: fileSize,
6883
initialRowCount: 1,
84+
chunkSize: 5,
6985
})
7086
// with only one row loaded, the average row size is not accurate enough to estimate the number of rows
7187
expect(df.numRows).toBe(expectedRows) // the estimate is not perfect
@@ -80,6 +96,7 @@ describe('csvDataFrame', () => {
8096
url,
8197
byteLength: fileSize,
8298
initialRowCount: 0,
99+
chunkSize: 5,
83100
})
84101
expect(df.getCell({ row: 1, column: 'b' })).toBeUndefined()
85102
revoke()
@@ -170,6 +187,7 @@ describe('csvDataFrame', () => {
170187
url,
171188
byteLength: fileSize,
172189
initialRowCount: 2,
190+
chunkSize: 5,
173191
})
174192
expect(df.getCell({ row: 2, column: 'a' })).toBeUndefined()
175193
revoke()
@@ -182,6 +200,7 @@ describe('csvDataFrame', () => {
182200
url,
183201
byteLength: fileSize,
184202
initialRowCount: 2,
203+
chunkSize: 5,
185204
})
186205
expect(df.getCell({ row: 5, column: 'a' })).toBeUndefined()
187206
revoke()
@@ -242,6 +261,7 @@ describe('csvDataFrame', () => {
242261
url,
243262
byteLength: fileSize,
244263
initialRowCount: 2,
264+
chunkSize: 5,
245265
})
246266
expect(df.getRowNumber({ row: 2 })).toBeUndefined()
247267
revoke()
@@ -254,6 +274,7 @@ describe('csvDataFrame', () => {
254274
url,
255275
byteLength: fileSize,
256276
initialRowCount: 2,
277+
chunkSize: 5,
257278
})
258279
expect(df.getRowNumber({ row: 5 })).toBeUndefined()
259280
revoke()
@@ -290,6 +311,7 @@ describe('csvDataFrame', () => {
290311
url,
291312
byteLength: fileSize,
292313
initialRowCount: 2,
314+
chunkSize: 5,
293315
})
294316
expect(df.getCell({ row: 2, column: 'a' })).toBeUndefined()
295317
await df.fetch?.({ rowStart: 2, rowEnd: 5 })
@@ -306,6 +328,7 @@ describe('csvDataFrame', () => {
306328
url,
307329
byteLength: fileSize,
308330
initialRowCount: 1,
331+
chunkSize: 5,
309332
})
310333
expect(df.getCell({ row: 1, column: 'a' })).toBeUndefined()
311334
await df.fetch?.({ rowStart: 1, rowEnd: 10 })
@@ -366,6 +389,7 @@ describe('csvDataFrame', () => {
366389
url,
367390
byteLength: fileSize,
368391
initialRowCount: 2,
392+
chunkSize: 5,
369393
})
370394
expect(df.getCell({ row: 2, column: 'a' })).toBeUndefined()
371395
await df.fetch?.({ rowStart: 2, rowEnd: 10 })
@@ -380,6 +404,7 @@ describe('csvDataFrame', () => {
380404
url,
381405
byteLength: fileSize,
382406
initialRowCount: 2,
407+
chunkSize: 5,
383408
})
384409
expect(df.getCell({ row: 3, column: 'a' })).toBeUndefined()
385410
await df.fetch?.({ rowStart: 3, rowEnd: 4 })
@@ -396,6 +421,7 @@ describe('csvDataFrame', () => {
396421
url,
397422
byteLength: fileSize,
398423
initialRowCount: 1,
424+
chunkSize: 5,
399425
})
400426
expect(df.getCell({ row: 2, column: 'a' })).toBeUndefined()
401427
await df.fetch?.({ rowStart: 2, rowEnd: 5 })
@@ -412,6 +438,7 @@ describe('csvDataFrame', () => {
412438
url,
413439
byteLength: fileSize,
414440
initialRowCount: 1,
441+
chunkSize: 5,
415442
})
416443
expect(df.getCell({ row: 1, column: 'a' })).toBeUndefined()
417444
await df.fetch?.({ rowStart: 3, rowEnd: 5 })
@@ -455,6 +482,7 @@ describe('csvDataFrame', () => {
455482
url,
456483
byteLength: fileSize,
457484
initialRowCount: 1,
485+
chunkSize: 5,
458486
})
459487
expect(df.getCell({ row: 0, column: 'a' })).toStrictEqual({ value: '111111' })
460488
expect(df.getCell({ row: 1, column: 'a' })).toBeUndefined()
@@ -484,6 +512,7 @@ describe('csvDataFrame', () => {
484512
url,
485513
byteLength: fileSize,
486514
initialRowCount: 1,
515+
chunkSize: 5,
487516
})
488517
expect(df.getCell({ row: 0, column: 'a' })).toStrictEqual({ value: '111111' })
489518
expect(df.getCell({ row: 1, column: 'a' })).toBeUndefined()
@@ -514,6 +543,7 @@ describe('csvDataFrame', () => {
514543
url,
515544
byteLength: fileSize,
516545
initialRowCount: 2,
546+
chunkSize: 5,
517547
})
518548

519549
let resolveEventCount = 0
@@ -541,6 +571,7 @@ describe('csvDataFrame', () => {
541571
url,
542572
byteLength: fileSize,
543573
initialRowCount: 1,
574+
chunkSize: 5,
544575
})
545576
expect(df.getCell({ row: 0, column: 'a' })).toStrictEqual({ value: '1' })
546577
expect(df.getCell({ row: 1, column: 'a' })).toBeUndefined()

0 commit comments

Comments
 (0)