Skip to content

Commit a110142

Browse files
authored
Update the estimated number of rows (#22)
* update the number of rows on new estimate * remove unneeded check
1 parent 50e6b39 commit a110142

File tree

5 files changed

+87
-31
lines changed

5 files changed

+87
-31
lines changed

package-lock.json

Lines changed: 4 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
},
1717
"dependencies": {
1818
"cosovo": "0.2.0",
19-
"hightable": "0.22.2",
19+
"hightable": "0.24.1",
2020
"react": "19.2.0",
2121
"react-dom": "19.2.0"
2222
},

src/Page.tsx

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import { type DataFrame, HighTable } from 'hightable'
2-
import { type ReactNode } from 'react'
2+
import { type ReactNode, useEffect, useState } from 'react'
33

44
import Loading from './Loading.js'
55

@@ -28,6 +28,23 @@ export default function Page({
2828
setError,
2929
iframe = false,
3030
}: PageProps): ReactNode {
31+
const [numRowsEstimate, setNumRowsEstimate] = useState<{ numRows: number, isEstimate: boolean } | undefined>(undefined)
32+
useEffect(() => {
33+
if (!df) return
34+
/**
35+
* Handle num rows update event
36+
*/
37+
function onNumRowsUpdate() {
38+
if (!df) return
39+
setNumRowsEstimate({ numRows: df.numRows, isEstimate: df.metadata?.isNumRowsEstimated ?? false })
40+
}
41+
onNumRowsUpdate()
42+
df.eventTarget?.addEventListener('numrowschange', onNumRowsUpdate)
43+
return () => {
44+
df.eventTarget?.removeEventListener('numrowschange', onNumRowsUpdate)
45+
}
46+
}, [df])
47+
3148
return (
3249
<>
3350
{iframe ? '' : <div className="top-header">{name}</div>}
@@ -37,14 +54,14 @@ export default function Page({
3754
{formatFileSize(byteLength)}
3855
</span>
3956
)}
40-
{df
57+
{numRowsEstimate
4158
? (
4259
<span>
43-
{df.numRows.toLocaleString()}
60+
{numRowsEstimate.numRows.toLocaleString()}
4461
{' '}
4562
row
46-
{df.numRows > 1 ? 's' : ''}
47-
{df.metadata?.isNumRowsEstimated ? ' (estimated)' : ''}
63+
{numRowsEstimate.numRows > 1 ? 's' : ''}
64+
{numRowsEstimate.isEstimate ? ' (estimated)' : ''}
4865
</span>
4966
)
5067
: null}

src/cache.ts

Lines changed: 42 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,15 @@
11
import type { Newline, ParseResult } from 'cosovo'
2+
import { createEventTarget } from 'hightable'
23

34
import { checkNonNegativeInteger } from './helpers.js'
45

6+
export interface CSVCacheEvents {
7+
'num-rows-estimate-updated': {
8+
numRows: number
9+
isEstimate: boolean
10+
}
11+
}
12+
513
/**
614
* A byte range in a CSV file, with the parsed rows
715
*/
@@ -192,6 +200,14 @@ export class CSVCache {
192200
* The average number of bytes per row, used for estimating row positions
193201
*/
194202
#averageRowByteCount: number | undefined = undefined
203+
/**
204+
* The estimated number of rows in the CSV file
205+
*/
206+
#numRowsEstimate: { numRows: number, isEstimate: boolean } = { numRows: 0, isEstimate: true }
207+
/**
208+
* An event target to emit events
209+
*/
210+
#eventTarget = createEventTarget<CSVCacheEvents>()
195211

196212
constructor({ columnNames, headerByteCount, byteLength, delimiter, newline }: { columnNames: string[], headerByteCount?: number, byteLength: number, delimiter: string, newline: Newline }) {
197213
headerByteCount ??= 0
@@ -267,6 +283,22 @@ export class CSVCache {
267283
return this.#newline
268284
}
269285

286+
/**
287+
* Get an estimate of the total number of rows in the CSV file
288+
* @returns The estimated number of rows and if it's an estimate
289+
*/
290+
get numRowsEstimate(): { numRows: number, isEstimate: boolean } {
291+
return this.#numRowsEstimate
292+
}
293+
294+
/**
295+
* Get the event target to listen to cache events
296+
* @returns The event target
297+
*/
298+
get eventTarget(): ReturnType<typeof createEventTarget<CSVCacheEvents>> {
299+
return this.#eventTarget
300+
}
301+
270302
/**
271303
* Update the average row byte count based on the cached rows
272304
*/
@@ -275,10 +307,11 @@ export class CSVCache {
275307
const rowCount = this.#serial.rowCount + this.#random.reduce((sum, range) => sum + range.rowCount, 0)
276308
if (rowCount === 0) {
277309
this.#averageRowByteCount = undefined
278-
return
279310
}
280-
const averageRowByteCount = rowByteCount / rowCount
281-
this.#averageRowByteCount = averageRowByteCount
311+
else {
312+
this.#averageRowByteCount = rowByteCount / rowCount
313+
}
314+
this.#updateNumRowsEstimate()
282315
}
283316

284317
/**
@@ -322,19 +355,19 @@ export class CSVCache {
322355
}
323356

324357
/**
325-
* Get an estimate of the total number of rows in the CSV file
326-
* @returns The estimated number of rows and if it's an estimate
358+
* Update the estimated number of rows in the CSV file
327359
*/
328-
get numRowsEstimate(): { numRows: number, isEstimate: boolean } {
360+
#updateNumRowsEstimate(): void {
329361
const averageRowByteCount = this.averageRowByteCount
330362
const numRows = this.allRowsCached
331363
? this.rowCount
332364
: averageRowByteCount === 0 || averageRowByteCount === undefined
333365
? 0
334366
: Math.round((this.#byteLength - this.headerByteCount) / averageRowByteCount)
335-
return {
336-
numRows,
337-
isEstimate: !this.allRowsCached,
367+
const isEstimate = !this.allRowsCached
368+
if (this.#numRowsEstimate.numRows !== numRows || this.#numRowsEstimate.isEstimate !== isEstimate) {
369+
this.#numRowsEstimate = { numRows, isEstimate }
370+
this.#eventTarget.dispatchEvent(new CustomEvent('num-rows-estimate-updated'))
338371
}
339372
}
340373

src/dataframe.ts

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ interface Params {
2727
initialRowCount?: number // number of rows to fetch at dataframe creation
2828
}
2929

30+
// Note that when sending a 'numrowsupdate' event, the isNumRowsEstimated flag is also updated if needed
3031
export type CSVDataFrame = DataFrame<{ isNumRowsEstimated: boolean }>
3132

3233
/**
@@ -45,10 +46,12 @@ export async function csvDataFrame(params: Params): Promise<CSVDataFrame> {
4546

4647
const eventTarget = createEventTarget<DataFrameEvents>()
4748
const cache = await initializeCSVCachefromURL({ url, byteLength, chunkSize, initialRowCount })
48-
const { numRows, isEstimate } = cache.numRowsEstimate
49-
const metadata = {
50-
isNumRowsEstimated: isEstimate,
51-
}
49+
cache.eventTarget.addEventListener('num-rows-estimate-updated', () => {
50+
// propagate event
51+
eventTarget.dispatchEvent(new CustomEvent('numrowschange'))
52+
})
53+
// no need to remove the listener, as the cache has the same lifetime as the dataframe
54+
5255
const columnDescriptors: DataFrame['columnDescriptors'] = cache.columnNames.map(name => ({ name }))
5356

5457
/**
@@ -141,24 +144,21 @@ export async function csvDataFrame(params: Params): Promise<CSVDataFrame> {
141144
}): Promise<void> {
142145
checkSignal(signal)
143146

147+
// until the CSV is fully loaded, we don't know the exact number of rows
148+
const numRows = cache.allRowsCached ? cache.rowCount : Infinity
144149
validateFetchParams({
145150
rowStart,
146151
rowEnd,
147152
columns,
148153
orderBy,
149154
data: {
150-
numRows: Infinity, // we don't (always) know the exact number of rows yet
155+
numRows,
151156
columnDescriptors,
152157
},
153158
})
154159

155160
if (cache.allRowsCached) {
156161
// all rows are already cached
157-
if (rowEnd > cache.rowCount) {
158-
// requested rows are beyond the end of the file
159-
throw new Error(`Requested rows are beyond the end of the file: ${rowEnd} > ${cache.rowCount}`)
160-
}
161-
// else nothing to do
162162
return
163163
}
164164

@@ -262,8 +262,14 @@ export async function csvDataFrame(params: Params): Promise<CSVDataFrame> {
262262
}
263263

264264
return {
265-
metadata,
266-
numRows,
265+
metadata: {
266+
get isNumRowsEstimated() {
267+
return cache.numRowsEstimate.isEstimate
268+
},
269+
},
270+
get numRows() {
271+
return cache.numRowsEstimate.numRows
272+
},
267273
columnDescriptors,
268274
getCell,
269275
getRowNumber,

0 commit comments

Comments
 (0)