diff --git a/Sources/CodexBarCore/Vendored/CostUsage/CostUsageJsonl.swift b/Sources/CodexBarCore/Vendored/CostUsage/CostUsageJsonl.swift index c5132737a..5105f9759 100644 --- a/Sources/CodexBarCore/Vendored/CostUsage/CostUsageJsonl.swift +++ b/Sources/CodexBarCore/Vendored/CostUsage/CostUsageJsonl.swift @@ -23,15 +23,24 @@ enum CostUsageJsonl { try handle.seek(toOffset: UInt64(startOffset)) } - var buffer = Data() - buffer.reserveCapacity(64 * 1024) - var current = Data() current.reserveCapacity(4 * 1024) var lineBytes = 0 var truncated = false var bytesRead: Int64 = 0 + func appendSegment(_ segment: Data.SubSequence) { + guard !segment.isEmpty else { return } + lineBytes += segment.count + guard !truncated else { return } + if lineBytes > maxLineBytes || lineBytes > prefixBytes { + truncated = true + current.removeAll(keepingCapacity: true) + return + } + current.append(contentsOf: segment) + } + func flushLine() { guard lineBytes > 0 else { return } let line = Line(bytes: current, wasTruncated: truncated) @@ -49,23 +58,14 @@ enum CostUsageJsonl { } bytesRead += Int64(chunk.count) - buffer.append(chunk) - - while true { - guard let nl = buffer.firstIndex(of: 0x0A) else { break } - let linePart = buffer[.. maxLineBytes || lineBytes > prefixBytes { - truncated = true - current.removeAll(keepingCapacity: true) - } else { - current.append(contentsOf: linePart) - } - } + var segmentStart = chunk.startIndex + while let nl = chunk[segmentStart...].firstIndex(of: 0x0A) { + appendSegment(chunk[segmentStart..= 5.0) + } +} + +private struct JsonlScanSummary: Equatable { + let lineCount: Int + let truncatedCount: Int + let payloadByteCount: Int + let endOffset: Int64 +} + +private typealias JsonlScanner = ( + _ fileURL: URL, + _ offset: Int64, + _ maxLineBytes: Int, + _ prefixBytes: Int, + _ onLine: (CostUsageJsonl.Line) -> Void) throws -> Int64 + +private func makeBenchmarkFixture(line: String, lineCount: Int) -> Data { + let lineBytes = Data(line.utf8) + var data = Data() + data.reserveCapacity((lineBytes.count + 1) * lineCount) + for _ in 0.. JsonlScanSummary +{ + var lineCount = 0 + var truncatedCount = 0 + var payloadByteCount = 0 + + let endOffset = try scanner(fileURL, 0, maxLineBytes, prefixBytes) { line in + lineCount += 1 + payloadByteCount += line.bytes.count + if line.wasTruncated { + truncatedCount += 1 + } + } + + return JsonlScanSummary( + lineCount: lineCount, + truncatedCount: truncatedCount, + payloadByteCount: payloadByteCount, + endOffset: endOffset) +} + +private func fastestScanDurationNanoseconds( + runs: Int, + fileURL: URL, + maxLineBytes: Int, + prefixBytes: Int, + scanner: JsonlScanner) throws -> UInt64 +{ + var fastest = UInt64.max + for _ in 0.. Void) throws + -> Int64 +{ + let handle = try FileHandle(forReadingFrom: fileURL) + defer { try? handle.close() } + + let startOffset = max(0, offset) + if startOffset > 0 { + try handle.seek(toOffset: UInt64(startOffset)) + } + + var buffer = Data() + buffer.reserveCapacity(64 * 1024) + + var current = Data() + current.reserveCapacity(4 * 1024) + var lineBytes = 0 + var truncated = false + var bytesRead: Int64 = 0 + + func flushLine() { + guard lineBytes > 0 else { return } + onLine(.init(bytes: current, wasTruncated: truncated)) + current.removeAll(keepingCapacity: true) + lineBytes = 0 + truncated = false + } + + while true { + let chunk = try handle.read(upToCount: 256 * 1024) ?? Data() + if chunk.isEmpty { + flushLine() + break + } + + bytesRead += Int64(chunk.count) + buffer.append(chunk) + + while true { + guard let nl = buffer.firstIndex(of: 0x0A) else { break } + let linePart = buffer[.. maxLineBytes || lineBytes > prefixBytes { + truncated = true + current.removeAll(keepingCapacity: true) + } else { + current.append(contentsOf: linePart) + } + } + + flushLine() + } + } + + return startOffset + bytesRead +} diff --git a/Tests/CodexBarTests/CostUsageScannerTests.swift b/Tests/CodexBarTests/CostUsageScannerTests.swift index 98985bd6a..ecad8208d 100644 --- a/Tests/CodexBarTests/CostUsageScannerTests.swift +++ b/Tests/CodexBarTests/CostUsageScannerTests.swift @@ -838,6 +838,60 @@ struct CostUsageScannerTests { #expect(report.data[0].outputTokens == 15) #expect(report.data[0].totalTokens == 45) } + + @Test + func jsonlScannerHandlesLinesAcrossReadChunks() throws { + let env = try CostUsageTestEnvironment() + defer { env.cleanup() } + + let fileURL = env.root.appendingPathComponent("large-lines.jsonl", isDirectory: false) + let largeLine = String(repeating: "x", count: 300_000) + let contents = "\(largeLine)\nsmall\n" + try contents.write(to: fileURL, atomically: true, encoding: .utf8) + + var scanned: [(count: Int, truncated: Bool)] = [] + let endOffset = try CostUsageJsonl.scan( + fileURL: fileURL, + maxLineBytes: 400_000, + prefixBytes: 400_000) + { line in + scanned.append((line.bytes.count, line.wasTruncated)) + } + + #expect(endOffset == Int64(Data(contents.utf8).count)) + #expect(scanned.count == 2) + #expect(scanned[0].count == 300_000) + #expect(scanned[0].truncated == false) + #expect(scanned[1].count == 5) + #expect(scanned[1].truncated == false) + } + + @Test + func jsonlScannerMarksPrefixLimitedLinesAsTruncated() throws { + let env = try CostUsageTestEnvironment() + defer { env.cleanup() } + + let fileURL = env.root.appendingPathComponent("truncated-lines.jsonl", isDirectory: false) + let shortLine = "ok" + let longLine = String(repeating: "a", count: 2000) + let contents = "\(shortLine)\n\(longLine)\n" + try contents.write(to: fileURL, atomically: true, encoding: .utf8) + + var scanned: [CostUsageJsonl.Line] = [] + _ = try CostUsageJsonl.scan( + fileURL: fileURL, + maxLineBytes: 10000, + prefixBytes: 64) + { line in + scanned.append(line) + } + + #expect(scanned.count == 2) + #expect(String(data: scanned[0].bytes, encoding: .utf8) == "ok") + #expect(scanned[0].wasTruncated == false) + #expect(scanned[1].bytes.isEmpty) + #expect(scanned[1].wasTruncated == true) + } } private struct CostUsageTestEnvironment {