fix: remap session JSONL chunk line numbers to original source positions (#12102)

* fix: remap session JSONL chunk line numbers to original source positions

buildSessionEntry() flattens JSONL messages into plain text before
chunkMarkdown() assigns line numbers. The stored startLine/endLine
values therefore reference positions in the flattened text, not the
original JSONL file.

- Add lineMap to SessionFileEntry tracking which JSONL line each
  extracted message came from
- Add remapChunkLines() to translate chunk positions back to original
  JSONL lines after chunking
- Guard remap with source === "sessions" to prevent misapplication
- Include lineMap in content hash so existing sessions get re-indexed

Fixes #12044

* memory: dedupe session JSONL parsing

---------

Co-authored-by: Tak Hoffman <781889+Takhoffman@users.noreply.github.com>
This commit is contained in:
Marcus Castro
2026-02-10 21:09:24 -03:00
committed by GitHub
parent 424d2dddf5
commit 45488e4ec9
5 changed files with 197 additions and 122 deletions

View File

@@ -50,10 +50,17 @@ import {
type MemoryChunk,
type MemoryFileEntry,
parseEmbedding,
remapChunkLines,
runWithConcurrency,
} from "./internal.js";
import { searchKeyword, searchVector } from "./manager-search.js";
import { ensureMemoryIndexSchema } from "./memory-schema.js";
import {
buildSessionEntry,
listSessionFilesForAgent,
sessionPathForFile,
type SessionFileEntry,
} from "./session-files.js";
import { loadSqliteVecExtension } from "./sqlite-vec.js";
import { requireNodeSqlite } from "./sqlite.js";
@@ -66,15 +73,6 @@ type MemoryIndexMeta = {
vectorDims?: number;
};
type SessionFileEntry = {
path: string;
absPath: string;
mtimeMs: number;
size: number;
hash: string;
content: string;
};
type MemorySyncProgressState = {
completed: number;
total: number;
@@ -1147,8 +1145,8 @@ export class MemoryIndexManager implements MemorySearchManager {
needsFullReindex: boolean;
progress?: MemorySyncProgressState;
}) {
const files = await this.listSessionFiles();
const activePaths = new Set(files.map((file) => this.sessionPathForFile(file)));
const files = await listSessionFilesForAgent(this.agentId);
const activePaths = new Set(files.map((file) => sessionPathForFile(file)));
const indexAll = params.needsFullReindex || this.sessionsDirtyFiles.size === 0;
log.debug("memory sync: indexing session files", {
files: files.length,
@@ -1177,7 +1175,7 @@ export class MemoryIndexManager implements MemorySearchManager {
}
return;
}
const entry = await this.buildSessionEntry(absPath);
const entry = await buildSessionEntry(absPath);
if (!entry) {
if (params.progress) {
params.progress.completed += 1;
@@ -1545,113 +1543,6 @@ export class MemoryIndexManager implements MemorySearchManager {
.run(META_KEY, value);
}
private async listSessionFiles(): Promise<string[]> {
const dir = resolveSessionTranscriptsDirForAgent(this.agentId);
try {
const entries = await fs.readdir(dir, { withFileTypes: true });
return entries
.filter((entry) => entry.isFile())
.map((entry) => entry.name)
.filter((name) => name.endsWith(".jsonl"))
.map((name) => path.join(dir, name));
} catch {
return [];
}
}
private sessionPathForFile(absPath: string): string {
return path.join("sessions", path.basename(absPath)).replace(/\\/g, "/");
}
private normalizeSessionText(value: string): string {
return value
.replace(/\s*\n+\s*/g, " ")
.replace(/\s+/g, " ")
.trim();
}
private extractSessionText(content: unknown): string | null {
if (typeof content === "string") {
const normalized = this.normalizeSessionText(content);
return normalized ? normalized : null;
}
if (!Array.isArray(content)) {
return null;
}
const parts: string[] = [];
for (const block of content) {
if (!block || typeof block !== "object") {
continue;
}
const record = block as { type?: unknown; text?: unknown };
if (record.type !== "text" || typeof record.text !== "string") {
continue;
}
const normalized = this.normalizeSessionText(record.text);
if (normalized) {
parts.push(normalized);
}
}
if (parts.length === 0) {
return null;
}
return parts.join(" ");
}
private async buildSessionEntry(absPath: string): Promise<SessionFileEntry | null> {
try {
const stat = await fs.stat(absPath);
const raw = await fs.readFile(absPath, "utf-8");
const lines = raw.split("\n");
const collected: string[] = [];
for (const line of lines) {
if (!line.trim()) {
continue;
}
let record: unknown;
try {
record = JSON.parse(line);
} catch {
continue;
}
if (
!record ||
typeof record !== "object" ||
(record as { type?: unknown }).type !== "message"
) {
continue;
}
const message = (record as { message?: unknown }).message as
| { role?: unknown; content?: unknown }
| undefined;
if (!message || typeof message.role !== "string") {
continue;
}
if (message.role !== "user" && message.role !== "assistant") {
continue;
}
const text = this.extractSessionText(message.content);
if (!text) {
continue;
}
const label = message.role === "user" ? "User" : "Assistant";
collected.push(`${label}: ${text}`);
}
const content = collected.join("\n");
return {
path: this.sessionPathForFile(absPath),
absPath,
mtimeMs: stat.mtimeMs,
size: stat.size,
hash: hashText(content),
content,
};
} catch (err) {
log.debug(`Failed reading session file ${absPath}: ${String(err)}`);
return null;
}
}
private estimateEmbeddingTokens(text: string): number {
if (!text) {
return 0;
@@ -2318,6 +2209,9 @@ export class MemoryIndexManager implements MemorySearchManager {
const chunks = chunkMarkdown(content, this.settings.chunking).filter(
(chunk) => chunk.text.trim().length > 0,
);
if (options.source === "sessions" && "lineMap" in entry) {
remapChunkLines(chunks, entry.lineMap);
}
const embeddings = this.batch.enabled
? await this.embedChunksWithBatch(chunks, entry, options.source)
: await this.embedChunksInBatches(chunks);