Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed
- Preserve regex and case sensitivity query parameters when loading more search results. [#972](https://github.com/sourcebot-dev/sourcebot/pull/972)
- Fixed page navigation failing after Next.js 16 upgrade by removing `router.refresh()` calls immediately following `router.push()`. [#974](https://github.com/sourcebot-dev/sourcebot/pull/974)
- Add filesystem-first GC scan to remove orphaned repo directories and index shards that have no corresponding database record. [#973](https://github.com/sourcebot-dev/sourcebot/pull/973)


## [4.13.1] - 2026-02-28

Expand Down
2 changes: 1 addition & 1 deletion packages/backend/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ const repoIndexManager = new RepoIndexManager(prisma, settings, redis, promClien
const configManager = new ConfigManager(prisma, connectionManager, env.CONFIG_PATH);

connectionManager.startScheduler();
repoIndexManager.startScheduler();
await repoIndexManager.startScheduler();

if (env.EXPERIMENT_EE_PERMISSION_SYNC_ENABLED === 'true' && !hasEntitlement('permission-syncing')) {
logger.error('Permission syncing is not supported in current plan. Please contact team@sourcebot.dev for assistance.');
Expand Down
75 changes: 71 additions & 4 deletions packages/backend/src/repoIndexManager.ts
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
import * as Sentry from '@sentry/node';
import { PrismaClient, Repo, RepoIndexingJobStatus, RepoIndexingJobType } from "@sourcebot/db";
import { createLogger, env, getRepoPath, Logger, RepoIndexingJobMetadata, repoIndexingJobMetadataSchema, RepoMetadata, repoMetadataSchema } from "@sourcebot/shared";
import { createLogger, env, getRepoPath, Logger, getRepoIdFromPath, RepoIndexingJobMetadata, repoIndexingJobMetadataSchema, RepoMetadata, repoMetadataSchema } from "@sourcebot/shared";
import { DelayedError, Job, Queue, Worker } from "bullmq";
import { existsSync } from 'fs';
import { readdir, rm } from 'fs/promises';
import { Redis } from 'ioredis';
import micromatch from 'micromatch';
import Redlock, { ExecutionError } from 'redlock';
import { INDEX_CACHE_DIR, WORKER_STOP_GRACEFUL_TIMEOUT_MS } from './constants.js';
import { INDEX_CACHE_DIR, REPOS_CACHE_DIR, WORKER_STOP_GRACEFUL_TIMEOUT_MS } from './constants.js';
import { cloneRepository, fetchRepository, getBranches, getCommitHashForRefName, getLatestCommitTimestamp, getLocalDefaultBranch, getTags, isPathAValidGitRepoRoot, isRepoEmpty, unsetGitConfig, upsertGitConfig } from './git.js';
import { captureEvent } from './posthog.js';
import { PromClient } from './promClient.js';
import { RepoWithConnections, Settings } from "./types.js";
import { getAuthCredentialsForRepo, getShardPrefix, measure, setIntervalAsync } from './utils.js';
import { getAuthCredentialsForRepo, getRepoIdFromShardFileName, getShardPrefix, measure, setIntervalAsync } from './utils.js';
import { cleanupTempShards, indexGitRepository } from './zoekt.js';

const LOG_TAG = 'repo-index-manager';
Expand Down Expand Up @@ -96,8 +96,10 @@ export class RepoIndexManager {
});
}

public startScheduler() {
public async startScheduler() {
logger.debug('Starting scheduler');
// Cleanup any orphaned disk resources on startup
await this.cleanupOrphanedDiskResources();
this.interval = setIntervalAsync(async () => {
await this.scheduleIndexJobs();
await this.scheduleCleanupJobs();
Expand Down Expand Up @@ -637,6 +639,71 @@ export class RepoIndexManager {
}
}

// Scans the repos and index directories on disk and removes any entries
// that have no corresponding Repo record in the database. This handles
// edge cases where the DB and disk resources are out of sync.
private async cleanupOrphanedDiskResources() {
// --- Repo directories ---
// Dirs are named by repoId: DATA_CACHE_DIR/repos/<repoId>/
if (existsSync(REPOS_CACHE_DIR)) {
const entries = await readdir(REPOS_CACHE_DIR);
const repoIdToPath = new Map<number, string>();
for (const entry of entries) {
const repoPath = `${REPOS_CACHE_DIR}/${entry}`;
const repoId = getRepoIdFromPath(repoPath);
if (repoId !== undefined) {
repoIdToPath.set(repoId, repoPath);
}
}

if (repoIdToPath.size > 0) {
const existingRepos = await this.db.repo.findMany({
where: { id: { in: [...repoIdToPath.keys()] } },
select: { id: true },
});
const existingIds = new Set(existingRepos.map(r => r.id));
for (const [repoId, repoPath] of repoIdToPath) {
if (!existingIds.has(repoId)) {
logger.info(`Removing orphaned repo directory with no DB record: ${repoPath}`);
await rm(repoPath, { recursive: true, force: true });
}
}
}
}

// --- Index shards ---
// Shard files are prefixed with <orgId>_<repoId>: DATA_CACHE_DIR/index/<orgId>_<repoId>_*.zoekt
if (existsSync(INDEX_CACHE_DIR)) {
const entries = await readdir(INDEX_CACHE_DIR);
const repoIdToShards = new Map<number, string[]>();
for (const entry of entries) {
const repoId = getRepoIdFromShardFileName(entry);
if (repoId !== undefined) {
const shards = repoIdToShards.get(repoId) ?? [];
shards.push(entry);
repoIdToShards.set(repoId, shards);
}
}

if (repoIdToShards.size > 0) {
const existingRepos = await this.db.repo.findMany({
where: { id: { in: [...repoIdToShards.keys()] } },
select: { id: true },
});
const existingIds = new Set(existingRepos.map(r => r.id));
for (const [repoId, shards] of repoIdToShards) {
if (!existingIds.has(repoId)) {
for (const entry of shards) {
const shardPath = `${INDEX_CACHE_DIR}/${entry}`;
logger.info(`Removing orphaned index shard with no DB record: ${shardPath}`);
await rm(shardPath, { force: true });
}
}
}
}
}
}

public async dispose() {
if (this.interval) {
clearInterval(this.interval);
Expand Down
8 changes: 8 additions & 0 deletions packages/backend/src/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,14 @@ export const getShardPrefix = (orgId: number, repoId: number) => {
return `${orgId}_${repoId}`;
}

export const getRepoIdFromShardFileName = (fileName: string): number | undefined => {
const match = fileName.match(/^(\d+)_(\d+)_/);
if (!match) {
return undefined;
}
return parseInt(match[2], 10);
}

export const fetchWithRetry = async <T>(
fetchFn: () => Promise<T>,
identifier: string,
Expand Down
1 change: 1 addition & 0 deletions packages/shared/src/index.server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ export {
loadJsonFile,
getConfigSettings,
getRepoPath,
getRepoIdFromPath,
} from "./utils.js";
export * from "./constants.js";
export {
Expand Down
5 changes: 5 additions & 0 deletions packages/shared/src/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,11 @@ export const getConfigSettings = async (configPath?: string): Promise<ConfigSett
}
}

export const getRepoIdFromPath = (repoPath: string): number | undefined => {
const id = parseInt(path.basename(repoPath), 10);
return isNaN(id) ? undefined : id;
}

export const getRepoPath = (repo: Repo): { path: string, isReadOnly: boolean } => {
// If we are dealing with a local repository, then use that as the path.
// Mark as read-only since we aren't guaranteed to have write access to the local filesystem.
Expand Down