From d2c0cb03b838c5ad727b79c12186a5154f2e8c61 Mon Sep 17 00:00:00 2001 From: Dhruv Kumar Date: Thu, 7 May 2026 12:03:19 +0530 Subject: [PATCH] Optimize OptiSim.algorithm() --- selector/methods/distance.py | 78 +++++++++++++++++------------------- 1 file changed, 37 insertions(+), 41 deletions(-) diff --git a/selector/methods/distance.py b/selector/methods/distance.py index 3bc44520..9df1bb88 100644 --- a/selector/methods/distance.py +++ b/selector/methods/distance.py @@ -341,8 +341,9 @@ def algorithm(self, x, max_size) -> Union[List, Iterable]: Parameters ---------- - x: ndarray of shape (n_samples, n_features) - Feature matrix of `n_samples` samples in `n_features` dimensional feature space. + x: ndarray of shape (n_samples, n_features) or (n_samples, n_samples) + Feature matrix of `n_samples` samples in `n_features` dimensional feature space, + or a pre-computed symmetric pairwise distance matrix. max_size : int Maximum number of samples to select. @@ -352,59 +353,54 @@ def algorithm(self, x, max_size) -> Union[List, Iterable]: List of indices of selected sample indices. """ + n_samples = len(x) + is_dist_matrix = (x.ndim == 2 and x.shape[0] == x.shape[1] and np.allclose(np.diag(x), 0)) + # set up reference index selected = get_initial_selection(x=x, x_dist=None, ref_index=self.ref_index, fun_dist=None) count = len(selected) - # establish a kd-tree for nearest-neighbor lookup - tree = spatial.KDTree(x) - # use a random number generator that will be used to randomly select points rng = np.random.default_rng(seed=self.random_seed) + bv = np.zeros(n_samples, dtype=bool) - n_samples = len(x) - # bv will serve as a mask to discard points within radius r of previously selected points - bv = np.zeros(n_samples) - candidates = list(range(n_samples)) - # determine which points are within radius r of initial point - # note: workers=-1 uses all available processors/CPUs - index_remove = tree.query_ball_point( - x[self.ref_index], self.r, eps=self.eps, p=self.p, workers=-1 - ) - # exclude points within radius r of initial point from list of candidates using bv mask - for idx in index_remove: - bv[idx] = 1 - candidates = np.ma.array(candidates, mask=bv) + # Initialize min_dists: minimum distance from each point to any selected point. + min_dists = np.full(n_samples, np.inf) + + if is_dist_matrix: + # ath A: Distance matrix path + for idx in selected: + min_dists = np.minimum(min_dists, x[idx]) + bv |= x[idx] <= self.r + else: + # Path B: Raw feature path + for idx in selected: + dists = np.linalg.norm(x - x[idx], ord=self.p if np.isfinite(self.p) else np.inf, axis=1) + min_dists = np.minimum(min_dists, dists) + bv |= dists <= self.r + + candidates = np.where(~bv)[0] # while there are still remaining candidates to be selected - # compressed returns all the non-masked data as a 1-D array - while len(candidates.compressed()) > 0: - # randomly select samples from list of candidates - try: - sublist = rng.choice(candidates.compressed(), size=self.k, replace=False) - except ValueError: - sublist = candidates.compressed() - - # create a new kd-tree for nearest neighbor lookup with candidates - new_tree = spatial.KDTree(x[selected]) - # query the kd-tree for nearest neighbors to selected samples - # note: workers=-1 uses all available processors/CPUs - search, _ = new_tree.query(x[sublist], eps=self.eps, p=self.p, workers=-1) - # identify the nearest neighbor with the largest distance from previously selected samples - best_idx = sublist[np.argmax(search)] + while len(candidates) > 0: + k = min(self.k, len(candidates)) + sublist = rng.choice(candidates, size=k, replace=False) + + best_idx = sublist[np.argmax(min_dists[sublist])] selected.append(best_idx) count += 1 if count > max_size: - # do this if you have reached the maximum number of points selected return selected - # eliminate all samples within radius r of the selected sample - index_remove = tree.query_ball_point( - x[best_idx], self.r, eps=self.eps, p=self.p, workers=-1 - ) - for idx in index_remove: - bv[idx] = 1 - candidates = np.ma.array(candidates, mask=bv) + if is_dist_matrix: + min_dists = np.minimum(min_dists, x[best_idx]) + bv |= x[best_idx] <= self.r + else: + new_dists = np.linalg.norm(x - x[best_idx], ord=self.p if np.isfinite(self.p) else np.inf, axis=1) + min_dists = np.minimum(min_dists, new_dists) + bv |= new_dists <= self.r + + candidates = np.where(~bv)[0] return selected