Source code for archepy.init.furthest_sum

"""
FurthestSum initialization (CPU).

Picks an initial set of archetype indices by greedily selecting the column
that is farthest (in summed Euclidean distance) from the points already
chosen. Faithful port of ``FurthestSum.m`` from the upstream MATLAB toolbox.

For GPU acceleration on large problems, see :func:`archepy.init.furthest_sum_gpu`.
"""

from __future__ import annotations

from collections.abc import Iterable

import numpy as np


[docs] def furthest_sum( K: np.ndarray, noc: int, i: int | Iterable[int], exclude: Iterable[int] | None = None, *, treat_as_kernel: bool | None = None, one_based: bool = False, ) -> list[int]: """ Greedy FurthestSum selection of ``noc`` candidate archetype indices. Parameters ---------- K : ndarray Either a data matrix of shape ``(D, N)`` (observations as columns) or a symmetric kernel matrix of shape ``(N, N)``. noc : int Number of indices to select. i : int or iterable of int Initial seed index/indices. exclude : iterable of int, optional Indices that must not be selected. treat_as_kernel : bool, optional If ``None`` (default), auto-detect: a square symmetric ``K`` is treated as a kernel; otherwise as a data matrix. Pass ``False`` explicitly for large square data matrices to skip the symmetry check. one_based : bool, default False If ``True``, treat input/output indices as 1-based (MATLAB style). Returns ------- list[int] Selected indices. """ A = np.asarray(K) if A.ndim != 2: raise ValueError("K must be 2D.") if treat_as_kernel is None: treat_as_kernel = (A.shape[0] == A.shape[1]) and np.allclose(A, A.T, atol=1e-10) if treat_as_kernel: N = A.shape[0] else: # Data matrix: D x N — observations are columns. N = A.shape[1] def to0(idx: Iterable[int] | int) -> list[int]: if isinstance(idx, (int, np.integer)): return [int(idx) - 1 if one_based else int(idx)] return [int(x) - 1 if one_based else int(x) for x in idx] selected: list[int] = to0(i) if len(selected) == 0: raise ValueError("Initial seed `i` must contain at least one index.") rolling_idx = selected[0] banned = np.zeros(N, dtype=bool) if exclude is not None: banned[to0(exclude)] = True banned[np.clip(selected, 0, N - 1)] = True sum_dist = np.zeros(N, dtype=float) if treat_as_kernel: Kdiag = np.diag(A).astype(float) def add_from(seed: int): d2 = Kdiag - 2.0 * A[seed, :] + Kdiag[seed] np.maximum(d2, 0.0, out=d2) np.sqrt(d2, out=d2) return d2 def remove_from(seed: int): return add_from(seed) else: X = A norms2 = np.sum(X * X, axis=0) def add_from(seed: int): Kq = np.dot(X[:, seed].T, X) d2 = norms2 - 2.0 * Kq + norms2[seed] np.maximum(d2, 0.0, out=d2) np.sqrt(d2, out=d2) return d2 def remove_from(seed: int): return add_from(seed) for k in range(1, noc + 10 + 1): if k > (noc - 1) and len(selected) > 0: to_remove = selected[0] sum_dist -= remove_from(to_remove) banned[to_remove] = False selected.pop(0) sum_dist += add_from(rolling_idx) candidates = np.where(~banned)[0] if candidates.size == 0: break farthest_idx = candidates[np.argmax(sum_dist[candidates])] rolling_idx = farthest_idx selected.append(farthest_idx) banned[farthest_idx] = True if len(selected) > noc: selected = selected[-noc:] elif len(selected) < noc: fill_needed = noc - len(selected) more = np.setdiff1d(np.arange(N), np.array(selected), assume_unique=False) selected.extend(list(more[:fill_needed])) if one_based: return [s + 1 for s in selected] return selected