13 #include "Clustering.h" 
   22 #include "FaissAssert.h" 
   23 #include "IndexFlat.h" 
   30     verbose(false), spherical(false),
 
   32     frozen_centroids(false),
 
   33     min_points_per_centroid(39),
 
   34     max_points_per_centroid(256),
 
   48 static double imbalance_factor (
int n, 
int k, 
long *assign) {
 
   49     std::vector<int> hist(k, 0);
 
   50     for (
int i = 0; i < n; i++)
 
   53     double tot = 0, uf = 0;
 
   55     for (
int i = 0 ; i < k ; i++) {
 
   57         uf += hist[i] * (double) hist[i];
 
   59     uf = uf * k / (tot * tot);
 
   68     FAISS_THROW_IF_NOT_FMT (nx >= k,
 
   69              "Number of training points (%ld) should be at least " 
   70              "as large as number of clusters (%ld)", nx, k);
 
   76     for (
size_t i = 0; i < nx * 
d; i++) {
 
   77       FAISS_THROW_IF_NOT_MSG (finite (x_in[i]),
 
   78                         "input contains NaN's or Inf's");
 
   81     const float *x = x_in;
 
   86             printf(
"Sampling a subset of %ld / %ld for training\n",
 
   87                    k * max_points_per_centroid, nx);
 
   88         std::vector<int> perm (nx);
 
   89         rand_perm (perm.data (), nx, 
seed);
 
   91         float * x_new = 
new float [nx * 
d];
 
   92         for (idx_t i = 0; i < nx; i++)
 
   93             memcpy (x_new + i * d, x + perm[i] * d, 
sizeof(x_new[0]) * 
d);
 
   98                  "WARNING clustering %ld points to %ld centroids: " 
   99                  "please provide at least %ld training points\n",
 
  100                  nx, k, idx_t(k) * min_points_per_centroid);
 
  106             printf(
"Number of training points (%ld) same as number of " 
  107                    "clusters, just copying\n", nx);
 
  111         memcpy (
centroids.data(), x_in, 
sizeof (*x_in) * d * 
k);
 
  117         printf(
"Clustering %d points in %ldD to %ld clusters, " 
  118                "redo %d times, %d iterations\n",
 
  124     idx_t * assign = 
new idx_t[nx];
 
  126     float * dis = 
new float[nx];
 
  130     float best_err = 1e50;
 
  131     std::vector<float> best_obj;
 
  132     std::vector<float> best_centroids;
 
  136     FAISS_THROW_IF_NOT_MSG (
 
  138        "size of provided input centroids not a multiple of dimension");
 
  140     size_t n_input_centroids = 
centroids.size() / 
d;
 
  142     if (verbose && n_input_centroids > 0) {
 
  143         printf (
"  Using %zd centroids provided as input (%sfrozen)\n",
 
  147     double t_search_tot = 0;
 
  149         printf(
"  Preprocessing in %.2f s\n",
 
  154     for (
int redo = 0; redo < 
nredo; redo++) {
 
  156         if (verbose && nredo > 1) {
 
  157             printf(
"Outer iteration %d / %d\n", redo, nredo);
 
  163         std::vector<int> perm (nx);
 
  165         rand_perm (perm.data(), nx, 
seed + 1 + redo * 15486557L);
 
  166         for (
int i = n_input_centroids; i < 
k ; i++)
 
  167             memcpy (&
centroids[i * d], x + perm[i] * d,
 
  176         FAISS_THROW_IF_NOT (index.
ntotal == 0);
 
  179         for (
int i = 0; i < 
niter; i++) {
 
  181             index.
search (nx, x, 1, dis, assign);
 
  185             for (
int j = 0; j < nx; j++)
 
  194                 printf (
"  Iteration %d (%.2f s, search %.2f s): " 
  195                         "objective=%g imbalance=%.3f nsplit=%d       \r",
 
  198                         err, imbalance_factor (nx, k, assign),
 
  210             assert (index.
ntotal == 0);
 
  213         if (verbose) printf(
"\n");
 
  215             if (err < best_err) {
 
  217                     printf (
"Objective improved: keep new clusters\n");
 
  237     clus.verbose = d * n * k > (1L << 30);
 
  240     clus.
train (n, x, index);
 
  241     memcpy(centroids, clus.
centroids.data(), 
sizeof(*centroids) * d * k);
 
  242     return clus.
obj.back();
 
int km_update_centroids(const float *x, float *centroids, long *assign, size_t d, size_t k, size_t n, size_t k_frozen)
int niter
clustering iterations 
int nredo
redo clustering this many times and keep best 
ClusteringParameters()
sets reasonable defaults 
virtual void reset()=0
removes all elements from the database. 
Clustering(int d, int k)
the only mandatory parameters are k and d 
virtual void train(idx_t n, const float *x)
int seed
seed for the random number generator 
bool frozen_centroids
use the centroids provided as input and do not change them during iterations 
int min_points_per_centroid
otherwise you get a warning 
virtual void add(idx_t n, const float *x)=0
float kmeans_clustering(size_t d, size_t n, size_t k, const float *x, float *centroids)
idx_t ntotal
total nb of indexed vectors 
double getmillisecs()
ms elapsed since some arbitrary epoch 
std::vector< float > centroids
centroids (k * d) 
size_t d
dimension of the vectors 
virtual void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const =0
bool update_index
update index after each iteration? 
virtual void train(idx_t n, const float *x, faiss::Index &index)
Index is used during the assignment stage. 
bool is_trained
set if the Index does not require training, or if training is done already 
bool spherical
do we want normalized centroids? 
int max_points_per_centroid
to limit size of dataset