diff -Nru faiss-1.7.3/benchs/bench_all_ivf/bench_all_ivf.py faiss-1.7.4/benchs/bench_all_ivf/bench_all_ivf.py
--- faiss-1.7.3/benchs/bench_all_ivf/bench_all_ivf.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/benchs/bench_all_ivf/bench_all_ivf.py	2023-04-19 13:18:30.000000000 +0000
@@ -3,15 +3,20 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
+import argparse
 import os
 import sys
 import time
-import pdb
-import numpy as np
+
 import faiss
-import argparse
-import datasets
-from datasets import sanitize
+import numpy as np
+
+try:
+    import datasets_fb as datasets
+except ModuleNotFoundError:
+    import datasets_oss as datasets
+
+sanitize = datasets.sanitize
 
 
 ######################################################
@@ -262,8 +267,7 @@
         print("Getting centroids from", args.get_centroids_from)
         src_index = faiss.read_index(args.get_centroids_from)
         src_quant = faiss.downcast_index(src_index.quantizer)
-        centroids = faiss.vector_to_array(src_quant.xb)
-        centroids = centroids.reshape(-1, d)
+        centroids = src_quant.reconstruct_n()
         print("  centroid table shape", centroids.shape)
 
         if isinstance(vec_transform, faiss.VectorTransform):
@@ -333,7 +337,7 @@
 
 xq = sanitize(ds.get_queries())
 gt = ds.get_groundtruth(k=args.k)
-assert gt.shape[1] == args.k, pdb.set_trace()
+assert gt.shape[1] == args.k
 
 if args.searchthreads != -1:
     print("Setting nb of threads to", args.searchthreads)
diff -Nru faiss-1.7.3/benchs/bench_all_ivf/cmp_with_scann.py faiss-1.7.4/benchs/bench_all_ivf/cmp_with_scann.py
--- faiss-1.7.3/benchs/bench_all_ivf/cmp_with_scann.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/benchs/bench_all_ivf/cmp_with_scann.py	2023-04-19 13:18:30.000000000 +0000
@@ -75,28 +75,37 @@
     k = args.k
     nrun = args.nrun
 
-    if args.lib == "faiss":
+    if not os.path.exists(cache_dir + "xb.npy"):
         # prepare cache
-        import faiss
         from datasets import load_dataset
-
         ds = load_dataset(args.db, download=args.download)
         print(ds)
-        if not os.path.exists(cache_dir + "xb.npy"):
-            # store for SCANN
-            os.system(f"rm -rf {cache_dir}; mkdir -p {cache_dir}")
-            tosave = dict(
-                # xt = ds.get_train(10),
-                xb = ds.get_database(),
-                xq = ds.get_queries(),
-                gt = ds.get_groundtruth()
-            )
-            for name, v in tosave.items():
-                fname = cache_dir + "/" + name + ".npy"
-                print("save", fname)
-                np.save(fname, v)
-
-            open(cache_dir + "metric", "w").write(ds.metric)
+        # store for SCANN
+        os.system(f"rm -rf {cache_dir}; mkdir -p {cache_dir}")
+        tosave = dict(
+            xb = ds.get_database(),
+            xq = ds.get_queries(),
+            gt = ds.get_groundtruth()
+        )
+        for name, v in tosave.items():
+            fname = cache_dir + "/" + name + ".npy"
+            print("save", fname)
+            np.save(fname, v)
+
+        open(cache_dir + "metric", "w").write(ds.metric)
+        
+    dataset = {}
+    for kn in "xb xq gt".split():
+        fname = cache_dir + "/" + kn + ".npy"
+        print("load", fname)
+        dataset[kn] = np.load(fname)
+    xb = dataset["xb"]
+    xq = dataset["xq"]
+    gt = dataset["gt"] 
+    distance_measure = open(cache_dir + "metric").read()
+    
+    if args.lib == "faiss":
+        import faiss
 
         name1_to_metric = {
             "IP": faiss.METRIC_INNER_PRODUCT,
@@ -106,14 +115,10 @@
         index_fname = cache_dir + "index.faiss"
         if not os.path.exists(index_fname):
             index = faiss_make_index(
-                ds.get_database(), name1_to_metric[ds.metric], index_fname)
+                xb, name1_to_metric[distance_measure], index_fname)
         else:
             index = faiss.read_index(index_fname)
 
-        xb = ds.get_database()
-        xq = ds.get_queries()
-        gt = ds.get_groundtruth()
-
         faiss_eval_search(
                 index, xq, xb, nprobe_tab, pre_reorder_k_tab, k, gt,
                 nrun, args.measure
@@ -122,32 +127,22 @@
     if args.lib == "scann":
         from scann.scann_ops.py import scann_ops_pybind
 
-        dataset = {}
-        for kn in "xb xq gt".split():
-            fname = cache_dir + "/" + kn + ".npy"
-            print("load", fname)
-            dataset[kn] = np.load(fname)
         name1_to_name2 = {
             "IP": "dot_product",
             "L2": "squared_l2"
         }
-        distance_measure = name1_to_name2[open(cache_dir + "metric").read()]
-
-        xb = dataset["xb"]
-        xq = dataset["xq"]
-        gt = dataset["gt"]
 
         scann_dir = cache_dir + "/scann1.1.1_serialized"
         if os.path.exists(scann_dir + "/scann_config.pb"):
             searcher = scann_ops_pybind.load_searcher(scann_dir)
         else:
-            searcher = scann_make_index(xb, distance_measure, scann_dir, 0)
+            searcher = scann_make_index(xb, name1_to_name2[distance_measure], scann_dir, 0)
 
         scann_dir = cache_dir + "/scann1.1.1_serialized_reorder"
         if os.path.exists(scann_dir + "/scann_config.pb"):
             searcher_reo = scann_ops_pybind.load_searcher(scann_dir)
         else:
-            searcher_reo = scann_make_index(xb, distance_measure, scann_dir, 100)
+            searcher_reo = scann_make_index(xb, name1_to_name2[distance_measure], scann_dir, 100)
 
         scann_eval_search(
             searcher, searcher_reo,
@@ -256,7 +251,6 @@
     #    index.by_residual = False
 
     print("train")
-    # index.train(ds.get_train())
     index.train(xb[:250000])
     print("add")
     index.add(xb)
diff -Nru faiss-1.7.3/benchs/bench_all_ivf/datasets_oss.py faiss-1.7.4/benchs/bench_all_ivf/datasets_oss.py
--- faiss-1.7.3/benchs/bench_all_ivf/datasets_oss.py	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/benchs/bench_all_ivf/datasets_oss.py	2023-04-19 13:18:30.000000000 +0000
@@ -0,0 +1,136 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Common functions to load datasets and compute their ground-truth
+"""
+
+import time
+import numpy as np
+import faiss
+
+from faiss.contrib import datasets as faiss_datasets
+
+print("path:", faiss_datasets.__file__)
+
+faiss_datasets.dataset_basedir = '/checkpoint/matthijs/simsearch/'
+
+def sanitize(x):
+    return np.ascontiguousarray(x, dtype='float32')
+
+
+#################################################################
+# Dataset
+#################################################################
+
+class DatasetCentroids(faiss_datasets.Dataset):
+
+    def __init__(self, ds, indexfile):
+        self.d = ds.d
+        self.metric = ds.metric
+        self.nq = ds.nq
+        self.xq = ds.get_queries()
+
+        # get the xb set
+        src_index = faiss.read_index(indexfile)
+        src_quant = faiss.downcast_index(src_index.quantizer)
+        centroids = faiss.vector_to_array(src_quant.xb)
+        self.xb = centroids.reshape(-1, self.d)
+        self.nb = self.nt = len(self.xb)
+
+    def get_queries(self):
+        return self.xq
+
+    def get_database(self):
+        return self.xb
+
+    def get_train(self, maxtrain=None):
+        return self.xb
+
+    def get_groundtruth(self, k=100):
+        return faiss.knn(
+            self.xq, self.xb, k,
+            faiss.METRIC_L2 if self.metric == 'L2' else faiss.METRIC_INNER_PRODUCT
+        )[1]
+
+
+
+
+
+
+def load_dataset(dataset='deep1M', compute_gt=False, download=False):
+
+    print("load data", dataset)
+
+    if dataset == 'sift1M':
+        return faiss_datasets.DatasetSIFT1M()
+
+    elif dataset.startswith('bigann'):
+
+        dbsize = 1000 if dataset == "bigann1B" else int(dataset[6:-1])
+
+        return faiss_datasets.DatasetBigANN(nb_M=dbsize)
+
+    elif dataset.startswith("deep_centroids_"):
+        ncent = int(dataset[len("deep_centroids_"):])
+        centdir = "/checkpoint/matthijs/bench_all_ivf/precomputed_clusters"
+        return DatasetCentroids(
+            faiss_datasets.DatasetDeep1B(nb=1000000),
+            f"{centdir}/clustering.dbdeep1M.IVF{ncent}.faissindex"
+        )
+
+    elif dataset.startswith("deep"):
+
+        szsuf = dataset[4:]
+        if szsuf[-1] == 'M':
+            dbsize = 10 ** 6 * int(szsuf[:-1])
+        elif szsuf == '1B':
+            dbsize = 10 ** 9
+        elif szsuf[-1] == 'k':
+            dbsize = 1000 * int(szsuf[:-1])
+        else:
+            assert False, "did not recognize suffix " + szsuf
+        return faiss_datasets.DatasetDeep1B(nb=dbsize)
+
+    elif dataset == "music-100":
+        return faiss_datasets.DatasetMusic100()
+
+    elif dataset == "glove":
+        return faiss_datasets.DatasetGlove(download=download)
+
+    else:
+        assert False
+
+
+#################################################################
+# Evaluation
+#################################################################
+
+
+def evaluate_DI(D, I, gt):
+    nq = gt.shape[0]
+    k = I.shape[1]
+    rank = 1
+    while rank <= k:
+        recall = (I[:, :rank] == gt[:, :1]).sum() / float(nq)
+        print("R@%d: %.4f" % (rank, recall), end=' ')
+        rank *= 10
+
+
+def evaluate(xq, gt, index, k=100, endl=True):
+    t0 = time.time()
+    D, I = index.search(xq, k)
+    t1 = time.time()
+    nq = xq.shape[0]
+    print("\t %8.4f ms per query, " % (
+        (t1 - t0) * 1000.0 / nq), end=' ')
+    rank = 1
+    while rank <= k:
+        recall = (I[:, :rank] == gt[:, :1]).sum() / float(nq)
+        print("R@%d: %.4f" % (rank, recall), end=' ')
+        rank *= 10
+    if endl:
+        print()
+    return D, I
diff -Nru faiss-1.7.3/benchs/bench_all_ivf/datasets.py faiss-1.7.4/benchs/bench_all_ivf/datasets.py
--- faiss-1.7.3/benchs/bench_all_ivf/datasets.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/benchs/bench_all_ivf/datasets.py	1970-01-01 00:00:00.000000000 +0000
@@ -1,137 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""
-Common functions to load datasets and compute their ground-truth
-"""
-
-import time
-import numpy as np
-import faiss
-
-from faiss.contrib import datasets as faiss_datasets
-
-print("path:", faiss_datasets.__file__)
-
-faiss_datasets.dataset_basedir = '/checkpoint/matthijs/simsearch/'
-
-def sanitize(x):
-    return np.ascontiguousarray(x, dtype='float32')
-
-
-#################################################################
-# Dataset
-#################################################################
-
-class DatasetCentroids(faiss_datasets.Dataset):
-
-    def __init__(self, ds, indexfile):
-        self.d = ds.d
-        self.metric = ds.metric
-        self.nq = ds.nq
-        self.xq = ds.get_queries()
-
-        # get the xb set
-        src_index = faiss.read_index(indexfile)
-        src_quant = faiss.downcast_index(src_index.quantizer)
-        centroids = faiss.vector_to_array(src_quant.xb)
-        self.xb = centroids.reshape(-1, self.d)
-        self.nb = self.nt = len(self.xb)
-
-    def get_queries(self):
-        return self.xq
-
-    def get_database(self):
-        return self.xb
-
-    def get_train(self, maxtrain=None):
-        return self.xb
-
-    def get_groundtruth(self, k=100):
-        return faiss.knn(
-            self.xq, self.xb, k,
-            faiss.METRIC_L2 if self.metric == 'L2' else faiss.METRIC_INNER_PRODUCT
-        )[1]
-
-
-
-
-
-
-def load_dataset(dataset='deep1M', compute_gt=False, download=False):
-
-    print("load data", dataset)
-
-    if dataset == 'sift1M':
-        return faiss_datasets.DatasetSIFT1M()
-
-    elif dataset.startswith('bigann'):
-
-        dbsize = 1000 if dataset == "bigann1B" else int(dataset[6:-1])
-
-        return faiss_datasets.DatasetBigANN(nb_M=dbsize)
-
-    elif dataset.startswith("deep_centroids_"):
-        ncent = int(dataset[len("deep_centroids_"):])
-        centdir = "/checkpoint/matthijs/bench_all_ivf/precomputed_clusters"
-        return DatasetCentroids(
-            faiss_datasets.DatasetDeep1B(nb=1000000),
-            f"{centdir}/clustering.dbdeep1M.IVF{ncent}.faissindex"
-        )
-
-
-    elif dataset.startswith("deep"):
-
-        szsuf = dataset[4:]
-        if szsuf[-1] == 'M':
-            dbsize = 10 ** 6 * int(szsuf[:-1])
-        elif szsuf == '1B':
-            dbsize = 10 ** 9
-        elif szsuf[-1] == 'k':
-            dbsize = 1000 * int(szsuf[:-1])
-        else:
-            assert False, "did not recognize suffix " + szsuf
-        return faiss_datasets.DatasetDeep1B(nb=dbsize)
-
-    elif dataset == "music-100":
-        return faiss_datasets.DatasetMusic100()
-
-    elif dataset == "glove":
-        return faiss_datasets.DatasetGlove(download=download)
-
-    else:
-        assert False
-
-
-#################################################################
-# Evaluation
-#################################################################
-
-
-def evaluate_DI(D, I, gt):
-    nq = gt.shape[0]
-    k = I.shape[1]
-    rank = 1
-    while rank <= k:
-        recall = (I[:, :rank] == gt[:, :1]).sum() / float(nq)
-        print("R@%d: %.4f" % (rank, recall), end=' ')
-        rank *= 10
-
-
-def evaluate(xq, gt, index, k=100, endl=True):
-    t0 = time.time()
-    D, I = index.search(xq, k)
-    t1 = time.time()
-    nq = xq.shape[0]
-    print("\t %8.4f ms per query, " % (
-        (t1 - t0) * 1000.0 / nq), end=' ')
-    rank = 1
-    while rank <= k:
-        recall = (I[:, :rank] == gt[:, :1]).sum() / float(nq)
-        print("R@%d: %.4f" % (rank, recall), end=' ')
-        rank *= 10
-    if endl:
-        print()
-    return D, I
diff -Nru faiss-1.7.3/benchs/bench_big_batch_ivf.py faiss-1.7.4/benchs/bench_big_batch_ivf.py
--- faiss-1.7.3/benchs/bench_big_batch_ivf.py	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/benchs/bench_big_batch_ivf.py	2023-04-19 13:18:30.000000000 +0000
@@ -0,0 +1,109 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import time
+
+import faiss
+
+import numpy as np
+
+from faiss.contrib.datasets import SyntheticDataset
+from faiss.contrib.ivf_tools import big_batch_search
+
+parser = argparse.ArgumentParser()
+
+
+def aa(*args, **kwargs):
+    group.add_argument(*args, **kwargs)
+
+
+group = parser.add_argument_group('dataset options')
+aa('--dim', type=int, default=64)
+aa('--size', default="S")
+
+group = parser.add_argument_group('index options')
+aa('--nlist', type=int, default=100)
+aa('--factory_string', default="", help="overrides nlist")
+aa('--k', type=int, default=10)
+aa('--nprobe', type=int, default=5)
+aa('--nt', type=int, default=-1, help="nb search threads")
+aa('--method', default="pairwise_distances", help="")
+
+args = parser.parse_args()
+print("args:", args)
+
+if args.size == "S":
+    ds = SyntheticDataset(32, 2000, 4000, 1000)
+elif args.size == "M":
+    ds = SyntheticDataset(32, 20000, 40000, 10000)
+elif args.size == "L":
+    ds = SyntheticDataset(32, 200000, 400000, 100000)
+else:
+    raise RuntimeError(f"dataset size {args.size} not supported")
+
+nlist = args.nlist
+nprobe = args.nprobe
+k = args.k
+
+
+def tic(name):
+    global tictoc
+    tictoc = (name, time.time())
+    print(name, end="\r", flush=True)
+
+
+def toc():
+    global tictoc
+    name, t0 = tictoc
+    dt = time.time() - t0
+    print(f"{name}: {dt:.3f} s")
+    return dt
+
+
+print(f"dataset {ds}, {nlist=:} {nprobe=:} {k=:}")
+
+if args.factory_string == "":
+    factory_string = f"IVF{nlist},Flat"
+else:
+    factory_string = args.factory_string
+
+print(f"instantiate {factory_string}")
+index = faiss.index_factory(ds.d, factory_string)
+
+if args.factory_string != "":
+    nlist = index.nlist
+
+print("nlist", nlist)
+
+tic("train")
+index.train(ds.get_train())
+toc()
+
+tic("add")
+index.add(ds.get_database())
+toc()
+
+if args.nt != -1:
+    print("setting nb of threads to", args.nt)
+    faiss.omp_set_num_threads(args.nt)
+
+tic("reference search")
+index.nprobe
+index.nprobe = nprobe
+Dref, Iref = index.search(ds.get_queries(), k)
+t_ref = toc()
+
+tic("block search")
+Dnew, Inew = big_batch_search(
+    index, ds.get_queries(),
+    k, method=args.method, verbose=10
+)
+t_tot = toc()
+
+assert (Inew != Iref).sum() / Iref.size < 1e-4
+np.testing.assert_almost_equal(Dnew, Dref, decimal=4)
+
+print(f"total block search time {t_tot:.3f} s, speedup {t_ref / t_tot:.3f}x")
diff -Nru faiss-1.7.3/benchs/bench_cppcontrib_sa_decode.cpp faiss-1.7.4/benchs/bench_cppcontrib_sa_decode.cpp
--- faiss-1.7.3/benchs/bench_cppcontrib_sa_decode.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/benchs/bench_cppcontrib_sa_decode.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -1,3 +1,10 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
 #include <omp.h>
 
 #include <algorithm>
diff -Nru faiss-1.7.3/benchs/bench_gpu_1bn.py faiss-1.7.4/benchs/bench_gpu_1bn.py
--- faiss-1.7.3/benchs/bench_gpu_1bn.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/benchs/bench_gpu_1bn.py	2023-04-19 13:18:30.000000000 +0000
@@ -13,7 +13,7 @@
 import faiss
 import re
 
-from multiprocessing.dummy import Pool as ThreadPool
+from multiprocessing.pool import ThreadPool
 from datasets import ivecs_read
 
 ####################################################################
diff -Nru faiss-1.7.3/benchs/bench_gpu_sift1m.py faiss-1.7.4/benchs/bench_gpu_sift1m.py
--- faiss-1.7.3/benchs/bench_gpu_sift1m.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/benchs/bench_gpu_sift1m.py	2023-04-19 13:18:30.000000000 +0000
@@ -85,7 +85,8 @@
 
 for lnprobe in range(10):
     nprobe = 1 << lnprobe
-    index.setNumProbes(nprobe)
+    index.nprobe
+    index.nprobe = nprobe
     t, r = evaluate(index, xq, gt, 100)
 
     print("nprobe=%4d %.3f ms recalls= %.4f %.4f %.4f" % (nprobe, t, r[1], r[10], r[100]))
diff -Nru faiss-1.7.3/benchs/bench_hamming_computer.cpp faiss-1.7.4/benchs/bench_hamming_computer.cpp
--- faiss-1.7.3/benchs/bench_hamming_computer.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/benchs/bench_hamming_computer.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -9,6 +9,8 @@
 #include <cstdio>
 #include <vector>
 
+#include <inttypes.h>
+
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/utils/hamming.h>
 #include <faiss/utils/random.h>
@@ -30,6 +32,114 @@
     }
 }
 
+template <int CODE_SIZE_IN_BITS>
+void hamming_func_test(
+        const uint8_t* const x1,
+        const uint8_t* const x2,
+        const size_t n1,
+        const size_t n2,
+        uint64_t& sumv,
+        uint64_t& xorv) {
+    constexpr size_t CODE_SIZE_IN_BYTES = CODE_SIZE_IN_BITS / 8;
+
+    double t0 = faiss::getmillisecs();
+
+    uint64_t sumx = 0;
+    uint64_t xorx = 0;
+
+    const size_t nruns = 10;
+    for (size_t irun = 0; irun < 10; irun++) {
+#pragma omp parallel reduction(+ : sumx, xorx)
+        {
+#pragma omp for
+            for (size_t i = 0; i < n1; i++) {
+                uint64_t local_sum = 0;
+                uint64_t local_xor = 0;
+
+                const uint64_t* data1_ptr =
+                        (const uint64_t*)(x1 + i * CODE_SIZE_IN_BYTES);
+
+                for (size_t j = 0; j < n2; j++) {
+                    const uint64_t* data2_ptr =
+                            (const uint64_t*)(x2 + j * CODE_SIZE_IN_BYTES);
+
+                    uint64_t code = faiss::hamming<CODE_SIZE_IN_BITS>(
+                            data1_ptr, data2_ptr);
+                    local_sum += code;
+                    local_xor ^= code;
+                }
+
+                sumx += local_sum;
+                xorx ^= local_xor;
+            }
+        }
+    }
+
+    sumv = sumx;
+    xorv = xorx;
+
+    double t1 = faiss::getmillisecs();
+    printf("hamming<%d>: %.3f msec, %" PRIX64 ", %" PRIX64 "\n",
+           CODE_SIZE_IN_BITS,
+           (t1 - t0) / nruns,
+           sumx,
+           xorx);
+}
+
+template <typename HammingComputerT, int CODE_SIZE_IN_BITS>
+void hamming_computer_test(
+        const uint8_t* const x1,
+        const uint8_t* const x2,
+        const size_t n1,
+        const size_t n2,
+        uint64_t& sumv,
+        uint64_t& xorv) {
+    constexpr size_t CODE_SIZE_IN_BYTES = CODE_SIZE_IN_BITS / 8;
+
+    double t0 = faiss::getmillisecs();
+
+    uint64_t sumx = 0;
+    uint64_t xorx = 0;
+
+    const size_t nruns = 10;
+    for (size_t irun = 0; irun < nruns; irun++) {
+        sumx = 0;
+        xorx = 0;
+
+#pragma omp parallel reduction(+ : sumx, xorx)
+        {
+#pragma omp for
+            for (size_t i = 0; i < n1; i++) {
+                uint64_t local_sum = 0;
+                uint64_t local_xor = 0;
+
+                const uint8_t* data1_ptr = x1 + i * CODE_SIZE_IN_BYTES;
+                HammingComputerT hc(data1_ptr, CODE_SIZE_IN_BYTES);
+
+                for (size_t j = 0; j < n2; j++) {
+                    const uint8_t* data2_ptr = x2 + j * CODE_SIZE_IN_BYTES;
+                    uint64_t code = hc.hamming(data2_ptr);
+                    local_sum += code;
+                    local_xor ^= code;
+                }
+
+                sumx += local_sum;
+                xorx ^= local_xor;
+            }
+        }
+    }
+
+    sumv = sumx;
+    xorv = xorx;
+
+    double t1 = faiss::getmillisecs();
+    printf("HammingComputer<%zd>: %.3f msec, %" PRIX64 ", %" PRIX64 "\n",
+           CODE_SIZE_IN_BYTES,
+           (t1 - t0) / nruns,
+           sumx,
+           xorx);
+}
+
 int main() {
     size_t n = 4 * 1000 * 1000;
 
@@ -89,5 +199,57 @@
         printf("Hamming_M8   implem: %.3f ms\n", tot_t2 / nrun);
         printf("Hamming_M4   implem: %.3f ms\n", tot_t3 / nrun);
     }
+
+    // evaluate various hamming<>() function calls
+    const size_t MAX_HAMMING_FUNC_CODE_SIZE = 512;
+
+    const size_t n1 = 65536;
+    const size_t n2 = 16384;
+
+    std::vector<uint8_t> x1(n1 * MAX_HAMMING_FUNC_CODE_SIZE / 8);
+    std::vector<uint8_t> x2(n2 * MAX_HAMMING_FUNC_CODE_SIZE / 8);
+    byte_rand(x1.data(), x1.size(), 12345);
+    byte_rand(x2.data(), x2.size(), 23456);
+
+    // These two values serve as a kind of CRC.
+    uint64_t sumx = 0;
+    uint64_t xorx = 0;
+    hamming_func_test<64>(x1.data(), x2.data(), n1, n2, sumx, xorx);
+    hamming_func_test<128>(x1.data(), x2.data(), n1, n2, sumx, xorx);
+    hamming_func_test<256>(x1.data(), x2.data(), n1, n2, sumx, xorx);
+    hamming_func_test<384>(x1.data(), x2.data(), n1, n2, sumx, xorx);
+    hamming_func_test<512>(x1.data(), x2.data(), n1, n2, sumx, xorx);
+
+    // evaluate various HammingComputerXX
+    hamming_computer_test<faiss::HammingComputer4, 32>(
+            x1.data(), x2.data(), n1, n2, sumx, xorx);
+    hamming_computer_test<faiss::HammingComputer8, 64>(
+            x1.data(), x2.data(), n1, n2, sumx, xorx);
+    hamming_computer_test<faiss::HammingComputer16, 128>(
+            x1.data(), x2.data(), n1, n2, sumx, xorx);
+    hamming_computer_test<faiss::HammingComputer20, 160>(
+            x1.data(), x2.data(), n1, n2, sumx, xorx);
+    hamming_computer_test<faiss::HammingComputer32, 256>(
+            x1.data(), x2.data(), n1, n2, sumx, xorx);
+    hamming_computer_test<faiss::HammingComputer64, 512>(
+            x1.data(), x2.data(), n1, n2, sumx, xorx);
+
+    // evaluate various GenHammingDistanceComputerXX
+    hamming_computer_test<faiss::GenHammingComputer8, 64>(
+            x1.data(), x2.data(), n1, n2, sumx, xorx);
+    hamming_computer_test<faiss::GenHammingComputer16, 128>(
+            x1.data(), x2.data(), n1, n2, sumx, xorx);
+    hamming_computer_test<faiss::GenHammingComputer32, 256>(
+            x1.data(), x2.data(), n1, n2, sumx, xorx);
+
+    hamming_computer_test<faiss::GenHammingComputerM8, 64>(
+            x1.data(), x2.data(), n1, n2, sumx, xorx);
+    hamming_computer_test<faiss::GenHammingComputerM8, 128>(
+            x1.data(), x2.data(), n1, n2, sumx, xorx);
+    hamming_computer_test<faiss::GenHammingComputerM8, 256>(
+            x1.data(), x2.data(), n1, n2, sumx, xorx);
+    hamming_computer_test<faiss::GenHammingComputerM8, 512>(
+            x1.data(), x2.data(), n1, n2, sumx, xorx);
+
     return 0;
 }
diff -Nru faiss-1.7.3/benchs/bench_hnsw.py faiss-1.7.4/benchs/bench_hnsw.py
--- faiss-1.7.3/benchs/bench_hnsw.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/benchs/bench_hnsw.py	2023-04-19 13:18:30.000000000 +0000
@@ -17,7 +17,7 @@
 
 
 k = int(sys.argv[1])
-todo = sys.argv[1:]
+todo = sys.argv[2:]
 
 print("load data")
 
diff -Nru faiss-1.7.3/benchs/bench_hybrid_cpu_gpu.py faiss-1.7.4/benchs/bench_hybrid_cpu_gpu.py
--- faiss-1.7.3/benchs/bench_hybrid_cpu_gpu.py	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/benchs/bench_hybrid_cpu_gpu.py	2023-04-19 13:18:30.000000000 +0000
@@ -0,0 +1,606 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+import pickle
+import time
+from multiprocessing.pool import ThreadPool
+
+import faiss
+import numpy as np
+
+try:
+    from faiss.contrib.datasets_fb import dataset_from_name
+except ImportError:
+    from faiss.contrib.datasets import dataset_from_name
+
+from faiss.contrib.evaluation import OperatingPointsWithRanges
+from faiss.contrib.ivf_tools import replace_ivf_quantizer
+
+#################################################################
+# Preassigned search functions
+#################################################################
+
+
+def search_preassigned(xq, k, index, quantizer, batch_size=0):
+    """
+    Explicitly call the coarse quantizer and the search_preassigned
+    on the index.
+    """
+    n, d = xq.shape
+    nprobe = index.nprobe
+    if batch_size == 0:
+        batch_size = n + 1
+    D = np.empty((n, k), dtype='float32')
+    I = np.empty((n, k), dtype='int64')
+    for i0 in range(0, n, batch_size):
+        Dq, Iq = quantizer.search(xq[i0:i0 + batch_size], nprobe)
+        D[i0:i0 + batch_size], I[i0:i0 + batch_size] = \
+            index.search_preassigned(xq[i0:i0 + batch_size], k, Iq, Dq)
+    return D, I
+
+
+def tiled_search_preassigned(xq, k, index, quantizer, batch_size=32768):
+    """
+    Explicitly call the coarse quantizer and the search_preassigned
+    on the index. Allow overlapping between coarse quantization and
+    scanning the inverted lists.
+    """
+    n, d = xq.shape
+
+    # prepare a thread that will run the quantizer
+    qq_pool = ThreadPool(1)
+    nprobe = index.nprobe
+
+    def coarse_quant(i0):
+        if i0 >= n:
+            return None
+        i1 = min(i0 + batch_size, n)
+        return quantizer.search(xq[i0:i1], nprobe)
+
+    D = np.empty((n, k), dtype='float32')
+    I = np.empty((n, k), dtype='int64')
+    qq = coarse_quant(0)
+
+    for i0 in range(0, n, batch_size):
+        i1 = min(i0 + batch_size, n)
+        qq_next = qq_pool.apply_async(coarse_quant, (i0 + batch_size, ))
+        Dq, Iq = qq
+        index.search_preassigned(
+            xq[i0:i1], k, Iq=Iq, Dq=Dq, I=I[i0:i1], D=D[i0:i1])
+        qq = qq_next.get()
+
+    qq_pool.close()
+    return D, I
+
+
+#################################################################
+# IVF index objects with a separate coarse quantizer
+#################################################################
+
+class SeparateCoarseQuantizationIndex:
+    """
+    Separately manage the coarse quantizer and the IVF index.
+    """
+
+    def __init__(self, quantizer, index, bs=-1, seq_tiling=False):
+        self.index = index
+        self.index_ivf = extract_index_ivf(index)
+        if isinstance(self.index_ivf, faiss.IndexIVF):
+            self.index_ivf.parallel_mode
+            self.index_ivf.parallel_mode = 3
+
+        self.quantizer = quantizer
+        assert self.quantizer.d == self.index_ivf.d
+        # populate quantizer if it was not done before
+        if quantizer.ntotal > 0:
+            assert quantizer.ntotal == self.index_ivf.nlist
+        else:
+            centroids = self.index_ivf.quantizer.reconstruct_n()
+            print(f"adding centroids size {centroids.shape} to quantizer")
+            quantizer.train(centroids)
+            quantizer.add(centroids)
+        self.bs = bs
+        self.seq_tiling = seq_tiling
+
+    def search(self, xq, k):
+        # perform coarse quantization
+        if isinstance(self.index, faiss.IndexPreTransform):
+            # print("applying pre-transform")
+            assert self.index.chain.size() == 1
+            xq = self.index.chain.at(0).apply(xq)
+        if self.bs <= 0:
+            # non batched
+            nprobe = self.index_ivf.nprobe
+            Dq, Iq = self.quantizer.search(xq, nprobe)
+
+            return self.index_ivf.search_preassigned(xq, k, Iq, Dq)
+        if self.seq_tiling:
+            return search_preassigned(
+                xq, k, self.index_ivf, self.quantizer, self.bs)
+        else:
+            return tiled_search_preassigned(
+                xq, k, self.index_ivf, self.quantizer, self.bs)
+
+
+class ShardedGPUIndex:
+    """
+    Multiple GPU indexes, each on its GPU, with a common coarse quantizer.
+    The Python version of IndexShardsIVF
+    """
+    def __init__(self, quantizer, index, bs=-1, seq_tiling=False):
+        self.quantizer = quantizer
+        self.cpu_index = index
+        if isinstance(index, faiss.IndexPreTransform):
+            index = faiss.downcast_index(index.index)
+        ngpu = index.count()
+        self.pool = ThreadPool(ngpu)
+        self.bs = bs
+        if bs > 0:
+            self.q_pool = ThreadPool(1)
+
+    def __del__(self):
+        self.pool.close()
+        if self.bs > 0:
+            self.q_pool.close()
+
+    def search(self, xq, k):
+        nq = len(xq)
+        # perform coarse quantization
+        index = self.cpu_index
+        if isinstance(self.cpu_index, faiss.IndexPreTransform):
+            assert index.chain.size() == 1
+            xq = self.cpu_index.chain.at(0).apply(xq)
+            index = faiss.downcast_index(index.index)
+        ngpu = index.count()
+        sub_index_0 = faiss.downcast_index(index.at(0))
+        nprobe = sub_index_0.nprobe
+
+        Dall = np.empty((ngpu, nq, k), dtype='float32')
+        Iall = np.empty((ngpu, nq, k), dtype='int64')
+        bs = self.bs
+        if bs <= 0:
+
+            Dq, Iq = self.quantizer.search(xq, nprobe)
+
+            def do_search(rank):
+                gpu_index = faiss.downcast_index(index.at(rank))
+                Dall[rank], Iall[rank] = gpu_index.search_preassigned(
+                    xq, k, Iq, Dq)
+            list(self.pool.map(do_search, range(ngpu)))
+        else:
+            qq_pool = self.q_pool
+            bs = self.bs
+
+            def coarse_quant(i0):
+                if i0 >= nq:
+                    return None
+                return self.quantizer.search(xq[i0:i0 + bs], nprobe)
+
+            def do_search(rank, i0, qq):
+                gpu_index = faiss.downcast_index(index.at(rank))
+                Dq, Iq = qq
+                Dall[rank, i0:i0 + bs], Iall[rank, i0:i0 + bs] = \
+                    gpu_index.search_preassigned(xq[i0:i0 + bs], k, Iq, Dq)
+
+            qq = coarse_quant(0)
+
+            for i0 in range(0, nq, bs):
+                qq_next = qq_pool.apply_async(coarse_quant, (i0 + bs, ))
+                list(self.pool.map(
+                    lambda rank: do_search(rank, i0, qq),
+                    range(ngpu)
+                ))
+                qq = qq_next.get()
+
+        return faiss.merge_knn_results(Dall, Iall)
+
+
+def extract_index_ivf(index):
+    """ extract the IVF sub-index from the index, supporting GpuIndexes
+    as well """
+    try:
+        return faiss.extract_index_ivf(index)
+    except RuntimeError:
+        if index.__class__ == faiss.IndexPreTransform:
+            index = faiss.downcast_index(index.index)
+        if isinstance(index, faiss.GpuIndexIVF):
+            return index
+        raise RuntimeError(f"could not extract IVF index from {index}")
+
+
+def set_index_parameter(index, name, val):
+    """
+    Index parameter setting that works on the index lookalikes defined above
+    """
+    if index.__class__ == SeparateCoarseQuantizationIndex:
+        if name == "nprobe":
+            set_index_parameter(index.index_ivf, name, val)
+        elif name.startswith("quantizer_"):
+            set_index_parameter(
+                index.quantizer, name[name.find("_") + 1:], val)
+        else:
+            raise RuntimeError()
+        return
+
+    if index.__class__ == ShardedGPUIndex:
+        if name == "nprobe":
+            set_index_parameter(index.cpu_index, name, val)
+        elif name.startswith("quantizer_"):
+            set_index_parameter(
+                index.quantizer, name[name.find("_") + 1:], val)
+        else:
+            raise RuntimeError()
+        return
+
+    # then it's a Faiss index
+    index = faiss.downcast_index(index)
+
+    if isinstance(index, faiss.IndexPreTransform):
+        set_index_parameter(index.index, name, val)
+    elif isinstance(index, faiss.IndexShardsIVF):
+        if name != "nprobe" and name.startswith("quantizer_"):
+            set_index_parameter(
+                index.quantizer, name[name.find("_") + 1:], val)
+        else:
+            for i in range(index.count()):
+                sub_index = index.at(i)
+                set_index_parameter(sub_index, name, val)
+    elif (isinstance(index, faiss.IndexShards) or
+          isinstance(index, faiss.IndexReplicas)):
+        for i in range(index.count()):
+            sub_index = index.at(i)
+            set_index_parameter(sub_index, name, val)
+    elif name.startswith("quantizer_"):
+        index_ivf = extract_index_ivf(index)
+        set_index_parameter(
+            index_ivf.quantizer, name[name.find("_") + 1:], val)
+    elif name == "efSearch":
+        index.hnsw.efSearch
+        index.hnsw.efSearch = int(val)
+    elif name == "nprobe":
+        index_ivf = extract_index_ivf(index)
+        index_ivf.nprobe
+        index_ivf.nprobe = int(val)
+    else:
+        raise RuntimeError(f"could not set param {name} on {index}")
+
+
+#####################################################################
+# Driver routine
+#####################################################################
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    def aa(*args, **kwargs):
+        group.add_argument(*args, **kwargs)
+
+    group = parser.add_argument_group('dataset options')
+    aa('--nq', type=int, default=int(10e5),
+       help="nb queries (queries will be duplicated if below that number")
+    aa('--db', default='bigann10M', help='dataset')
+
+    group = parser.add_argument_group('index options')
+    aa('--indexname', default="", help="override index name")
+    aa('--mmap', default=False, action='store_true', help='mmap index')
+    aa('--shard_type', default=1, type=int, help="set type of sharding")
+    aa('--useFloat16', default=False, action='store_true',
+       help='GPU cloner options')
+    aa('--useFloat16CoarseQuantizer', default=False, action='store_true',
+       help='GPU cloner options')
+    aa('--usePrecomputed', default=False, action='store_true',
+       help='GPU cloner options')
+    group = parser.add_argument_group('search options')
+    aa('--k', type=int, default=100)
+    aa('--search_type', default="cpu",
+        choices=[
+            "cpu", "gpu", "gpu_flat_quantizer",
+            "cpu_flat_gpu_quantizer", "gpu_tiled", "gpu_ivf_quantizer",
+            "multi_gpu", "multi_gpu_flat_quantizer",
+            "multi_gpu_sharded", "multi_gpu_flat_quantizer_sharded",
+            "multi_gpu_sharded1", "multi_gpu_sharded1_flat",
+            "multi_gpu_sharded1_ivf",
+            "multi_gpu_Csharded1", "multi_gpu_Csharded1_flat",
+            "multi_gpu_Csharded1_ivf",
+        ],
+        help="how to search"
+    )
+    aa('--ivf_quant_nlist', type=int, default=1024,
+       help="nb of invlists for IVF quantizer")
+    aa('--batch_size', type=int, default=-1,
+       help="batch size for tiled CPU / GPU computation (-1= no tiling)")
+    aa('--n_autotune', type=int, default=300,
+        help="max nb of auto-tuning steps")
+    aa('--nt', type=int, default=-1, help="force number of CPU threads to this")
+
+    group = parser.add_argument_group('output options')
+    aa('--quiet', default=False, action="store_true")
+    aa('--stats', default="", help="pickle to store output stats")
+
+    args = parser.parse_args()
+    print("args:", args)
+
+    if not args.quiet:
+        # log some stats about the machine
+        os.system("grep -m1 'model name' < /proc/cpuinfo")
+        os.system("grep -E 'MemTotal|MemFree' /proc/meminfo")
+        os.system("nvidia-smi")
+
+    print("prepare dataset", args.db)
+    ds = dataset_from_name(args.db)
+    print(ds)
+
+    print("Faiss nb GPUs:", faiss.get_num_gpus())
+
+    xq = ds.get_queries()
+    if args.nq > len(xq):
+        xqx = []
+        n = 0
+        while n < args.nq:
+            xqx.append(xq[:args.nq - n])
+            n += len(xqx[-1])
+        print(f"increased nb queries from {len(xq)} to {n}")
+        xq = np.vstack(xqx)
+
+    if args.nt != -1:
+        print("setting nb openmp threads to", args.nt)
+        faiss.omp_set_num_threads(args.nt)
+
+    print("loading index")
+
+    if args.mmap:
+        io_flag = faiss.IO_FLAG_READ_ONLY | faiss.IO_FLAG_MMAP
+    else:
+        io_flag = 0
+
+    print(f"load index {args.indexname} {io_flag=:x}")
+    index = faiss.read_index(args.indexname, io_flag)
+    index_ivf = faiss.extract_index_ivf(index)
+
+    print("prepare index")
+    op = OperatingPointsWithRanges()
+    op.add_range(
+        "nprobe", [
+            2 ** i for i in range(20)
+            if 2 ** i < index_ivf.nlist * 0.1 and 2 ** i <= 4096
+        ]
+    )
+
+    # prepare options for GPU clone
+
+    co = faiss.GpuMultipleClonerOptions()
+    co.useFloat16 = args.useFloat16
+    co.useFloat16CoarseQuantizer = args.useFloat16CoarseQuantizer
+    co.usePrecomputed = args.usePrecomputed
+    co.shard_type = args.shard_type
+
+    if args.search_type == "cpu":
+        op.add_range(
+            "quantizer_efSearch",
+            [2 ** i for i in range(10)]
+        )
+    elif args.search_type == "gpu":
+        print("move index to 1 GPU")
+        res = faiss.StandardGpuResources()
+        index = faiss.index_cpu_to_gpu(res, 0, index, co)
+        op.add_range(
+            "quantizer_efSearch",
+            [2 ** i for i in range(10)]
+        )
+        op.restrict_range("nprobe", 2049)
+    elif args.search_type == "gpu_tiled":
+        print("move index to 1 GPU")
+        new_quantizer = faiss.IndexFlatL2(index_ivf.d)
+        quantizer_hnsw = replace_ivf_quantizer(index_ivf, new_quantizer)
+        res = faiss.StandardGpuResources()
+        index = faiss.index_cpu_to_gpu(res, 0, index, co)
+        op.add_range(
+            "quantizer_efSearch",
+            [2 ** i for i in range(10)]
+        )
+        op.restrict_range("nprobe", 2049)
+        index = SeparateCoarseQuantizationIndex(
+            quantizer_hnsw, index, bs=args.batch_size)
+    elif args.search_type == "gpu_ivf_quantizer":
+        index_ivf = faiss.extract_index_ivf(index)
+        centroids = index_ivf.quantizer.reconstruct_n()
+        replace_ivf_quantizer(index_ivf, faiss.IndexFlatL2(index_ivf.d))
+        res = faiss.StandardGpuResources()
+        new_quantizer = faiss.index_factory(
+            index_ivf.d, f"IVF{args.ivf_quant_nlist},Flat")
+        new_quantizer.train(centroids)
+        new_quantizer.add(centroids)
+        index = SeparateCoarseQuantizationIndex(
+            faiss.index_cpu_to_gpu(res, 0, new_quantizer, co),
+            faiss.index_cpu_to_gpu(res, 0, index, co),
+            bs=args.batch_size, seq_tiling=True
+        )
+        op.add_range(
+            "quantizer_nprobe",
+            [2 ** i for i in range(9)]
+        )
+        op.restrict_range("nprobe", 1025)
+    elif args.search_type == "gpu_flat_quantizer":
+        index_ivf = faiss.extract_index_ivf(index)
+        new_quantizer = faiss.IndexFlatL2(index_ivf.d)
+        replace_ivf_quantizer(index_ivf, new_quantizer)
+        res = faiss.StandardGpuResources()
+        index = faiss.index_cpu_to_gpu(res, 0, index, co)
+        op.restrict_range("nprobe", 2049)
+    elif args.search_type == "cpu_flat_gpu_quantizer":
+        index_ivf = faiss.extract_index_ivf(index)
+        quantizer = faiss.IndexFlatL2(index_ivf.d)
+        res = faiss.StandardGpuResources()
+        quantizer = faiss.index_cpu_to_gpu(res, 0, quantizer, co)
+        index = SeparateCoarseQuantizationIndex(
+            quantizer, index, bs=args.batch_size)
+        op.restrict_range("nprobe", 2049)
+    elif args.search_type in ("multi_gpu", "multi_gpu_sharded"):
+        print(f"move index to {faiss.get_num_gpus()} GPU")
+        co.shard = "sharded" in args.search_type
+        index = faiss.index_cpu_to_all_gpus(index, co=co)
+        op.add_range(
+            "quantizer_efSearch",
+            [2 ** i for i in range(10)]
+        )
+        op.restrict_range("nprobe", 2049)
+    elif args.search_type in (
+            "multi_gpu_flat_quantizer", "multi_gpu_flat_quantizer_sharded"):
+        index_ivf = faiss.extract_index_ivf(index)
+        new_quantizer = faiss.IndexFlatL2(ds.d)
+        replace_ivf_quantizer(index_ivf, new_quantizer)
+        index = faiss.index_cpu_to_all_gpus(index, co=co)
+        op.restrict_range("nprobe", 2049)
+    elif args.search_type in (
+            "multi_gpu_sharded1", "multi_gpu_sharded1_flat",
+            "multi_gpu_sharded1_ivf"):
+        print(f"move index to {faiss.get_num_gpus()} GPU")
+        new_quantizer = faiss.IndexFlatL2(index_ivf.d)
+        hnsw_quantizer = replace_ivf_quantizer(index_ivf, new_quantizer)
+        co.shard
+        co.shard = True
+        gpus = list(range(faiss.get_num_gpus()))
+        res = [faiss.StandardGpuResources() for _ in gpus]
+        index = faiss.index_cpu_to_gpu_multiple_py(res, index, co, gpus)
+        op.restrict_range("nprobe", 2049)
+        if args.search_type == "multi_gpu_sharded1":
+            op.add_range(
+                "quantizer_efSearch",
+                [2 ** i for i in range(10)]
+            )
+            index = ShardedGPUIndex(hnsw_quantizer, index, bs=args.batch_size)
+        elif args.search_type == "multi_gpu_sharded1_ivf":
+            centroids = hnsw_quantizer.storage.reconstruct_n()
+            quantizer = faiss.index_factory(
+                centroids.shape[1], f"IVF{args.ivf_quant_nlist},Flat")
+            quantizer.train(centroids)
+            quantizer.add(centroids)
+            co.shard = False
+            quantizer = faiss.index_cpu_to_gpu_multiple_py(
+                res, quantizer, co, gpus)
+            index = ShardedGPUIndex(quantizer, index, bs=args.batch_size)
+
+            op.add_range(
+                "quantizer_nprobe",
+                [2 ** i for i in range(9)]
+            )
+            op.restrict_range("nprobe", 1025)
+        elif args.search_type == "multi_gpu_sharded1_flat":
+            quantizer = hnsw_quantizer.storage
+            quantizer = faiss.index_cpu_to_gpu_multiple_py(
+                res, quantizer, co, gpus)
+            index = ShardedGPUIndex(quantizer, index, bs=args.batch_size)
+        else:
+            raise RuntimeError()
+    elif args.search_type in (
+            "multi_gpu_Csharded1", "multi_gpu_Csharded1_flat",
+            "multi_gpu_Csharded1_ivf"):
+        print(f"move index to {faiss.get_num_gpus()} GPU")
+        co.shard = True
+        co.common_ivf_quantizer
+        co.common_ivf_quantizer = True
+        op.restrict_range("nprobe", 2049)
+        if args.search_type == "multi_gpu_Csharded1":
+            op.add_range(
+                "quantizer_efSearch",
+                [2 ** i for i in range(10)]
+            )
+            index = faiss.index_cpu_to_all_gpus(index, co)
+        elif args.search_type == "multi_gpu_Csharded1_flat":
+            new_quantizer = faiss.IndexFlatL2(index_ivf.d)
+            quantizer_hnsw = replace_ivf_quantizer(index_ivf, new_quantizer)
+            index = faiss.index_cpu_to_all_gpus(index, co)
+        elif args.search_type == "multi_gpu_Csharded1_ivf":
+            quantizer = faiss.index_factory(
+                index_ivf.d, f"IVF{args.ivf_quant_nlist},Flat")
+            quantizer_hnsw = replace_ivf_quantizer(index_ivf, quantizer)
+            op.add_range(
+                "quantizer_nprobe",
+                [2 ** i for i in range(9)]
+            )
+            index = faiss.index_cpu_to_all_gpus(index, co)
+        else:
+            raise RuntimeError()
+    else:
+        raise RuntimeError()
+
+    totex = op.num_experiments()
+    rs = np.random.RandomState(123)
+    if totex < args.n_autotune:
+        experiments = rs.permutation(totex - 2) + 1
+    else:
+        experiments = rs.randint(
+            totex - 2, size=args.n_autotune - 2, replace=False)
+
+    experiments = [0, totex - 1] + list(experiments)
+    print(f"total nb experiments {totex}, running {len(experiments)}")
+
+    print("perform search")
+    gt = ds.get_groundtruth(100)
+
+    # piggyback on operating points so that this gets stored in the stats file
+    op.all_experiments = []
+    op.platform = {
+        "loadavg": open("/proc/loadavg", "r").readlines(),
+        "procesor": [l for l in open("/proc/cpuinfo") if "model name" in l][0],
+        "GPU": list(os.popen("nvidia-smi", "r")),
+        "mem": open("/proc/meminfo", "r").readlines(),
+        "pid": os.getpid()
+    }
+    op.args = args
+    if args.stats:
+        print(f"storing stats in {args.stats} after each experiment")
+
+    for cno in experiments:
+        key = op.cno_to_key(cno)
+        parameters = op.get_parameters(key)
+        print(f"{cno=:4d} {str(parameters):50}", end=": ", flush=True)
+
+        (max_perf, min_time) = op.predict_bounds(key)
+        if not op.is_pareto_optimal(max_perf, min_time):
+            print(f"SKIP, {max_perf=:.3f} {min_time=:.3f}", )
+            continue
+
+        for name, val in parameters.items():
+            set_index_parameter(index, name, val)
+
+        if cno == 0:
+            # warmup
+            for _ in range(5):
+                D, I = index.search(xq, 100)
+
+        t0 = time.time()
+        try:
+            D, I = index.search(xq, 100)
+        except RuntimeError as e:
+            print(f"ERROR {e}")
+            continue
+        t1 = time.time()
+
+        recalls = {}
+        for rank in 1, 10, 100:
+            recall = (gt[:, :1] == I[:ds.nq, :rank]).sum() / ds.nq
+            recalls[rank] = recall
+
+        print(f"time={t1 - t0:.3f} s recalls={recalls}")
+        perf = recalls[1]
+        op.add_operating_point(key, perf, t1 - t0)
+        op.all_experiments.append({
+            "cno": cno,
+            "key": key,
+            "parameters": parameters,
+            "time": t1 - t0,
+            "recalls": recalls
+        })
+
+        if args.stats:
+            pickle.dump(op, open(args.stats, "wb"))
+
+
+if __name__ == "__main__":
+    main()
diff -Nru faiss-1.7.3/benchs/bench_ivf_selector.cpp faiss-1.7.4/benchs/bench_ivf_selector.cpp
--- faiss-1.7.3/benchs/bench_ivf_selector.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/benchs/bench_ivf_selector.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -25,7 +25,7 @@
  */
 
 int main() {
-    using idx_t = faiss::Index::idx_t;
+    using idx_t = faiss::idx_t;
     int d = 64;
     size_t nb = 1024 * 1024;
     size_t nq = 512 * 16;
diff -Nru faiss-1.7.3/benchs/bench_polysemous_1bn.py faiss-1.7.4/benchs/bench_polysemous_1bn.py
--- faiss-1.7.3/benchs/bench_polysemous_1bn.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/benchs/bench_polysemous_1bn.py	2023-04-19 13:18:30.000000000 +0000
@@ -9,7 +9,7 @@
 import numpy as np
 import re
 import faiss
-from multiprocessing.dummy import Pool as ThreadPool
+from multiprocessing.pool import ThreadPool
 from datasets import ivecs_read
 
 
diff -Nru faiss-1.7.3/benchs/bench_pq_transposed_centroid_table.py faiss-1.7.4/benchs/bench_pq_transposed_centroid_table.py
--- faiss-1.7.3/benchs/bench_pq_transposed_centroid_table.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/benchs/bench_pq_transposed_centroid_table.py	2023-04-19 13:18:30.000000000 +0000
@@ -1,3 +1,10 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
 import faiss
 import time
 import random
diff -Nru faiss-1.7.3/benchs/distributed_ondisk/distributed_kmeans.py faiss-1.7.4/benchs/distributed_ondisk/distributed_kmeans.py
--- faiss-1.7.3/benchs/distributed_ondisk/distributed_kmeans.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/benchs/distributed_ondisk/distributed_kmeans.py	2023-04-19 13:18:30.000000000 +0000
@@ -17,7 +17,7 @@
 
 import faiss
 
-from multiprocessing.dummy import Pool as ThreadPool
+from multiprocessing.pool import ThreadPool
 from faiss.contrib import rpc
 from faiss.contrib.datasets import SyntheticDataset
 from faiss.contrib.vecs_io import bvecs_mmap, fvecs_mmap
diff -Nru faiss-1.7.3/benchs/distributed_ondisk/make_index_vslice.py faiss-1.7.4/benchs/distributed_ondisk/make_index_vslice.py
--- faiss-1.7.3/benchs/distributed_ondisk/make_index_vslice.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/benchs/distributed_ondisk/make_index_vslice.py	2023-04-19 13:18:30.000000000 +0000
@@ -8,7 +8,7 @@
 import numpy as np
 import faiss
 import argparse
-from multiprocessing.dummy import Pool as ThreadPool
+from multiprocessing.pool import ThreadPool
 
 def ivecs_mmap(fname):
     a = np.memmap(fname, dtype='int32', mode='r')
diff -Nru faiss-1.7.3/benchs/distributed_ondisk/merge_to_ondisk.py faiss-1.7.4/benchs/distributed_ondisk/merge_to_ondisk.py
--- faiss-1.7.3/benchs/distributed_ondisk/merge_to_ondisk.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/benchs/distributed_ondisk/merge_to_ondisk.py	2023-04-19 13:18:30.000000000 +0000
@@ -6,7 +6,7 @@
 import os
 import faiss
 import argparse
-from multiprocessing.dummy import Pool as ThreadPool
+from multiprocessing.pool import ThreadPool
 
 if __name__ == '__main__':
 
diff -Nru faiss-1.7.3/benchs/distributed_ondisk/search_server.py faiss-1.7.4/benchs/distributed_ondisk/search_server.py
--- faiss-1.7.3/benchs/distributed_ondisk/search_server.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/benchs/distributed_ondisk/search_server.py	2023-04-19 13:18:30.000000000 +0000
@@ -64,7 +64,7 @@
 # Client implementation
 ############################################################
 
-from multiprocessing.dummy import Pool as ThreadPool
+from multiprocessing.pool import ThreadPool
 import faiss
 import numpy as np
 
diff -Nru faiss-1.7.3/benchs/link_and_code/bench_link_and_code.py faiss-1.7.4/benchs/link_and_code/bench_link_and_code.py
--- faiss-1.7.3/benchs/link_and_code/bench_link_and_code.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/benchs/link_and_code/bench_link_and_code.py	2023-04-19 13:18:30.000000000 +0000
@@ -8,10 +8,7 @@
 import sys
 import time
 import numpy as np
-import re
 import faiss
-from multiprocessing.dummy import Pool as ThreadPool
-import pdb
 import argparse
 import datasets
 from datasets import sanitize
diff -Nru faiss-1.7.3/benchs/README.md faiss-1.7.4/benchs/README.md
--- faiss-1.7.3/benchs/README.md	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/benchs/README.md	2023-04-19 13:18:30.000000000 +0000
@@ -75,7 +75,7 @@
 
 ### Getting Deep1B
 
-The ground-truth and queries are available here 
+The ground-truth and queries are available here
 
 https://yadi.sk/d/11eDCm7Dsn9GA
 
@@ -145,7 +145,7 @@
 
 ### Experiments of the appendix
 
-The experiments in the appendix are only in the ArXiv version of the paper (table 3). 
+The experiments in the appendix are only in the ArXiv version of the paper (table 3).
 
 ```
 python bench_polysemous_1bn.py SIFT1000M OPQ8_64,IMI2x13,PQ8 nprobe={1,2,4,8,16,32,64,128},ht={20,24,26,28,30}
@@ -179,11 +179,11 @@
 
 ## GPU experiments
 
-The benchmarks below run 1 or 4 Titan X GPUs and reproduce the results of the "GPU paper". They are also a good starting point on how to use GPU Faiss. 
+The benchmarks below run 1 or 4 Titan X GPUs and reproduce the results of the "GPU paper". They are also a good starting point on how to use GPU Faiss.
 
 ### Search on SIFT1M
 
-See above on how to get SIFT1M into subdirectory sift1M/. The script [`bench_gpu_sift1m.py`](bench_gpu_sift1m.py) reproduces the "exact k-NN time" plot in the ArXiv paper, and the SIFT1M numbers. 
+See above on how to get SIFT1M into subdirectory sift1M/. The script [`bench_gpu_sift1m.py`](bench_gpu_sift1m.py) reproduces the "exact k-NN time" plot in the ArXiv paper, and the SIFT1M numbers.
 
 The output is:
 ```
@@ -245,14 +245,14 @@
 
 To get the "infinite MNIST dataset", follow the instructions on [Léon Bottou's website](http://leon.bottou.org/projects/infimnist). The script assumes the file `mnist8m-patterns-idx3-ubyte` is in subdirectory `mnist8m`
 
-The script [`kmeans_mnist.py`](kmeans_mnist.py) produces the following output: 
+The script [`kmeans_mnist.py`](kmeans_mnist.py) produces the following output:
 
 ```
 python kmeans_mnist.py 1 256
 ...
 Clustering 8100000 points in 784D to 256 clusters, redo 1 times, 20 iterations
   Preprocessing in 7.94526 s
-  Iteration 19 (131.697 s, search 114.78 s): objective=1.44881e+13 imbalance=1.05963 nsplit=0        
+  Iteration 19 (131.697 s, search 114.78 s): objective=1.44881e+13 imbalance=1.05963 nsplit=0
 final objective: 1.449e+13
 total runtime: 140.615 s
 ```
@@ -263,7 +263,7 @@
 
 Even on multiple GPUs, building the 1B datasets can last several hours. It is often a good idea to validate that everything is working fine on smaller datasets like SIFT1M, SIFT2M, etc.
 
-The search results on SIFT1B in the "GPU paper" can be obtained with 
+The search results on SIFT1B in the "GPU paper" can be obtained with
 
 <!-- see P57124181 -->
 
@@ -285,7 +285,7 @@
 
 ### search on Deep1B
 
-The same script generates the GPU search results on Deep1B. 
+The same script generates the GPU search results on Deep1B.
 
 ```
 python bench_gpu_1bn.py  Deep1B OPQ20_80,IVF262144,PQ20 -nnn 10 -R 2 -ngpu 4 -altadd -noptables -tempmem $[1024*1024*1024]
@@ -336,3 +336,26 @@
 999997440/1000000000 (36717.207 s, 0.6015)      probe=128: 36717.309 s rank-10 intersection results: 0.6015
 999997440/1000000000 (70616.392 s, 0.6047)      probe=256: 70616.581 s rank-10 intersection results: 0.6047
 ```
+
+# Additional benchmarks
+
+This directory also contains certain additional benchmarks (and serve as an additional source of examples of how to use the FAISS code).
+Certain tests / benchmarks might be outdated.
+
+* bench_6bit_codec.cpp - tests vector codecs for SQ6 quantization on a synthetic dataset
+* bench_cppcontrib_sa_decode.cpp - benchmarks specialized kernels for vector codecs for PQ, IVFPQ and Resudial+PQ on a synthetic dataset
+* bench_for_interrupt.py - evaluates the impact of the interrupt callback handler (which can be triggered from Python code)
+* bench_hamming_computer.cpp - specialized implementations for Hamming distance computations
+* bench_heap_replace.cpp - benchmarks different implementations of certain calls for a Heap data structure
+* bench_hnsw.py - benchmarks HNSW in combination with other ones for SIFT1M dataset
+* bench_index_flat.py - benchmarks IndexFlatL2 on a synthetic dataset
+* bench_index_pq.py - benchmarks PQ on SIFT1M dataset
+* bench_ivf_fastscan_single_query.py - benchmarks a single query for different nprobe levels for IVF{nlist},PQ{M}x4fs on BIGANN dataset
+* bench_ivf_fastscan.py - compares IVF{nlist},PQ{M}x4fs against other indices on SIFT1M dataset
+* bench_ivf_selector.cpp - checks the possible overhead when using faiss::IDSelectorAll interface
+* bench_pairwise_distances.py - benchmarks pairwise distance computation between two synthetic datasets
+* bench_partition.py - benchmarks partitioning functions
+* bench_pq_tables.py - benchmarks ProductQuantizer.compute_inner_prod_tables() and ProductQuantizer.compute_distance_tables() calls
+* bench_quantizer.py - benchmarks various quantizers for SIFT1M, Deep1B, BigANN datasets
+* bench_scalar_quantizer.py - benchmarks IVF+SQ on a Sift1M dataset
+* bench_vector_ops.py - benchmarks dot product and distances computations on a synthetic dataset
diff -Nru faiss-1.7.3/c_api/example_c.c faiss-1.7.4/c_api/example_c.c
--- faiss-1.7.3/c_api/example_c.c	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/c_api/example_c.c	2023-04-19 13:18:30.000000000 +0000
@@ -17,6 +17,7 @@
 #include "Index_c.h"
 #include "clone_index_c.h"
 #include "error_c.h"
+#include "impl/AuxIndexStructures_c.h"
 #include "index_factory_c.h"
 #include "index_io_c.h"
 
@@ -92,6 +93,84 @@
         free(I);
         free(D);
     }
+    { // search xb first 5 but search parameters of id range [50, 100]
+        idx_t* I = malloc(k * nq * sizeof(idx_t));
+        float* D = malloc(k * nq * sizeof(float));
+        FaissIDSelectorRange* sel = NULL;
+        FAISS_TRY(faiss_IDSelectorRange_new(&sel, 50, 100));
+        FaissSearchParameters* params = NULL;
+        FAISS_TRY(faiss_SearchParameters_new(&params, sel));
+        FAISS_TRY(
+                faiss_Index_search_with_params(index, nq, xq, k, params, D, I));
+        printf("Searching w/ IDSelectorRange [50,100]\n");
+        printf("I=\n");
+        for (int i = 0; i < 5; i++) {
+            for (int j = 0; j < k; j++)
+                printf("%5lld (d=%2.3f)  ", I[i * k + j], D[i * k + j]);
+            printf("\n");
+        }
+        free(I);
+        free(D);
+        faiss_SearchParameters_free(params);
+        faiss_IDSelectorRange_free(sel);
+    }
+
+    { // search xb first 5 but search parameters of id range [20,40] OR
+      // [45,60]
+        idx_t* I = malloc(k * nq * sizeof(idx_t));
+        float* D = malloc(k * nq * sizeof(float));
+        FaissIDSelectorRange* lhs_sel = NULL;
+        FAISS_TRY(faiss_IDSelectorRange_new(&lhs_sel, 20, 40));
+        FaissIDSelectorRange* rhs_sel = NULL;
+        FAISS_TRY(faiss_IDSelectorRange_new(&rhs_sel, 45, 60));
+        FaissIDSelectorOr* sel = NULL;
+        FAISS_TRY(faiss_IDSelectorOr_new(&sel, lhs_sel, rhs_sel));
+        FaissSearchParameters* params = NULL;
+        FAISS_TRY(faiss_SearchParameters_new(&params, sel));
+        FAISS_TRY(
+                faiss_Index_search_with_params(index, nq, xq, k, params, D, I));
+        printf("Searching w/ IDSelectorRange [20,40] OR [45,60] \n");
+        printf("I=\n");
+        for (int i = 0; i < 5; i++) {
+            for (int j = 0; j < k; j++)
+                printf("%5lld (d=%2.3f)  ", I[i * k + j], D[i * k + j]);
+            printf("\n");
+        }
+        free(I);
+        free(D);
+        faiss_SearchParameters_free(params);
+        faiss_IDSelectorRange_free(lhs_sel);
+        faiss_IDSelectorRange_free(rhs_sel);
+        faiss_IDSelector_free(sel);
+    }
+    { // search xb first 5 but search parameters of id range [20,40] AND
+      // [15,35] = [20,35]
+        idx_t* I = malloc(k * nq * sizeof(idx_t));
+        float* D = malloc(k * nq * sizeof(float));
+        FaissIDSelectorRange* lhs_sel = NULL;
+        FAISS_TRY(faiss_IDSelectorRange_new(&lhs_sel, 20, 40));
+        FaissIDSelectorRange* rhs_sel = NULL;
+        FAISS_TRY(faiss_IDSelectorRange_new(&rhs_sel, 15, 35));
+        FaissIDSelectorAnd* sel = NULL;
+        FAISS_TRY(faiss_IDSelectorAnd_new(&sel, lhs_sel, rhs_sel));
+        FaissSearchParameters* params = NULL;
+        FAISS_TRY(faiss_SearchParameters_new(&params, sel));
+        FAISS_TRY(
+                faiss_Index_search_with_params(index, nq, xq, k, params, D, I));
+        printf("Searching w/ IDSelectorRange [20,40] AND [15,35] = [20,35]\n");
+        printf("I=\n");
+        for (int i = 0; i < 5; i++) {
+            for (int j = 0; j < k; j++)
+                printf("%5lld (d=%2.3f)  ", I[i * k + j], D[i * k + j]);
+            printf("\n");
+        }
+        free(I);
+        free(D);
+        faiss_SearchParameters_free(params);
+        faiss_IDSelectorRange_free(lhs_sel);
+        faiss_IDSelectorRange_free(rhs_sel);
+        faiss_IDSelector_free(sel);
+    }
 
     printf("Saving index to disk...\n");
     FAISS_TRY(faiss_write_index_fname(index, "example.index"));
diff -Nru faiss-1.7.3/c_api/faiss_c.h faiss-1.7.4/c_api/faiss_c.h
--- faiss-1.7.3/c_api/faiss_c.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/c_api/faiss_c.h	2023-04-19 13:18:30.000000000 +0000
@@ -34,6 +34,12 @@
 #define FAISS_DECLARE_INDEX_DOWNCAST(clazz) \
     Faiss##clazz* faiss_##clazz##_cast(FaissIndex*);
 
+/// Declare a dynamic downcast operation from a base `FaissSearchParameters*`
+/// pointer type to a more specific search parameters type. The function returns
+/// the same pointer if the downcast is valid, and `NULL` otherwise.
+#define FAISS_DECLARE_SEARCH_PARAMETERS_DOWNCAST(clazz) \
+    Faiss##clazz* faiss_##clazz##_cast(FaissSearchParameters*);
+
 /// Declare a getter for the field `name` in class `clazz`,
 /// of return type `ty`
 #define FAISS_DECLARE_GETTER(clazz, ty, name) \
diff -Nru faiss-1.7.3/c_api/gpu/DeviceUtils_c.h faiss-1.7.4/c_api/gpu/DeviceUtils_c.h
--- faiss-1.7.3/c_api/gpu/DeviceUtils_c.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/c_api/gpu/DeviceUtils_c.h	2023-04-19 13:18:30.000000000 +0000
@@ -11,7 +11,7 @@
 #ifndef FAISS_DEVICE_UTILS_C_H
 #define FAISS_DEVICE_UTILS_C_H
 
-#include <cublas.h>
+#include <cublas_v2.h>
 #include <cuda_runtime_api.h>
 #include "../faiss_c.h"
 
diff -Nru faiss-1.7.3/c_api/gpu/GpuAutoTune_c.cpp faiss-1.7.4/c_api/gpu/GpuAutoTune_c.cpp
--- faiss-1.7.3/c_api/gpu/GpuAutoTune_c.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/c_api/gpu/GpuAutoTune_c.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -87,9 +87,9 @@
 }
 
 int faiss_index_cpu_to_gpu_multiple_with_options(
-        FaissGpuResourcesProvider** providers_vec,
+        FaissGpuResourcesProvider* const* providers_vec,
         size_t providers_vec_size,
-        int* devices,
+        const int* devices,
         size_t devices_size,
         const FaissIndex* index,
         const FaissGpuMultipleClonerOptions* options,
diff -Nru faiss-1.7.3/c_api/impl/AuxIndexStructures_c.cpp faiss-1.7.4/c_api/impl/AuxIndexStructures_c.cpp
--- faiss-1.7.3/c_api/impl/AuxIndexStructures_c.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/c_api/impl/AuxIndexStructures_c.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -18,8 +18,12 @@
 using faiss::BufferList;
 using faiss::DistanceComputer;
 using faiss::IDSelector;
+using faiss::IDSelectorAnd;
 using faiss::IDSelectorBatch;
+using faiss::IDSelectorNot;
+using faiss::IDSelectorOr;
 using faiss::IDSelectorRange;
+using faiss::IDSelectorXOr;
 using faiss::RangeQueryResult;
 using faiss::RangeSearchPartialResult;
 using faiss::RangeSearchResult;
@@ -115,6 +119,52 @@
     }
     CATCH_AND_HANDLE
 }
+
+int faiss_IDSelectorNot_new(
+        FaissIDSelectorNot** p_sel,
+        const FaissIDSelector* sel) {
+    try {
+        *p_sel = reinterpret_cast<FaissIDSelectorNot*>(
+                new IDSelectorNot(reinterpret_cast<const IDSelector*>(sel)));
+    }
+    CATCH_AND_HANDLE
+}
+
+int faiss_IDSelectorAnd_new(
+        FaissIDSelectorAnd** p_sel,
+        const FaissIDSelector* lhs_sel,
+        const FaissIDSelector* rhs_sel) {
+    try {
+        *p_sel = reinterpret_cast<FaissIDSelectorAnd*>(new IDSelectorAnd(
+                reinterpret_cast<const IDSelector*>(lhs_sel),
+                reinterpret_cast<const IDSelector*>(rhs_sel)));
+    }
+    CATCH_AND_HANDLE
+}
+
+int faiss_IDSelectorOr_new(
+        FaissIDSelectorOr** p_sel,
+        const FaissIDSelector* lhs_sel,
+        const FaissIDSelector* rhs_sel) {
+    try {
+        *p_sel = reinterpret_cast<FaissIDSelectorOr*>(new IDSelectorOr(
+                reinterpret_cast<const IDSelector*>(lhs_sel),
+                reinterpret_cast<const IDSelector*>(rhs_sel)));
+    }
+    CATCH_AND_HANDLE
+}
+
+int faiss_IDSelectorXOr_new(
+        FaissIDSelectorXOr** p_sel,
+        const FaissIDSelector* lhs_sel,
+        const FaissIDSelector* rhs_sel) {
+    try {
+        *p_sel = reinterpret_cast<FaissIDSelectorXOr*>(new IDSelectorXOr(
+                reinterpret_cast<const IDSelector*>(lhs_sel),
+                reinterpret_cast<const IDSelector*>(rhs_sel)));
+    }
+    CATCH_AND_HANDLE
+}
 
 // Below are structures used only by Index implementations
 
diff -Nru faiss-1.7.3/c_api/impl/AuxIndexStructures_c.h faiss-1.7.4/c_api/impl/AuxIndexStructures_c.h
--- faiss-1.7.3/c_api/impl/AuxIndexStructures_c.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/c_api/impl/AuxIndexStructures_c.h	2023-04-19 13:18:30.000000000 +0000
@@ -82,6 +82,29 @@
         size_t n,
         const idx_t* indices);
 
+FAISS_DECLARE_CLASS(IDSelectorNot)
+int faiss_IDSelectorNot_new(
+        FaissIDSelectorNot** p_sel,
+        const FaissIDSelector* sel);
+
+FAISS_DECLARE_CLASS(IDSelectorAnd)
+int faiss_IDSelectorAnd_new(
+        FaissIDSelectorAnd** p_sel,
+        const FaissIDSelector* lhs_sel,
+        const FaissIDSelector* rhs_sel);
+
+FAISS_DECLARE_CLASS(IDSelectorOr)
+int faiss_IDSelectorOr_new(
+        FaissIDSelectorOr** p_sel,
+        const FaissIDSelector* lhs_sel,
+        const FaissIDSelector* rhs_sel);
+
+FAISS_DECLARE_CLASS(IDSelectorXOr)
+int faiss_IDSelectorXOr_new(
+        FaissIDSelectorXOr** p_sel,
+        const FaissIDSelector* lhs_sel,
+        const FaissIDSelector* rhs_sel);
+
 // Below are structures used only by Index implementations
 
 /** List of temporary buffers used to store results before they are
diff -Nru faiss-1.7.3/c_api/Index_c.cpp faiss-1.7.4/c_api/Index_c.cpp
--- faiss-1.7.3/c_api/Index_c.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/c_api/Index_c.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -10,10 +10,25 @@
 
 #include "Index_c.h"
 #include <faiss/Index.h>
+#include <faiss/impl/IDSelector.h>
 #include "macros_impl.h"
 
 extern "C" {
 
+DEFINE_DESTRUCTOR(SearchParameters)
+
+int faiss_SearchParameters_new(
+        FaissSearchParameters** p_sp,
+        FaissIDSelector* sel) {
+    try {
+        faiss::SearchParameters* params = new faiss::SearchParameters;
+        params->sel = reinterpret_cast<faiss::IDSelector*>(sel);
+        *p_sp = reinterpret_cast<FaissSearchParameters*>(params);
+        return 0;
+    }
+    CATCH_AND_HANDLE
+}
+
 DEFINE_DESTRUCTOR(Index)
 
 DEFINE_GETTER(Index, int, d)
@@ -65,6 +80,26 @@
     }
     CATCH_AND_HANDLE
 }
+
+int faiss_Index_search_with_params(
+        const FaissIndex* index,
+        idx_t n,
+        const float* x,
+        idx_t k,
+        const FaissSearchParameters* params,
+        float* distances,
+        idx_t* labels) {
+    try {
+        reinterpret_cast<const faiss::Index*>(index)->search(
+                n,
+                x,
+                k,
+                distances,
+                labels,
+                reinterpret_cast<const faiss::SearchParameters*>(params));
+    }
+    CATCH_AND_HANDLE
+}
 
 int faiss_Index_range_search(
         const FaissIndex* index,
diff -Nru faiss-1.7.3/c_api/Index_c.h faiss-1.7.4/c_api/Index_c.h
--- faiss-1.7.3/c_api/Index_c.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/c_api/Index_c.h	2023-04-19 13:18:30.000000000 +0000
@@ -39,6 +39,13 @@
     METRIC_JensenShannon,
 } FaissMetricType;
 
+FAISS_DECLARE_CLASS(SearchParameters)
+FAISS_DECLARE_DESTRUCTOR(SearchParameters)
+
+int faiss_SearchParameters_new(
+        FaissSearchParameters** p_sp,
+        FaissIDSelector* sel);
+
 /// Opaque type for referencing to an index object
 FAISS_DECLARE_CLASS(Index)
 FAISS_DECLARE_DESTRUCTOR(Index)
@@ -107,6 +114,27 @@
         float* distances,
         idx_t* labels);
 
+/**
+ * query n vectors of dimension d with seach parameters to the index.
+ *
+ * return at most k vectors. If there are not enough results for a query,
+ * the result is padded with -1s.
+ *
+ * @param index       opaque pointer to index object
+ * @param x           input vectors to search, size n * d
+ * @param params      input params to modify how search is done
+ * @param labels      output labels of the NNs, size n*k
+ * @param distances   output pairwise distances, size n*k
+ */
+int faiss_Index_search_with_params(
+        const FaissIndex* index,
+        idx_t n,
+        const float* x,
+        idx_t k,
+        const FaissSearchParameters* params,
+        float* distances,
+        idx_t* labels);
+
 /** query n vectors of dimension d to the index.
  *
  * return all vectors with distance < radius. Note that many
diff -Nru faiss-1.7.3/c_api/IndexIVF_c.cpp faiss-1.7.4/c_api/IndexIVF_c.cpp
--- faiss-1.7.3/c_api/IndexIVF_c.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/c_api/IndexIVF_c.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -12,10 +12,50 @@
 #include <faiss/IndexIVF.h>
 #include "Clustering_c.h"
 #include "Index_c.h"
+#include "impl/AuxIndexStructures_c.h"
 #include "macros_impl.h"
 
 using faiss::IndexIVF;
 using faiss::IndexIVFStats;
+using faiss::SearchParametersIVF;
+
+/// SearchParametersIVF definitions
+
+DEFINE_DESTRUCTOR(SearchParametersIVF)
+DEFINE_SEARCH_PARAMETERS_DOWNCAST(SearchParametersIVF)
+
+int faiss_SearchParametersIVF_new(FaissSearchParametersIVF** p_sp) {
+    try {
+        SearchParametersIVF* sp = new SearchParametersIVF;
+        *p_sp = reinterpret_cast<FaissSearchParametersIVF*>(sp);
+    }
+    CATCH_AND_HANDLE
+}
+
+int faiss_SearchParametersIVF_new_with(
+        FaissSearchParametersIVF** p_sp,
+        FaissIDSelector* sel,
+        size_t nprobe,
+        size_t max_codes) {
+    try {
+        SearchParametersIVF* sp = new SearchParametersIVF;
+        sp->sel = reinterpret_cast<faiss::IDSelector*>(sel);
+        sp->nprobe = nprobe;
+        sp->max_codes = max_codes;
+        *p_sp = reinterpret_cast<FaissSearchParametersIVF*>(sp);
+    }
+    CATCH_AND_HANDLE
+}
+
+DEFINE_GETTER_PERMISSIVE(SearchParametersIVF, const FaissIDSelector*, sel)
+
+DEFINE_GETTER(SearchParametersIVF, size_t, nprobe)
+DEFINE_SETTER(SearchParametersIVF, size_t, nprobe)
+
+DEFINE_GETTER(SearchParametersIVF, size_t, max_codes)
+DEFINE_SETTER(SearchParametersIVF, size_t, max_codes)
+
+/// IndexIVF definitions
 
 DEFINE_DESTRUCTOR(IndexIVF)
 DEFINE_INDEX_DOWNCAST(IndexIVF)
@@ -61,7 +101,10 @@
         idx_t a2) {
     try {
         reinterpret_cast<const IndexIVF*>(index)->copy_subset_to(
-                *reinterpret_cast<IndexIVF*>(other), subset_type, a1, a2);
+                *reinterpret_cast<IndexIVF*>(other),
+                static_cast<faiss::InvertedLists::subset_type_t>(subset_type),
+                a1,
+                a2);
     }
     CATCH_AND_HANDLE
 }
diff -Nru faiss-1.7.3/c_api/IndexIVF_c.h faiss-1.7.4/c_api/IndexIVF_c.h
--- faiss-1.7.3/c_api/IndexIVF_c.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/c_api/IndexIVF_c.h	2023-04-19 13:18:30.000000000 +0000
@@ -14,11 +14,27 @@
 #include "Clustering_c.h"
 #include "Index_c.h"
 #include "faiss_c.h"
+#include "impl/AuxIndexStructures_c.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+FAISS_DECLARE_CLASS_INHERITED(SearchParametersIVF, SearchParameters)
+FAISS_DECLARE_DESTRUCTOR(SearchParametersIVF)
+FAISS_DECLARE_SEARCH_PARAMETERS_DOWNCAST(SearchParametersIVF)
+
+int faiss_SearchParametersIVF_new(FaissSearchParametersIVF** p_sp);
+int faiss_SearchParametersIVF_new_with(
+        FaissSearchParametersIVF** p_sp,
+        FaissIDSelector* sel,
+        size_t nprobe,
+        size_t max_codes);
+
+FAISS_DECLARE_GETTER(SearchParametersIVF, const FaissIDSelector*, sel)
+FAISS_DECLARE_GETTER_SETTER(SearchParametersIVF, size_t, nprobe)
+FAISS_DECLARE_GETTER_SETTER(SearchParametersIVF, size_t, max_codes)
+
 /** Index based on a inverted file (IVF)
  *
  * In the inverted file, the quantizer (an Index instance) provides a
diff -Nru faiss-1.7.3/c_api/IndexReplicas_c.cpp faiss-1.7.4/c_api/IndexReplicas_c.cpp
--- faiss-1.7.3/c_api/IndexReplicas_c.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/c_api/IndexReplicas_c.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -14,8 +14,8 @@
 
 DEFINE_DESTRUCTOR(IndexReplicas)
 
-DEFINE_GETTER(IndexReplicas, int, own_fields)
-DEFINE_SETTER(IndexReplicas, int, own_fields)
+DEFINE_GETTER(IndexReplicas, int, own_indices)
+DEFINE_SETTER(IndexReplicas, int, own_indices)
 
 int faiss_IndexReplicas_new(FaissIndexReplicas** p_index, idx_t d) {
     try {
diff -Nru faiss-1.7.3/c_api/IndexShards_c.cpp faiss-1.7.4/c_api/IndexShards_c.cpp
--- faiss-1.7.3/c_api/IndexShards_c.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/c_api/IndexShards_c.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -14,8 +14,8 @@
 
 DEFINE_DESTRUCTOR(IndexShards)
 
-DEFINE_GETTER(IndexShards, int, own_fields)
-DEFINE_SETTER(IndexShards, int, own_fields)
+DEFINE_GETTER(IndexShards, int, own_indices)
+DEFINE_SETTER(IndexShards, int, own_indices)
 
 DEFINE_GETTER(IndexShards, int, successive_ids)
 DEFINE_SETTER(IndexShards, int, successive_ids)
diff -Nru faiss-1.7.3/c_api/INSTALL.md faiss-1.7.4/c_api/INSTALL.md
--- faiss-1.7.3/c_api/INSTALL.md	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/c_api/INSTALL.md	2023-04-19 13:18:30.000000000 +0000
@@ -8,11 +8,15 @@
 
 The full contents of the pure C API are in the ["c_api"](c_api/) folder.
 Please be sure to follow the instructions on [building the main C++ library](../INSTALL.md#step-1-compiling-the-c-faiss) first.
-Then, enter the [c_api](c_api/) directory and run
+Include `-DFAISS_ENABLE_C_API=ON` to the cmake command.
 
-  `make`
+`make -C build`
 
-This builds the dynamic library "faiss_c", containing the full implementation of Faiss and the necessary wrappers for the C interface. It does not depend on libfaiss.a or the C++ standard library. It will also build an example program `bin/example_c`.
+
+This builds the dynamic library "faiss_c", containing the full implementation of Faiss and the necessary wrappers for the C interface. It does not depend on libfaiss.a or the C++ standard library. 
+
+To build the example program, you should run `make -C build example_c` at the top level of
+the faiss repo. The example program will be in `build/c_api/example_c` .
 
 Using the API
 -------------
diff -Nru faiss-1.7.3/c_api/macros_impl.h faiss-1.7.4/c_api/macros_impl.h
--- faiss-1.7.3/c_api/macros_impl.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/c_api/macros_impl.h	2023-04-19 13:18:30.000000000 +0000
@@ -99,4 +99,10 @@
                 reinterpret_cast<faiss::Index*>(index)));                   \
     }
 
+#define DEFINE_SEARCH_PARAMETERS_DOWNCAST(clazz)                            \
+    Faiss##clazz* faiss_##clazz##_cast(FaissSearchParameters* sp) {         \
+        return reinterpret_cast<Faiss##clazz*>(dynamic_cast<faiss::clazz*>( \
+                reinterpret_cast<faiss::SearchParameters*>(sp)));           \
+    }
+
 #endif
diff -Nru faiss-1.7.3/CHANGELOG.md faiss-1.7.4/CHANGELOG.md
--- faiss-1.7.3/CHANGELOG.md	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/CHANGELOG.md	2023-04-19 13:18:30.000000000 +0000
@@ -9,6 +9,40 @@
 the Facebook Faiss team.  Feel free to add entries here if you submit a PR.
 
 ## [Unreleased]
+## [1.7.4] - 2023-04-12
+### Added
+- Added big batch IVF search for conducting efficient search with big batches of queries
+- Checkpointing in big batch search support
+- Precomputed centroids support
+- Support for iterable inverted lists for eg. key value stores
+- 64-bit indexing arithmetic support in FAISS GPU
+- IndexIVFShards now handle IVF indexes with a common quantizer
+- Jaccard distance support
+- CodePacker for non-contiguous code layouts
+- Approximate evaluation of top-k distances for ResidualQuantizer and IndexBinaryFlat
+- Added support for 12-bit PQ / IVFPQ fine quantizer decoders for standalone vector codecs (faiss/cppcontrib)
+- Conda packages for osx-arm64 (Apple M1) and linux-aarch64 (ARM64) architectures
+- Support for Python 3.10
+
+### Removed
+- CUDA 10 is no longer supported in precompiled packages
+- Removed Python 3.7 support for precompiled packages
+- Removed constraint for using fine quantizer with no greater than 8 bits for IVFPQ, for example, now it is possible to use IVF256,PQ10x12 for a CPU index
+
+### Changed
+- Various performance optimizations for PQ / IVFPQ for AVX2 and ARM for training (fused distance+nearest kernel), search (faster kernels for distance_to_code() and scan_list_*()) and vector encoding
+- A magnitude faster CPU code for LSQ/PLSQ training and vector encoding (reworked code)
+- Performance improvements for Hamming Code computations for AVX2 and ARM (reworked code)
+- Improved auto-vectorization support for IP and L2 distance computations (better handling of pragmas)
+- Improved ResidualQuantizer vector encoding (pooling memory allocations, avoid r/w to a temporary buffer)
+
+### Fixed
+- HSNW bug fixed which improves the recall rate! Special thanks to zh Wang @hhy3 for this.
+- Faiss GPU IVF large query batch fix
+- Faiss + Torch fixes, re-enable k = 2048
+- Fix the number of distance computations to match max_codes parameter
+- Fix decoding of large fast_scan blocks
+
 
 ## [1.7.3] - 2022-11-3
 ### Added
@@ -224,7 +258,8 @@
 - C bindings.
 - Extended tutorial to GPU indices.
 
-[Unreleased]: https://github.com/facebookresearch/faiss/compare/v1.7.2...HEAD
+[Unreleased]: https://github.com/facebookresearch/faiss/compare/v1.7.4...HEAD
+[1.7.4]: https://github.com/facebookresearch/faiss/compare/v1.7.3...v1.7.4
 [1.7.3]: https://github.com/facebookresearch/faiss/compare/v1.7.2...v1.7.3
 [1.7.2]: https://github.com/facebookresearch/faiss/compare/v1.7.1...v1.7.2
 [1.7.1]: https://github.com/facebookresearch/faiss/compare/v1.7.0...v1.7.1
diff -Nru faiss-1.7.3/.circleci/config.yml faiss-1.7.4/.circleci/config.yml
--- faiss-1.7.3/.circleci/config.yml	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/.circleci/config.yml	2023-04-19 13:18:30.000000000 +0000
@@ -1,12 +1,43 @@
 version: 2.1
 
-orbs:
-  win: circleci/windows@4.1.1
+executors:
+  linux-x86_64-cpu:
+    docker:
+      - image: continuumio/miniconda3
+    resource_class: large
+  linux-x86_64-gpu:
+    environment:
+      CONDA_ARCH: Linux-x86_64
+    machine:
+      image: linux-cuda-11:2023.02.1
+    resource_class: gpu.nvidia.medium
+  linux-arm64-cpu:
+    environment:
+      CONDA_ARCH: Linux-aarch64
+    machine:
+      image: ubuntu-2004:current
+    resource_class: arm.medium
+  macosx-x86_64-cpu:
+    environment:
+      CONDA_ARCH: MacOSX-x86_64
+    macos:
+      xcode: 11.7.0  # max supported for conda build, https://circleci.com/docs/using-macos#supported-xcode-versions
+  macosx-arm64-cpu:
+    environment:
+      CONDA_ARCH: MacOSX-arm64
+    macos:
+      xcode: 14.2.0 # minimum supported for M1
+    resource_class: macos.m1.large.gen1
+  windows-x86_64-cpu:
+    machine:
+      image: windows-server-2019-vs2019:stable
+      shell: bash.exe
+    resource_class: windows.medium
 
 jobs:
   format:
     docker:
-      - image: ubuntu:20.04
+      - image: ubuntu:22.04
     steps:
       - checkout
       - run:
@@ -27,293 +58,187 @@
                exit 1
              fi
 
-  build_linux:
+  build_conda:
     parameters:
-      opt_level:
+      label:
         type: string
-        default: generic
-      resource_class:
+        default: ""
+      cuda:
         type: string
-        default: medium
-    docker:
-      - image: beauby/faiss-circleci:cpu
-    resource_class: << parameters.resource_class >>
+        default: ""
+      cuda_archs:
+        type: string
+        default: ""
+      compiler_version:
+        type: string
+        default: ""
+      exec:
+        type: executor
+    executor: << parameters.exec >>
     environment:
       OMP_NUM_THREADS: 10
-      MKL_THREADING_LAYER: GNU
+      PACKAGE_TYPE: <<parameters.label>>
+      CUDA_ARCHS: <<parameters.cuda_archs>>
     steps:
       - checkout
       - run:
-          name: Build faiss library
+          name: Install conda
           command: |
-            cmake -B build -DBUILD_TESTING=ON -DFAISS_ENABLE_GPU=OFF \
-                  -DFAISS_OPT_LEVEL=<< parameters.opt_level >> \
-                  -DFAISS_ENABLE_C_API=ON \
-                  -DCMAKE_BUILD_TYPE=Release -DBLA_VENDOR=Intel10_64_dyn .
-            make -k -C build -j3 faiss
+            if [ -n "${CONDA_ARCH}" ]
+            then
+              curl https://repo.anaconda.com/miniconda/Miniconda3-latest-${CONDA_ARCH}.sh --output miniconda.sh
+              bash miniconda.sh -b -p $HOME/miniconda
+              ~/miniconda/bin/conda init
+            fi
+      - run:
+          name: Install conda build tools
+          command: |
+            conda update -y -q conda
+            conda install -y -q conda-build
+      - when:
+          condition: << parameters.label >>
+          steps:
+            - run:
+                name: Enable anaconda uploads
+                command: |
+                  conda install -y -q anaconda-client
+                  conda config --set anaconda_upload yes
       - when:
           condition:
-            equal: [ "avx2", << parameters.opt_level >> ]
+              not: << parameters.label >>
           steps:
             - run:
-                name: Build faiss_avx2 library
-                command: make -k -C build -j3 faiss_avx2 swigfaiss_avx2
-      - run:
-          name: Test faiss library
-          command: |
-            make -C build -j3 faiss_test
-            export GTEST_OUTPUT="xml:$(realpath .)/test-results/googletest/"
-            make -C build test
-      - run:
-          name: Build python extension
-          command: |
-            make -C build -j3 swigfaiss
-            cd build/faiss/python
-            python3 setup.py build
-      - run:
-          name: Test python extension
-          command: |
-            pip3 install pytest
-            export PYTHONPATH="$(ls -d ./build/faiss/python/build/lib*/)"
-            pytest --junitxml=test-results/pytest/results.xml tests/test_*.py
-            pytest --junitxml=test-results/pytest/results-torch.xml tests/torch_*.py
-      - store_test_results:
-          path: test-results
-      - run:
-          name: Build C API
-          command: |
-            make -k -C build -j faiss_c
-
-  build_linux_conda:
-    docker:
-      - image: continuumio/miniconda3
-    steps:
-      - checkout
-      - run:
-          name: Conda build
-          command: |
-            conda install -y -q conda-build
-            cd conda
-            conda build faiss --python 3.7 -c pytorch
-
-  build_osx:
-    macos:
-      xcode: 14.0.0  # https://circleci.com/docs/using-macos#supported-xcode-versions
-    environment:
-      OMP_NUM_THREADS: 10
-    steps:
-      - checkout
-      - run:
-          name: Install conda
-          command: |
-            curl https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh --output miniconda.sh
-            bash miniconda.sh -b -p $HOME/miniconda
-            $HOME/miniconda/bin/conda install -y -q conda-build
-      - run:
-          name: Install MacOSX10.9 SDK
-          command: |
-            curl -L -o - https://github.com/phracker/MacOSX-SDKs/releases/download/10.15/MacOSX10.9.sdk.tar.xz | sudo tar xJf - -C /opt
-      - run:
-          name: Build/test
-          command: |
-            export PATH=~/miniconda/bin:$PATH
-            cd conda
-            conda build faiss --python 3.8 -c pytorch
-
-  build_windows:
-    executor:
-      name: win/default
-      shell: bash.exe
-    steps:
-      - checkout
-      - run:
-          name: Build/test
-          command: |
-            conda update conda
-            conda install conda-build
-            cd conda
-            conda build faiss --python 3.8 -c pytorch
+                name: Conda build (CPU)
+                no_output_timeout: 30m
+                command: |
+                  cd conda
+                  conda build faiss --python 3.10 -c pytorch -c pkgs/main -c conda-forge
+      - when:
+          condition:
+            and:
+              - << parameters.label >>
+              - not: << parameters.cuda >>
+          steps:
+            - run:
+                name: Conda build (CPU) w/ anaconda upload
+                no_output_timeout: 30m
+                command: |
+                  cd conda
+                  conda build faiss --user pytorch --label <<parameters.label>> -c pytorch -c pkgs/main -c conda-forge
+      - when:
+          condition:
+            and:
+              - << parameters.label >>
+              - << parameters.cuda >>
+          steps:
+            - run:
+                name: Conda build (GPU) w/ anaconda upload
+                no_output_timeout: 60m
+                command: |
+                  sudo update-alternatives --set cuda /usr/local/cuda-<<parameters.cuda>>
+                  cd conda
+                  conda build faiss-gpu --variants '{ "cudatoolkit": "<<parameters.cuda>>", "c_compiler_version": "<<parameters.compiler_version>>", "cxx_compiler_version": "<<parameters.compiler_version>>" }' \
+                      --user pytorch --label <<parameters.label>> -c pytorch -c nvidia -c pkgs/main -c conda-forge
 
-  build_arm:
-    machine:
-      image: ubuntu-2004:202101-01
-    resource_class: arm.medium
+  build_cmake:
     parameters:
+      exec:
+        type: executor
       opt_level:
         type: string
         default: generic
+      gpu:
+        type: string
+        default: "OFF"
+    executor: << parameters.exec >>
     environment:
       OMP_NUM_THREADS: 10
-      CONDA_HOME: /home/circleci/miniconda3
-      PYTHON: /home/circleci/miniconda3/bin/python
+      MKL_THREADING_LAYER: GNU
     steps:
       - checkout
       - run:
-          name: Install dependencies
+          name: Install conda
           command: |
-            sudo apt-get update && sudo apt-get install -y swig
-            wget https://repo.anaconda.com/miniconda/Miniconda3-py39_4.9.2-Linux-aarch64.sh
-            bash Miniconda3-py39_4.9.2-Linux-aarch64.sh -b -p $CONDA_HOME
-            pip3 install cmake
-            $CONDA_HOME/bin/conda install -y numpy scipy
-            $CONDA_HOME/bin/conda install -y pytorch cpuonly -c pytorch
-            $CONDA_HOME/bin/pip install pytest
+            if [ -n "${CONDA_ARCH}" ]
+            then
+              curl https://repo.anaconda.com/miniconda/Miniconda3-latest-${CONDA_ARCH}.sh --output miniconda.sh
+              bash miniconda.sh -b -p $HOME/miniconda
+              ~/miniconda/bin/conda init
+            fi
+      - when:
+          condition:
+            equal: [ "ON", << parameters.gpu >> ]
+          steps:
+            - run:
+                name: Configure CUDA
+                command: sudo update-alternatives --set cuda /usr/local/cuda-11.4
       - run:
-          name: Build faiss library
+          name: Set up environment
           command: |
-            cmake -B build -DBUILD_TESTING=ON -DFAISS_ENABLE_GPU=OFF \
+            conda update -y -q conda
+            conda install -y -q cmake=3.23.1 make swig mkl=2021 mkl-devel=2021 numpy scipy pytest gxx_linux-64 sysroot_linux-64=2.17 -c pkgs/main -c conda-forge
+      - run:
+          name: Build all targets
+          no_output_timeout: 30m
+          command: |
+            eval "$(conda shell.bash hook)"
+            conda activate
+            cmake -B build \
+                  -DBUILD_TESTING=ON \
+                  -DBUILD_SHARED_LIBS=OFF \
+                  -DFAISS_ENABLE_GPU=<< parameters.gpu >> \
                   -DFAISS_OPT_LEVEL=<< parameters.opt_level >> \
                   -DFAISS_ENABLE_C_API=ON \
+                  -DPYTHON_EXECUTABLE=$(which python) \
                   -DCMAKE_BUILD_TYPE=Release \
-                  -DPython_EXECUTABLE=$PYTHON .
-            make -k -C build -j3 faiss
+                  -DBLA_VENDOR=Intel10_64_dyn \
+                  -DCMAKE_CUDA_FLAGS="-gencode arch=compute_75,code=sm_75" \
+                  .
+            make -k -C build -j$(nproc)
       - run:
-          name: Test faiss library
+          name: C++ tests
           command: |
-            make -C build -j3 faiss_test
             export GTEST_OUTPUT="xml:$(realpath .)/test-results/googletest/"
             make -C build test
       - run:
-          name: Build python extension
+          name: Install Python extension
           command: |
-            make -C build -j3 swigfaiss
             cd build/faiss/python
-            $PYTHON setup.py build
-      - run:
-          name: Test python extension
-          command: |
-            export PYTHONPATH="$(ls -d ./build/faiss/python/build/lib*/)"
-            $PYTHON -c "import faiss; assert 'NEON' in faiss.get_compile_options()"
-            $PYTHON -m pytest --junitxml=test-results/pytest/results.xml tests/test_*.py
-            $PYTHON -m pytest --junitxml=test-results/pytest/results-torch.xml tests/torch_*.py
+            python setup.py install
+      - when:
+          condition:
+            equal: [ "OFF", << parameters.gpu >> ]
+          steps:
+            - run:
+                name: Python tests (CPU only)
+                command: |
+                  conda install -y -q pytorch -c pytorch
+                  pytest --junitxml=test-results/pytest/results.xml tests/test_*.py
+                  pytest --junitxml=test-results/pytest/results-torch.xml tests/torch_*.py
+      - when:
+          condition:
+            equal: [ "ON", << parameters.gpu >> ]
+          steps:
+            - run:
+                name: Python tests (CPU + GPU)
+                command: |
+                  conda install -y -q pytorch pytorch-cuda -c pytorch -c nvidia
+                  pytest --junitxml=test-results/pytest/results.xml tests/test_*.py
+                  pytest --junitxml=test-results/pytest/results-torch.xml tests/torch_*.py
+                  cp tests/common_faiss_tests.py faiss/gpu/test
+                  pytest --junitxml=test-results/pytest/results-gpu.xml faiss/gpu/test/test_*.py
+                  pytest --junitxml=test-results/pytest/results-gpu-torch.xml faiss/gpu/test/torch_*.py
+      - when:
+          condition:
+            equal: [ "avx2", << parameters.opt_level >> ]
+          steps:
+            - run:
+                name: Test avx2 loading
+                command: |
+                  FAISS_DISABLE_CPU_FEATURES=AVX2 LD_DEBUG=libs python -c "import faiss" 2>&1 | grep faiss.so
+                  LD_DEBUG=libs python -c "import faiss" 2>&1 | grep faiss_avx2.so
       - store_test_results:
           path: test-results
-      - run:
-          name: Build C API
-          command: |
-            make -k -C build -j faiss_c
-
-  build_linux_gpu:
-    machine:
-      resource_class: gpu.nvidia.medium
-      image: ubuntu-2004-cuda-11.4:202110-01
-      docker_layer_caching: true
-    steps:
-      - checkout
-      - run:
-          name: Build/test
-          command: |
-            docker build -t faiss -f .circleci/Dockerfile.faiss_gpu .
-            docker run --gpus all faiss make -C build test
-            docker run --gpus all faiss sh -c '(pwd; find)'
-            docker run --gpus all faiss sh -c '(cd build/faiss/python; python3 setup.py install) && cp tests/common_faiss_tests.py faiss/gpu/test && python3 -m unittest discover -s faiss/gpu/test -p "test_*"'
-            docker run --gpus all faiss sh -c '(cd build/faiss/python; python3 setup.py install) && cp tests/common_faiss_tests.py faiss/gpu/test && python3 -m unittest discover -s faiss/gpu/test -p "torch_*.py"'
-          no_output_timeout: 60m
-
-  deploy_linux:
-    parameters:
-      label:
-        type: string
-        default: main
-    docker:
-      - image: continuumio/miniconda3
-    steps:
-      - checkout
-      - run:
-          name: Install conda-build/anaconda-client
-          command: |
-            conda install -y -q conda-build anaconda-client
-            conda config --set anaconda_upload yes
-      - run:
-          name: Build packages
-          environment:
-            PACKAGE_TYPE: <<parameters.label>>
-          command: |
-            cd conda
-            conda build faiss --user pytorch --label <<parameters.label>> -c pytorch
-
-  deploy_linux_gpu:
-    parameters:
-      label:
-        type: string
-        default: main
-      cuda:
-        type: string
-      cuda_archs:
-        type: string
-      compiler_version:
-        type: string
-    machine:
-      resource_class: gpu.nvidia.medium
-      image: ubuntu-2004-cuda-11.4:202110-01
-      docker_layer_caching: true
-    steps:
-      - checkout
-      - run:
-          name: Build packages
-          command: |
-            docker build -t faiss -f conda/Dockerfile.cuda<<parameters.cuda>> .
-            docker run --gpus all \
-              -e PACKAGE_TYPE="<<parameters.label>>" \
-              -e CUDA_ARCHS="<<parameters.cuda_archs>>" \
-              -e ANACONDA_API_TOKEN=$ANACONDA_API_TOKEN \
-              faiss \
-              conda build faiss-gpu --variants '{ "cudatoolkit": "<<parameters.cuda>>", "c_compiler_version": "<<parameters.compiler_version>>", "cxx_compiler_version": "<<parameters.compiler_version>>" }' \
-                --user pytorch --label <<parameters.label>> -c pytorch
-          no_output_timeout: 60m
-
-  deploy_osx:
-    parameters:
-      label:
-        type: string
-        default: main
-    macos:
-      xcode: 14.0.0
-    steps:
-      - checkout
-      - run:
-          name: Install conda
-          command: |
-            curl https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh --output miniconda.sh
-            bash miniconda.sh -b -p $HOME/miniconda
-            $HOME/miniconda/bin/conda install -y -q conda-build anaconda-client
-            $HOME/miniconda/bin/conda config --set anaconda_upload yes
-      - run:
-          name: Install MacOSX10.9 SDK
-          command: |
-            curl -L -o - https://github.com/phracker/MacOSX-SDKs/releases/download/10.15/MacOSX10.9.sdk.tar.xz | sudo tar xJf - -C /opt
-      - run:
-          name: Build packages
-          environment:
-            PACKAGE_TYPE: <<parameters.label>>
-          command: |
-            export PATH=~/miniconda/bin:$PATH
-            cd conda
-            conda build faiss --user pytorch --label <<parameters.label>> -c pytorch
-
-  deploy_windows:
-    parameters:
-      label:
-        type: string
-        default: main
-    executor:
-      name: win/default
-      shell: bash.exe
-    steps:
-      - checkout
-      - run:
-          name: Install conda-build/anaconda-client
-          command: |
-            conda update conda
-            conda install -y -q conda-build anaconda-client
-            conda config --set anaconda_upload yes
-      - run:
-          name: Build packages
-          environment:
-            PACKAGE_TYPE: <<parameters.label>>
-          command: |
-            cd conda
-            conda build faiss --user pytorch --label <<parameters.label>> -c pytorch
 
 workflows:
   version: 2
@@ -321,60 +246,88 @@
     jobs:
       - format:
           name: Format
-      - build_linux:
-          name: Linux
-      - build_linux:
-          name: Linux (AVX2)
+      - build_cmake:
+          name: Linux x86_64 (cmake)
+          exec: linux-x86_64-cpu
+      - build_cmake:
+          name: Linux x86_64 AVX2 (cmake)
+          exec: linux-x86_64-cpu
           opt_level: "avx2"
-          resource_class: "medium+"
-      - build_linux_conda:
-          name: Linux (conda)
-      - build_linux_gpu:
-          name: Linux GPU
+      - build_cmake:
+          name: Linux x86_64 GPU (cmake)
+          exec: linux-x86_64-gpu
+          gpu: "ON"
           requires:
-            - Linux
-      - build_osx:
-          name: OSX
-      - build_windows:
-          name: Windows
-      - build_arm:
-          name: ARM64
-      - deploy_linux:
-          name: Linux packages
+            - Linux x86_64 (cmake)
+      - build_conda:
+          name: Linux x86_64 (conda)
+          exec: linux-x86_64-cpu
+      - build_conda:
+          name: OSX x86_64 (conda)
+          exec: macosx-x86_64-cpu
+      - build_conda:
+          name: Windows x86_64 (conda)
+          exec: windows-x86_64-cpu
+      - build_conda:
+          name: OSX arm64 (conda)
+          exec: macosx-arm64-cpu
+          requires:
+            - Linux arm64 (conda)
+      - build_conda:
+          name: Linux arm64 (conda)
+          exec: linux-arm64-cpu
+      - build_conda:
+          name: Linux x86_64 packages
+          exec: linux-x86_64-cpu
+          label: main
           filters:
             tags:
               only: /^v.*/
             branches:
               ignore: /.*/
-      - deploy_linux_gpu:
-          name: Linux GPU packages (CUDA 10.2)
-          cuda: "10.2"
-          cuda_archs: "35;52;60;61;70;72;75"
-          compiler_version: "8.4"
+      - build_conda:
+          name: Linux x86_64 GPU packages (CUDA 11.4)
+          exec: linux-x86_64-gpu
+          label: main
+          cuda: "11.4"
+          cuda_archs: "60;61;70;72;75;80;86"
+          compiler_version: "11.2"
           filters:
             tags:
               only: /^v.*/
             branches:
               ignore: /.*/
-      - deploy_linux_gpu:
-          name: Linux GPU packages (CUDA 11.3)
-          cuda: "11.3"
-          cuda_archs: "60;61;70;72;75;80;86"
-          compiler_version: "9.3"
+      - build_conda:
+          name: Windows x86_64 packages
+          exec: windows-x86_64-cpu
+          label: main
           filters:
             tags:
               only: /^v.*/
             branches:
               ignore: /.*/
-      - deploy_windows:
-          name: Windows packages
+      - build_conda:
+          name: OSX x86_64 packages
+          exec: macosx-x86_64-cpu
+          label: main
           filters:
             tags:
               only: /^v.*/
             branches:
               ignore: /.*/
-      - deploy_osx:
-          name: OSX packages
+      - build_conda:
+          name: OSX arm64 packages
+          exec: macosx-arm64-cpu
+          label: main
+          filters:
+            tags:
+              only: /^v.*/
+            branches:
+              ignore: /.*/
+      - build_conda:
+          name: Linux arm64 packages
+          exec: linux-arm64-cpu
+          label: main
           filters:
             tags:
               only: /^v.*/
@@ -390,24 +343,30 @@
               only:
                 - main
     jobs:
-      - deploy_linux:
-          name: Linux nightlies
-          label: nightly
-      - deploy_linux_gpu:
-          name: Linux GPU nightlies (CUDA 10.2)
-          cuda: "10.2"
-          cuda_archs: "35;52;60;61;70;72;75"
-          compiler_version: "8.4"
+      - build_conda:
+          name: Linux x86_64 nightlies
+          exec: linux-x86_64-cpu
           label: nightly
-      - deploy_linux_gpu:
-          name: Linux GPU nightlies (CUDA 11.3)
-          cuda: "11.3"
+      - build_conda:
+          name: Linux x86_64 GPU nightlies (CUDA 11.4)
+          exec: linux-x86_64-gpu
+          cuda: "11.4"
           cuda_archs: "60;61;70;72;75;80;86"
-          compiler_version: "9.3"
+          compiler_version: "11.2"
+          label: nightly
+      - build_conda:
+          name: Windows x86_64 nightlies
+          exec: windows-x86_64-cpu
+          label: nightly
+      - build_conda:
+          name: OSX x86_64 nightlies
+          exec: macosx-x86_64-cpu
           label: nightly
-      - deploy_windows:
-          name: Windows nightlies
+      - build_conda:
+          name: OSX arm64 nightlies
+          exec: macosx-arm64-cpu
           label: nightly
-      - deploy_osx:
-          name: OSX nightlies
+      - build_conda:
+          name: Linux arm64 nightlies
+          exec: linux-arm64-cpu
           label: nightly
diff -Nru faiss-1.7.3/.circleci/Dockerfile.cpu faiss-1.7.4/.circleci/Dockerfile.cpu
--- faiss-1.7.3/.circleci/Dockerfile.cpu	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/.circleci/Dockerfile.cpu	1970-01-01 00:00:00.000000000 +0000
@@ -1,11 +0,0 @@
-FROM cimg/base:stable-20.04
-
-# Install python3, swig, and MKL.
-RUN sudo apt-get update && \
-sudo apt-get install -y python3-dev python3-pip swig libmkl-dev
-
-# Install recent CMake.
-RUN wget -nv -O - https://github.com/Kitware/CMake/releases/download/v3.17.1/cmake-3.17.1-Linux-x86_64.tar.gz | sudo tar xzf - --strip-components=1 -C /usr
-
-# Install numpy/scipy/pytorch for python tests.
-RUN pip3 install numpy scipy torch
diff -Nru faiss-1.7.3/.circleci/Dockerfile.faiss_gpu faiss-1.7.4/.circleci/Dockerfile.faiss_gpu
--- faiss-1.7.3/.circleci/Dockerfile.faiss_gpu	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/.circleci/Dockerfile.faiss_gpu	1970-01-01 00:00:00.000000000 +0000
@@ -1,28 +0,0 @@
-FROM nvidia/cuda:10.2-devel-ubuntu18.04
-
-# Install python3, wget, and openblas.
-RUN apt-get update && \
-        apt-get install -y python3-dev python3-pip libopenblas-dev wget libpcre3-dev
-
-# Install swig 4.0.2.
-RUN wget -nv -O - https://sourceforge.net/projects/swig/files/swig/swig-4.0.2/swig-4.0.2.tar.gz/download | tar zxf - && cd swig-4.0.2 && ./configure && make -j && make install
-
-# Install recent CMake.
-RUN wget -nv -O - https://github.com/Kitware/CMake/releases/download/v3.17.1/cmake-3.17.1-Linux-x86_64.tar.gz | tar xzf - --strip-components=1 -C /usr
-
-# Install numpy/scipy/pytorch for python tests.
-RUN pip3 install numpy scipy torch
-
-COPY . /faiss
-
-WORKDIR /faiss
-
-RUN cmake -B build \
-        -DFAISS_ENABLE_GPU=ON \
-        -DFAISS_ENABLE_C_API=ON \
-        -DFAISS_ENABLE_PYTHON=ON \
-        -DBUILD_TESTING=ON \
-        -DCMAKE_CUDA_FLAGS="-gencode arch=compute_75,code=sm_75" \
-        .
-
-RUN make -C build -j8
diff -Nru faiss-1.7.3/CMakeLists.txt faiss-1.7.4/CMakeLists.txt
--- faiss-1.7.3/CMakeLists.txt	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/CMakeLists.txt	2023-04-19 13:18:30.000000000 +0000
@@ -4,10 +4,10 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
 
 project(faiss
-  VERSION 1.7.3
+  VERSION 1.7.4
   DESCRIPTION "A library for efficient similarity search and clustering of dense vectors."
   HOMEPAGE_URL "https://github.com/facebookresearch/faiss"
   LANGUAGES CXX)
diff -Nru faiss-1.7.3/conda/conda_build_config.yaml faiss-1.7.4/conda/conda_build_config.yaml
--- faiss-1.7.3/conda/conda_build_config.yaml	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/conda/conda_build_config.yaml	2023-04-19 13:18:30.000000000 +0000
@@ -1,6 +1,4 @@
-CONDA_BUILD_SYSROOT:
-  - /opt/MacOSX10.9.sdk        # [osx]
 python:
-  - 3.7
   - 3.8
   - 3.9
+  - 3.10
diff -Nru faiss-1.7.3/conda/Dockerfile.cpu faiss-1.7.4/conda/Dockerfile.cpu
--- faiss-1.7.3/conda/Dockerfile.cpu	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/conda/Dockerfile.cpu	1970-01-01 00:00:00.000000000 +0000
@@ -1,19 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-FROM nvidia/cuda:10.2-devel-ubuntu18.04
-
-RUN apt-get update && apt-get install -y wget git
-
-RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
-        bash Miniconda3-latest-Linux-x86_64.sh -b -p ~/miniconda3
-ENV PATH="/root/miniconda3/condabin:${PATH}"
-
-RUN conda install conda-build
-
-COPY ./ faiss
-WORKDIR /faiss/conda
-
-RUN conda build faiss --no-anaconda-upload -c pytorch
diff -Nru faiss-1.7.3/conda/Dockerfile.cuda10.2 faiss-1.7.4/conda/Dockerfile.cuda10.2
--- faiss-1.7.3/conda/Dockerfile.cuda10.2	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/conda/Dockerfile.cuda10.2	1970-01-01 00:00:00.000000000 +0000
@@ -1,24 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-FROM nvidia/cuda:10.2-devel-centos8
-
-RUN sed -i 's/mirrorlist/#mirrorlist/g' /etc/yum.repos.d/CentOS-*
-
-RUN sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-*
-
-RUN yum update -y --nogpgcheck
-
-RUN yum install -y wget git libcublas-devel-10-2
-
-RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
-        bash Miniconda3-latest-Linux-x86_64.sh -b -p ~/miniconda3
-ENV PATH="/root/miniconda3/condabin:${PATH}"
-
-RUN conda install -y -q conda-build anaconda-client
-RUN conda config --set anaconda_upload yes
-
-COPY ./ faiss
-WORKDIR /faiss/conda
diff -Nru faiss-1.7.3/conda/Dockerfile.cuda11.3 faiss-1.7.4/conda/Dockerfile.cuda11.3
--- faiss-1.7.3/conda/Dockerfile.cuda11.3	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/conda/Dockerfile.cuda11.3	1970-01-01 00:00:00.000000000 +0000
@@ -1,24 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-FROM nvidia/cuda:11.3.1-devel-centos8
-
-RUN sed -i 's/mirrorlist/#mirrorlist/g' /etc/yum.repos.d/CentOS-*
-
-RUN sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-*
-
-RUN yum update -y --nogpgcheck
-
-RUN yum install -y --nogpgcheck wget git libcublas-devel-11-3
-
-RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
-        bash Miniconda3-latest-Linux-x86_64.sh -b -p ~/miniconda3
-ENV PATH="/root/miniconda3/condabin:${PATH}"
-
-RUN conda install -y -q conda-build anaconda-client
-RUN conda config --set anaconda_upload yes
-
-COPY ./ faiss
-WORKDIR /faiss/conda
diff -Nru faiss-1.7.3/conda/faiss/build-lib-arm64.sh faiss-1.7.4/conda/faiss/build-lib-arm64.sh
--- faiss-1.7.3/conda/faiss/build-lib-arm64.sh	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/conda/faiss/build-lib-arm64.sh	2023-04-19 13:18:30.000000000 +0000
@@ -0,0 +1,22 @@
+#!/bin/sh
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -e
+
+
+# Build libfaiss.so
+cmake -B _build \
+      -DBUILD_SHARED_LIBS=ON \
+      -DBUILD_TESTING=OFF \
+      -DFAISS_ENABLE_GPU=OFF \
+      -DFAISS_ENABLE_PYTHON=OFF \
+      -DCMAKE_INSTALL_LIBDIR=lib \
+      -DCMAKE_BUILD_TYPE=Release .
+
+make -C _build -j$(nproc) faiss
+
+cmake --install _build --prefix $PREFIX
+cmake --install _build --prefix _libfaiss_stage/
diff -Nru faiss-1.7.3/conda/faiss/build-lib.sh faiss-1.7.4/conda/faiss/build-lib.sh
--- faiss-1.7.3/conda/faiss/build-lib.sh	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/conda/faiss/build-lib.sh	2023-04-19 13:18:30.000000000 +0000
@@ -18,7 +18,7 @@
       -DCMAKE_INSTALL_LIBDIR=lib \
       -DCMAKE_BUILD_TYPE=Release .
 
-make -C _build -j $CPU_COUNT faiss faiss_avx2
+make -C _build -j$(nproc) faiss faiss_avx2
 
 cmake --install _build --prefix $PREFIX
 cmake --install _build --prefix _libfaiss_stage/
diff -Nru faiss-1.7.3/conda/faiss/build-pkg-arm64.sh faiss-1.7.4/conda/faiss/build-pkg-arm64.sh
--- faiss-1.7.3/conda/faiss/build-pkg-arm64.sh	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/conda/faiss/build-pkg-arm64.sh	2023-04-19 13:18:30.000000000 +0000
@@ -0,0 +1,22 @@
+#!/bin/sh
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -e
+
+
+# Build swigfaiss.so/swigfaiss_avx2.so.
+cmake -B _build_python_${PY_VER} \
+      -Dfaiss_ROOT=_libfaiss_stage/ \
+      -DFAISS_ENABLE_GPU=OFF \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DPython_EXECUTABLE=$PYTHON \
+      faiss/python
+
+make -C _build_python_${PY_VER} -j$(nproc) swigfaiss
+
+# Build actual python module.
+cd _build_python_${PY_VER}/
+$PYTHON setup.py install --single-version-externally-managed --record=record.txt --prefix=$PREFIX
diff -Nru faiss-1.7.3/conda/faiss/build-pkg.sh faiss-1.7.4/conda/faiss/build-pkg.sh
--- faiss-1.7.3/conda/faiss/build-pkg.sh	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/conda/faiss/build-pkg.sh	2023-04-19 13:18:30.000000000 +0000
@@ -16,7 +16,7 @@
       -DPython_EXECUTABLE=$PYTHON \
       faiss/python
 
-make -C _build_python_${PY_VER} -j $CPU_COUNT swigfaiss swigfaiss_avx2
+make -C _build_python_${PY_VER} -j$(nproc) swigfaiss swigfaiss_avx2
 
 # Build actual python module.
 cd _build_python_${PY_VER}/
diff -Nru faiss-1.7.3/conda/faiss/meta.yaml faiss-1.7.4/conda/faiss/meta.yaml
--- faiss-1.7.3/conda/faiss/meta.yaml	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/conda/faiss/meta.yaml	2023-04-19 13:18:30.000000000 +0000
@@ -26,7 +26,8 @@
 
 outputs:
   - name: libfaiss
-    script: build-lib.sh   # [not win]
+    script: build-lib.sh   # [x86_64 and not win]
+    script: build-lib-arm64.sh # [not x86_64]
     script: build-lib.bat  # [win]
     build:
       string: "h{{ PKG_HASH }}_{{ number }}_cpu{{ suffix }}"
@@ -35,54 +36,56 @@
     requirements:
       build:
         - {{ compiler('cxx') }}
+        - sysroot_linux-64 =2.17 # [linux64]
         - llvm-openmp  # [osx]
-        - cmake >=3.17
+        - cmake =3.23.1
         - make  # [not win]
+        - mkl-devel =2021 # [x86_64]
+        - openblas # [not x86_64]
       host:
-        - mkl =2018
+        - mkl =2021 # [x86_64]
       run:
-        - mkl >=2018  # [not win]
-        - mkl >=2018,<2021  # [win]
+        - mkl >=2021  # [win]
     test:
       requires:
         - conda-build
       commands:
         - test -f $PREFIX/lib/libfaiss$SHLIB_EXT       # [not win]
-        - test -f $PREFIX/lib/libfaiss_avx2$SHLIB_EXT  # [not win]
+        - test -f $PREFIX/lib/libfaiss_avx2$SHLIB_EXT  # [x86_64 and not win]
         - conda inspect linkages -p $PREFIX $PKG_NAME  # [not win]
         - conda inspect objects -p $PREFIX $PKG_NAME   # [osx]
 
   - name: faiss-cpu
-    script: build-pkg.sh   # [not win]
+    script: build-pkg.sh   # [x86_64 and not win]
+    script: build-pkg-arm64.sh # [not x86_64]
     script: build-pkg.bat  # [win]
     build:
       string: "py{{ PY_VER }}_h{{ PKG_HASH }}_{{ number }}_cpu{{ suffix }}"
     requirements:
       build:
         - {{ compiler('cxx') }}
+        - sysroot_linux-64 =2.17 # [linux64]
         - swig
-        - cmake >=3.17
+        - cmake =3.23.1
         - make  # [not win]
       host:
         - python {{ python }}
-        - numpy =1.16  # [not win]
-        - numpy =1.11  # [win]
+        - numpy >=1.16,<2
         - {{ pin_subpackage('libfaiss', exact=True) }}
       run:
         - python {{ python }}
-        - numpy >=1.11,<2
+        - numpy >=1.16,<2
         - {{ pin_subpackage('libfaiss', exact=True) }}
     test:
       requires:
         - numpy
         - scipy
-        - pytorch  # [not osx]
-        - pytorch <1.12.0  # [osx]
+        - pytorch
       commands:
-        - python -X faulthandler -m unittest discover -v -s tests -p "test_*"
-        - python -X faulthandler -m unittest discover -v -s tests -p "torch_*"
-        - sh test_cpu_dispatch.sh  # [linux]
+        - python -X faulthandler -m unittest discover -v -s tests/ -p "test_*"
+        - python -X faulthandler -m unittest discover -v -s tests/ -p "torch_*"
+        - sh test_cpu_dispatch.sh  # [linux64]
       files:
-        - test_cpu_dispatch.sh  # [linux]
+        - test_cpu_dispatch.sh  # [linux64]
       source_files:
         - tests/
diff -Nru faiss-1.7.3/conda/faiss-gpu/build-lib.sh faiss-1.7.4/conda/faiss-gpu/build-lib.sh
--- faiss-1.7.3/conda/faiss-gpu/build-lib.sh	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/conda/faiss-gpu/build-lib.sh	2023-04-19 13:18:30.000000000 +0000
@@ -19,7 +19,7 @@
       -DCMAKE_INSTALL_LIBDIR=lib \
       -DCMAKE_BUILD_TYPE=Release .
 
-make -C _build -j $CPU_COUNT faiss faiss_avx2
+make -C _build -j$(nproc) faiss faiss_avx2
 
 cmake --install _build --prefix $PREFIX
 cmake --install _build --prefix _libfaiss_stage/
diff -Nru faiss-1.7.3/conda/faiss-gpu/build-pkg.sh faiss-1.7.4/conda/faiss-gpu/build-pkg.sh
--- faiss-1.7.3/conda/faiss-gpu/build-pkg.sh	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/conda/faiss-gpu/build-pkg.sh	2023-04-19 13:18:30.000000000 +0000
@@ -16,7 +16,7 @@
       -DPython_EXECUTABLE=$PYTHON \
       faiss/python
 
-make -C _build_python_${PY_VER} -j $CPU_COUNT swigfaiss swigfaiss_avx2
+make -C _build_python_${PY_VER} -j$(nproc) swigfaiss swigfaiss_avx2
 
 # Build actual python module.
 cd _build_python_${PY_VER}/
diff -Nru faiss-1.7.3/conda/faiss-gpu/meta.yaml faiss-1.7.4/conda/faiss-gpu/meta.yaml
--- faiss-1.7.3/conda/faiss-gpu/meta.yaml	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/conda/faiss-gpu/meta.yaml	2023-04-19 13:18:30.000000000 +0000
@@ -36,15 +36,16 @@
     requirements:
       build:
         - {{ compiler('cxx') }}
+        - sysroot_linux-64 =2.17 # [linux64]
         - llvm-openmp  # [osx]
-        - cmake >=3.18
+        - cmake =3.23.1
         - make  # [not win]
+        - mkl-devel =2021
       host:
-        - mkl =2018
+        - mkl =2021
         - cudatoolkit {{ cudatoolkit }}
       run:
-        - mkl >=2018  # [not win]
-        - mkl >=2018,<2021  # [win]
+        - mkl >=2021
         - {{ pin_compatible('cudatoolkit', max_pin='x.x') }}
     test:
       requires:
@@ -62,16 +63,17 @@
     requirements:
       build:
         - {{ compiler('cxx') }}
+        - sysroot_linux-64 =2.17 # [linux64]
         - swig
-        - cmake >=3.17
+        - cmake =3.23.1
         - make  # [not win]
       host:
         - python {{ python }}
-        - numpy =1.16
+        - numpy >=1.16,<2
         - {{ pin_subpackage('libfaiss', exact=True) }}
       run:
         - python {{ python }}
-        - numpy >=1.11,<2
+        - numpy >=1.16,<2
         - {{ pin_subpackage('libfaiss', exact=True) }}
     test:
       requires:
@@ -79,9 +81,11 @@
         - scipy
         - pytorch
       commands:
-        - python -m unittest discover tests/
+        - python -X faulthandler -m unittest discover -v -s tests/ -p "test_*"
+        - python -X faulthandler -m unittest discover -v -s tests/ -p "torch_*"
         - cp tests/common_faiss_tests.py faiss/gpu/test
-        - python -m unittest discover faiss/gpu/test/
+        - python -X faulthandler -m unittest discover -v -s faiss/gpu/test/ -p "test_*"
+        - python -X faulthandler -m unittest discover -v -s faiss/gpu/test/ -p "torch_*"
         - sh test_cpu_dispatch.sh  # [linux]
       files:
         - test_cpu_dispatch.sh  # [linux]
diff -Nru faiss-1.7.3/contrib/client_server.py faiss-1.7.4/contrib/client_server.py
--- faiss-1.7.3/contrib/client_server.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/contrib/client_server.py	2023-04-19 13:18:30.000000000 +0000
@@ -3,7 +3,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-from multiprocessing.dummy import Pool as ThreadPool
+from multiprocessing.pool import ThreadPool
 import faiss
 from typing import List, Tuple
 
diff -Nru faiss-1.7.3/contrib/clustering.py faiss-1.7.4/contrib/clustering.py
--- faiss-1.7.3/contrib/clustering.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/contrib/clustering.py	2023-04-19 13:18:30.000000000 +0000
@@ -21,11 +21,14 @@
 def print_nop(*arg, **kwargs):
     pass
 
-def two_level_clustering(xt, nc1, nc2, clustering_niter=25, **args):
+def two_level_clustering(xt, nc1, nc2, rebalance=True, clustering_niter=25, **args):
     """
     perform 2-level clustering on a training set xt
     nc1 and nc2 are the number of clusters at each level, the final number of
-    clusters is nc1 * nc2. Additional arguments are passed to the Kmeans object
+    clusters is nc2. Additional arguments are passed to the Kmeans object.
+
+    Rebalance allocates the number of sub-clusters depending on the number of
+    first-level assignment.
     """
     d = xt.shape[1]
 
@@ -33,11 +36,11 @@
 
     log = print if verbose else print_nop
 
-    log(f"2-level clustering of {xt.shape} nb clusters = {nc1}*{nc2} = {nc1*nc2}")
+    log(f"2-level clustering of {xt.shape} nb 1st level clusters = {nc1} total {nc2}")
     log("perform coarse training")
 
     km = faiss.Kmeans(
-        d, nc1, verbose=True, niter=clustering_niter,
+        d, nc1, niter=clustering_niter,
         max_points_per_centroid=2000,
         **args
     )
@@ -57,10 +60,16 @@
     o = assign1.argsort()
     del km
 
-    if type(nc2) == int:
-        all_nc2 = [nc2] * nc1
+    if not rebalance:
+        # make sure the sub-clusters sum up to exactly nc2
+        cc = np.arange(nc1 + 1) * nc2 // nc1
+        all_nc2 = cc[1:] - cc[:-1]
     else:
-        all_nc2 = nc2
+        bc_sum = np.cumsum(bc)
+        all_nc2 = bc_sum * nc2 // bc_sum[-1]
+        all_nc2[1:] -= all_nc2[:-1]
+        assert sum(all_nc2) == nc2
+        log(f"nb 2nd-level centroids {min(all_nc2)}-{max(all_nc2)}")
 
     # train sub-clusters
     i0 = 0
@@ -94,16 +103,16 @@
             vt = index.chain.at(i)
             vt.train(xt)
             xt = vt.apply(xt)
-        train_ivf_index_with_2level(index.index, xt)
+        train_ivf_index_with_2level(index.index, xt, **args)
         index.is_trained = True
         return
     assert isinstance(index, faiss.IndexIVF)
     assert index.metric_type == faiss.METRIC_L2
     # now do 2-level clustering
     nc1 = int(np.sqrt(index.nlist))
-    cc = np.arange(nc1 + 1) * index.nlist // nc1
-    all_nc2 = cc[1:] - cc[:-1]
-    centroids, _ = two_level_clustering(xt, nc1, all_nc2, **args)
+    print("REBALANCE=", args)
+
+    centroids, _ = two_level_clustering(xt, nc1, index.nlist, **args)
     index.quantizer.train(centroids)
     index.quantizer.add(centroids)
     # finish training
diff -Nru faiss-1.7.3/contrib/datasets.py faiss-1.7.4/contrib/datasets.py
--- faiss-1.7.3/contrib/datasets.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/contrib/datasets.py	2023-04-19 13:18:30.000000000 +0000
@@ -310,3 +310,39 @@
             assert k <= 100
             gt = gt[:, :k]
         return gt
+
+
+
+def dataset_from_name(dataset='deep1M', download=False):
+    """ converts a string describing a dataset to a Dataset object
+    Supports sift1M, bigann1M..bigann1B, deep1M..deep1B, music-100 and glove
+    """
+
+    if dataset == 'sift1M':
+        return DatasetSIFT1M()
+
+    elif dataset.startswith('bigann'):
+        dbsize = 1000 if dataset == "bigann1B" else int(dataset[6:-1])
+        return DatasetBigANN(nb_M=dbsize)
+
+    elif dataset.startswith("deep"):
+
+        szsuf = dataset[4:]
+        if szsuf[-1] == 'M':
+            dbsize = 10 ** 6 * int(szsuf[:-1])
+        elif szsuf == '1B':
+            dbsize = 10 ** 9
+        elif szsuf[-1] == 'k':
+            dbsize = 1000 * int(szsuf[:-1])
+        else:
+            assert False, "did not recognize suffix " + szsuf
+        return DatasetDeep1B(nb=dbsize)
+
+    elif dataset == "music-100":
+        return DatasetMusic100()
+
+    elif dataset == "glove":
+        return DatasetGlove(download=download)
+
+    else:
+        raise RuntimeError("unknown dataset " + dataset)
diff -Nru faiss-1.7.3/contrib/evaluation.py faiss-1.7.4/contrib/evaluation.py
--- faiss-1.7.3/contrib/evaluation.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/contrib/evaluation.py	2023-04-19 13:18:30.000000000 +0000
@@ -5,8 +5,10 @@
 
 import numpy as np
 import unittest
+import time
+import faiss
 
-from multiprocessing.dummy import Pool as ThreadPool
+from multiprocessing.pool import ThreadPool
 
 ###############################################################
 # Simple functions to evaluate knn results
@@ -264,3 +266,192 @@
             (Ii_new, Di_new) = sort_by_ids(Ii_new, Di_new)
             np.testing.assert_array_equal(Ii_ref, Ii_new)
         np.testing.assert_array_almost_equal(Di_ref, Di_new, decimal=5)
+
+
+###############################################################
+# OperatingPoints functions
+# this is the Python version of the AutoTune object in C++
+
+class OperatingPoints:
+    """
+    Manages a set of search parameters with associated performance and time.
+    Keeps the Pareto optimal points.
+    """
+
+    def __init__(self):
+        # list of (key, perf, t)
+        self.operating_points = [
+            #  (self.do_nothing_key(), 0.0, 0.0)
+        ]
+        self.suboptimal_points = []
+
+    def compare_keys(self, k1, k2):
+        """ return -1 if k1 > k2, 1 if k2 > k1, 0 otherwise """
+        raise NotImplemented
+
+    def do_nothing_key(self):
+        """ parameters to say we do noting, takes 0 time and has 0 performance"""
+        raise NotImplemented
+
+    def is_pareto_optimal(self, perf_new, t_new):
+        for _, perf, t in self.operating_points:
+            if perf >= perf_new and t <= t_new:
+                return False
+        return True
+
+    def predict_bounds(self, key):
+        """ predicts the bound on time and performance """
+        min_time = 0.0
+        max_perf = 1.0
+        for key2, perf, t in self.operating_points + self.suboptimal_points:
+            cmp = self.compare_keys(key, key2)
+            if cmp > 0: # key2 > key
+                if t > min_time:
+                    min_time = t
+            if cmp < 0: # key2 < key
+                if perf < max_perf:
+                    max_perf = perf
+        return max_perf, min_time
+
+    def should_run_experiment(self, key):
+        (max_perf, min_time) = self.predict_bounds(key)
+        return self.is_pareto_optimal(max_perf, min_time)
+
+    def add_operating_point(self, key, perf, t):
+        if self.is_pareto_optimal(perf, t):
+            i = 0
+            # maybe it shadows some other operating point completely?
+            while i < len(self.operating_points):
+                op_Ls, perf2, t2 = self.operating_points[i]
+                if perf >= perf2 and t < t2:
+                    self.suboptimal_points.append(
+                        self.operating_points.pop(i))
+                else:
+                    i += 1
+            self.operating_points.append((key, perf, t))
+            return True
+        else:
+            self.suboptimal_points.append((key, perf, t))
+            return False
+
+
+class OperatingPointsWithRanges(OperatingPoints):
+    """
+    Set of parameters that are each picked from a discrete range of values.
+    An increase of each parameter is assumed to make the operation slower
+    and more accurate.
+    A key = int array of indices in the ordered set of parameters.
+    """
+
+    def __init__(self):
+        OperatingPoints.__init__(self)
+        # list of (name, values)
+        self.ranges = []
+
+    def add_range(self, name, values):
+        self.ranges.append((name, values))
+
+    def compare_keys(self, k1, k2):
+        if np.all(k1 >= k2):
+            return 1
+        if np.all(k2 >= k1):
+            return -1
+        return 0
+
+    def do_nothing_key(self):
+        return np.zeros(len(self.ranges), dtype=int)
+
+    def num_experiments(self):
+        return np.prod([len(values) for name, values in self.ranges])
+
+    def cno_to_key(self, cno):
+        """Convert a sequential experiment number to a key"""
+        k = np.zeros(len(self.ranges), dtype=int)
+        for i, (name, values) in enumerate(self.ranges):
+            k[i] = cno % len(values)
+            cno //= len(values)
+        assert cno == 0
+        return k
+
+    def get_parameters(self, k):
+        """Convert a key to a dictionary with parameter values"""
+        return {
+            name: values[k[i]]
+            for i, (name, values) in enumerate(self.ranges)
+        }
+
+    def restrict_range(self, name, max_val):
+        """ remove too large values from a range"""
+        for name2, values in self.ranges:
+            if name == name2:
+                val2 = [v for v in values if v < max_val]
+                values[:] = val2
+                return
+        raise RuntimeError(f"parameter {name} not found")
+
+
+###############################################################
+# Timer object
+
+class TimerIter:
+    def __init__(self, timer):
+        self.ts = []
+        self.runs = timer.runs
+        self.timer = timer
+        if timer.nt >= 0:
+            faiss.omp_set_num_threads(timer.nt)
+
+    def __next__(self):
+        timer = self.timer
+        self.runs -= 1
+        self.ts.append(time.time())
+        total_time = self.ts[-1] - self.ts[0] if len(self.ts) >= 2 else 0
+        if self.runs == -1 or total_time > timer.max_secs:
+            if timer.nt >= 0:
+                faiss.omp_set_num_threads(timer.remember_nt)
+            ts = np.array(self.ts)
+            times = ts[1:] - ts[:-1]
+            if len(times) == timer.runs:
+                timer.times = times[timer.warmup :]
+            else:
+                # if timeout, we use all the runs
+                timer.times = times[:]
+            raise StopIteration
+
+class RepeatTimer:
+    """
+    This is yet another timer object. It is adapted to Faiss by
+    taking a number of openmp threads to set on input. It should be called
+    in an explicit loop as:
+
+    timer = RepeatTimer(warmup=1, nt=1, runs=6)
+
+    for _ in timer:
+        # perform operation
+
+    print(f"time={timer.get_ms():.1f} ± {timer.get_ms_std():.1f} ms")
+
+    the same timer can be re-used. In that case it is reset each time it
+    enters a loop. It focuses on ms-scale times because for second scale
+    it's usually less relevant to repeat the operation.
+    """
+    def __init__(self, warmup=0, nt=-1, runs=1, max_secs=np.inf):
+        assert warmup < runs
+        self.warmup = warmup
+        self.nt = nt
+        self.runs = runs
+        self.max_secs = max_secs
+        self.remember_nt = faiss.omp_get_max_threads()
+
+    def __iter__(self):
+        return TimerIter(self)
+
+    def ms(self):
+        return np.mean(self.times) * 1000
+
+    def ms_std(self):
+        return np.std(self.times) * 1000 if len(self.times) > 1 else 0.0
+
+    def nruns(self):
+        """ effective number of runs (may be lower than runs - warmup due to timeout)"""
+        return len(self.times)
diff -Nru faiss-1.7.3/contrib/exhaustive_search.py faiss-1.7.4/contrib/exhaustive_search.py
--- faiss-1.7.3/contrib/exhaustive_search.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/contrib/exhaustive_search.py	2023-04-19 13:18:30.000000000 +0000
@@ -20,7 +20,7 @@
     LOG.info("knn_ground_truth queries size %s k=%d" % (xq.shape, k))
     t0 = time.time()
     nq, d = xq.shape
-    keep_max = metric_type == faiss.METRIC_INNER_PRODUCT
+    keep_max = faiss.is_similarity_metric(metric_type)
     rh = faiss.ResultHeap(nq, k, keep_max=keep_max)
 
     index = faiss.IndexFlat(d, metric_type)
@@ -265,7 +265,7 @@
                      (totres, max_results))
             radius, totres = apply_maxres(
                 res_batches, min_results,
-                keep_max=index.metric_type == faiss.METRIC_INNER_PRODUCT
+                keep_max=faiss.is_similarity_metric(index.metric_type)
             )
         t2 = time.time()
         t_search += t1 - t0
@@ -281,7 +281,7 @@
     if clip_to_min and totres > min_results:
         radius, totres = apply_maxres(
             res_batches, min_results,
-            keep_max=index.metric_type == faiss.METRIC_INNER_PRODUCT
+            keep_max=faiss.is_similarity_metric(index.metric_type)
         )
 
     nres = np.hstack([nres_i for nres_i, dis_i, ids_i in res_batches])
diff -Nru faiss-1.7.3/contrib/ivf_tools.py faiss-1.7.4/contrib/ivf_tools.py
--- faiss-1.7.3/contrib/ivf_tools.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/contrib/ivf_tools.py	2023-04-19 13:18:30.000000000 +0000
@@ -3,9 +3,18 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
+import time
+import pickle
+import os
+from multiprocessing.pool import ThreadPool
+import threading
+
 import numpy as np
 import faiss
 
+from faiss.contrib.inspect_tools import get_invlist
+
+
 def add_preassigned(index_ivf, x, a, ids=None):
     """
     Add elements to an IVF index, where the assignment is already computed
@@ -25,7 +34,9 @@
 
 def search_preassigned(index_ivf, xq, k, list_nos, coarse_dis=None):
     """
-    Perform a search in the IVF index, with predefined lists to search into
+    Perform a search in the IVF index, with predefined lists to search into.
+    Supports indexes with pretransforms (as opposed to the
+    IndexIVF.search_preassigned, that cannot be applied with pretransform).
     """
     n, d = xq.shape
     if isinstance(index_ivf, faiss.IndexBinaryIVF):
@@ -37,26 +48,20 @@
     assert d == index_ivf.d
     assert list_nos.shape == (n, index_ivf.nprobe)
 
-    # the coarse distances are used in IVFPQ with L2 distance and by_residual=True
-    # otherwise we provide dummy coarse_dis
+    # the coarse distances are used in IVFPQ with L2 distance and
+    # by_residual=True otherwise we provide dummy coarse_dis
     if coarse_dis is None:
         coarse_dis = np.zeros((n, index_ivf.nprobe), dtype=dis_type)
     else:
         assert coarse_dis.shape == (n, index_ivf.nprobe)
 
-    D = np.empty((n, k), dtype=dis_type)
-    I = np.empty((n, k), dtype='int64')
-
-    sp = faiss.swig_ptr
-    index_ivf.search_preassigned(
-        n, sp(xq), k,
-        sp(list_nos), sp(coarse_dis), sp(D), sp(I), False)
-    return D, I
+    return index_ivf.search_preassigned(xq, k, list_nos, coarse_dis)
 
 
 def range_search_preassigned(index_ivf, x, radius, list_nos, coarse_dis=None):
     """
-    Perform a range search in the IVF index, with predefined lists to search into
+    Perform a range search in the IVF index, with predefined lists to
+    search into
     """
     n, d = x.shape
     if isinstance(index_ivf, faiss.IndexBinaryIVF):
@@ -65,8 +70,8 @@
     else:
         dis_type = "float32"
 
-    # the coarse distances are used in IVFPQ with L2 distance and by_residual=True
-    # otherwise we provide dummy coarse_dis
+    # the coarse distances are used in IVFPQ with L2 distance and
+    # by_residual=True otherwise we provide dummy coarse_dis
     if coarse_dis is None:
         coarse_dis = np.empty((n, index_ivf.nprobe), dtype=dis_type)
     else:
@@ -89,3 +94,471 @@
     dist = faiss.rev_swig_ptr(res.distances, num_results).copy()
     indices = faiss.rev_swig_ptr(res.labels, num_results).copy()
     return lims, dist, indices
+
+
+def replace_ivf_quantizer(index_ivf, new_quantizer):
+    """ replace the IVF quantizer with a flat quantizer and return the
+    old quantizer"""
+    if new_quantizer.ntotal == 0:
+        centroids = index_ivf.quantizer.reconstruct_n()
+        new_quantizer.train(centroids)
+        new_quantizer.add(centroids)
+    else:
+        assert new_quantizer.ntotal == index_ivf.nlist
+
+    # cleanly dealloc old quantizer
+    old_own = index_ivf.own_fields
+    index_ivf.own_fields = False
+    old_quantizer = faiss.downcast_index(index_ivf.quantizer)
+    old_quantizer.this.own(old_own)
+    index_ivf.quantizer = new_quantizer
+
+    if hasattr(index_ivf, "referenced_objects"):
+        index_ivf.referenced_objects.append(new_quantizer)
+    else:
+        index_ivf.referenced_objects = [new_quantizer]
+    return old_quantizer
+
+
+class BigBatchSearcher:
+    """
+    Object that manages all the data related to the computation
+    except the actual within-bucket matching and the organization of the
+    computation (parallel or not)
+    """
+
+    def __init__(
+            self,
+            index, xq, k,
+            verbose=0,
+            use_float16=False):
+
+        # verbosity
+        self.verbose = verbose
+        self.tictoc = []
+
+        self.xq = xq
+        self.index = index
+        self.use_float16 = use_float16
+        keep_max = faiss.is_similarity_metric(index.metric_type)
+        self.rh = faiss.ResultHeap(len(xq), k, keep_max=keep_max)
+        self.t_accu = [0] * 5
+        self.t_display = self.t0 = time.time()
+
+    def start_t_accu(self):
+        self.t_accu_t0 = time.time()
+
+    def stop_t_accu(self, n):
+        self.t_accu[n] += time.time() - self.t_accu_t0
+
+    def tic(self, name):
+        self.tictoc = (name, time.time())
+        if self.verbose > 0:
+            print(name, end="\r", flush=True)
+
+    def toc(self):
+        name, t0 = self.tictoc
+        dt = time.time() - t0
+        if self.verbose > 0:
+            print(f"{name}: {dt:.3f} s")
+        return dt
+
+    def report(self, l):
+        if self.verbose == 1 or (
+                l > 1000 and time.time() < self.t_display + 1.0):
+            return
+        print(
+            f"[{time.time()-self.t0:.1f} s] list {l}/{self.index.nlist} "
+            f"times prep q {self.t_accu[0]:.3f} prep b {self.t_accu[1]:.3f} "
+            f"comp {self.t_accu[2]:.3f} res {self.t_accu[3]:.3f} "
+            f"wait {self.t_accu[4]:.3f}",
+            end="\r", flush=True
+        )
+        self.t_display = time.time()
+
+    def coarse_quantization(self):
+        self.tic("coarse quantization")
+        bs = 65536
+        nq = len(self.xq)
+        q_assign = np.empty((nq, self.index.nprobe), dtype='int32')
+        for i0 in range(0, nq, bs):
+            i1 = min(nq, i0 + bs)
+            q_dis_i, q_assign_i = self.index.quantizer.search(
+                self.xq[i0:i1], self.index.nprobe)
+            # q_dis[i0:i1] = q_dis_i
+            q_assign[i0:i1] = q_assign_i
+        self.toc()
+        self.q_assign = q_assign
+
+    def reorder_assign(self):
+        self.tic("bucket sort")
+        q_assign = self.q_assign
+        q_assign += 1   # move -1 -> 0
+        self.bucket_lims = faiss.matrix_bucket_sort_inplace(
+            self.q_assign, nbucket=self.index.nlist + 1, nt=16)
+        self.query_ids = self.q_assign.ravel()
+        if self.verbose > 0:
+            print('  number of -1s:', self.bucket_lims[1])
+        self.bucket_lims = self.bucket_lims[1:]  # shift back to ignore -1s
+        del self.q_assign   # inplace so let's forget about the old version...
+        self.toc()
+
+    def prepare_bucket(self, l):
+        """ prepare the queries and database items for bucket l"""
+        t0 = time.time()
+        index = self.index
+        # prepare queries
+        i0, i1 = self.bucket_lims[l], self.bucket_lims[l + 1]
+        q_subset = self.query_ids[i0:i1]
+        xq_l = self.xq[q_subset]
+        if self.by_residual:
+            xq_l = xq_l - index.quantizer.reconstruct(l)
+        t1 = time.time()
+        # prepare database side
+        list_ids, xb_l = get_invlist(index.invlists, l)
+
+        if self.decode_func is None:
+            xb_l = xb_l.ravel()
+        else:
+            xb_l = self.decode_func(xb_l)
+
+        if self.use_float16:
+            xb_l = xb_l.astype('float16')
+            xq_l = xq_l.astype('float16')
+
+        t2 = time.time()
+        self.t_accu[0] += t1 - t0
+        self.t_accu[1] += t2 - t1
+        return q_subset, xq_l, list_ids, xb_l
+
+    def add_results_to_heap(self, q_subset, D, list_ids, I):
+        """add the bucket results to the heap structure"""
+        if D is None:
+            return
+        t0 = time.time()
+        if I is None:
+            I = list_ids
+        else:
+            I = list_ids[I]
+        self.rh.add_result_subset(q_subset, D, I)
+        self.t_accu[3] += time.time() - t0
+
+    def sizes_in_checkpoint(self):
+        return (self.xq.shape, self.index.nprobe, self.index.nlist)
+
+    def write_checkpoint(self, fname, cur_list_no):
+        # write to temp file then move to final file
+        tmpname = fname + ".tmp"
+        pickle.dump(
+            {
+                "sizes": self.sizes_in_checkpoint(),
+                "cur_list_no": cur_list_no,
+                "rh": (self.rh.D, self.rh.I),
+            }, open(tmpname, "wb"), -1
+        )
+        os.replace(tmpname, fname)
+
+    def read_checkpoint(self, fname):
+        ckp = pickle.load(open(fname, "rb"))
+        assert ckp["sizes"] == self.sizes_in_checkpoint()
+        self.rh.D[:] = ckp["rh"][0]
+        self.rh.I[:] = ckp["rh"][1]
+        return ckp["cur_list_no"]
+
+
+class BlockComputer:
+    """ computation within one bucket """
+
+    def __init__(
+            self,
+            index,
+            method="knn_function",
+            pairwise_distances=faiss.pairwise_distances,
+            knn=faiss.knn):
+
+        self.index = index
+        if index.__class__ == faiss.IndexIVFFlat:
+            index_help = faiss.IndexFlat(index.d, index.metric_type)
+            decode_func = lambda x: x.view("float32")
+            by_residual = False
+        elif index.__class__ == faiss.IndexIVFPQ:
+            index_help = faiss.IndexPQ(
+                index.d, index.pq.M, index.pq.nbits, index.metric_type)
+            index_help.pq = index.pq
+            decode_func = index_help.pq.decode
+            index_help.is_trained = True
+            by_residual = index.by_residual
+        elif index.__class__ == faiss.IndexIVFScalarQuantizer:
+            index_help = faiss.IndexScalarQuantizer(
+                index.d, index.sq.qtype, index.metric_type)
+            index_help.sq = index.sq
+            decode_func = index_help.sq.decode
+            index_help.is_trained = True
+            by_residual = index.by_residual
+        else:
+            raise RuntimeError(f"index type {index.__class__} not supported")
+        self.index_help = index_help
+        self.decode_func = None if method == "index" else decode_func
+        self.by_residual = by_residual
+        self.method = method
+        self.pairwise_distances = pairwise_distances
+        self.knn = knn
+
+    def block_search(self, xq_l, xb_l, list_ids, k, **extra_args):
+        metric_type = self.index.metric_type
+        if xq_l.size == 0 or xb_l.size == 0:
+            D = I = None
+        elif self.method == "index":
+            faiss.copy_array_to_vector(xb_l, self.index_help.codes)
+            self.index_help.ntotal = len(list_ids)
+            D, I = self.index_help.search(xq_l, k)
+        elif self.method == "pairwise_distances":
+            # TODO implement blockwise to avoid mem blowup
+            D = self.pairwise_distances(xq_l, xb_l, metric=metric_type)
+            I = None
+        elif self.method == "knn_function":
+            D, I = self.knn(xq_l, xb_l, k, metric=metric_type, **extra_args)
+
+        return D, I
+
+
+def big_batch_search(
+        index, xq, k,
+        method="knn_function",
+        pairwise_distances=faiss.pairwise_distances,
+        knn=faiss.knn,
+        verbose=0,
+        threaded=0,
+        use_float16=False,
+        prefetch_threads=8,
+        computation_threads=0,
+        q_assign=None,
+        checkpoint=None,
+        checkpoint_freq=64,
+        start_list=0,
+        end_list=None,
+        crash_at=-1
+        ):
+    """
+    Search queries xq in the IVF index, with a search function that collects
+    batches of query vectors per inverted list. This can be faster than the
+    regular search indexes.
+    Supports IVFFlat, IVFPQ and IVFScalarQuantizer.
+
+    Supports three computation methods:
+    method = "index":
+        build a flat index and populate it separately for each index
+    method = "pairwise_distances":
+        decompress codes and compute all pairwise distances for the queries
+        and index and add result to heap
+    method = "knn_function":
+        decompress codes and compute knn results for the queries
+
+    threaded=0: sequential execution
+    threaded=1: prefetch next bucket while computing the current one
+    threaded>1: prefetch this many buckets at a time.
+
+    compute_threads>1: the knn function will get an additional thread_no that
+        tells which worker should handle this.
+
+    In threaded mode, the computation is tiled with the bucket perparation and
+    the writeback of results (useful to maximize GPU utilization).
+
+    use_float16: convert all matrices to float16 (faster for GPU gemm)
+
+    q_assign: override coarse assignment, should be a matrix of size nq * nprobe
+
+    checkpointing (only for threaded > 1):
+    checkpoint: file where the checkpoints are stored
+    checkpoint_freq: when to perform checkpoinging. Should be a multiple of threaded
+
+    start_list, end_list: process only a subset of invlists
+    """
+    nprobe = index.nprobe
+
+    assert method in ("index", "pairwise_distances", "knn_function")
+
+    mem_queries = xq.nbytes
+    mem_assign = len(xq) * nprobe * np.dtype('int32').itemsize
+    mem_res = len(xq) * k * (
+        np.dtype('int64').itemsize
+        + np.dtype('float32').itemsize
+    )
+    mem_tot = mem_queries + mem_assign + mem_res
+    if verbose > 0:
+        print(
+            f"memory: queries {mem_queries} assign {mem_assign} "
+            f"result {mem_res} total {mem_tot} = {mem_tot / (1<<30):.3f} GiB"
+        )
+
+    bbs = BigBatchSearcher(
+        index, xq, k,
+        verbose=verbose,
+        use_float16=use_float16
+    )
+
+    comp = BlockComputer(
+        index,
+        method=method,
+        pairwise_distances=pairwise_distances,
+        knn=knn
+    )
+
+    bbs.decode_func = comp.decode_func
+    bbs.by_residual = comp.by_residual
+
+    if q_assign is None:
+        bbs.coarse_quantization()
+    else:
+        bbs.q_assign = q_assign
+    bbs.reorder_assign()
+
+    if end_list is None:
+        end_list = index.nlist
+
+    if checkpoint is not None:
+        assert (start_list, end_list) == (0, index.nlist)
+        if os.path.exists(checkpoint):
+            print("recovering checkpoint", checkpoint)
+            start_list = bbs.read_checkpoint(checkpoint)
+            print("   start at list", start_list)
+        else:
+            print("no checkpoint: starting from scratch")
+
+    if threaded == 0:
+        # simple sequential version
+
+        for l in range(start_list, end_list):
+            bbs.report(l)
+            q_subset, xq_l, list_ids, xb_l = bbs.prepare_bucket(l)
+            t0i = time.time()
+            D, I = comp.block_search(xq_l, xb_l, list_ids, k)
+            bbs.t_accu[2] += time.time() - t0i
+            bbs.add_results_to_heap(q_subset, D, list_ids, I)
+
+    elif threaded == 1:
+
+        # parallel version with granularity 1
+
+        def add_results_and_prefetch(to_add, l):
+            """ perform the addition for the previous bucket and
+            prefetch the next (if applicable) """
+            if to_add is not None:
+                bbs.add_results_to_heap(*to_add)
+            if l < index.nlist:
+                return bbs.prepare_bucket(l)
+
+        prefetched_bucket = bbs.prepare_bucket(start_list)
+        to_add = None
+        pool = ThreadPool(1)
+
+        for l in range(start_list, end_list):
+            bbs.report(l)
+            prefetched_bucket_a = pool.apply_async(
+                add_results_and_prefetch, (to_add, l + 1))
+            q_subset, xq_l, list_ids, xb_l = prefetched_bucket
+            bbs.start_t_accu()
+            D, I = comp.block_search(xq_l, xb_l, list_ids, k)
+            bbs.stop_t_accu(2)
+            to_add = q_subset, D, list_ids, I
+            bbs.start_t_accu()
+            prefetched_bucket = prefetched_bucket_a.get()
+            bbs.stop_t_accu(4)
+
+        bbs.add_results_to_heap(*to_add)
+        pool.close()
+    else:
+        # run by batches with parallel prefetch and parallel comp
+        list_step = threaded
+        assert start_list % list_step == 0
+
+        if prefetch_threads == 0:
+            prefetch_map = map
+        else:
+            prefetch_pool = ThreadPool(prefetch_threads)
+            prefetch_map = prefetch_pool.map
+
+        if computation_threads > 0:
+            comp_pool = ThreadPool(computation_threads)
+
+        def add_results_and_prefetch_batch(to_add, l):
+            def add_results(to_add):
+                for ta in to_add: # this one cannot be run in parallel...
+                    if ta is not None:
+                        bbs.add_results_to_heap(*ta)
+            if prefetch_threads == 0:
+                add_results(to_add)
+            else:
+                add_a = prefetch_pool.apply_async(add_results, (to_add, ))
+            next_lists = range(l, min(l + list_step, index.nlist))
+            res = list(prefetch_map(bbs.prepare_bucket, next_lists))
+            if prefetch_threads > 0:
+                add_a.get()
+            return res
+
+        # used only when computation_threads > 1
+        thread_id_to_seq_lock = threading.Lock()
+        thread_id_to_seq = {}
+
+        def do_comp(bucket):
+            (q_subset, xq_l, list_ids, xb_l) = bucket
+            try:
+                tid = thread_id_to_seq[threading.get_ident()]
+            except KeyError:
+                with thread_id_to_seq_lock:
+                    tid = len(thread_id_to_seq)
+                    thread_id_to_seq[threading.get_ident()] = tid
+            D, I = comp.block_search(xq_l, xb_l, list_ids, k, thread_id=tid)
+            return q_subset, D, list_ids, I
+
+        prefetched_buckets = add_results_and_prefetch_batch([], start_list)
+        to_add = []
+        pool = ThreadPool(1)
+        prefetched_buckets_a = None
+
+        # loop over inverted lists
+        for l in range(start_list, end_list, list_step):
+            bbs.report(l)
+            buckets = prefetched_buckets
+            prefetched_buckets_a = pool.apply_async(
+                add_results_and_prefetch_batch, (to_add, l + list_step))
+
+            bbs.start_t_accu()
+
+            to_add = []
+            if computation_threads == 0:
+                for q_subset, xq_l, list_ids, xb_l in buckets:
+                    D, I = comp.block_search(xq_l, xb_l, list_ids, k)
+                    to_add.append((q_subset, D, list_ids, I))
+            else:
+                to_add = list(comp_pool.map(do_comp, buckets))
+
+            bbs.stop_t_accu(2)
+
+            # to test checkpointing
+            if l == crash_at:
+                1 / 0
+
+            bbs.start_t_accu()
+            prefetched_buckets = prefetched_buckets_a.get()
+            bbs.stop_t_accu(4)
+
+            if checkpoint is not None:
+                if (l // list_step) % checkpoint_freq == 0:
+                    print("writing checkpoint %s" % l)
+                    bbs.write_checkpoint(checkpoint, l)
+
+        # flush add
+        for ta in to_add:
+            bbs.add_results_to_heap(*ta)
+        pool.close()
+        if prefetch_threads != 0:
+            prefetch_pool.close()
+        if computation_threads != 0:
+            comp_pool.close()
+
+    bbs.tic("finalize heap")
+    bbs.rh.finalize()
+    bbs.toc()
+
+    return bbs.rh.D, bbs.rh.I
diff -Nru faiss-1.7.3/contrib/torch_utils.py faiss-1.7.4/contrib/torch_utils.py
--- faiss-1.7.3/contrib/torch_utils.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/contrib/torch_utils.py	2023-04-19 13:18:30.000000000 +0000
@@ -41,7 +41,7 @@
     assert x.dtype == torch.float16
     # no canonical half type in C/C++
     return faiss.cast_integer_to_void_ptr(
-        x.storage().data_ptr() + x.storage_offset() * 4)
+        x.storage().data_ptr() + x.storage_offset() * 2)
 
 def swig_ptr_from_FloatTensor(x):
     """ gets a Faiss SWIG pointer from a pytorch tensor (on CPU or GPU) """
@@ -55,7 +55,7 @@
     assert x.is_contiguous()
     assert x.dtype == torch.int32, 'dtype=%s' % x.dtype
     return faiss.cast_integer_to_int_ptr(
-        x.storage().data_ptr() + x.storage_offset() * 8)
+        x.storage().data_ptr() + x.storage_offset() * 4)
 
 def swig_ptr_from_IndicesTensor(x):
     """ gets a Faiss SWIG pointer from a pytorch tensor (on CPU or GPU) """
@@ -319,7 +319,10 @@
 
         return x
 
-    def torch_replacement_reconstruct_n(self, n0, ni, x=None):
+    def torch_replacement_reconstruct_n(self, n0=0, ni=-1, x=None):
+        if ni == -1:
+            ni = self.ntotal
+
         # No tensor inputs are required, but with importing this module, we
         # assume that the default should be torch tensors. If we are passed a
         # numpy array, however, assume that the user is overriding this default
@@ -490,10 +493,10 @@
             handle_torch_Index(the_class)
 
 # allows torch tensor usage with bfKnn
-def torch_replacement_knn_gpu(res, xq, xb, k, D=None, I=None, metric=faiss.METRIC_L2):
+def torch_replacement_knn_gpu(res, xq, xb, k, D=None, I=None, metric=faiss.METRIC_L2, device=-1):
     if type(xb) is np.ndarray:
         # Forward to faiss __init__.py base method
-        return faiss.knn_gpu_numpy(res, xq, xb, k, D, I, metric)
+        return faiss.knn_gpu_numpy(res, xq, xb, k, D, I, metric, device)
 
     nb, d = xb.size()
     if xb.is_contiguous():
@@ -570,6 +573,7 @@
     args.outDistances = D_ptr
     args.outIndices = I_ptr
     args.outIndicesType = I_type
+    args.device = device
 
     with using_stream(res):
         faiss.bfKnn(res, args)
@@ -579,7 +583,7 @@
 torch_replace_method(faiss_module, 'knn_gpu', torch_replacement_knn_gpu, True, True)
 
 # allows torch tensor usage with bfKnn for all pairwise distances
-def torch_replacement_pairwise_distance_gpu(res, xq, xb, D=None, metric=faiss.METRIC_L2):
+def torch_replacement_pairwise_distance_gpu(res, xq, xb, D=None, metric=faiss.METRIC_L2, device=-1):
     if type(xb) is np.ndarray:
         # Forward to faiss __init__.py base method
         return faiss.pairwise_distance_gpu_numpy(res, xq, xb, D, metric)
@@ -643,6 +647,7 @@
     args.queryType = xq_type
     args.numQueries = nq
     args.outDistances = D_ptr
+    args.device = device
 
     with using_stream(res):
         faiss.bfKnn(res, args)
diff -Nru faiss-1.7.3/debian/changelog faiss-1.7.4/debian/changelog
--- faiss-1.7.3/debian/changelog	2022-12-25 19:43:45.000000000 +0000
+++ faiss-1.7.4/debian/changelog	2023-06-26 14:06:31.000000000 +0000
@@ -1,8 +1,32 @@
-faiss (1.7.3-2build1) lunar; urgency=medium
+faiss (1.7.4-3) unstable; urgency=medium
 
-  * No-change rebuild with Python 3.11 as default
+  * Team upload.
+  * Drop broken Neon inlining patch
 
- -- Graham Inggs <ginggs@ubuntu.com>  Sun, 25 Dec 2022 19:43:45 +0000
+ -- Timo Röhling <roehling@debian.org>  Mon, 26 Jun 2023 16:06:31 +0200
+
+faiss (1.7.4-2) unstable; urgency=medium
+
+  * Team upload.
+  * Install missing header file
+
+ -- Timo Röhling <roehling@debian.org>  Sat, 24 Jun 2023 21:32:26 +0200
+
+faiss (1.7.4-1) unstable; urgency=medium
+
+  * Team upload.
+
+  [ Debian Janitor ]
+  * Set upstream metadata fields: Bug-Database, Bug-Submit, Repository-Browse.
+
+  [ Timo Röhling ]
+  * Make package cross-buildable
+  * Fix build with gcc-13 (Closes: #1037643)
+  * New upstream version 1.7.4
+  * Refresh patches
+  * Bump Standards-Version to 4.6.2
+
+ -- Timo Röhling <roehling@debian.org>  Thu, 22 Jun 2023 22:13:41 +0200
 
 faiss (1.7.3-2) unstable; urgency=medium
 
diff -Nru faiss-1.7.3/debian/control faiss-1.7.4/debian/control
--- faiss-1.7.3/debian/control	2022-11-10 13:12:59.000000000 +0000
+++ faiss-1.7.4/debian/control	2023-06-26 14:06:08.000000000 +0000
@@ -2,7 +2,7 @@
 Section: science
 Homepage: https://github.com/facebookresearch/faiss
 Priority: optional
-Standards-Version: 4.6.1
+Standards-Version: 4.6.2
 Vcs-Git: https://salsa.debian.org/deeplearning-team/faiss.git
 Vcs-Browser: https://salsa.debian.org/deeplearning-team/faiss
 Maintainer: Debian Deep Learning Team <debian-ai@lists.debian.org>
@@ -16,7 +16,8 @@
                libblas-dev,
                libgtest-dev,
                liblapack-dev,
-               python3-dev,
+	       libpython3-dev,
+               python3-dev:any,
                python3-numpy,
                python3-setuptools,
                swig
diff -Nru faiss-1.7.3/debian/patches/0001-Use-system-GTest.patch faiss-1.7.4/debian/patches/0001-Use-system-GTest.patch
--- faiss-1.7.3/debian/patches/0001-Use-system-GTest.patch	2022-11-17 22:31:23.000000000 +0000
+++ faiss-1.7.4/debian/patches/0001-Use-system-GTest.patch	2023-06-26 14:06:08.000000000 +0000
@@ -8,7 +8,7 @@
  2 files changed, 14 insertions(+), 7 deletions(-)
 
 diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 7430aac..de761c5 100644
+index fa1e312..63e3556 100644
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
 @@ -22,6 +22,7 @@ option(FAISS_OPT_LEVEL "" "generic")
@@ -20,10 +20,10 @@
  if(FAISS_ENABLE_GPU)
    set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
 diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
-index d03a91c..af7f857 100644
+index 55455bd..442dfa4 100644
 --- a/tests/CMakeLists.txt
 +++ b/tests/CMakeLists.txt
-@@ -37,18 +37,24 @@ else()
+@@ -40,18 +40,24 @@ else()
    target_link_libraries(faiss_test PRIVATE faiss)
  endif()
  
diff -Nru faiss-1.7.3/debian/patches/0003-Fix-CMake-package-export.patch faiss-1.7.4/debian/patches/0003-Fix-CMake-package-export.patch
--- faiss-1.7.3/debian/patches/0003-Fix-CMake-package-export.patch	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/debian/patches/0003-Fix-CMake-package-export.patch	2023-06-26 14:06:08.000000000 +0000
@@ -0,0 +1,24 @@
+From: =?utf-8?q?Timo_R=C3=B6hling?= <roehling@debian.org>
+Date: Tue, 8 Feb 2022 11:53:12 +0100
+Subject: Fix CMake package export
+
+---
+ faiss/CMakeLists.txt | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt
+index 1fea676..34f1283 100644
+--- a/faiss/CMakeLists.txt
++++ b/faiss/CMakeLists.txt
+@@ -316,9 +316,9 @@ configure_file(${PROJECT_SOURCE_DIR}/cmake/faiss-config.cmake.in
+ )
+ install(FILES ${PROJECT_BINARY_DIR}/cmake/faiss-config.cmake
+   ${PROJECT_BINARY_DIR}/cmake/faiss-config-version.cmake
+-  DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/faiss
++  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/faiss
+ )
+ 
+ install(EXPORT faiss-targets
+-  DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/faiss
++  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/faiss
+ )
diff -Nru faiss-1.7.3/debian/patches/0003-Force-inlining-on-simdlib-helpers.patch faiss-1.7.4/debian/patches/0003-Force-inlining-on-simdlib-helpers.patch
--- faiss-1.7.3/debian/patches/0003-Force-inlining-on-simdlib-helpers.patch	2022-11-17 22:31:23.000000000 +0000
+++ faiss-1.7.4/debian/patches/0003-Force-inlining-on-simdlib-helpers.patch	1970-01-01 00:00:00.000000000 +0000
@@ -1,42 +0,0 @@
-From: =?utf-8?q?Timo_R=C3=B6hling?= <roehling@debian.org>
-Date: Mon, 7 Feb 2022 08:12:11 +0100
-Subject: Force inlining on simdlib helpers
-
-The helper functions are called with intrinsics as function
-arguments, but intrinsics cannot have their address taken, so
-these calls only work if they are inlined by the compiler.
----
- faiss/utils/simdlib_neon.h | 6 +++---
- 1 file changed, 3 insertions(+), 3 deletions(-)
-
-diff --git a/faiss/utils/simdlib_neon.h b/faiss/utils/simdlib_neon.h
-index 737e948..c45091a 100644
---- a/faiss/utils/simdlib_neon.h
-+++ b/faiss/utils/simdlib_neon.h
-@@ -120,7 +120,7 @@ static inline std::string bin(const S& simd) {
- }
- 
- template <typename D, typename F, typename T>
--static inline void set1(D& d, F&& f, T t) {
-+static __attribute__((always_inline)) inline void set1(D& d, F&& f, T t) {
-     const auto v = f(t);
-     d.val[0] = v;
-     d.val[1] = v;
-@@ -143,7 +143,7 @@ static inline std::string elements_to_string(const char* fmt, const S& simd) {
- }
- 
- template <typename T, typename F>
--static inline T unary_func(const T& a, F&& f) {
-+static __attribute__((always_inline)) inline T unary_func(const T& a, F&& f) {
-     T t;
-     t.val[0] = f(a.val[0]);
-     t.val[1] = f(a.val[1]);
-@@ -151,7 +151,7 @@ static inline T unary_func(const T& a, F&& f) {
- }
- 
- template <typename T, typename F>
--static inline T binary_func(const T& a, const T& b, F&& f) {
-+static __attribute__((always_inline)) inline T binary_func(const T& a, const T& b, F&& f) {
-     T t;
-     t.val[0] = f(a.val[0], b.val[0]);
-     t.val[1] = f(a.val[1], b.val[1]);
diff -Nru faiss-1.7.3/debian/patches/0004-Fix-CMake-package-export.patch faiss-1.7.4/debian/patches/0004-Fix-CMake-package-export.patch
--- faiss-1.7.3/debian/patches/0004-Fix-CMake-package-export.patch	2022-11-17 22:31:23.000000000 +0000
+++ faiss-1.7.4/debian/patches/0004-Fix-CMake-package-export.patch	1970-01-01 00:00:00.000000000 +0000
@@ -1,24 +0,0 @@
-From: =?utf-8?q?Timo_R=C3=B6hling?= <roehling@debian.org>
-Date: Tue, 8 Feb 2022 11:53:12 +0100
-Subject: Fix CMake package export
-
----
- faiss/CMakeLists.txt | 4 ++--
- 1 file changed, 2 insertions(+), 2 deletions(-)
-
-diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt
-index fd3ddb3..80f74b4 100644
---- a/faiss/CMakeLists.txt
-+++ b/faiss/CMakeLists.txt
-@@ -292,9 +292,9 @@ configure_file(${PROJECT_SOURCE_DIR}/cmake/faiss-config.cmake.in
- )
- install(FILES ${PROJECT_BINARY_DIR}/cmake/faiss-config.cmake
-   ${PROJECT_BINARY_DIR}/cmake/faiss-config-version.cmake
--  DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/faiss
-+  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/faiss
- )
- 
- install(EXPORT faiss-targets
--  DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/faiss
-+  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/faiss
- )
diff -Nru faiss-1.7.3/debian/patches/0004-Fix-implementation-for-32-bit-size_t.patch faiss-1.7.4/debian/patches/0004-Fix-implementation-for-32-bit-size_t.patch
--- faiss-1.7.3/debian/patches/0004-Fix-implementation-for-32-bit-size_t.patch	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/debian/patches/0004-Fix-implementation-for-32-bit-size_t.patch	2023-06-26 14:06:08.000000000 +0000
@@ -0,0 +1,89 @@
+From: =?utf-8?q?Timo_R=C3=B6hling?= <roehling@debian.org>
+Date: Wed, 9 Feb 2022 01:28:47 +0100
+Subject: Fix implementation for 32 bit size_t
+
+---
+ faiss/index_factory.cpp                | 2 +-
+ faiss/invlists/DirectMap.cpp           | 2 +-
+ faiss/invlists/OnDiskInvertedLists.cpp | 7 ++++---
+ faiss/utils/utils.cpp                  | 2 +-
+ 4 files changed, 7 insertions(+), 6 deletions(-)
+
+diff --git a/faiss/index_factory.cpp b/faiss/index_factory.cpp
+index 5dbf609..6660b71 100644
+--- a/faiss/index_factory.cpp
++++ b/faiss/index_factory.cpp
+@@ -728,7 +728,7 @@ std::unique_ptr<Index> index_factory_sub(
+     }
+ 
+     if (verbose) {
+-        printf("after () normalization: %s %ld parenthesis indexes d=%d\n",
++        printf("after () normalization: %s %zu parenthesis indexes d=%d\n",
+                description.c_str(),
+                parenthesis_indexes.size(),
+                d);
+diff --git a/faiss/invlists/DirectMap.cpp b/faiss/invlists/DirectMap.cpp
+index b276b76..63cf2d3 100644
+--- a/faiss/invlists/DirectMap.cpp
++++ b/faiss/invlists/DirectMap.cpp
+@@ -199,7 +199,7 @@ size_t DirectMap::remove_ids(const IDSelector& sel, InvertedLists* invlists) {
+                             last_id,
+                             ScopedCodes(invlists, list_no, last).get());
+                     // update hash entry for last element
+-                    hashtable[last_id] = list_no << 32 | offset;
++                    hashtable[last_id] = idx_t(list_no) << 32 | idx_t(offset);
+                 }
+                 invlists->resize(list_no, last);
+                 nremove++;
+diff --git a/faiss/invlists/OnDiskInvertedLists.cpp b/faiss/invlists/OnDiskInvertedLists.cpp
+index 825ccfb..95bb5c9 100644
+--- a/faiss/invlists/OnDiskInvertedLists.cpp
++++ b/faiss/invlists/OnDiskInvertedLists.cpp
+@@ -11,6 +11,7 @@
+ 
+ #include <pthread.h>
+ 
++#include <limits>
+ #include <unordered_set>
+ 
+ #include <sys/mman.h>
+@@ -325,7 +326,7 @@ void OnDiskInvertedLists::update_totsize(size_t new_size) {
+ 
+     FAISS_THROW_IF_NOT_FMT(
+             err == 0,
+-            "truncate %s to %ld: %s",
++            "truncate %s to %zu: %s",
+             filename.c_str(),
+             totsize,
+             strerror(errno));
+@@ -524,7 +525,7 @@ void OnDiskInvertedLists::free_slot(size_t offset, size_t capacity) {
+         it++;
+     }
+ 
+-    size_t inf = 1UL << 60;
++    size_t inf = (std::numeric_limits<size_t>::max() - (std::numeric_limits<size_t>::max() >> 1)) >> 1;
+ 
+     size_t end_prev = inf;
+     if (it != slots.begin()) {
+@@ -533,7 +534,7 @@ void OnDiskInvertedLists::free_slot(size_t offset, size_t capacity) {
+         end_prev = prev->offset + prev->capacity;
+     }
+ 
+-    size_t begin_next = 1L << 60;
++    size_t begin_next = inf;
+     if (it != slots.end()) {
+         begin_next = it->offset;
+     }
+diff --git a/faiss/utils/utils.cpp b/faiss/utils/utils.cpp
+index 894653b..6488a10 100644
+--- a/faiss/utils/utils.cpp
++++ b/faiss/utils/utils.cpp
+@@ -160,7 +160,7 @@ size_t get_mem_usage_kb() {
+         char buf[256];
+         if (!fgets(buf, 256, f))
+             break;
+-        if (sscanf(buf, "VmRSS: %ld kB", &sz) == 1)
++        if (sscanf(buf, "VmRSS: %zu kB", &sz) == 1)
+             break;
+     }
+     fclose(f);
diff -Nru faiss-1.7.3/debian/patches/0005-Fix-floating-point-comparisons-in-unit-tests.patch faiss-1.7.4/debian/patches/0005-Fix-floating-point-comparisons-in-unit-tests.patch
--- faiss-1.7.3/debian/patches/0005-Fix-floating-point-comparisons-in-unit-tests.patch	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/debian/patches/0005-Fix-floating-point-comparisons-in-unit-tests.patch	2023-06-26 14:06:08.000000000 +0000
@@ -0,0 +1,21 @@
+From: =?utf-8?q?Timo_R=C3=B6hling?= <roehling@debian.org>
+Date: Wed, 9 Feb 2022 01:38:34 +0100
+Subject: Fix floating point comparisons in unit tests
+
+---
+ tests/test_lowlevel_ivf.cpp | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/tests/test_lowlevel_ivf.cpp b/tests/test_lowlevel_ivf.cpp
+index e28e2a9..a4d8410 100644
+--- a/tests/test_lowlevel_ivf.cpp
++++ b/tests/test_lowlevel_ivf.cpp
+@@ -359,7 +359,7 @@ void test_lowlevel_access_binary(const char* index_key) {
+                     float computed_D = scanner->distance_to_code(
+                             xb.data() + vno * il->code_size);
+ 
+-                    EXPECT_EQ(computed_D, D[jj]);
++                    EXPECT_FLOAT_EQ(computed_D, D[jj]);
+                 }
+             }
+         }
diff -Nru faiss-1.7.3/debian/patches/0005-Fix-implementation-for-32-bit-size_t.patch faiss-1.7.4/debian/patches/0005-Fix-implementation-for-32-bit-size_t.patch
--- faiss-1.7.3/debian/patches/0005-Fix-implementation-for-32-bit-size_t.patch	2022-11-17 22:31:23.000000000 +0000
+++ faiss-1.7.4/debian/patches/0005-Fix-implementation-for-32-bit-size_t.patch	1970-01-01 00:00:00.000000000 +0000
@@ -1,89 +0,0 @@
-From: =?utf-8?q?Timo_R=C3=B6hling?= <roehling@debian.org>
-Date: Wed, 9 Feb 2022 01:28:47 +0100
-Subject: Fix implementation for 32 bit size_t
-
----
- faiss/index_factory.cpp                | 2 +-
- faiss/invlists/DirectMap.cpp           | 2 +-
- faiss/invlists/OnDiskInvertedLists.cpp | 7 ++++---
- faiss/utils/utils.cpp                  | 2 +-
- 4 files changed, 7 insertions(+), 6 deletions(-)
-
-diff --git a/faiss/index_factory.cpp b/faiss/index_factory.cpp
-index 535d239..44c5bb3 100644
---- a/faiss/index_factory.cpp
-+++ b/faiss/index_factory.cpp
-@@ -730,7 +730,7 @@ std::unique_ptr<Index> index_factory_sub(
-     }
- 
-     if (verbose) {
--        printf("after () normalization: %s %ld parenthesis indexes d=%d\n",
-+        printf("after () normalization: %s %zu parenthesis indexes d=%d\n",
-                description.c_str(),
-                parenthesis_indexes.size(),
-                d);
-diff --git a/faiss/invlists/DirectMap.cpp b/faiss/invlists/DirectMap.cpp
-index 6cb5202..1a5c3a6 100644
---- a/faiss/invlists/DirectMap.cpp
-+++ b/faiss/invlists/DirectMap.cpp
-@@ -199,7 +199,7 @@ size_t DirectMap::remove_ids(const IDSelector& sel, InvertedLists* invlists) {
-                             last_id,
-                             ScopedCodes(invlists, list_no, last).get());
-                     // update hash entry for last element
--                    hashtable[last_id] = list_no << 32 | offset;
-+                    hashtable[last_id] = idx_t(list_no) << 32 | idx_t(offset);
-                 }
-                 invlists->resize(list_no, last);
-                 nremove++;
-diff --git a/faiss/invlists/OnDiskInvertedLists.cpp b/faiss/invlists/OnDiskInvertedLists.cpp
-index 5027c0e..8caded3 100644
---- a/faiss/invlists/OnDiskInvertedLists.cpp
-+++ b/faiss/invlists/OnDiskInvertedLists.cpp
-@@ -11,6 +11,7 @@
- 
- #include <pthread.h>
- 
-+#include <limits>
- #include <unordered_set>
- 
- #include <sys/mman.h>
-@@ -325,7 +326,7 @@ void OnDiskInvertedLists::update_totsize(size_t new_size) {
- 
-     FAISS_THROW_IF_NOT_FMT(
-             err == 0,
--            "truncate %s to %ld: %s",
-+            "truncate %s to %zu: %s",
-             filename.c_str(),
-             totsize,
-             strerror(errno));
-@@ -524,7 +525,7 @@ void OnDiskInvertedLists::free_slot(size_t offset, size_t capacity) {
-         it++;
-     }
- 
--    size_t inf = 1UL << 60;
-+    size_t inf = (std::numeric_limits<size_t>::max() - (std::numeric_limits<size_t>::max() >> 1)) >> 1;
- 
-     size_t end_prev = inf;
-     if (it != slots.begin()) {
-@@ -533,7 +534,7 @@ void OnDiskInvertedLists::free_slot(size_t offset, size_t capacity) {
-         end_prev = prev->offset + prev->capacity;
-     }
- 
--    size_t begin_next = 1L << 60;
-+    size_t begin_next = inf;
-     if (it != slots.end()) {
-         begin_next = it->offset;
-     }
-diff --git a/faiss/utils/utils.cpp b/faiss/utils/utils.cpp
-index 013bf38..175a537 100644
---- a/faiss/utils/utils.cpp
-+++ b/faiss/utils/utils.cpp
-@@ -160,7 +160,7 @@ size_t get_mem_usage_kb() {
-         char buf[256];
-         if (!fgets(buf, 256, f))
-             break;
--        if (sscanf(buf, "VmRSS: %ld kB", &sz) == 1)
-+        if (sscanf(buf, "VmRSS: %zu kB", &sz) == 1)
-             break;
-     }
-     fclose(f);
diff -Nru faiss-1.7.3/debian/patches/0006-Add-GTest-filter.patch faiss-1.7.4/debian/patches/0006-Add-GTest-filter.patch
--- faiss-1.7.3/debian/patches/0006-Add-GTest-filter.patch	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/debian/patches/0006-Add-GTest-filter.patch	2023-06-26 14:06:08.000000000 +0000
@@ -0,0 +1,18 @@
+From: =?utf-8?q?Timo_R=C3=B6hling?= <roehling@debian.org>
+Date: Wed, 9 Feb 2022 08:08:13 +0100
+Subject: Add GTest filter
+
+---
+ tests/CMakeLists.txt | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
+index 442dfa4..c40bb1d 100644
+--- a/tests/CMakeLists.txt
++++ b/tests/CMakeLists.txt
+@@ -62,4 +62,4 @@ target_link_libraries(faiss_test PRIVATE
+ 
+ # Defines `gtest_discover_tests()`.
+ include(GoogleTest)
+-gtest_discover_tests(faiss_test)
++gtest_discover_tests(faiss_test TEST_FILTER "${FAISS_GTEST_FILTER}")
diff -Nru faiss-1.7.3/debian/patches/0006-Fix-floating-point-comparisons-in-unit-tests.patch faiss-1.7.4/debian/patches/0006-Fix-floating-point-comparisons-in-unit-tests.patch
--- faiss-1.7.3/debian/patches/0006-Fix-floating-point-comparisons-in-unit-tests.patch	2022-11-17 22:31:23.000000000 +0000
+++ faiss-1.7.4/debian/patches/0006-Fix-floating-point-comparisons-in-unit-tests.patch	1970-01-01 00:00:00.000000000 +0000
@@ -1,21 +0,0 @@
-From: =?utf-8?q?Timo_R=C3=B6hling?= <roehling@debian.org>
-Date: Wed, 9 Feb 2022 01:38:34 +0100
-Subject: Fix floating point comparisons in unit tests
-
----
- tests/test_lowlevel_ivf.cpp | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/tests/test_lowlevel_ivf.cpp b/tests/test_lowlevel_ivf.cpp
-index 44450fc..3cb0b4e 100644
---- a/tests/test_lowlevel_ivf.cpp
-+++ b/tests/test_lowlevel_ivf.cpp
-@@ -361,7 +361,7 @@ void test_lowlevel_access_binary(const char* index_key) {
-                     float computed_D = scanner->distance_to_code(
-                             xb.data() + vno * il->code_size);
- 
--                    EXPECT_EQ(computed_D, D[jj]);
-+                    EXPECT_FLOAT_EQ(computed_D, D[jj]);
-                 }
-             }
-         }
diff -Nru faiss-1.7.3/debian/patches/0007-Add-GTest-filter.patch faiss-1.7.4/debian/patches/0007-Add-GTest-filter.patch
--- faiss-1.7.3/debian/patches/0007-Add-GTest-filter.patch	2022-11-17 22:31:23.000000000 +0000
+++ faiss-1.7.4/debian/patches/0007-Add-GTest-filter.patch	1970-01-01 00:00:00.000000000 +0000
@@ -1,18 +0,0 @@
-From: =?utf-8?q?Timo_R=C3=B6hling?= <roehling@debian.org>
-Date: Wed, 9 Feb 2022 08:08:13 +0100
-Subject: Add GTest filter
-
----
- tests/CMakeLists.txt | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
-index af7f857..37eff1e 100644
---- a/tests/CMakeLists.txt
-+++ b/tests/CMakeLists.txt
-@@ -59,4 +59,4 @@ target_link_libraries(faiss_test PRIVATE
- 
- # Defines `gtest_discover_tests()`.
- include(GoogleTest)
--gtest_discover_tests(faiss_test)
-+gtest_discover_tests(faiss_test TEST_FILTER "${FAISS_GTEST_FILTER}")
diff -Nru faiss-1.7.3/debian/patches/0007-Fix-build-with-gcc-13.patch faiss-1.7.4/debian/patches/0007-Fix-build-with-gcc-13.patch
--- faiss-1.7.3/debian/patches/0007-Fix-build-with-gcc-13.patch	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/debian/patches/0007-Fix-build-with-gcc-13.patch	2023-06-26 14:06:08.000000000 +0000
@@ -0,0 +1,20 @@
+From: =?utf-8?q?Timo_R=C3=B6hling?= <roehling@debian.org>
+Date: Thu, 22 Jun 2023 17:51:36 +0200
+Subject: Fix build with gcc-13
+
+---
+ faiss/Index.h | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/faiss/Index.h b/faiss/Index.h
+index d73a684..91fe700 100644
+--- a/faiss/Index.h
++++ b/faiss/Index.h
+@@ -11,6 +11,7 @@
+ #define FAISS_INDEX_H
+ 
+ #include <faiss/MetricType.h>
++#include <cstdint>
+ #include <cstdio>
+ #include <sstream>
+ #include <string>
diff -Nru faiss-1.7.3/debian/patches/0008-Install-missing-header-file.patch faiss-1.7.4/debian/patches/0008-Install-missing-header-file.patch
--- faiss-1.7.3/debian/patches/0008-Install-missing-header-file.patch	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/debian/patches/0008-Install-missing-header-file.patch	2023-06-26 14:06:08.000000000 +0000
@@ -0,0 +1,20 @@
+From: =?utf-8?q?Timo_R=C3=B6hling?= <roehling@debian.org>
+Date: Sat, 24 Jun 2023 21:29:46 +0200
+Subject: Install missing header file
+
+---
+ faiss/CMakeLists.txt | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt
+index 34f1283..847ae34 100644
+--- a/faiss/CMakeLists.txt
++++ b/faiss/CMakeLists.txt
+@@ -145,6 +145,7 @@ set(FAISS_HEADERS
+   impl/AdditiveQuantizer.h
+   impl/AuxIndexStructures.h
+   impl/IDSelector.h
++  impl/CodePacker.h
+   impl/DistanceComputer.h
+   impl/FaissAssert.h
+   impl/FaissException.h
diff -Nru faiss-1.7.3/debian/patches/series faiss-1.7.4/debian/patches/series
--- faiss-1.7.3/debian/patches/series	2022-11-10 13:16:34.000000000 +0000
+++ faiss-1.7.4/debian/patches/series	2023-06-26 14:06:08.000000000 +0000
@@ -1,7 +1,8 @@
 0001-Use-system-GTest.patch
 0002-Use-SWIGWORDSIZE32-on-32-bit-architectures.patch
-0003-Force-inlining-on-simdlib-helpers.patch
-0004-Fix-CMake-package-export.patch
-0005-Fix-implementation-for-32-bit-size_t.patch
-0006-Fix-floating-point-comparisons-in-unit-tests.patch
-0007-Add-GTest-filter.patch
+0003-Fix-CMake-package-export.patch
+0004-Fix-implementation-for-32-bit-size_t.patch
+0005-Fix-floating-point-comparisons-in-unit-tests.patch
+0006-Add-GTest-filter.patch
+0007-Fix-build-with-gcc-13.patch
+0008-Install-missing-header-file.patch
diff -Nru faiss-1.7.3/debian/upstream/metadata faiss-1.7.4/debian/upstream/metadata
--- faiss-1.7.3/debian/upstream/metadata	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/debian/upstream/metadata	2023-06-22 20:03:42.000000000 +0000
@@ -0,0 +1,4 @@
+---
+Bug-Database: https://github.com/facebookresearch/faiss/issues
+Bug-Submit: https://github.com/facebookresearch/faiss/issues/new
+Repository-Browse: https://github.com/facebookresearch/faiss
diff -Nru faiss-1.7.3/demos/demo_imi_flat.cpp faiss-1.7.4/demos/demo_imi_flat.cpp
--- faiss-1.7.3/demos/demo_imi_flat.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/demos/demo_imi_flat.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -132,7 +132,7 @@
                k,
                nq);
 
-        std::vector<faiss::Index::idx_t> nns(k * nq);
+        std::vector<faiss::idx_t> nns(k * nq);
         std::vector<float> dis(k * nq);
 
         index.search(nq, queries.data(), k, dis.data(), nns.data());
diff -Nru faiss-1.7.3/demos/demo_imi_pq.cpp faiss-1.7.4/demos/demo_imi_pq.cpp
--- faiss-1.7.3/demos/demo_imi_pq.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/demos/demo_imi_pq.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -126,7 +126,7 @@
                nb);
 
         std::vector<float> database(nb * d);
-        std::vector<faiss::Index::idx_t> ids(nb);
+        std::vector<faiss::idx_t> ids(nb);
         for (size_t i = 0; i < nb; i++) {
             for (size_t j = 0; j < d; j++) {
                 database[i * d + j] = distrib(rng);
@@ -169,7 +169,7 @@
     // - given a vector float *x, finding which k centroids are
     //   closest to it (ie to find the nearest neighbors) can be done with
     //
-    //   faiss::Index::idx_t *centroid_ids = new faiss::Index::idx_t[k];
+    //   faiss::idx_t *centroid_ids = new faiss::idx_t[k];
     //   float *distances = new float[k];
     //   index.quantizer->search (1, x, k, dis, centroids_ids);
     //
@@ -184,7 +184,7 @@
                k,
                nq);
 
-        std::vector<faiss::Index::idx_t> nns(k * nq);
+        std::vector<faiss::idx_t> nns(k * nq);
         std::vector<float> dis(k * nq);
 
         index.search(nq, queries.data(), k, dis.data(), nns.data());
diff -Nru faiss-1.7.3/demos/demo_ivfpq_indexing.cpp faiss-1.7.4/demos/demo_ivfpq_indexing.cpp
--- faiss-1.7.3/demos/demo_ivfpq_indexing.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/demos/demo_ivfpq_indexing.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -118,7 +118,7 @@
                k,
                nq);
 
-        std::vector<faiss::Index::idx_t> nns(k * nq);
+        std::vector<faiss::idx_t> nns(k * nq);
         std::vector<float> dis(k * nq);
 
         index.search(nq, queries.data(), k, dis.data(), nns.data());
diff -Nru faiss-1.7.3/demos/demo_nndescent.cpp faiss-1.7.4/demos/demo_nndescent.cpp
--- faiss-1.7.3/demos/demo_nndescent.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/demos/demo_nndescent.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -58,8 +58,8 @@
         }
 
         int k = 5;
-        std::vector<faiss::IndexNNDescent::idx_t> nns(k * nq);
-        std::vector<faiss::IndexFlat::idx_t> gt_nns(k * nq);
+        std::vector<faiss::idx_t> nns(k * nq);
+        std::vector<faiss::idx_t> gt_nns(k * nq);
         std::vector<float> dis(k * nq);
 
         auto start = high_resolution_clock::now();
diff -Nru faiss-1.7.3/demos/demo_residual_quantizer.cpp faiss-1.7.4/demos/demo_residual_quantizer.cpp
--- faiss-1.7.3/demos/demo_residual_quantizer.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/demos/demo_residual_quantizer.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -1,4 +1,9 @@
-// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 
 #include <climits>
 #include <cstdio>
@@ -24,7 +29,7 @@
     /******************************************
      * Generate a test dataset
      ******************************************/
-    using idx_t = faiss::Index::idx_t;
+    using idx_t = faiss::idx_t;
     size_t d = 128;
     size_t nt = 10000;
     size_t nb = 10000;
diff -Nru faiss-1.7.3/demos/demo_sift1M.cpp faiss-1.7.4/demos/demo_sift1M.cpp
--- faiss-1.7.3/demos/demo_sift1M.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/demos/demo_sift1M.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -140,8 +140,8 @@
         assert(d == d2 || !"query does not have same dimension as train set");
     }
 
-    size_t k;                // nb of results per query in the GT
-    faiss::Index::idx_t* gt; // nq * k matrix of ground-truth nearest-neighbors
+    size_t k;         // nb of results per query in the GT
+    faiss::idx_t* gt; // nq * k matrix of ground-truth nearest-neighbors
 
     {
         printf("[%.3f s] Loading ground truth for %ld queries\n",
@@ -153,7 +153,7 @@
         int* gt_int = ivecs_read("sift1M/sift_groundtruth.ivecs", &k, &nq2);
         assert(nq2 == nq || !"incorrect nb of ground truth entries");
 
-        gt = new faiss::Index::idx_t[k * nq];
+        gt = new faiss::idx_t[k * nq];
         for (int i = 0; i < k * nq; i++) {
             gt[i] = gt_int[i];
         }
@@ -219,7 +219,7 @@
                nq);
 
         // output buffers
-        faiss::Index::idx_t* I = new faiss::Index::idx_t[nq * k];
+        faiss::idx_t* I = new faiss::idx_t[nq * k];
         float* D = new float[nq * k];
 
         index->search(nq, xq, k, D, I);
diff -Nru faiss-1.7.3/demos/demo_weighted_kmeans.cpp faiss-1.7.4/demos/demo_weighted_kmeans.cpp
--- faiss-1.7.3/demos/demo_weighted_kmeans.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/demos/demo_weighted_kmeans.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -155,7 +155,7 @@
                 faiss::IndexFlatL2 cent_index(d);
                 cent_index.add(nc, centroids.data());
                 std::vector<float> dis(n);
-                std::vector<faiss::Index::idx_t> idx(n);
+                std::vector<faiss::idx_t> idx(n);
 
                 cent_index.search(
                         nc * 2, ccent.data(), 1, dis.data(), idx.data());
diff -Nru faiss-1.7.3/Dockerfile faiss-1.7.4/Dockerfile
--- faiss-1.7.3/Dockerfile	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/Dockerfile	1970-01-01 00:00:00.000000000 +0000
@@ -1,29 +0,0 @@
-FROM nvidia/cuda:8.0-devel-centos7
-
-# Install MKL
-RUN yum-config-manager --add-repo https://yum.repos.intel.com/mkl/setup/intel-mkl.repo
-RUN rpm --import https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB
-RUN yum install -y intel-mkl-2019.3-062
-ENV LD_LIBRARY_PATH /opt/intel/mkl/lib/intel64:$LD_LIBRARY_PATH
-ENV LIBRARY_PATH /opt/intel/mkl/lib/intel64:$LIBRARY_PATH
-ENV LD_PRELOAD /usr/lib64/libgomp.so.1:/opt/intel/mkl/lib/intel64/libmkl_def.so:\
-/opt/intel/mkl/lib/intel64/libmkl_avx2.so:/opt/intel/mkl/lib/intel64/libmkl_core.so:\
-/opt/intel/mkl/lib/intel64/libmkl_intel_lp64.so:/opt/intel/mkl/lib/intel64/libmkl_gnu_thread.so
-
-# Install necessary build tools
-RUN yum install -y gcc-c++ make swig3
-
-# Install necesary headers/libs
-RUN yum install -y python-devel numpy
-
-COPY . /opt/faiss
-
-WORKDIR /opt/faiss
-
-# --with-cuda=/usr/local/cuda-8.0 
-RUN ./configure --prefix=/usr --libdir=/usr/lib64 --without-cuda
-RUN make -j $(nproc)
-RUN make -C python
-RUN make test
-RUN make install
-RUN make -C demos demo_ivfpq_indexing && ./demos/demo_ivfpq_indexing
diff -Nru faiss-1.7.3/Doxyfile faiss-1.7.4/Doxyfile
--- faiss-1.7.3/Doxyfile	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/Doxyfile	2023-04-19 13:18:30.000000000 +0000
@@ -786,7 +786,7 @@
 # Note that the wildcards are matched against the file with absolute path, so to
 # exclude all test directories for example use the pattern */test/*
 
-EXCLUDE_PATTERNS       = */impl/*
+EXCLUDE_PATTERNS       = 
 
 # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
 # (namespaces, classes, functions, etc.) that should be excluded from the
diff -Nru faiss-1.7.3/faiss/AutoTune.cpp faiss-1.7.4/faiss/AutoTune.cpp
--- faiss-1.7.3/faiss/AutoTune.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/AutoTune.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -32,6 +32,7 @@
 #include <faiss/IndexPreTransform.h>
 #include <faiss/IndexRefine.h>
 #include <faiss/IndexScalarQuantizer.h>
+#include <faiss/IndexShardsIVF.h>
 #include <faiss/MetaIndexes.h>
 #include <faiss/VectorTransform.h>
 
@@ -354,7 +355,7 @@
         index = ix->index;
     }
 
-    if (DC(IndexIVF)) {
+    if (DC(IndexIVFInterface)) {
         {
             ParameterRange& pr = add_range("nprobe");
             for (int i = 0; i < 13; i++) {
@@ -461,6 +462,16 @@
         set_index_parameter(ix->index, name, val);
         return;
     }
+    if (DC(IndexShardsIVF)) {
+        // special handling because the nprobe is set at the sub-class level
+        // but other params are set on the class itself
+        if (name.find("quantizer_") == 0 && name != "nprobe" &&
+            name != "quantizer_nprobe") {
+            std::string sub_name = name.substr(strlen("quantizer_"));
+            set_index_parameter(ix->quantizer, sub_name, val);
+            return;
+        }
+    }
     if (DC(ThreadedIndex<Index>)) {
         // call on all sub-indexes
         auto fn = [this, name, val](int /* no */, Index* subIndex) {
@@ -608,7 +619,7 @@
     if (n_experiments == 0) {
         for (size_t cno = 0; cno < n_comb; cno++) {
             set_index_parameters(index, cno);
-            std::vector<Index::idx_t> I(nq * crit.nnn);
+            std::vector<idx_t> I(nq * crit.nnn);
             std::vector<float> D(nq * crit.nnn);
 
             double t0 = getmillisecs();
@@ -677,7 +688,7 @@
         }
 
         set_index_parameters(index, cno);
-        std::vector<Index::idx_t> I(nq * crit.nnn);
+        std::vector<idx_t> I(nq * crit.nnn);
         std::vector<float> D(nq * crit.nnn);
 
         double t0 = getmillisecs();
@@ -688,7 +699,7 @@
         do {
             if (thread_over_batches) {
 #pragma omp parallel for
-                for (Index::idx_t q0 = 0; q0 < nq; q0 += batchsize) {
+                for (idx_t q0 = 0; q0 < nq; q0 += batchsize) {
                     size_t q1 = q0 + batchsize;
                     if (q1 > nq)
                         q1 = nq;
diff -Nru faiss-1.7.3/faiss/AutoTune.h faiss-1.7.4/faiss/AutoTune.h
--- faiss-1.7.3/faiss/AutoTune.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/AutoTune.h	2023-04-19 13:18:30.000000000 +0000
@@ -24,7 +24,6 @@
  * higher is better.
  */
 struct AutoTuneCriterion {
-    typedef Index::idx_t idx_t;
     idx_t nq;     ///< nb of queries this criterion is evaluated on
     idx_t nnn;    ///< nb of NNs that the query should request
     idx_t gt_nnn; ///< nb of GT NNs required to evaluate criterion
diff -Nru faiss-1.7.3/faiss/clone_index.cpp faiss-1.7.4/faiss/clone_index.cpp
--- faiss-1.7.3/faiss/clone_index.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/clone_index.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -16,18 +16,24 @@
 
 #include <faiss/Index2Layer.h>
 #include <faiss/IndexAdditiveQuantizer.h>
+#include <faiss/IndexAdditiveQuantizerFastScan.h>
 #include <faiss/IndexFlat.h>
 #include <faiss/IndexHNSW.h>
 #include <faiss/IndexIVF.h>
+#include <faiss/IndexIVFAdditiveQuantizerFastScan.h>
 #include <faiss/IndexIVFFlat.h>
 #include <faiss/IndexIVFPQ.h>
+#include <faiss/IndexIVFPQFastScan.h>
 #include <faiss/IndexIVFPQR.h>
 #include <faiss/IndexIVFSpectralHash.h>
 #include <faiss/IndexLSH.h>
 #include <faiss/IndexLattice.h>
 #include <faiss/IndexNSG.h>
 #include <faiss/IndexPQ.h>
+#include <faiss/IndexPQFastScan.h>
 #include <faiss/IndexPreTransform.h>
+#include <faiss/IndexRefine.h>
+#include <faiss/IndexRowwiseMinMax.h>
 #include <faiss/IndexScalarQuantizer.h>
 #include <faiss/MetaIndexes.h>
 #include <faiss/VectorTransform.h>
@@ -36,6 +42,9 @@
 #include <faiss/impl/ProductQuantizer.h>
 #include <faiss/impl/ResidualQuantizer.h>
 #include <faiss/impl/ScalarQuantizer.h>
+#include <faiss/impl/pq4_fast_scan.h>
+
+#include <faiss/invlists/BlockInvertedLists.h>
 
 namespace faiss {
 
@@ -71,39 +80,220 @@
 IndexIVF* Cloner::clone_IndexIVF(const IndexIVF* ivf) {
     TRYCLONE(IndexIVFPQR, ivf)
     TRYCLONE(IndexIVFPQ, ivf)
+
+    TRYCLONE(IndexIVFLocalSearchQuantizer, ivf)
+    TRYCLONE(IndexIVFProductLocalSearchQuantizer, ivf)
+    TRYCLONE(IndexIVFProductResidualQuantizer, ivf)
+    TRYCLONE(IndexIVFResidualQuantizer, ivf)
+
+    TRYCLONE(IndexIVFLocalSearchQuantizerFastScan, ivf)
+    TRYCLONE(IndexIVFProductLocalSearchQuantizerFastScan, ivf)
+    TRYCLONE(IndexIVFProductResidualQuantizerFastScan, ivf)
+    TRYCLONE(IndexIVFResidualQuantizerFastScan, ivf)
+    TRYCLONE(IndexIVFPQFastScan, ivf)
+
+    TRYCLONE(IndexIVFFlatDedup, ivf)
     TRYCLONE(IndexIVFFlat, ivf)
+
+    TRYCLONE(IndexIVFSpectralHash, ivf)
+
     TRYCLONE(IndexIVFScalarQuantizer, ivf) {
         FAISS_THROW_MSG("clone not supported for this type of IndexIVF");
     }
     return nullptr;
 }
 
+IndexRefine* clone_IndexRefine(const IndexRefine* ir) {
+    TRYCLONE(IndexRefineFlat, ir)
+    TRYCLONE(IndexRefine, ir) {
+        FAISS_THROW_MSG("clone not supported for this type of IndexRefine");
+    }
+}
+
+IndexIDMap* clone_IndexIDMap(const IndexIDMap* im) {
+    TRYCLONE(IndexIDMap2, im)
+    TRYCLONE(IndexIDMap, im) {
+        FAISS_THROW_MSG("clone not supported for this type of IndexIDMap");
+    }
+}
+
+IndexHNSW* clone_IndexHNSW(const IndexHNSW* ihnsw) {
+    TRYCLONE(IndexHNSW2Level, ihnsw)
+    TRYCLONE(IndexHNSWFlat, ihnsw)
+    TRYCLONE(IndexHNSWPQ, ihnsw)
+    TRYCLONE(IndexHNSWSQ, ihnsw)
+    TRYCLONE(IndexHNSW, ihnsw) {
+        FAISS_THROW_MSG("clone not supported for this type of IndexHNSW");
+    }
+}
+
+IndexNNDescent* clone_IndexNNDescent(const IndexNNDescent* innd) {
+    TRYCLONE(IndexNNDescentFlat, innd)
+    TRYCLONE(IndexNNDescent, innd) {
+        FAISS_THROW_MSG("clone not supported for this type of IndexNNDescent");
+    }
+}
+
+IndexNSG* clone_IndexNSG(const IndexNSG* insg) {
+    TRYCLONE(IndexNSGFlat, insg)
+    TRYCLONE(IndexNSGPQ, insg)
+    TRYCLONE(IndexNSGSQ, insg)
+    TRYCLONE(IndexNSG, insg) {
+        FAISS_THROW_MSG("clone not supported for this type of IndexNNDescent");
+    }
+}
+
+IndexRowwiseMinMaxBase* clone_IndexRowwiseMinMax(
+        const IndexRowwiseMinMaxBase* irmmb) {
+    TRYCLONE(IndexRowwiseMinMaxFP16, irmmb)
+    TRYCLONE(IndexRowwiseMinMax, irmmb) {
+        FAISS_THROW_MSG(
+                "clone not supported for this type of IndexRowwiseMinMax");
+    }
+}
+
+#define TRYCAST(classname) classname* res = dynamic_cast<classname*>(index)
+
+void reset_AdditiveQuantizerIndex(Index* index) {
+    auto clone_ProductQuantizers =
+            [](std::vector<AdditiveQuantizer*>& quantizers) {
+                for (auto& q : quantizers) {
+                    q = dynamic_cast<AdditiveQuantizer*>(clone_Quantizer(q));
+                }
+            };
+    if (TRYCAST(IndexIVFLocalSearchQuantizerFastScan)) {
+        res->aq = &res->lsq;
+    } else if (TRYCAST(IndexIVFResidualQuantizerFastScan)) {
+        res->aq = &res->rq;
+    } else if (TRYCAST(IndexIVFProductLocalSearchQuantizerFastScan)) {
+        res->aq = &res->plsq;
+        clone_ProductQuantizers(res->plsq.quantizers);
+    } else if (TRYCAST(IndexIVFProductResidualQuantizerFastScan)) {
+        res->aq = &res->prq;
+        clone_ProductQuantizers(res->prq.quantizers);
+    } else if (TRYCAST(IndexIVFLocalSearchQuantizer)) {
+        res->aq = &res->lsq;
+    } else if (TRYCAST(IndexIVFResidualQuantizer)) {
+        res->aq = &res->rq;
+    } else if (TRYCAST(IndexIVFProductLocalSearchQuantizer)) {
+        res->aq = &res->plsq;
+        clone_ProductQuantizers(res->plsq.quantizers);
+    } else if (TRYCAST(IndexIVFProductResidualQuantizer)) {
+        res->aq = &res->prq;
+        clone_ProductQuantizers(res->prq.quantizers);
+    } else if (TRYCAST(IndexLocalSearchQuantizerFastScan)) {
+        res->aq = &res->lsq;
+    } else if (TRYCAST(IndexResidualQuantizerFastScan)) {
+        res->aq = &res->rq;
+    } else if (TRYCAST(IndexProductLocalSearchQuantizerFastScan)) {
+        res->aq = &res->plsq;
+        clone_ProductQuantizers(res->plsq.quantizers);
+    } else if (TRYCAST(IndexProductResidualQuantizerFastScan)) {
+        res->aq = &res->prq;
+        clone_ProductQuantizers(res->prq.quantizers);
+    } else if (TRYCAST(IndexLocalSearchQuantizer)) {
+        res->aq = &res->lsq;
+    } else if (TRYCAST(IndexResidualQuantizer)) {
+        res->aq = &res->rq;
+    } else if (TRYCAST(IndexProductLocalSearchQuantizer)) {
+        res->aq = &res->plsq;
+        clone_ProductQuantizers(res->plsq.quantizers);
+    } else if (TRYCAST(IndexProductResidualQuantizer)) {
+        res->aq = &res->prq;
+        clone_ProductQuantizers(res->prq.quantizers);
+    } else if (TRYCAST(LocalSearchCoarseQuantizer)) {
+        res->aq = &res->lsq;
+    } else if (TRYCAST(ResidualCoarseQuantizer)) {
+        res->aq = &res->rq;
+    } else {
+        FAISS_THROW_MSG(
+                "clone not supported for this type of additive quantizer index");
+    }
+}
+
+Index* clone_AdditiveQuantizerIndex(const Index* index) {
+    // IndexAdditiveQuantizer
+    TRYCLONE(IndexResidualQuantizer, index)
+    TRYCLONE(IndexProductResidualQuantizer, index)
+    TRYCLONE(IndexLocalSearchQuantizer, index)
+    TRYCLONE(IndexProductLocalSearchQuantizer, index)
+
+    // IndexFastScan
+    TRYCLONE(IndexResidualQuantizerFastScan, index)
+    TRYCLONE(IndexLocalSearchQuantizerFastScan, index)
+    TRYCLONE(IndexProductResidualQuantizerFastScan, index)
+    TRYCLONE(IndexProductLocalSearchQuantizerFastScan, index)
+
+    // AdditiveCoarseQuantizer
+    TRYCLONE(ResidualCoarseQuantizer, index)
+    TRYCLONE(LocalSearchCoarseQuantizer, index) {
+        FAISS_THROW_MSG(
+                "clone not supported for this type of additive quantizer index");
+    }
+}
+
+namespace {
+
+IndexHNSW* clone_HNSW(const IndexHNSW* ihnsw) {
+    TRYCLONE(IndexHNSWFlat, ihnsw)
+    TRYCLONE(IndexHNSWPQ, ihnsw)
+    TRYCLONE(IndexHNSWSQ, ihnsw)
+    return new IndexHNSW(*ihnsw);
+}
+
+InvertedLists* clone_InvertedLists(const InvertedLists* invlists) {
+    if (auto* ails = dynamic_cast<const ArrayInvertedLists*>(invlists)) {
+        return new ArrayInvertedLists(*ails);
+    }
+    if (auto* bils = dynamic_cast<const BlockInvertedLists*>(invlists)) {
+        auto* bils2 = new BlockInvertedLists(*bils);
+        if (bils->packer) {
+            auto* packerPQ4 = dynamic_cast<const CodePackerPQ4*>(bils->packer);
+            FAISS_THROW_IF_NOT(packerPQ4);
+            bils2->packer = new CodePackerPQ4(*packerPQ4);
+        }
+        return bils2;
+    }
+    FAISS_THROW_FMT(
+            "clone not supported for this type of inverted lists %s",
+            typeid(*invlists).name());
+}
+
+} // anonymous namespace
+
 Index* Cloner::clone_Index(const Index* index) {
     TRYCLONE(IndexPQ, index)
     TRYCLONE(IndexLSH, index)
+
+    // IndexFlat
+    TRYCLONE(IndexFlat1D, index)
     TRYCLONE(IndexFlatL2, index)
     TRYCLONE(IndexFlatIP, index)
     TRYCLONE(IndexFlat, index)
+
     TRYCLONE(IndexLattice, index)
-    TRYCLONE(IndexResidualQuantizer, index)
+    TRYCLONE(IndexRandom, index)
+    TRYCLONE(IndexPQFastScan, index)
+
     TRYCLONE(IndexScalarQuantizer, index)
     TRYCLONE(MultiIndexQuantizer, index)
-    TRYCLONE(ResidualCoarseQuantizer, index)
+
     if (const IndexIVF* ivf = dynamic_cast<const IndexIVF*>(index)) {
         IndexIVF* res = clone_IndexIVF(ivf);
         if (ivf->invlists == nullptr) {
             res->invlists = nullptr;
-        } else if (
-                auto* ails = dynamic_cast<const ArrayInvertedLists*>(
-                        ivf->invlists)) {
-            res->invlists = new ArrayInvertedLists(*ails);
-            res->own_invlists = true;
         } else {
-            FAISS_THROW_MSG(
-                    "clone not supported for this type of inverted lists");
+            res->invlists = clone_InvertedLists(ivf->invlists);
+            res->own_invlists = true;
         }
+
         res->own_fields = true;
         res->quantizer = clone_Index(ivf->quantizer);
+
+        if (dynamic_cast<const IndexIVFAdditiveQuantizerFastScan*>(res) ||
+            dynamic_cast<const IndexIVFAdditiveQuantizer*>(res)) {
+            reset_AdditiveQuantizerIndex(res);
+        }
         return res;
     } else if (
             const IndexPreTransform* ipt =
@@ -122,19 +312,18 @@
         return res;
     } else if (
             const IndexIDMap* idmap = dynamic_cast<const IndexIDMap*>(index)) {
-        const IndexIDMap2* idmap2 = dynamic_cast<const IndexIDMap2*>(index);
-        IndexIDMap* res =
-                idmap2 ? new IndexIDMap2(*idmap2) : new IndexIDMap(*idmap);
+        IndexIDMap* res = clone_IndexIDMap(idmap);
         res->own_fields = true;
         res->index = clone_Index(idmap->index);
         return res;
     } else if (const IndexHNSW* ihnsw = dynamic_cast<const IndexHNSW*>(index)) {
-        IndexHNSW* res = new IndexHNSW(*ihnsw);
+        IndexHNSW* res = clone_IndexHNSW(ihnsw);
         res->own_fields = true;
-        res->storage = clone_Index(ihnsw->storage);
+        // make sure we don't get a GPU index here
+        res->storage = Cloner::clone_Index(ihnsw->storage);
         return res;
     } else if (const IndexNSG* insg = dynamic_cast<const IndexNSG*>(index)) {
-        IndexNSG* res = new IndexNSG(*insg);
+        IndexNSG* res = clone_IndexNSG(insg);
 
         // copy the dynamic allocated graph
         auto& new_graph = res->nsg.final_graph;
@@ -147,7 +336,7 @@
     } else if (
             const IndexNNDescent* innd =
                     dynamic_cast<const IndexNNDescent*>(index)) {
-        IndexNNDescent* res = new IndexNNDescent(*innd);
+        IndexNNDescent* res = clone_IndexNNDescent(innd);
         res->own_fields = true;
         res->storage = clone_Index(innd->storage);
         return res;
@@ -157,11 +346,36 @@
         res->q1.own_fields = true;
         res->q1.quantizer = clone_Index(i2l->q1.quantizer);
         return res;
+    } else if (
+            const IndexRefine* ir = dynamic_cast<const IndexRefine*>(index)) {
+        IndexRefine* res = clone_IndexRefine(ir);
+        res->own_fields = true;
+        res->base_index = clone_Index(ir->base_index);
+        if (ir->refine_index != nullptr) {
+            res->own_refine_index = true;
+            res->refine_index = clone_Index(ir->refine_index);
+        }
+        return res;
+    } else if (
+            const IndexRowwiseMinMaxBase* irmmb =
+                    dynamic_cast<const IndexRowwiseMinMaxBase*>(index)) {
+        IndexRowwiseMinMaxBase* res = clone_IndexRowwiseMinMax(irmmb);
+        res->own_fields = true;
+        res->index = clone_Index(irmmb->index);
+    } else if (
+            dynamic_cast<const IndexAdditiveQuantizerFastScan*>(index) ||
+            dynamic_cast<const IndexAdditiveQuantizer*>(index) ||
+            dynamic_cast<const AdditiveCoarseQuantizer*>(index)) {
+        Index* res = clone_AdditiveQuantizerIndex(index);
+        reset_AdditiveQuantizerIndex(res);
+        return res;
     } else {
-        FAISS_THROW_MSG("clone not supported for this type of Index");
+        FAISS_THROW_FMT(
+                "clone not supported for this Index type %s",
+                typeid(*index).name());
     }
     return nullptr;
-}
+} // namespace
 
 Quantizer* clone_Quantizer(const Quantizer* quant) {
     TRYCLONE(ResidualQuantizer, quant)
diff -Nru faiss-1.7.3/faiss/Clustering.cpp faiss-1.7.4/faiss/Clustering.cpp
--- faiss-1.7.3/faiss/Clustering.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/Clustering.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -88,8 +88,6 @@
 
 namespace {
 
-using idx_t = Clustering::idx_t;
-
 idx_t subsample_training_set(
         const Clustering& clus,
         idx_t nx,
@@ -374,7 +372,7 @@
     std::unique_ptr<float[]> dis(new float[nx]);
 
     // remember best iteration for redo
-    bool lower_is_better = index.metric_type != METRIC_INNER_PRODUCT;
+    bool lower_is_better = !is_similarity_metric(index.metric_type);
     float best_obj = lower_is_better ? HUGE_VALF : -HUGE_VALF;
     std::vector<ClusteringIterationStats> best_iteration_stats;
     std::vector<float> best_centroids;
@@ -624,8 +622,6 @@
 
 namespace {
 
-using idx_t = Index::idx_t;
-
 void copy_columns(idx_t n, idx_t d1, const float* src, idx_t d2, float* dest) {
     idx_t d = std::min(d1, d2);
     for (idx_t i = 0; i < n; i++) {
diff -Nru faiss-1.7.3/faiss/Clustering.h faiss-1.7.4/faiss/Clustering.h
--- faiss-1.7.3/faiss/Clustering.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/Clustering.h	2023-04-19 13:18:30.000000000 +0000
@@ -61,7 +61,6 @@
  *
  */
 struct Clustering : ClusteringParameters {
-    typedef Index::idx_t idx_t;
     size_t d; ///< dimension of the vectors
     size_t k; ///< nb of centroids
 
@@ -154,7 +153,6 @@
  * https://arxiv.org/abs/1509.05195
  */
 struct ProgressiveDimClustering : ProgressiveDimClusteringParameters {
-    using idx_t = Index::idx_t;
     size_t d; ///< dimension of the vectors
     size_t k; ///< nb of centroids
 
diff -Nru faiss-1.7.3/faiss/CMakeLists.txt faiss-1.7.4/faiss/CMakeLists.txt
--- faiss-1.7.3/faiss/CMakeLists.txt	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/CMakeLists.txt	2023-04-19 13:18:30.000000000 +0000
@@ -44,12 +44,14 @@
   IndexRowwiseMinMax.cpp
   IndexScalarQuantizer.cpp
   IndexShards.cpp
+  IndexShardsIVF.cpp
   MatrixStats.cpp
   MetaIndexes.cpp
   VectorTransform.cpp
   clone_index.cpp
   index_factory.cpp
   impl/AuxIndexStructures.cpp
+  impl/CodePacker.cpp
   impl/IDSelector.cpp
   impl/FaissException.cpp
   impl/HNSW.cpp
@@ -85,7 +87,11 @@
   utils/partitioning.cpp
   utils/quantize_lut.cpp
   utils/random.cpp
+  utils/sorting.cpp
   utils/utils.cpp
+  utils/distances_fused/avx512.cpp
+  utils/distances_fused/distances_fused.cpp
+  utils/distances_fused/simdlib_based.cpp
 )
 
 set(FAISS_HEADERS
@@ -128,6 +134,7 @@
   IndexRowwiseMinMax.h
   IndexScalarQuantizer.h
   IndexShards.h
+  IndexShardsIVF.h
   MatrixStats.h
   MetaIndexes.h
   MetricType.h
@@ -163,6 +170,9 @@
   impl/platform_macros.h
   impl/pq4_fast_scan.h
   impl/simd_result_handlers.h
+  impl/code_distance/code_distance.h
+  impl/code_distance/code_distance-generic.h
+  impl/code_distance/code_distance-avx2.h
   invlists/BlockInvertedLists.h
   invlists/DirectMap.h
   invlists/InvertedLists.h
@@ -187,6 +197,20 @@
   utils/simdlib_emulated.h
   utils/simdlib_neon.h
   utils/utils.h
+  utils/distances_fused/avx512.h
+  utils/distances_fused/distances_fused.h
+  utils/distances_fused/simdlib_based.h
+  utils/approx_topk/approx_topk.h
+  utils/approx_topk/avx2-inl.h
+  utils/approx_topk/generic.h
+  utils/approx_topk/mode.h
+  utils/approx_topk_hamming/approx_topk_hamming.h
+  utils/transpose/transpose-avx2-inl.h
+  utils/hamming_distance/common.h
+  utils/hamming_distance/generic-inl.h
+  utils/hamming_distance/hamdis-inl.h
+  utils/hamming_distance/neon-inl.h
+  utils/hamming_distance/avx2-inl.h
 )
 
 if(NOT WIN32)
diff -Nru faiss-1.7.3/faiss/cppcontrib/detail/CoarseBitType.h faiss-1.7.4/faiss/cppcontrib/detail/CoarseBitType.h
--- faiss-1.7.3/faiss/cppcontrib/detail/CoarseBitType.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/cppcontrib/detail/CoarseBitType.h	2023-04-19 13:18:30.000000000 +0000
@@ -1,3 +1,10 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
 #pragma once
 
 #include <cstdint>
diff -Nru faiss-1.7.3/faiss/cppcontrib/detail/UintReader.h faiss-1.7.4/faiss/cppcontrib/detail/UintReader.h
--- faiss-1.7.3/faiss/cppcontrib/detail/UintReader.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/cppcontrib/detail/UintReader.h	2023-04-19 13:18:30.000000000 +0000
@@ -1,3 +1,10 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
 #pragma once
 
 #include <cstdint>
@@ -121,6 +128,72 @@
 };
 
 // reduces the number of read operations from RAM
+///////////////////////////////////////////////
+// 76543210 76543210 76543210 76543210 76543210 76543210
+// 00000000 0000
+//              1111 11111111
+//                            22222222 2222
+//                                         3333 33333333
+template <intptr_t N_ELEMENTS, intptr_t CPOS>
+struct Uint12Reader {
+    static_assert(CPOS < N_ELEMENTS, "CPOS should be less than N_ELEMENTS");
+
+    static intptr_t get(const uint8_t* const __restrict codes) {
+        // Read using 4-bytes or 2-bytes.
+
+        constexpr intptr_t ELEMENT_TO_READ = CPOS / 4;
+        constexpr intptr_t SUB_ELEMENT = CPOS % 4;
+
+        switch (SUB_ELEMENT) {
+            case 0: {
+                if (N_ELEMENTS > CPOS + 2) {
+                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                            codes + ELEMENT_TO_READ * 6);
+                    return (code32 & 0b0000111111111111);
+                } else {
+                    const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
+                            codes + ELEMENT_TO_READ * 6 + 0);
+                    return (code16 & 0b0000111111111111);
+                }
+            }
+            case 1: {
+                if (N_ELEMENTS > CPOS + 1) {
+                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                            codes + ELEMENT_TO_READ * 6);
+                    return (code32 & 0b111111111111000000000000) >> 12;
+                } else {
+                    const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
+                            codes + ELEMENT_TO_READ * 6 + 1);
+                    return (code16 & 0b1111111111110000) >> 4;
+                }
+            }
+            case 2: {
+                if (N_ELEMENTS > CPOS + 1) {
+                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                            codes + ELEMENT_TO_READ * 6 + 2);
+                    return (code32 & 0b000011111111111100000000) >> 8;
+                } else {
+                    const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
+                            codes + ELEMENT_TO_READ * 6 + 3);
+                    return (code16 & 0b0000111111111111);
+                }
+            }
+            case 3: {
+                if (N_ELEMENTS > CPOS) {
+                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                            codes + ELEMENT_TO_READ * 6 + 2);
+                    return (code32 & 0b11111111111100000000000000000000) >> 20;
+                } else {
+                    const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
+                            codes + ELEMENT_TO_READ * 6 + 4);
+                    return (code16 & 0b1111111111110000) >> 4;
+                }
+            }
+        }
+    }
+};
+
+// reduces the number of read operations from RAM
 template <intptr_t N_ELEMENTS, intptr_t CPOS>
 struct Uint16Reader {
     static_assert(CPOS < N_ELEMENTS, "CPOS should be less than N_ELEMENTS");
@@ -174,6 +247,11 @@
 };
 
 template <intptr_t N_ELEMENTS, intptr_t CPOS>
+struct UintReaderImplType<N_ELEMENTS, 12, CPOS> {
+    using reader_type = Uint12Reader<N_ELEMENTS, CPOS>;
+};
+
+template <intptr_t N_ELEMENTS, intptr_t CPOS>
 struct UintReaderImplType<N_ELEMENTS, 16, CPOS> {
     using reader_type = Uint16Reader<N_ELEMENTS, CPOS>;
 };
diff -Nru faiss-1.7.3/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h faiss-1.7.4/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h
--- faiss-1.7.3/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h	2023-04-19 13:18:30.000000000 +0000
@@ -1,4 +1,10 @@
-// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
 #ifndef LEVEL2_AVX2_INL_H
 #define LEVEL2_AVX2_INL_H
 
@@ -1851,8 +1857,14 @@
 } // namespace
 
 // Suitable for IVF256,PQ[1]x8
+// Subtable for IVF256,PQ[1]x10 (such as IVF256,PQ16x10np)
+// Subtable for IVF256,PQ[1]x12 (such as IVF256,PQ16x12np)
+// Suitable for IVF256,PQ[1]x16 (such as IVF256,PQ16x16np)
 // Suitable for Residual[1]x8,PQ[2]x8
-// Suitable for IVF[9-16 bit],PQ[1]x8 (such as IVF1024,PQ16np)
+// Suitable for IVF[2^9-2^16 bit],PQ[1]x8 (such as IVF1024,PQ16np)
+// Suitable for IVF[2^9-2^16 bit],PQ[1]x10 (such as IVF1024,PQ16x10np)
+// Suitable for IVF[2^9-2^16 bit],PQ[1]x12 (such as IVF1024,PQ16x12np)
+// Suitable for IVF[2^9-2^16 bit],PQ[1]x16 (such as IVF1024,PQ16x16np)
 // Suitable for Residual[1]x[9-16 bit],PQ[2]x[3] (such as Residual2x9,PQ8)
 template <
         intptr_t DIM,
@@ -1862,11 +1874,13 @@
         intptr_t FINE_BITS = 8>
 struct Index2LevelDecoder {
     static_assert(
-            COARSE_BITS == 8 || COARSE_BITS == 10 || COARSE_BITS == 16,
-            "Only 8, 10 or 16 bits are currently supported for COARSE_BITS");
+            COARSE_BITS == 8 || COARSE_BITS == 10 || COARSE_BITS == 12 ||
+                    COARSE_BITS == 16,
+            "Only 8, 10, 12 or 16 bits are currently supported for COARSE_BITS");
     static_assert(
-            FINE_BITS == 8 || FINE_BITS == 10 || FINE_BITS == 16,
-            "Only 8, 10 or 16 bits are currently supported for FINE_BITS");
+            FINE_BITS == 8 || FINE_BITS == 10 || FINE_BITS == 12 ||
+                    FINE_BITS == 16,
+            "Only 8, 10, 12 or 16 bits are currently supported for FINE_BITS");
 
     static constexpr intptr_t dim = DIM;
     static constexpr intptr_t coarseSize = COARSE_SIZE;
diff -Nru faiss-1.7.3/faiss/cppcontrib/sa_decode/Level2-inl.h faiss-1.7.4/faiss/cppcontrib/sa_decode/Level2-inl.h
--- faiss-1.7.3/faiss/cppcontrib/sa_decode/Level2-inl.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/cppcontrib/sa_decode/Level2-inl.h	2023-04-19 13:18:30.000000000 +0000
@@ -1,4 +1,10 @@
-// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
 #ifndef LEVEL2_INL_H
 #define LEVEL2_INL_H
 
diff -Nru faiss-1.7.3/faiss/cppcontrib/sa_decode/Level2-neon-inl.h faiss-1.7.4/faiss/cppcontrib/sa_decode/Level2-neon-inl.h
--- faiss-1.7.3/faiss/cppcontrib/sa_decode/Level2-neon-inl.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/cppcontrib/sa_decode/Level2-neon-inl.h	2023-04-19 13:18:30.000000000 +0000
@@ -1,4 +1,10 @@
-// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
 #ifndef LEVEL2_NEON_INL_H
 #define LEVEL2_NEON_INL_H
 
@@ -1940,9 +1946,15 @@
 } // namespace
 
 // Suitable for IVF256,PQ[1]x8
+// Subtable for IVF256,PQ[1]x10 (such as IVF256,PQ16x10np)
+// Subtable for IVF256,PQ[1]x12 (such as IVF256,PQ16x12np)
+// Suitable for IVF256,PQ[1]x16 (such as IVF256,PQ16x16np)
 // Suitable for Residual[1]x8,PQ[2]x8
-// Suitable for IVF[9-16 bit],PQ[1]x8 (such as IVF1024,PQ16np)
-// Suitable for Residual1x[9-16 bit],PQ[1]x8 (such as Residual1x9,PQ8)
+// Suitable for IVF[2^9-2^16 bit],PQ[1]x8 (such as IVF1024,PQ16np)
+// Suitable for IVF[2^9-2^16 bit],PQ[1]x10 (such as IVF1024,PQ16x10np)
+// Suitable for IVF[2^9-2^16 bit],PQ[1]x12 (such as IVF1024,PQ16x12np)
+// Suitable for IVF[2^9-2^16 bit],PQ[1]x16 (such as IVF1024,PQ16x16np)
+// Suitable for Residual[1]x[9-16 bit],PQ[2]x[3] (such as Residual2x9,PQ8)
 template <
         intptr_t DIM,
         intptr_t COARSE_SIZE,
@@ -1951,11 +1963,13 @@
         intptr_t FINE_BITS = 8>
 struct Index2LevelDecoder {
     static_assert(
-            COARSE_BITS == 8 || COARSE_BITS == 10 || COARSE_BITS == 16,
-            "Only 8, 10 or 16 bits are currently supported for COARSE_BITS");
+            COARSE_BITS == 8 || COARSE_BITS == 10 || COARSE_BITS == 12 ||
+                    COARSE_BITS == 16,
+            "Only 8, 10, 12 or 16 bits are currently supported for COARSE_BITS");
     static_assert(
-            FINE_BITS == 8 || FINE_BITS == 10 || FINE_BITS == 16,
-            "Only 8, 10 or 16 bits are currently supported for FINE_BITS");
+            FINE_BITS == 8 || FINE_BITS == 10 || FINE_BITS == 12 ||
+                    FINE_BITS == 16,
+            "Only 8, 10, 12 or 16 bits are currently supported for FINE_BITS");
 
     static constexpr intptr_t dim = DIM;
     static constexpr intptr_t coarseSize = COARSE_SIZE;
diff -Nru faiss-1.7.3/faiss/cppcontrib/sa_decode/MinMaxFP16-inl.h faiss-1.7.4/faiss/cppcontrib/sa_decode/MinMaxFP16-inl.h
--- faiss-1.7.3/faiss/cppcontrib/sa_decode/MinMaxFP16-inl.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/cppcontrib/sa_decode/MinMaxFP16-inl.h	2023-04-19 13:18:30.000000000 +0000
@@ -1,3 +1,10 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
 #pragma once
 
 #include <cstddef>
diff -Nru faiss-1.7.3/faiss/cppcontrib/sa_decode/MinMax-inl.h faiss-1.7.4/faiss/cppcontrib/sa_decode/MinMax-inl.h
--- faiss-1.7.3/faiss/cppcontrib/sa_decode/MinMax-inl.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/cppcontrib/sa_decode/MinMax-inl.h	2023-04-19 13:18:30.000000000 +0000
@@ -1,3 +1,10 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
 #pragma once
 
 #include <cstddef>
diff -Nru faiss-1.7.3/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h faiss-1.7.4/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h
--- faiss-1.7.3/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h	2023-04-19 13:18:30.000000000 +0000
@@ -1,4 +1,9 @@
-// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 
 #ifndef PQ_AVX2_INL_H
 #define PQ_AVX2_INL_H
@@ -1488,12 +1493,14 @@
 
 // Suitable for PQ[1]x8
 // Suitable for PQ[1]x10
+// Suitable for PQ[1]x12
 // Suitable for PQ[1]x16
 template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS = 8>
 struct IndexPQDecoder {
     static_assert(
-            FINE_BITS == 8 || FINE_BITS == 10 || FINE_BITS == 16,
-            "Only 8, 10 or 16 bits are currently supported for FINE_BITS");
+            FINE_BITS == 8 || FINE_BITS == 10 || FINE_BITS == 12 ||
+                    FINE_BITS == 16,
+            "Only 8, 10, 12 or 16 bits are currently supported for FINE_BITS");
 
     static constexpr intptr_t dim = DIM;
     static constexpr intptr_t fineSize = FINE_SIZE;
diff -Nru faiss-1.7.3/faiss/cppcontrib/sa_decode/PQ-inl.h faiss-1.7.4/faiss/cppcontrib/sa_decode/PQ-inl.h
--- faiss-1.7.3/faiss/cppcontrib/sa_decode/PQ-inl.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/cppcontrib/sa_decode/PQ-inl.h	2023-04-19 13:18:30.000000000 +0000
@@ -1,4 +1,10 @@
-// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
 #ifndef PQ_INL_H
 #define PQ_INL_H
 
diff -Nru faiss-1.7.3/faiss/cppcontrib/sa_decode/PQ-neon-inl.h faiss-1.7.4/faiss/cppcontrib/sa_decode/PQ-neon-inl.h
--- faiss-1.7.3/faiss/cppcontrib/sa_decode/PQ-neon-inl.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/cppcontrib/sa_decode/PQ-neon-inl.h	2023-04-19 13:18:30.000000000 +0000
@@ -1,4 +1,10 @@
-// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
 #ifndef PQ_NEON_INL_H
 #define PQ_NEON_INL_H
 
@@ -1322,12 +1328,14 @@
 
 // Suitable for PQ[1]x8
 // Suitable for PQ[1]x10
+// Suitable for PQ[1]x12
 // Suitable for PQ[1]x16
 template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS = 8>
 struct IndexPQDecoder {
     static_assert(
-            FINE_BITS == 8 || FINE_BITS == 10 || FINE_BITS == 16,
-            "Only 8, 10 or 16 bits are currently supported for FINE_BITS");
+            FINE_BITS == 8 || FINE_BITS == 10 || FINE_BITS == 12 ||
+                    FINE_BITS == 16,
+            "Only 8, 10, 12 or 16 bits are currently supported for FINE_BITS");
 
     static constexpr intptr_t dim = DIM;
     static constexpr intptr_t fineSize = FINE_SIZE;
diff -Nru faiss-1.7.3/faiss/cppcontrib/SaDecodeKernels.h faiss-1.7.4/faiss/cppcontrib/SaDecodeKernels.h
--- faiss-1.7.3/faiss/cppcontrib/SaDecodeKernels.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/cppcontrib/SaDecodeKernels.h	2023-04-19 13:18:30.000000000 +0000
@@ -1,4 +1,9 @@
-// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 
 #pragma once
 
@@ -11,19 +16,32 @@
 //   * PQ[1]x8
 // Additionally, AVX2 and ARM versions support
 //   * Residual[1]x8,PQ[2]x10
+//   * Residual[1]x8,PQ[2]x12
 //   * Residual[1]x8,PQ[2]x16
 //   * Residual[1]x10,PQ[2]x10
+//   * Residual[1]x10,PQ[2]x12
 //   * Residual[1]x10,PQ[2]x16
+//   * Residual[1]x12,PQ[2]x10
+//   * Residual[1]x12,PQ[2]x12
+//   * Residual[1]x12,PQ[2]x16
 //   * Residual[1]x16,PQ[2]x10
+//   * Residual[1]x16,PQ[2]x12
 //   * Residual[1]x16,PQ[2]x16
 //   * Residual1x[9-16 bit],PQ[1]x10 (such as Residual1x9,PQ16x10)
 //   * * (use with COARSE_BITS=16)
+//   * Residual1x[9-16 bit],PQ[1]x12 (such as Residual1x9,PQ16x12)
+//   * * (use with COARSE_BITS=16)
 //   * Residual1x[9-16 bit],PQ[1]x16 (such as Residual1x9,PQ16x16)
 //   * * (use with COARSE_BITS=16)
 //   * PQ[1]x10
+//   * PQ[1]x12
 //   * PQ[1]x16
-// Unfortunately, currently Faiss does not support something like
-//   IVF256,PQ16x10np
+//   * IVF256,PQ[1]x10 (such as IVF256,PQ16x10np)
+//   * IVF256,PQ[1]x12 (such as IVF256,PQ16x12np)
+//   * IVF256,PQ[1]x16 (such as IVF256,PQ16x16np)
+//   * IVF[2^9-2^16 bit],PQ[1]x10 (such as IVF1024,PQ16x10np)
+//   * IVF[2^9-2^16 bit],PQ[1]x12 (such as IVF1024,PQ16x12np)
+//   * IVF[2^9-2^16 bit],PQ[1]x16 (such as IVF1024,PQ16x16np)
 //
 // The goal was to achieve the maximum performance, so the template version it
 // is. The provided index families share the same code for sa_decode.
@@ -57,6 +75,10 @@
 //   decoder.
 // For example, "Residual4x10,PQ16x10np" for 256-dim data translates into
 //   Index2LevelDecoder<256,64,16,10,10>
+// For example, "IVF1024,PQ16x10np" for 256-dim data translates into
+//   Index2LevelDecoder<256,256,16,10,10>. But as there are only 1 coarse code
+//   element, Index2LevelDecoder<256,256,16,16,10> can be used as a faster
+//   decoder.
 //
 // Additional supported values for COARSE_BITS and FINE_BITS may be added later.
 //
diff -Nru faiss-1.7.3/faiss/gpu/CMakeLists.txt faiss-1.7.4/faiss/gpu/CMakeLists.txt
--- faiss-1.7.3/faiss/gpu/CMakeLists.txt	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/CMakeLists.txt	2023-04-19 13:18:30.000000000 +0000
@@ -7,7 +7,6 @@
 set(FAISS_GPU_SRC
   GpuAutoTune.cpp
   GpuCloner.cpp
-  GpuClonerOptions.cpp
   GpuDistance.cu
   GpuIcmEncoder.cu
   GpuIndex.cu
diff -Nru faiss-1.7.3/faiss/gpu/GpuAutoTune.cpp faiss-1.7.4/faiss/gpu/GpuAutoTune.cpp
--- faiss-1.7.3/faiss/gpu/GpuAutoTune.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/GpuAutoTune.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -11,6 +11,8 @@
 #include <faiss/IndexPreTransform.h>
 #include <faiss/IndexReplicas.h>
 #include <faiss/IndexShards.h>
+#include <faiss/IndexShardsIVF.h>
+
 #include <faiss/gpu/GpuIndex.h>
 #include <faiss/gpu/GpuIndexFlat.h>
 #include <faiss/gpu/GpuIndexIVFFlat.h>
@@ -33,7 +35,12 @@
 
 void GpuParameterSpace::initialize(const Index* index) {
     if (DC(IndexPreTransform)) {
-        index = ix->index;
+        initialize(ix->index);
+        return;
+    }
+    if (DC(IndexShardsIVF)) {
+        ParameterSpace::initialize(index);
+        return;
     }
     if (DC(IndexReplicas)) {
         if (ix->count() == 0)
@@ -53,6 +60,14 @@
                 break;
             pr.values.push_back(nprobe);
         }
+
+        ParameterSpace ivf_pspace;
+        ivf_pspace.initialize(ix->quantizer);
+
+        for (const ParameterRange& p : ivf_pspace.parameter_ranges) {
+            ParameterRange& pr = add_range("quantizer_" + p.name);
+            pr.values = p.values;
+        }
     }
     // not sure we should call the parent initializer
 }
@@ -72,7 +87,7 @@
     }
     if (name == "nprobe") {
         if (DC(GpuIndexIVF)) {
-            ix->setNumProbes(int(val));
+            ix->nprobe = size_t(val);
             return;
         }
     }
@@ -82,6 +97,14 @@
             return;
         }
     }
+
+    if (name.find("quantizer_") == 0) {
+        if (DC(GpuIndexIVF)) {
+            std::string sub_name = name.substr(strlen("quantizer_"));
+            set_index_parameter(ix->quantizer, sub_name, val);
+            return;
+        }
+    }
 
     // maybe normal index parameters apply?
     ParameterSpace::set_index_parameter(index, name, val);
diff -Nru faiss-1.7.3/faiss/gpu/GpuCloner.cpp faiss-1.7.4/faiss/gpu/GpuCloner.cpp
--- faiss-1.7.3/faiss/gpu/GpuCloner.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/GpuCloner.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -18,6 +18,7 @@
 #include <faiss/IndexPreTransform.h>
 #include <faiss/IndexReplicas.h>
 #include <faiss/IndexScalarQuantizer.h>
+#include <faiss/IndexShardsIVF.h>
 #include <faiss/MetaIndexes.h>
 #include <faiss/gpu/GpuIndex.h>
 #include <faiss/gpu/GpuIndexFlat.h>
@@ -116,7 +117,6 @@
         : GpuClonerOptions(options), provider(prov), device(device) {}
 
 Index* ToGpuCloner::clone_Index(const Index* index) {
-    using idx_t = Index::idx_t;
     if (auto ifl = dynamic_cast<const IndexFlat*>(index)) {
         GpuIndexFlatConfig config;
         config.device = device;
@@ -227,8 +227,8 @@
         std::vector<int>& devices,
         const GpuMultipleClonerOptions& options)
         : GpuMultipleClonerOptions(options) {
-    FAISS_ASSERT(provider.size() == devices.size());
-    for (int i = 0; i < provider.size(); i++) {
+    FAISS_THROW_IF_NOT(provider.size() == devices.size());
+    for (size_t i = 0; i < provider.size(); i++) {
         sub_cloners.push_back(ToGpuCloner(provider[i], devices[i], options));
     }
 }
@@ -241,28 +241,43 @@
 void ToGpuClonerMultiple::copy_ivf_shard(
         const IndexIVF* index_ivf,
         IndexIVF* idx2,
-        long n,
-        long i) {
+        idx_t n,
+        idx_t i) {
     if (shard_type == 2) {
-        long i0 = i * index_ivf->ntotal / n;
-        long i1 = (i + 1) * index_ivf->ntotal / n;
+        idx_t i0 = i * index_ivf->ntotal / n;
+        idx_t i1 = (i + 1) * index_ivf->ntotal / n;
 
         if (verbose)
             printf("IndexShards shard %ld indices %ld:%ld\n", i, i0, i1);
-        index_ivf->copy_subset_to(*idx2, 2, i0, i1);
+        index_ivf->copy_subset_to(
+                *idx2, InvertedLists::SUBSET_TYPE_ID_RANGE, i0, i1);
         FAISS_ASSERT(idx2->ntotal == i1 - i0);
     } else if (shard_type == 1) {
         if (verbose)
             printf("IndexShards shard %ld select modulo %ld = %ld\n", i, n, i);
-        index_ivf->copy_subset_to(*idx2, 1, n, i);
+        index_ivf->copy_subset_to(
+                *idx2, InvertedLists::SUBSET_TYPE_ID_MOD, n, i);
+    } else if (shard_type == 4) {
+        idx_t i0 = i * index_ivf->nlist / n;
+        idx_t i1 = (i + 1) * index_ivf->nlist / n;
+        if (verbose) {
+            printf("IndexShards %ld/%ld select lists %d:%d\n",
+                   i,
+                   n,
+                   int(i0),
+                   int(i1));
+        }
+        index_ivf->copy_subset_to(
+                *idx2, InvertedLists::SUBSET_TYPE_INVLIST, i0, i1);
     } else {
         FAISS_THROW_FMT("shard_type %d not implemented", shard_type);
     }
 }
 
 Index* ToGpuClonerMultiple::clone_Index_to_shards(const Index* index) {
-    long n = sub_cloners.size();
+    idx_t n = sub_cloners.size();
 
+    auto index_ivf = dynamic_cast<const faiss::IndexIVF*>(index);
     auto index_ivfpq = dynamic_cast<const faiss::IndexIVFPQ*>(index);
     auto index_ivfflat = dynamic_cast<const faiss::IndexIVFFlat*>(index);
     auto index_ivfsq =
@@ -274,16 +289,36 @@
             "IndexIVFFlat, IndexIVFScalarQuantizer, "
             "IndexFlat and IndexIVFPQ");
 
+    // decide what coarse quantizer the sub-indexes are going to have
+    const Index* quantizer = nullptr;
+    std::unique_ptr<Index> new_quantizer;
+    if (index_ivf) {
+        quantizer = index_ivf->quantizer;
+        if (common_ivf_quantizer &&
+            !dynamic_cast<const IndexFlat*>(quantizer)) {
+            // then we flatten the coarse quantizer so that everything remains
+            // on GPU
+            new_quantizer.reset(
+                    new IndexFlat(quantizer->d, quantizer->metric_type));
+            std::vector<float> centroids(quantizer->d * quantizer->ntotal);
+            quantizer->reconstruct_n(0, quantizer->ntotal, centroids.data());
+            new_quantizer->add(quantizer->ntotal, centroids.data());
+            quantizer = new_quantizer.get();
+        }
+    }
+
     std::vector<faiss::Index*> shards(n);
 
-    for (long i = 0; i < n; i++) {
+    for (idx_t i = 0; i < n; i++) {
         // make a shallow copy
-        if (reserveVecs)
+        if (reserveVecs) {
             sub_cloners[i].reserveVecs = (reserveVecs + n - 1) / n;
-
+        }
+        // note: const_casts here are harmless because the indexes build here
+        // are short-lived, translated immediately to GPU indexes.
         if (index_ivfpq) {
             faiss::IndexIVFPQ idx2(
-                    index_ivfpq->quantizer,
+                    const_cast<Index*>(quantizer),
                     index_ivfpq->d,
                     index_ivfpq->nlist,
                     index_ivfpq->code_size,
@@ -297,7 +332,7 @@
             shards[i] = sub_cloners[i].clone_Index(&idx2);
         } else if (index_ivfflat) {
             faiss::IndexIVFFlat idx2(
-                    index_ivfflat->quantizer,
+                    const_cast<Index*>(quantizer),
                     index->d,
                     index_ivfflat->nlist,
                     index_ivfflat->metric_type);
@@ -307,7 +342,7 @@
             shards[i] = sub_cloners[i].clone_Index(&idx2);
         } else if (index_ivfsq) {
             faiss::IndexIVFScalarQuantizer idx2(
-                    index_ivfsq->quantizer,
+                    const_cast<Index*>(quantizer),
                     index->d,
                     index_ivfsq->nlist,
                     index_ivfsq->sq.qtype,
@@ -323,40 +358,52 @@
             faiss::IndexFlat idx2(index->d, index->metric_type);
             shards[i] = sub_cloners[i].clone_Index(&idx2);
             if (index->ntotal > 0) {
-                long i0 = index->ntotal * i / n;
-                long i1 = index->ntotal * (i + 1) / n;
+                idx_t i0 = index->ntotal * i / n;
+                idx_t i1 = index->ntotal * (i + 1) / n;
                 shards[i]->add(i1 - i0, index_flat->get_xb() + i0 * index->d);
             }
         }
     }
 
     bool successive_ids = index_flat != nullptr;
-    faiss::IndexShards* res =
-            new faiss::IndexShards(index->d, true, successive_ids);
+    faiss::IndexShards* res;
+    if (common_ivf_quantizer && index_ivf) {
+        this->shard = false;
+        Index* common_quantizer = clone_Index(index_ivf->quantizer);
+        this->shard = true;
+        IndexShardsIVF* idx = new faiss::IndexShardsIVF(
+                common_quantizer, index_ivf->nlist, true, false);
+        idx->own_fields = true;
+        idx->own_indices = true;
+        res = idx;
+    } else {
+        res = new faiss::IndexShards(index->d, true, successive_ids);
+        res->own_indices = true;
+    }
 
     for (int i = 0; i < n; i++) {
         res->add_shard(shards[i]);
     }
-    res->own_fields = true;
     FAISS_ASSERT(index->ntotal == res->ntotal);
     return res;
 }
 
 Index* ToGpuClonerMultiple::clone_Index(const Index* index) {
-    long n = sub_cloners.size();
-    if (n == 1)
+    idx_t n = sub_cloners.size();
+    if (n == 1) {
         return sub_cloners[0].clone_Index(index);
+    }
 
     if (dynamic_cast<const IndexFlat*>(index) ||
-        dynamic_cast<const faiss::IndexIVFFlat*>(index) ||
-        dynamic_cast<const faiss::IndexIVFScalarQuantizer*>(index) ||
-        dynamic_cast<const faiss::IndexIVFPQ*>(index)) {
+        dynamic_cast<const IndexIVFFlat*>(index) ||
+        dynamic_cast<const IndexIVFScalarQuantizer*>(index) ||
+        dynamic_cast<const IndexIVFPQ*>(index)) {
         if (!shard) {
             IndexReplicas* res = new IndexReplicas();
             for (auto& sub_cloner : sub_cloners) {
                 res->addIndex(sub_cloner.clone_Index(index));
             }
-            res->own_fields = true;
+            res->own_indices = true;
             return res;
         } else {
             return clone_Index_to_shards(index);
@@ -373,8 +420,8 @@
         for (int m = 0; m < pq.M; m++) {
             // which GPU(s) will be assigned to this sub-quantizer
 
-            long i0 = m * n / pq.M;
-            long i1 = pq.M <= n ? (m + 1) * n / pq.M : i0 + 1;
+            idx_t i0 = m * n / pq.M;
+            idx_t i1 = pq.M <= n ? (m + 1) * n / pq.M : i0 + 1;
             std::vector<ToGpuCloner> sub_cloners_2;
             sub_cloners_2.insert(
                     sub_cloners_2.begin(),
diff -Nru faiss-1.7.3/faiss/gpu/GpuCloner.h faiss-1.7.4/faiss/gpu/GpuCloner.h
--- faiss-1.7.3/faiss/gpu/GpuCloner.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/GpuCloner.h	2023-04-19 13:18:30.000000000 +0000
@@ -55,8 +55,8 @@
     void copy_ivf_shard(
             const IndexIVF* index_ivf,
             IndexIVF* idx2,
-            long n,
-            long i);
+            idx_t n,
+            idx_t i);
 
     Index* clone_Index_to_shards(const Index* index);
 
diff -Nru faiss-1.7.3/faiss/gpu/GpuClonerOptions.cpp faiss-1.7.4/faiss/gpu/GpuClonerOptions.cpp
--- faiss-1.7.3/faiss/gpu/GpuClonerOptions.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/GpuClonerOptions.cpp	1970-01-01 00:00:00.000000000 +0000
@@ -1,26 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <faiss/gpu/GpuClonerOptions.h>
-
-namespace faiss {
-namespace gpu {
-
-GpuClonerOptions::GpuClonerOptions()
-        : indicesOptions(INDICES_64_BIT),
-          useFloat16CoarseQuantizer(false),
-          useFloat16(false),
-          usePrecomputed(false),
-          reserveVecs(0),
-          storeTransposed(false),
-          verbose(false) {}
-
-GpuMultipleClonerOptions::GpuMultipleClonerOptions()
-        : shard(false), shard_type(1) {}
-
-} // namespace gpu
-} // namespace faiss
diff -Nru faiss-1.7.3/faiss/gpu/GpuClonerOptions.h faiss-1.7.4/faiss/gpu/GpuClonerOptions.h
--- faiss-1.7.3/faiss/gpu/GpuClonerOptions.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/GpuClonerOptions.h	2023-04-19 13:18:30.000000000 +0000
@@ -14,41 +14,42 @@
 
 /// set some options on how to copy to GPU
 struct GpuClonerOptions {
-    GpuClonerOptions();
-
     /// how should indices be stored on index types that support indices
     /// (anything but GpuIndexFlat*)?
-    IndicesOptions indicesOptions;
+    IndicesOptions indicesOptions = INDICES_64_BIT;
 
     /// is the coarse quantizer in float16?
-    bool useFloat16CoarseQuantizer;
+    bool useFloat16CoarseQuantizer = false;
 
     /// for GpuIndexIVFFlat, is storage in float16?
     /// for GpuIndexIVFPQ, are intermediate calculations in float16?
-    bool useFloat16;
+    bool useFloat16 = false;
 
     /// use precomputed tables?
-    bool usePrecomputed;
+    bool usePrecomputed = false;
 
     /// reserve vectors in the invfiles?
-    long reserveVecs;
+    long reserveVecs = 0;
 
     /// For GpuIndexFlat, store data in transposed layout?
-    bool storeTransposed;
+    bool storeTransposed = false;
 
     /// Set verbose options on the index
-    bool verbose;
+    bool verbose = false;
 };
 
 struct GpuMultipleClonerOptions : public GpuClonerOptions {
-    GpuMultipleClonerOptions();
-
     /// Whether to shard the index across GPUs, versus replication
     /// across GPUs
-    bool shard;
+    bool shard = false;
 
     /// IndexIVF::copy_subset_to subset type
-    int shard_type;
+    int shard_type = 1;
+
+    /// set to true if an IndexIVF is to be dispatched to multiple GPUs with a
+    /// single common IVF quantizer, ie. only the inverted lists are sharded on
+    /// the sub-indexes (uses an IndexShardsIVF)
+    bool common_ivf_quantizer = false;
 };
 
 } // namespace gpu
diff -Nru faiss-1.7.3/faiss/gpu/GpuDistance.cu faiss-1.7.4/faiss/gpu/GpuDistance.cu
--- faiss-1.7.3/faiss/gpu/GpuDistance.cu	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/GpuDistance.cu	2023-04-19 13:18:30.000000000 +0000
@@ -40,10 +40,28 @@
             args.outIndices || args.k == -1,
             "bfKnn: outIndices must be provided (passed null)");
 
+    // If the user specified a device, then ensure that it is currently set
+    int device = -1;
+    if (args.device == -1) {
+        // Original behavior if no device is specified, use the current CUDA
+        // thread local device
+        device = getCurrentDevice();
+    } else {
+        // Otherwise, use the device specified in `args`
+        device = args.device;
+
+        FAISS_THROW_IF_NOT_FMT(
+                device >= 0 && device < getNumDevices(),
+                "bfKnn: device specified must be -1 (current CUDA thread local device) "
+                "or within the range [0, %d)",
+                getNumDevices());
+    }
+
+    DeviceScope scope(device);
+
     // Don't let the resources go out of scope
     auto resImpl = prov->getResources();
     auto res = resImpl.get();
-    auto device = getCurrentDevice();
     auto stream = res->getDefaultStreamCurrentDevice();
 
     auto tVectors = toDeviceTemporary<T, 2>(
@@ -93,11 +111,11 @@
                 args.metricArg,
                 tOutDistances);
     } else if (args.outIndicesType == IndicesDataType::I64) {
-        // The brute-force API only supports an interface for i32 indices only,
-        // so we must create an output i32 buffer then convert back
-        DeviceTensor<int, 2, true> tOutIntIndices(
+        auto tOutIndices = toDeviceTemporary<idx_t, 2>(
                 res,
-                makeTempAlloc(AllocType::Other, stream),
+                device,
+                (idx_t*)args.outIndices,
+                stream,
                 {args.numQueries, args.k});
 
         // Since we've guaranteed that all arguments are on device, call the
@@ -115,35 +133,19 @@
                 args.metric,
                 args.metricArg,
                 tOutDistances,
-                tOutIntIndices,
+                tOutIndices,
                 args.ignoreOutDistances);
 
-        // Convert and copy int indices out
-        auto tOutIndices = toDeviceTemporary<Index::idx_t, 2>(
-                res,
-                device,
-                (Index::idx_t*)args.outIndices,
-                stream,
-                {args.numQueries, args.k});
-
-        // Convert int to idx_t
-        convertTensor<int, Index::idx_t, 2>(
-                stream, tOutIntIndices, tOutIndices);
-
-        // Copy back if necessary
-        fromDevice<Index::idx_t, 2>(
-                tOutIndices, (Index::idx_t*)args.outIndices, stream);
+        fromDevice<idx_t, 2>(tOutIndices, (idx_t*)args.outIndices, stream);
 
     } else if (args.outIndicesType == IndicesDataType::I32) {
-        // We can use the brute-force API directly, as it takes i32 indices
+        // The brute-force API supports i64 indices, but our output buffer is
+        // i32 so we need to temporarily allocate and then convert back to i32
         // FIXME: convert to int32_t everywhere?
         static_assert(sizeof(int) == 4, "");
-
-        auto tOutIntIndices = toDeviceTemporary<int, 2>(
+        DeviceTensor<idx_t, 2, true> tIntIndices(
                 res,
-                device,
-                (int*)args.outIndices,
-                stream,
+                makeTempAlloc(AllocType::Other, stream),
                 {args.numQueries, args.k});
 
         // Since we've guaranteed that all arguments are on device, call the
@@ -161,9 +163,19 @@
                 args.metric,
                 args.metricArg,
                 tOutDistances,
-                tOutIntIndices,
+                tIntIndices,
                 args.ignoreOutDistances);
 
+        // Convert and copy int indices out
+        auto tOutIntIndices = toDeviceTemporary<int, 2>(
+                res,
+                device,
+                (int*)args.outIndices,
+                stream,
+                {args.numQueries, args.k});
+
+        convertTensor<idx_t, int, 2>(stream, tIntIndices, tOutIntIndices);
+
         // Copy back if necessary
         fromDevice<int, 2>(tOutIntIndices, (int*)args.outIndices, stream);
     } else {
@@ -198,12 +210,12 @@
         // innermost
         const float* vectors,
         bool vectorsRowMajor,
-        int numVectors,
+        idx_t numVectors,
         // A region of memory size numQueries x dims, with dims
         // innermost
         const float* queries,
         bool queriesRowMajor,
-        int numQueries,
+        idx_t numQueries,
         int dims,
         int k,
         // A region of memory size numQueries x k, with k
@@ -211,7 +223,7 @@
         float* outDistances,
         // A region of memory size numQueries x k, with k
         // innermost
-        Index::idx_t* outIndices) {
+        idx_t* outIndices) {
     std::cerr << "bruteForceKnn is deprecated; call bfKnn instead" << std::endl;
 
     GpuDistanceParams args;
diff -Nru faiss-1.7.3/faiss/gpu/GpuDistance.h faiss-1.7.4/faiss/gpu/GpuDistance.h
--- faiss-1.7.3/faiss/gpu/GpuDistance.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/GpuDistance.h	2023-04-19 13:18:30.000000000 +0000
@@ -45,7 +45,8 @@
               outDistances(nullptr),
               ignoreOutDistances(false),
               outIndicesType(IndicesDataType::I64),
-              outIndices(nullptr) {}
+              outIndices(nullptr),
+              device(-1) {}
 
     //
     // Search parameters
@@ -76,7 +77,7 @@
     const void* vectors;
     DistanceDataType vectorType;
     bool vectorsRowMajor;
-    int numVectors;
+    idx_t numVectors;
 
     /// Precomputed L2 norms for each vector in `vectors`, which can be
     /// optionally provided in advance to speed computation for METRIC_L2
@@ -93,7 +94,7 @@
     const void* queries;
     DistanceDataType queryType;
     bool queriesRowMajor;
-    int numQueries;
+    idx_t numQueries;
 
     //
     // Output results
@@ -112,6 +113,17 @@
     /// innermost (row major). Not used if k == -1 (all pairwise distances)
     IndicesDataType outIndicesType;
     void* outIndices;
+
+    //
+    // Execution information
+    //
+
+    /// On which GPU device should the search run?
+    /// -1 indicates that the current CUDA thread-local device
+    /// (via cudaGetDevice/cudaSetDevice) is used
+    /// Otherwise, an integer 0 <= device < numDevices indicates the device for
+    /// execution
+    int device;
 };
 
 /// A wrapper for gpu/impl/Distance.cuh to expose direct brute-force k-nearest
@@ -137,13 +149,13 @@
         // dims x numVectors, with numVectors innermost
         const float* vectors,
         bool vectorsRowMajor,
-        int numVectors,
+        idx_t numVectors,
         // If queriesRowMajor is true, this is
         // numQueries x dims, with dims innermost; otherwise,
         // dims x numQueries, with numQueries innermost
         const float* queries,
         bool queriesRowMajor,
-        int numQueries,
+        idx_t numQueries,
         int dims,
         int k,
         // A region of memory size numQueries x k, with k
@@ -151,7 +163,7 @@
         float* outDistances,
         // A region of memory size numQueries x k, with k
         // innermost (row major)
-        Index::idx_t* outIndices);
+        idx_t* outIndices);
 
 } // namespace gpu
 } // namespace faiss
diff -Nru faiss-1.7.3/faiss/gpu/GpuIndexBinaryFlat.cu faiss-1.7.4/faiss/gpu/GpuIndexBinaryFlat.cu
--- faiss-1.7.3/faiss/gpu/GpuIndexBinaryFlat.cu	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/GpuIndexBinaryFlat.cu	2023-04-19 13:18:30.000000000 +0000
@@ -76,13 +76,6 @@
 
     this->d = index->d;
 
-    // GPU code has 32 bit indices
-    FAISS_THROW_IF_NOT_FMT(
-            index->ntotal <= (Index::idx_t)std::numeric_limits<int>::max(),
-            "GPU index only supports up to %zu indices; "
-            "attempting to copy CPU index with %zu parameters",
-            (size_t)std::numeric_limits<int>::max(),
-            (size_t)index->ntotal);
     this->ntotal = index->ntotal;
 
     // destroy old first before allocating new
@@ -117,22 +110,13 @@
     }
 }
 
-void GpuIndexBinaryFlat::add(faiss::IndexBinary::idx_t n, const uint8_t* x) {
+void GpuIndexBinaryFlat::add(idx_t n, const uint8_t* x) {
     DeviceScope scope(binaryFlatConfig_.device);
 
-    validateNumVectors(n);
-
     // To avoid multiple re-allocations, ensure we have enough storage
     // available
     data_->reserve(n, resources_->getDefaultStream(binaryFlatConfig_.device));
 
-    // Due to GPU indexing in int32, we can't store more than this
-    // number of vectors on a GPU
-    FAISS_THROW_IF_NOT_FMT(
-            this->ntotal + n <= (Index::idx_t)std::numeric_limits<int>::max(),
-            "GPU index only supports up to %zu indices",
-            (size_t)std::numeric_limits<int>::max());
-
     data_->add(
             (const unsigned char*)x,
             n,
@@ -149,11 +133,11 @@
 }
 
 void GpuIndexBinaryFlat::search(
-        faiss::IndexBinary::idx_t n,
+        idx_t n,
         const uint8_t* x,
-        faiss::IndexBinary::idx_t k,
+        idx_t k,
         int32_t* distances,
-        faiss::IndexBinary::idx_t* labels,
+        faiss::idx_t* labels,
         const SearchParameters* params) const {
     DeviceScope scope(binaryFlatConfig_.device);
     auto stream = resources_->getDefaultStream(binaryFlatConfig_.device);
@@ -164,7 +148,6 @@
 
     FAISS_THROW_IF_NOT_MSG(!params, "params not implemented");
 
-    validateNumVectors(n);
     validateKSelect(k);
 
     // The input vectors may be too large for the GPU, but we still
@@ -178,13 +161,10 @@
             binaryFlatConfig_.device,
             distances,
             stream,
-            {(int)n, (int)k});
+            {n, k});
 
-    // FlatIndex only supports an interface returning int indices
-    DeviceTensor<int, 2, true> outIntIndices(
-            resources_.get(),
-            makeTempAlloc(AllocType::Other, stream),
-            {(int)n, (int)k});
+    auto outIndices = toDeviceTemporary<idx_t, 2>(
+            resources_.get(), binaryFlatConfig_.device, labels, stream, {n, k});
 
     bool usePaged = false;
 
@@ -195,43 +175,32 @@
         // -> GPU.
         // Currently, we don't handle the case where the output data won't
         // fit on the GPU (e.g., n * k is too large for the GPU memory).
-        size_t dataSize = (size_t)n * (this->d / 8) * sizeof(uint8_t);
+        size_t dataSize = n * (this->d / 8) * sizeof(uint8_t);
 
         if (dataSize >= kMinPageSize) {
             searchFromCpuPaged_(
-                    n, x, k, outDistances.data(), outIntIndices.data());
+                    n, x, k, outDistances.data(), outIndices.data());
             usePaged = true;
         }
     }
 
     if (!usePaged) {
-        searchNonPaged_(n, x, k, outDistances.data(), outIntIndices.data());
+        searchNonPaged_(n, x, k, outDistances.data(), outIndices.data());
     }
 
-    // Convert and copy int indices out
-    auto outIndices = toDeviceTemporary<Index::idx_t, 2>(
-            resources_.get(),
-            binaryFlatConfig_.device,
-            labels,
-            stream,
-            {(int)n, (int)k});
-
-    // Convert int to idx_t
-    convertTensor<int, Index::idx_t, 2>(stream, outIntIndices, outIndices);
-
     // Copy back if necessary
     fromDevice<int32_t, 2>(outDistances, distances, stream);
-    fromDevice<Index::idx_t, 2>(outIndices, labels, stream);
+    fromDevice<idx_t, 2>(outIndices, labels, stream);
 }
 
 void GpuIndexBinaryFlat::searchNonPaged_(
-        int n,
+        idx_t n,
         const uint8_t* x,
         int k,
         int32_t* outDistancesData,
-        int* outIndicesData) const {
+        idx_t* outIndicesData) const {
     Tensor<int32_t, 2, true> outDistances(outDistancesData, {n, k});
-    Tensor<int, 2, true> outIndices(outIndicesData, {n, k});
+    Tensor<idx_t, 2, true> outIndices(outIndicesData, {n, k});
 
     auto stream = resources_->getDefaultStream(binaryFlatConfig_.device);
 
@@ -242,44 +211,42 @@
             binaryFlatConfig_.device,
             const_cast<uint8_t*>(x),
             stream,
-            {n, (int)(this->d / 8)});
+            {n, (this->d / 8)});
 
     data_->query(vecs, k, outDistances, outIndices);
 }
 
 void GpuIndexBinaryFlat::searchFromCpuPaged_(
-        int n,
+        idx_t n,
         const uint8_t* x,
         int k,
         int32_t* outDistancesData,
-        int* outIndicesData) const {
+        idx_t* outIndicesData) const {
     Tensor<int32_t, 2, true> outDistances(outDistancesData, {n, k});
-    Tensor<int, 2, true> outIndices(outIndicesData, {n, k});
+    Tensor<idx_t, 2, true> outIndices(outIndicesData, {n, k});
 
-    auto vectorSize = sizeof(uint8_t) * (this->d / 8);
+    idx_t vectorSize = sizeof(uint8_t) * (this->d / 8);
 
     // Just page without overlapping copy with compute (as GpuIndexFlat does)
-    int batchSize = utils::nextHighestPowerOf2(
-            (int)((size_t)kMinPageSize / vectorSize));
+    auto batchSize =
+            utils::nextHighestPowerOf2(((idx_t)kMinPageSize / vectorSize));
 
-    for (int cur = 0; cur < n; cur += batchSize) {
-        int num = std::min(batchSize, n - cur);
+    for (idx_t cur = 0; cur < n; cur += batchSize) {
+        auto num = std::min(batchSize, n - cur);
 
         auto outDistancesSlice = outDistances.narrowOutermost(cur, num);
         auto outIndicesSlice = outIndices.narrowOutermost(cur, num);
 
         searchNonPaged_(
                 num,
-                x + (size_t)cur * (this->d / 8),
+                x + cur * (this->d / 8),
                 k,
                 outDistancesSlice.data(),
                 outIndicesSlice.data());
     }
 }
 
-void GpuIndexBinaryFlat::reconstruct(
-        faiss::IndexBinary::idx_t key,
-        uint8_t* out) const {
+void GpuIndexBinaryFlat::reconstruct(faiss::idx_t key, uint8_t* out) const {
     DeviceScope scope(binaryFlatConfig_.device);
 
     FAISS_THROW_IF_NOT_MSG(key < this->ntotal, "index out of bounds");
diff -Nru faiss-1.7.3/faiss/gpu/GpuIndexBinaryFlat.h faiss-1.7.4/faiss/gpu/GpuIndexBinaryFlat.h
--- faiss-1.7.3/faiss/gpu/GpuIndexBinaryFlat.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/GpuIndexBinaryFlat.h	2023-04-19 13:18:30.000000000 +0000
@@ -53,37 +53,37 @@
     /// in the index instance
     void copyTo(faiss::IndexBinaryFlat* index) const;
 
-    void add(faiss::IndexBinary::idx_t n, const uint8_t* x) override;
+    void add(faiss::idx_t n, const uint8_t* x) override;
 
     void reset() override;
 
     void search(
-            faiss::IndexBinary::idx_t n,
+            idx_t n,
             const uint8_t* x,
-            faiss::IndexBinary::idx_t k,
+            // faiss::IndexBinary has idx_t for k
+            idx_t k,
             int32_t* distances,
-            faiss::IndexBinary::idx_t* labels,
+            faiss::idx_t* labels,
             const faiss::SearchParameters* params = nullptr) const override;
 
-    void reconstruct(faiss::IndexBinary::idx_t key, uint8_t* recons)
-            const override;
+    void reconstruct(faiss::idx_t key, uint8_t* recons) const override;
 
    protected:
     /// Called from search when the input data is on the CPU;
     /// potentially allows for pinned memory usage
     void searchFromCpuPaged_(
-            int n,
+            idx_t n,
             const uint8_t* x,
             int k,
             int32_t* outDistancesData,
-            int* outIndicesData) const;
+            idx_t* outIndicesData) const;
 
     void searchNonPaged_(
-            int n,
+            idx_t n,
             const uint8_t* x,
             int k,
             int32_t* outDistancesData,
-            int* outIndicesData) const;
+            idx_t* outIndicesData) const;
 
    protected:
     /// Manages streans, cuBLAS handles and scratch memory for devices
diff -Nru faiss-1.7.3/faiss/gpu/GpuIndex.cu faiss-1.7.4/faiss/gpu/GpuIndex.cu
--- faiss-1.7.3/faiss/gpu/GpuIndex.cu	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/GpuIndex.cu	2023-04-19 13:18:30.000000000 +0000
@@ -25,22 +25,22 @@
 namespace gpu {
 
 /// Default CPU search size for which we use paged copies
-constexpr size_t kMinPageSize = (size_t)256 * 1024 * 1024;
+constexpr idx_t kMinPageSize = (idx_t)256 * 1024 * 1024;
 
 /// Size above which we page copies from the CPU to GPU (non-paged
 /// memory usage)
-constexpr size_t kNonPinnedPageSize = (size_t)256 * 1024 * 1024;
+constexpr idx_t kNonPinnedPageSize = (idx_t)256 * 1024 * 1024;
 
 // Default size for which we page add or search
-constexpr size_t kAddPageSize = (size_t)256 * 1024 * 1024;
+constexpr idx_t kAddPageSize = (idx_t)256 * 1024 * 1024;
 
 // Or, maximum number of vectors to consider per page of add or search
-constexpr size_t kAddVecSize = (size_t)512 * 1024;
+constexpr idx_t kAddVecSize = (idx_t)512 * 1024;
 
 // Use a smaller search size, as precomputed code usage on IVFPQ
 // requires substantial amounts of memory
 // FIXME: parameterize based on algorithm need
-constexpr size_t kSearchVecSize = (size_t)32 * 1024;
+constexpr idx_t kSearchVecSize = (idx_t)32 * 1024;
 
 GpuIndex::GpuIndex(
         std::shared_ptr<GpuResources> resources,
@@ -100,61 +100,53 @@
     return minPagedSize_;
 }
 
-void GpuIndex::add(Index::idx_t n, const float* x) {
+void GpuIndex::add(idx_t n, const float* x) {
     // Pass to add_with_ids
     add_with_ids(n, x, nullptr);
 }
 
-void GpuIndex::add_with_ids(
-        Index::idx_t n,
-        const float* x,
-        const Index::idx_t* ids) {
+void GpuIndex::add_with_ids(idx_t n, const float* x, const idx_t* ids) {
     DeviceScope scope(config_.device);
     FAISS_THROW_IF_NOT_MSG(this->is_trained, "Index not trained");
 
-    validateNumVectors(n);
-
     if (n == 0) {
         // nothing to add
         return;
     }
 
-    std::vector<Index::idx_t> generatedIds;
+    std::vector<idx_t> generatedIds;
 
     // Generate IDs if we need them
     if (!ids && addImplRequiresIDs_()) {
-        generatedIds = std::vector<Index::idx_t>(n);
+        generatedIds = std::vector<idx_t>(n);
 
-        for (Index::idx_t i = 0; i < n; ++i) {
+        for (idx_t i = 0; i < n; ++i) {
             generatedIds[i] = this->ntotal + i;
         }
     }
 
-    addPaged_((int)n, x, ids ? ids : generatedIds.data());
+    addPaged_(n, x, ids ? ids : generatedIds.data());
 }
 
-void GpuIndex::addPaged_(int n, const float* x, const Index::idx_t* ids) {
+void GpuIndex::addPaged_(idx_t n, const float* x, const idx_t* ids) {
     if (n > 0) {
-        size_t totalSize = (size_t)n * this->d * sizeof(float);
+        idx_t totalSize = n * this->d * sizeof(float);
 
         if (totalSize > kAddPageSize || n > kAddVecSize) {
             // How many vectors fit into kAddPageSize?
-            size_t maxNumVecsForPageSize =
-                    kAddPageSize / ((size_t)this->d * sizeof(float));
+            idx_t maxNumVecsForPageSize =
+                    kAddPageSize / (this->d * sizeof(float));
 
             // Always add at least 1 vector, if we have huge vectors
-            maxNumVecsForPageSize = std::max(maxNumVecsForPageSize, (size_t)1);
+            maxNumVecsForPageSize = std::max(maxNumVecsForPageSize, idx_t(1));
 
-            size_t tileSize = std::min((size_t)n, maxNumVecsForPageSize);
+            auto tileSize = std::min(n, maxNumVecsForPageSize);
             tileSize = std::min(tileSize, kSearchVecSize);
 
-            for (size_t i = 0; i < (size_t)n; i += tileSize) {
-                size_t curNum = std::min(tileSize, n - i);
+            for (idx_t i = 0; i < n; i += tileSize) {
+                auto curNum = std::min(tileSize, n - i);
 
-                addPage_(
-                        curNum,
-                        x + i * (size_t)this->d,
-                        ids ? ids + i : nullptr);
+                addPage_(curNum, x + i * this->d, ids ? ids + i : nullptr);
             }
         } else {
             addPage_(n, x, ids);
@@ -162,7 +154,7 @@
     }
 }
 
-void GpuIndex::addPage_(int n, const float* x, const Index::idx_t* ids) {
+void GpuIndex::addPage_(idx_t n, const float* x, const idx_t* ids) {
     // At this point, `x` can be resident on CPU or GPU, and `ids` may be
     // resident on CPU, GPU or may be null.
     //
@@ -178,10 +170,10 @@
             {n, this->d});
 
     if (ids) {
-        auto indices = toDeviceTemporary<Index::idx_t, 1>(
+        auto indices = toDeviceTemporary<idx_t, 1>(
                 resources_.get(),
                 config_.device,
-                const_cast<Index::idx_t*>(ids),
+                const_cast<idx_t*>(ids),
                 stream,
                 {n});
 
@@ -191,15 +183,10 @@
     }
 }
 
-void GpuIndex::assign(
-        Index::idx_t n,
-        const float* x,
-        Index::idx_t* labels,
-        Index::idx_t k) const {
+void GpuIndex::assign(idx_t n, const float* x, idx_t* labels, idx_t k) const {
     DeviceScope scope(config_.device);
     FAISS_THROW_IF_NOT_MSG(this->is_trained, "Index not trained");
 
-    validateNumVectors(n);
     validateKSelect(k);
 
     auto stream = resources_->getDefaultStream(config_.device);
@@ -207,25 +194,22 @@
     // We need to create a throw-away buffer for distances, which we don't use
     // but which we do need for the search call
     DeviceTensor<float, 2, true> distances(
-            resources_.get(),
-            makeTempAlloc(AllocType::Other, stream),
-            {(int)n, (int)k});
+            resources_.get(), makeTempAlloc(AllocType::Other, stream), {n, k});
 
     // Forward to search
     search(n, x, k, distances.data(), labels);
 }
 
 void GpuIndex::search(
-        Index::idx_t n,
+        idx_t n,
         const float* x,
-        Index::idx_t k,
+        idx_t k,
         float* distances,
-        Index::idx_t* labels,
+        idx_t* labels,
         const SearchParameters* params) const {
     DeviceScope scope(config_.device);
     FAISS_THROW_IF_NOT_MSG(this->is_trained, "Index not trained");
 
-    validateNumVectors(n);
     validateKSelect(k);
 
     if (n == 0 || k == 0) {
@@ -245,14 +229,10 @@
     // If we reach a point where all inputs are too big, we can add
     // another level of tiling.
     auto outDistances = toDeviceTemporary<float, 2>(
-            resources_.get(),
-            config_.device,
-            distances,
-            stream,
-            {(int)n, (int)k});
+            resources_.get(), config_.device, distances, stream, {n, k});
 
-    auto outLabels = toDeviceTemporary<Index::idx_t, 2>(
-            resources_.get(), config_.device, labels, stream, {(int)n, (int)k});
+    auto outLabels = toDeviceTemporary<idx_t, 2>(
+            resources_.get(), config_.device, labels, stream, {n, k});
 
     bool usePaged = false;
 
@@ -278,7 +258,7 @@
 
     // Copy back if necessary
     fromDevice<float, 2>(outDistances, distances, stream);
-    fromDevice<Index::idx_t, 2>(outLabels, labels, stream);
+    fromDevice<idx_t, 2>(outLabels, labels, stream);
 }
 
 void GpuIndex::search_and_reconstruct(
@@ -294,11 +274,11 @@
 }
 
 void GpuIndex::searchNonPaged_(
-        int n,
+        idx_t n,
         const float* x,
         int k,
         float* outDistancesData,
-        Index::idx_t* outIndicesData,
+        idx_t* outIndicesData,
         const SearchParameters* params) const {
     auto stream = resources_->getDefaultStream(config_.device);
 
@@ -309,40 +289,40 @@
             config_.device,
             const_cast<float*>(x),
             stream,
-            {n, (int)this->d});
+            {n, this->d});
 
     searchImpl_(n, vecs.data(), k, outDistancesData, outIndicesData, params);
 }
 
 void GpuIndex::searchFromCpuPaged_(
-        int n,
+        idx_t n,
         const float* x,
         int k,
         float* outDistancesData,
-        Index::idx_t* outIndicesData,
+        idx_t* outIndicesData,
         const SearchParameters* params) const {
     Tensor<float, 2, true> outDistances(outDistancesData, {n, k});
-    Tensor<Index::idx_t, 2, true> outIndices(outIndicesData, {n, k});
+    Tensor<idx_t, 2, true> outIndices(outIndicesData, {n, k});
 
     // Is pinned memory available?
     auto pinnedAlloc = resources_->getPinnedMemory();
-    int pageSizeInVecs =
-            (int)((pinnedAlloc.second / 2) / (sizeof(float) * this->d));
+    idx_t pageSizeInVecs =
+            ((pinnedAlloc.second / 2) / (sizeof(float) * this->d));
 
     if (!pinnedAlloc.first || pageSizeInVecs < 1) {
         // Just page without overlapping copy with compute
-        int batchSize = utils::nextHighestPowerOf2(
-                (int)((size_t)kNonPinnedPageSize / (sizeof(float) * this->d)));
+        idx_t batchSize = utils::nextHighestPowerOf2(
+                (kNonPinnedPageSize / (sizeof(float) * this->d)));
 
-        for (int cur = 0; cur < n; cur += batchSize) {
-            int num = std::min(batchSize, n - cur);
+        for (idx_t cur = 0; cur < n; cur += batchSize) {
+            auto num = std::min(batchSize, n - cur);
 
             auto outDistancesSlice = outDistances.narrowOutermost(cur, num);
             auto outIndicesSlice = outIndices.narrowOutermost(cur, num);
 
             searchNonPaged_(
                     num,
-                    x + (size_t)cur * this->d,
+                    x + cur * this->d,
                     k,
                     outDistancesSlice.data(),
                     outIndicesSlice.data(),
@@ -369,10 +349,6 @@
     auto defaultStream = resources_->getDefaultStream(config_.device);
     auto copyStream = resources_->getAsyncCopyStream(config_.device);
 
-    FAISS_ASSERT(
-            (size_t)pageSizeInVecs * this->d <=
-            (size_t)std::numeric_limits<int>::max());
-
     float* bufPinnedA = (float*)pinnedAlloc.first;
     float* bufPinnedB = bufPinnedA + (size_t)pageSizeInVecs * this->d;
     float* bufPinned[2] = {bufPinnedA, bufPinnedB};
@@ -382,11 +358,11 @@
     DeviceTensor<float, 2, true> bufGpuA(
             resources_.get(),
             makeTempAlloc(AllocType::Other, defaultStream),
-            {(int)pageSizeInVecs, (int)this->d});
+            {pageSizeInVecs, this->d});
     DeviceTensor<float, 2, true> bufGpuB(
             resources_.get(),
             makeTempAlloc(AllocType::Other, defaultStream),
-            {(int)pageSizeInVecs, (int)this->d});
+            {pageSizeInVecs, this->d});
     DeviceTensor<float, 2, true>* bufGpus[2] = {&bufGpuA, &bufGpuB};
 
     // Copy completion events for the pinned buffers
@@ -395,26 +371,25 @@
     // Execute completion events for the GPU buffers
     std::unique_ptr<CudaEvent> eventGpuExecuteDone[2];
 
-    // All offsets are in terms of number of vectors; they remain within
-    // int bounds (as this function only handles max in vectors)
+    // All offsets are in terms of number of vectors
 
     // Current start offset for buffer 1
-    int cur1 = 0;
-    int cur1BufIndex = 0;
+    idx_t cur1 = 0;
+    idx_t cur1BufIndex = 0;
 
     // Current start offset for buffer 2
-    int cur2 = -1;
-    int cur2BufIndex = 0;
+    idx_t cur2 = -1;
+    idx_t cur2BufIndex = 0;
 
     // Current start offset for buffer 3
-    int cur3 = -1;
-    int cur3BufIndex = 0;
+    idx_t cur3 = -1;
+    idx_t cur3BufIndex = 0;
 
     while (cur3 < n) {
         // Start async pinned -> GPU copy first (buf 2)
         if (cur2 != -1 && cur2 < n) {
             // Copy pinned to GPU
-            int numToCopy = std::min(pageSizeInVecs, n - cur2);
+            auto numToCopy = std::min(pageSizeInVecs, n - cur2);
 
             // Make sure any previous execution has completed before continuing
             auto& eventPrev = eventGpuExecuteDone[cur2BufIndex];
@@ -425,7 +400,7 @@
             CUDA_VERIFY(cudaMemcpyAsync(
                     bufGpus[cur2BufIndex]->data(),
                     bufPinned[cur2BufIndex],
-                    (size_t)numToCopy * this->d * sizeof(float),
+                    numToCopy * this->d * sizeof(float),
                     cudaMemcpyHostToDevice,
                     copyStream));
 
@@ -438,9 +413,9 @@
             cur2BufIndex = (cur2BufIndex == 0) ? 1 : 0;
         }
 
-        if (cur3 != -1 && cur3 < n) {
+        if (cur3 != idx_t(-1) && cur3 < n) {
             // Process on GPU
-            int numToProcess = std::min(pageSizeInVecs, n - cur3);
+            auto numToProcess = std::min(pageSizeInVecs, n - cur3);
 
             // Make sure the previous copy has completed before continuing
             auto& eventPrev = eventPinnedCopyDone[cur3BufIndex];
@@ -475,7 +450,7 @@
 
         if (cur1 < n) {
             // Copy CPU mem to CPU pinned
-            int numToCopy = std::min(pageSizeInVecs, n - cur1);
+            auto numToCopy = std::min(pageSizeInVecs, n - cur1);
 
             // Make sure any previous copy has completed before continuing
             auto& eventPrev = eventPinnedCopyDone[cur1BufIndex];
@@ -484,8 +459,8 @@
             }
 
             memcpy(bufPinned[cur1BufIndex],
-                   x + (size_t)cur1 * this->d,
-                   (size_t)numToCopy * this->d * sizeof(float));
+                   x + cur1 * this->d,
+                   numToCopy * this->d * sizeof(float));
 
             // We pick up from here
             cur2 = cur1;
@@ -495,18 +470,16 @@
     }
 }
 
-void GpuIndex::compute_residual(
-        const float* x,
-        float* residual,
-        Index::idx_t key) const {
+void GpuIndex::compute_residual(const float* x, float* residual, idx_t key)
+        const {
     FAISS_THROW_MSG("compute_residual not implemented for this type of index");
 }
 
 void GpuIndex::compute_residual_n(
-        Index::idx_t n,
+        idx_t n,
         const float* xs,
         float* residuals,
-        const Index::idx_t* keys) const {
+        const idx_t* keys) const {
     FAISS_THROW_MSG(
             "compute_residual_n not implemented for this type of index");
 }
diff -Nru faiss-1.7.3/faiss/gpu/GpuIndexFlat.cu faiss-1.7.4/faiss/gpu/GpuIndexFlat.cu
--- faiss-1.7.3/faiss/gpu/GpuIndexFlat.cu	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/GpuIndexFlat.cu	2023-04-19 13:18:30.000000000 +0000
@@ -100,14 +100,6 @@
 
     GpuIndex::copyFrom(index);
 
-    // GPU code has 32 bit indices
-    FAISS_THROW_IF_NOT_FMT(
-            index->ntotal <= (Index::idx_t)std::numeric_limits<int>::max(),
-            "GPU index only supports up to %zu indices; "
-            "attempting to copy CPU index with %zu parameters",
-            (size_t)std::numeric_limits<int>::max(),
-            (size_t)index->ntotal);
-
     data_.reset();
     data_.reset(new FlatIndex(
             resources_.get(),
@@ -153,15 +145,14 @@
     this->ntotal = 0;
 }
 
-void GpuIndexFlat::train(Index::idx_t n, const float* x) {
+void GpuIndexFlat::train(idx_t n, const float* x) {
     // nothing to do
 }
 
-void GpuIndexFlat::add(Index::idx_t n, const float* x) {
+void GpuIndexFlat::add(idx_t n, const float* x) {
     DeviceScope scope(config_.device);
 
     FAISS_THROW_IF_NOT_MSG(this->is_trained, "Index not trained");
-    validateNumVectors(n);
 
     if (n == 0) {
         // nothing to add
@@ -186,7 +177,7 @@
     return false;
 }
 
-void GpuIndexFlat::addImpl_(int n, const float* x, const Index::idx_t* ids) {
+void GpuIndexFlat::addImpl_(idx_t n, const float* x, const idx_t* ids) {
     // current device already set
     // n already validated
     FAISS_ASSERT(data_);
@@ -195,51 +186,31 @@
     // We do not support add_with_ids
     FAISS_THROW_IF_NOT_MSG(!ids, "add_with_ids not supported");
 
-    // Due to GPU indexing in int32, we can't store more than this
-    // number of vectors on a GPU
-    FAISS_THROW_IF_NOT_FMT(
-            this->ntotal + n <= (Index::idx_t)std::numeric_limits<int>::max(),
-            "GPU index only supports up to %zu indices",
-            (size_t)std::numeric_limits<int>::max());
-
     data_->add(x, n, resources_->getDefaultStream(config_.device));
     this->ntotal += n;
 }
 
 void GpuIndexFlat::searchImpl_(
-        int n,
+        idx_t n,
         const float* x,
         int k,
         float* distances,
-        Index::idx_t* labels,
+        idx_t* labels,
         const SearchParameters* params) const {
     // current device already set
     // n/k already validated
     auto stream = resources_->getDefaultStream(config_.device);
 
     // Input and output data are already resident on the GPU
-    Tensor<float, 2, true> queries(const_cast<float*>(x), {n, (int)this->d});
+    Tensor<float, 2, true> queries(const_cast<float*>(x), {n, this->d});
     Tensor<float, 2, true> outDistances(distances, {n, k});
-    Tensor<Index::idx_t, 2, true> outLabels(labels, {n, k});
-
-    // FlatIndex only supports int indices
-    DeviceTensor<int, 2, true> outIntLabels(
-            resources_.get(), makeTempAlloc(AllocType::Other, stream), {n, k});
+    Tensor<idx_t, 2, true> outLabels(labels, {n, k});
 
     data_->query(
-            queries,
-            k,
-            metric_type,
-            metric_arg,
-            outDistances,
-            outIntLabels,
-            true);
-
-    // Convert int to idx_t
-    convertTensor<int, Index::idx_t, 2>(stream, outIntLabels, outLabels);
+            queries, k, metric_type, metric_arg, outDistances, outLabels, true);
 }
 
-void GpuIndexFlat::reconstruct(Index::idx_t key, float* out) const {
+void GpuIndexFlat::reconstruct(idx_t key, float* out) const {
     DeviceScope scope(config_.device);
 
     FAISS_THROW_IF_NOT_FMT(
@@ -262,8 +233,7 @@
     fromDevice(vec.data(), out, this->d, stream);
 }
 
-void GpuIndexFlat::reconstruct_n(Index::idx_t i0, Index::idx_t n, float* out)
-        const {
+void GpuIndexFlat::reconstruct_n(idx_t i0, idx_t n, float* out) const {
     DeviceScope scope(config_.device);
 
     if (n == 0) {
@@ -271,8 +241,6 @@
         return;
     }
 
-    validateNumVectors(n);
-
     FAISS_THROW_IF_NOT_FMT(
             i0 < this->ntotal,
             "start index (%zu) out of bounds (ntotal %zu)",
@@ -283,19 +251,10 @@
             "max index requested (%zu) out of bounds (ntotal %zu)",
             i0 + n - 1,
             this->ntotal);
-    FAISS_THROW_IF_NOT_FMT(
-            n <= (Index::idx_t)std::numeric_limits<int>::max(),
-            "number of vectors requested (%zu) must be less than %zu",
-            n,
-            (Index::idx_t)std::numeric_limits<int>::max());
     auto stream = resources_->getDefaultStream(config_.device);
 
     auto outDevice = toDeviceTemporary<float, 2>(
-            resources_.get(),
-            config_.device,
-            out,
-            stream,
-            {(int)n, (int)this->d});
+            resources_.get(), config_.device, out, stream, {n, this->d});
 
     FAISS_ASSERT(data_);
     data_->reconstruct(i0, n, outDevice);
@@ -303,10 +262,8 @@
     fromDevice<float, 2>(outDevice, out, stream);
 }
 
-void GpuIndexFlat::reconstruct_batch(
-        Index::idx_t n,
-        const Index::idx_t* keys,
-        float* out) const {
+void GpuIndexFlat::reconstruct_batch(idx_t n, const idx_t* keys, float* out)
+        const {
     DeviceScope scope(config_.device);
     auto stream = resources_->getDefaultStream(config_.device);
 
@@ -315,21 +272,15 @@
         return;
     }
 
-    validateNumVectors(n);
-
-    auto keysDevice = toDeviceTemporary<faiss::Index::idx_t, 1>(
+    auto keysDevice = toDeviceTemporary<faiss::idx_t, 1>(
             resources_.get(),
             config_.device,
-            const_cast<Index::idx_t*>(keys),
+            const_cast<idx_t*>(keys),
             stream,
-            {(int)n});
+            {n});
 
     auto outDevice = toDeviceTemporary<float, 2>(
-            resources_.get(),
-            config_.device,
-            out,
-            stream,
-            {(int)n, (int)this->d});
+            resources_.get(), config_.device, out, stream, {n, this->d});
 
     FAISS_ASSERT(data_);
     data_->reconstruct(keysDevice, outDevice);
@@ -338,18 +289,16 @@
     fromDevice<float, 2>(outDevice, out, stream);
 }
 
-void GpuIndexFlat::compute_residual(
-        const float* x,
-        float* residual,
-        Index::idx_t key) const {
+void GpuIndexFlat::compute_residual(const float* x, float* residual, idx_t key)
+        const {
     compute_residual_n(1, x, residual, &key);
 }
 
 void GpuIndexFlat::compute_residual_n(
-        Index::idx_t n,
+        idx_t n,
         const float* xs,
         float* residuals,
-        const Index::idx_t* keys) const {
+        const idx_t* keys) const {
     DeviceScope scope(config_.device);
     auto stream = resources_->getDefaultStream(config_.device);
 
@@ -358,26 +307,20 @@
         return;
     }
 
-    validateNumVectors(n);
-
     auto vecsDevice = toDeviceTemporary<float, 2>(
             resources_.get(),
             config_.device,
             const_cast<float*>(xs),
             stream,
-            {(int)n, (int)this->d});
-    auto idsDevice = toDeviceTemporary<Index::idx_t, 1>(
+            {n, this->d});
+    auto idsDevice = toDeviceTemporary<idx_t, 1>(
             resources_.get(),
             config_.device,
-            const_cast<Index::idx_t*>(keys),
+            const_cast<idx_t*>(keys),
             stream,
-            {(int)n});
+            {n});
     auto residualDevice = toDeviceTemporary<float, 2>(
-            resources_.get(),
-            config_.device,
-            residuals,
-            stream,
-            {(int)n, (int)this->d});
+            resources_.get(), config_.device, residuals, stream, {n, this->d});
 
     FAISS_ASSERT(data_);
     data_->computeResidual(vecsDevice, idsDevice, residualDevice);
diff -Nru faiss-1.7.3/faiss/gpu/GpuIndexFlat.h faiss-1.7.4/faiss/gpu/GpuIndexFlat.h
--- faiss-1.7.3/faiss/gpu/GpuIndexFlat.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/GpuIndexFlat.h	2023-04-19 13:18:30.000000000 +0000
@@ -82,33 +82,32 @@
     void reset() override;
 
     /// This index is not trained, so this does nothing
-    void train(Index::idx_t n, const float* x) override;
+    void train(idx_t n, const float* x) override;
 
     /// Overrides to avoid excessive copies
-    void add(Index::idx_t, const float* x) override;
+    void add(idx_t, const float* x) override;
 
     /// Reconstruction methods; prefer the batch reconstruct as it will
     /// be more efficient
-    void reconstruct(Index::idx_t key, float* out) const override;
+    void reconstruct(idx_t key, float* out) const override;
 
     /// Batch reconstruction method
-    void reconstruct_n(Index::idx_t i0, Index::idx_t num, float* out)
-            const override;
+    void reconstruct_n(idx_t i0, idx_t num, float* out) const override;
 
     /// Batch reconstruction method
-    void reconstruct_batch(Index::idx_t n, const Index::idx_t* keys, float* out)
+    void reconstruct_batch(idx_t n, const idx_t* keys, float* out)
             const override;
 
     /// Compute residual
-    void compute_residual(const float* x, float* residual, Index::idx_t key)
+    void compute_residual(const float* x, float* residual, idx_t key)
             const override;
 
     /// Compute residual (batch mode)
     void compute_residual_n(
-            Index::idx_t n,
+            idx_t n,
             const float* xs,
             float* residuals,
-            const Index::idx_t* keys) const override;
+            const idx_t* keys) const override;
 
     /// For internal access
     inline FlatIndex* getGpuData() {
@@ -121,15 +120,15 @@
     bool addImplRequiresIDs_() const override;
 
     /// Called from GpuIndex for add
-    void addImpl_(int n, const float* x, const Index::idx_t* ids) override;
+    void addImpl_(idx_t n, const float* x, const idx_t* ids) override;
 
     /// Called from GpuIndex for search
     void searchImpl_(
-            int n,
+            idx_t n,
             const float* x,
             int k,
             float* distances,
-            Index::idx_t* labels,
+            idx_t* labels,
             const SearchParameters* params) const override;
 
    protected:
diff -Nru faiss-1.7.3/faiss/gpu/GpuIndex.h faiss-1.7.4/faiss/gpu/GpuIndex.h
--- faiss-1.7.3/faiss/gpu/GpuIndex.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/GpuIndex.h	2023-04-19 13:18:30.000000000 +0000
@@ -51,30 +51,31 @@
     /// `x` can be resident on the CPU or any GPU; copies are performed
     /// as needed
     /// Handles paged adds if the add set is too large; calls addInternal_
-    void add(Index::idx_t, const float* x) override;
+    void add(idx_t, const float* x) override;
 
     /// `x` and `ids` can be resident on the CPU or any GPU; copies are
     /// performed as needed
     /// Handles paged adds if the add set is too large; calls addInternal_
-    void add_with_ids(Index::idx_t n, const float* x, const Index::idx_t* ids)
-            override;
+    void add_with_ids(idx_t n, const float* x, const idx_t* ids) override;
 
     /// `x` and `labels` can be resident on the CPU or any GPU; copies are
     /// performed as needed
     void assign(
-            Index::idx_t n,
+            idx_t n,
             const float* x,
-            Index::idx_t* labels,
-            Index::idx_t k = 1) const override;
+            idx_t* labels,
+            // faiss::Index has idx_t for k
+            idx_t k = 1) const override;
 
     /// `x`, `distances` and `labels` can be resident on the CPU or any
     /// GPU; copies are performed as needed
     void search(
-            Index::idx_t n,
+            idx_t n,
             const float* x,
-            Index::idx_t k,
+            // faiss::Index has idx_t for k
+            idx_t k,
             float* distances,
-            Index::idx_t* labels,
+            idx_t* labels,
             const SearchParameters* params = nullptr) const override;
 
     /// `x`, `distances` and `labels` and `recons` can be resident on the CPU or
@@ -82,6 +83,7 @@
     void search_and_reconstruct(
             idx_t n,
             const float* x,
+            // faiss::Index has idx_t for k
             idx_t k,
             float* distances,
             idx_t* labels,
@@ -90,16 +92,16 @@
 
     /// Overridden to force GPU indices to provide their own GPU-friendly
     /// implementation
-    void compute_residual(const float* x, float* residual, Index::idx_t key)
+    void compute_residual(const float* x, float* residual, idx_t key)
             const override;
 
     /// Overridden to force GPU indices to provide their own GPU-friendly
     /// implementation
     void compute_residual_n(
-            Index::idx_t n,
+            idx_t n,
             const float* xs,
             float* residuals,
-            const Index::idx_t* keys) const override;
+            const idx_t* keys) const override;
 
    protected:
     /// Copy what we need from the CPU equivalent
@@ -114,43 +116,43 @@
 
     /// Overridden to actually perform the add
     /// All data is guaranteed to be resident on our device
-    virtual void addImpl_(int n, const float* x, const Index::idx_t* ids) = 0;
+    virtual void addImpl_(idx_t n, const float* x, const idx_t* ids) = 0;
 
     /// Overridden to actually perform the search
     /// All data is guaranteed to be resident on our device
     virtual void searchImpl_(
-            int n,
+            idx_t n,
             const float* x,
             int k,
             float* distances,
-            Index::idx_t* labels,
+            idx_t* labels,
             const SearchParameters* params) const = 0;
 
    private:
     /// Handles paged adds if the add set is too large, passes to
     /// addImpl_ to actually perform the add for the current page
-    void addPaged_(int n, const float* x, const Index::idx_t* ids);
+    void addPaged_(idx_t n, const float* x, const idx_t* ids);
 
     /// Calls addImpl_ for a single page of GPU-resident data
-    void addPage_(int n, const float* x, const Index::idx_t* ids);
+    void addPage_(idx_t n, const float* x, const idx_t* ids);
 
     /// Calls searchImpl_ for a single page of GPU-resident data
     void searchNonPaged_(
-            int n,
+            idx_t n,
             const float* x,
             int k,
             float* outDistancesData,
-            Index::idx_t* outIndicesData,
+            idx_t* outIndicesData,
             const SearchParameters* params) const;
 
     /// Calls searchImpl_ for a single page of GPU-resident data,
     /// handling paging of the data and copies from the CPU
     void searchFromCpuPaged_(
-            int n,
+            idx_t n,
             const float* x,
             int k,
             float* outDistancesData,
-            Index::idx_t* outIndicesData,
+            idx_t* outIndicesData,
             const SearchParameters* params) const;
 
    protected:
diff -Nru faiss-1.7.3/faiss/gpu/GpuIndexIVF.cu faiss-1.7.4/faiss/gpu/GpuIndexIVF.cu
--- faiss-1.7.3/faiss/gpu/GpuIndexIVF.cu	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/GpuIndexIVF.cu	2023-04-19 13:18:30.000000000 +0000
@@ -24,13 +24,10 @@
         int dims,
         faiss::MetricType metric,
         float metricArg,
-        int nlistIn,
+        idx_t nlistIn,
         GpuIndexIVFConfig config)
         : GpuIndex(provider->getResources(), dims, metric, metricArg, config),
-          nlist(nlistIn),
-          nprobe(1),
-          quantizer(nullptr),
-          own_fields(false),
+          IndexIVFInterface(nullptr, nlistIn),
           ivfConfig_(config) {
     // Only IP and L2 are supported for now
     if (!(metric_type == faiss::METRIC_L2 ||
@@ -47,12 +44,10 @@
         int dims,
         faiss::MetricType metric,
         float metricArg,
-        int nlistIn,
+        idx_t nlistIn,
         GpuIndexIVFConfig config)
         : GpuIndex(provider->getResources(), dims, metric, metricArg, config),
-          nlist(nlistIn),
-          nprobe(1),
-          quantizer(coarseQuantizer),
+          IndexIVFInterface(coarseQuantizer, nlistIn),
           ivfConfig_(config) {
     FAISS_THROW_IF_NOT_MSG(
             quantizer, "expecting a coarse quantizer object; none provided");
@@ -113,11 +108,7 @@
     verifyIVFSettings_();
 }
 
-GpuIndexIVF::~GpuIndexIVF() {
-    if (own_fields) {
-        delete quantizer;
-    }
-}
+GpuIndexIVF::~GpuIndexIVF() {}
 
 void GpuIndexIVF::verifyIVFSettings_() const {
     // We should always have a quantizer instance
@@ -130,7 +121,7 @@
         // IVF quantizer should correspond to our set of lists
         FAISS_THROW_IF_NOT_FMT(
                 quantizer->ntotal == nlist,
-                "IVF nlist count (%d) does not match trained coarse quantizer size (%zu)",
+                "IVF nlist count (%zu) does not match trained coarse quantizer size (%zu)",
                 nlist,
                 quantizer->ntotal);
     } else {
@@ -159,17 +150,9 @@
     GpuIndex::copyFrom(index);
 
     FAISS_ASSERT(index->nlist > 0);
-    FAISS_THROW_IF_NOT_FMT(
-            index->nlist <= (Index::idx_t)std::numeric_limits<int>::max(),
-            "GPU index only supports %zu inverted lists",
-            (size_t)std::numeric_limits<int>::max());
     nlist = index->nlist;
 
-    FAISS_THROW_IF_NOT_FMT(
-            index->nprobe > 0 && index->nprobe <= getMaxKSelection(),
-            "GPU index only supports nprobe <= %zu; passed %zu",
-            (size_t)getMaxKSelection(),
-            index->nprobe);
+    validateNProbe(index->nprobe);
     nprobe = index->nprobe;
 
     // The metric type may have changed as well, so we might have to
@@ -249,53 +232,41 @@
     index->make_direct_map(false);
 }
 
-int GpuIndexIVF::getNumLists() const {
+idx_t GpuIndexIVF::getNumLists() const {
     return nlist;
 }
 
-void GpuIndexIVF::setNumProbes(int nprobe) {
-    FAISS_THROW_IF_NOT_FMT(
-            nprobe > 0 && nprobe <= getMaxKSelection(),
-            "GPU index only supports nprobe <= %d; passed %d",
-            getMaxKSelection(),
-            nprobe);
-    this->nprobe = nprobe;
-}
-
-int GpuIndexIVF::getNumProbes() const {
-    return nprobe;
-}
-
-int GpuIndexIVF::getListLength(int listId) const {
+idx_t GpuIndexIVF::getListLength(idx_t listId) const {
     DeviceScope scope(config_.device);
     FAISS_ASSERT(baseIndex_);
 
     return baseIndex_->getListLength(listId);
 }
 
-std::vector<uint8_t> GpuIndexIVF::getListVectorData(int listId, bool gpuFormat)
-        const {
+std::vector<uint8_t> GpuIndexIVF::getListVectorData(
+        idx_t listId,
+        bool gpuFormat) const {
     DeviceScope scope(config_.device);
     FAISS_ASSERT(baseIndex_);
 
     return baseIndex_->getListVectorData(listId, gpuFormat);
 }
 
-std::vector<Index::idx_t> GpuIndexIVF::getListIndices(int listId) const {
+std::vector<idx_t> GpuIndexIVF::getListIndices(idx_t listId) const {
     DeviceScope scope(config_.device);
     FAISS_ASSERT(baseIndex_);
 
     return baseIndex_->getListIndices(listId);
 }
 
-void GpuIndexIVF::addImpl_(int n, const float* x, const Index::idx_t* xids) {
+void GpuIndexIVF::addImpl_(idx_t n, const float* x, const idx_t* xids) {
     // Device is already set in GpuIndex::add
     FAISS_ASSERT(baseIndex_);
     FAISS_ASSERT(n > 0);
 
     // Data is already resident on the GPU
-    Tensor<float, 2, true> data(const_cast<float*>(x), {n, (int)this->d});
-    Tensor<Index::idx_t, 1, true> labels(const_cast<Index::idx_t*>(xids), {n});
+    Tensor<float, 2, true> data(const_cast<float*>(x), {n, this->d});
+    Tensor<idx_t, 1, true> labels(const_cast<idx_t*>(xids), {n});
 
     // Not all vectors may be able to be added (some may contain NaNs etc)
     baseIndex_->addVectors(quantizer, data, labels);
@@ -305,15 +276,8 @@
     ntotal += n;
 }
 
-void GpuIndexIVF::searchImpl_(
-        int n,
-        const float* x,
-        int k,
-        float* distances,
-        Index::idx_t* labels,
-        const SearchParameters* params) const {
-    // Device was already set in GpuIndex::search
-    Index::idx_t use_nprobe = nprobe;
+int GpuIndexIVF::getCurrentNProbe_(const SearchParameters* params) const {
+    size_t use_nprobe = nprobe;
     if (params) {
         auto ivfParams = dynamic_cast<const SearchParametersIVF*>(params);
         if (ivfParams) {
@@ -333,16 +297,28 @@
     }
 
     validateNProbe(use_nprobe);
+    // We use int internally for nprobe
+    return int(use_nprobe);
+}
+
+void GpuIndexIVF::searchImpl_(
+        idx_t n,
+        const float* x,
+        int k,
+        float* distances,
+        idx_t* labels,
+        const SearchParameters* params) const {
+    // Device was already set in GpuIndex::search
+    int use_nprobe = getCurrentNProbe_(params);
 
     // This was previously checked
     FAISS_ASSERT(is_trained && baseIndex_);
     FAISS_ASSERT(n > 0);
 
     // Data is already resident on the GPU
-    Tensor<float, 2, true> queries(const_cast<float*>(x), {n, (int)this->d});
+    Tensor<float, 2, true> queries(const_cast<float*>(x), {n, this->d});
     Tensor<float, 2, true> outDistances(distances, {n, k});
-    Tensor<Index::idx_t, 2, true> outLabels(
-            const_cast<Index::idx_t*>(labels), {n, k});
+    Tensor<idx_t, 2, true> outLabels(const_cast<idx_t*>(labels), {n, k});
 
     baseIndex_->search(
             quantizer, queries, use_nprobe, k, outDistances, outLabels);
@@ -357,7 +333,9 @@
         float* distances,
         idx_t* labels,
         bool store_pairs,
-        const IVFSearchParameters* params) const {
+        const IVFSearchParameters* params,
+        IndexIVFStats* stats) const {
+    FAISS_THROW_IF_NOT_MSG(stats == nullptr, "IVF stats not supported");
     DeviceScope scope(config_.device);
     auto stream = resources_->getDefaultStream(config_.device);
 
@@ -368,7 +346,6 @@
     FAISS_THROW_IF_NOT_MSG(this->is_trained, "GpuIndexIVF not trained");
     FAISS_ASSERT(baseIndex_);
 
-    validateNumVectors(n);
     validateKSelect(k);
 
     if (n == 0 || k == 0) {
@@ -379,13 +356,12 @@
     idx_t use_nprobe = params ? params->nprobe : this->nprobe;
     validateNProbe(use_nprobe);
 
-    if (params) {
-        FAISS_THROW_IF_NOT_FMT(
-                params->max_codes == 0,
-                "GPU IVF index does not currently support "
-                "SearchParametersIVF::max_codes (passed %zu, must be 0)",
-                params->max_codes);
-    }
+    size_t max_codes = params ? params->max_codes : this->max_codes;
+    FAISS_THROW_IF_NOT_FMT(
+            max_codes == 0,
+            "GPU IVF index does not currently support "
+            "SearchParametersIVF::max_codes (passed %zu, must be 0)",
+            max_codes);
 
     // Ensure that all data/output buffers are resident on our desired device
     auto vecsDevice = toDeviceTemporary<float, 2>(
@@ -393,31 +369,27 @@
             config_.device,
             const_cast<float*>(x),
             stream,
-            {(int)n, (int)d});
+            {n, d});
 
     auto distanceDevice = toDeviceTemporary<float, 2>(
             resources_.get(),
             config_.device,
             const_cast<float*>(centroid_dis),
             stream,
-            {(int)n, (int)use_nprobe});
+            {n, use_nprobe});
 
-    auto assignDevice = toDeviceTemporary<Index::idx_t, 2>(
+    auto assignDevice = toDeviceTemporary<idx_t, 2>(
             resources_.get(),
             config_.device,
-            const_cast<Index::idx_t*>(assign),
+            const_cast<idx_t*>(assign),
             stream,
-            {(int)n, (int)use_nprobe});
+            {n, use_nprobe});
 
     auto outDistancesDevice = toDeviceTemporary<float, 2>(
-            resources_.get(),
-            config_.device,
-            distances,
-            stream,
-            {(int)n, (int)k});
+            resources_.get(), config_.device, distances, stream, {n, k});
 
-    auto outIndicesDevice = toDeviceTemporary<Index::idx_t, 2>(
-            resources_.get(), config_.device, labels, stream, {(int)n, (int)k});
+    auto outIndicesDevice = toDeviceTemporary<idx_t, 2>(
+            resources_.get(), config_.device, labels, stream, {n, k});
 
     baseIndex_->searchPreassigned(
             quantizer,
@@ -431,7 +403,20 @@
 
     // If the output was not already on the GPU, copy it back
     fromDevice<float, 2>(outDistancesDevice, distances, stream);
-    fromDevice<Index::idx_t, 2>(outIndicesDevice, labels, stream);
+    fromDevice<idx_t, 2>(outIndicesDevice, labels, stream);
+}
+
+void GpuIndexIVF::range_search_preassigned(
+        idx_t nx,
+        const float* x,
+        float radius,
+        const idx_t* keys,
+        const float* coarse_dis,
+        RangeSearchResult* result,
+        bool store_pairs,
+        const IVFSearchParameters* params,
+        IndexIVFStats* stats) const {
+    FAISS_THROW_MSG("range search not implemented");
 }
 
 bool GpuIndexIVF::addImplRequiresIDs_() const {
@@ -439,7 +424,7 @@
     return true;
 }
 
-void GpuIndexIVF::trainQuantizer_(Index::idx_t n, const float* x) {
+void GpuIndexIVF::trainQuantizer_(idx_t n, const float* x) {
     DeviceScope scope(config_.device);
 
     if (n == 0) {
diff -Nru faiss-1.7.3/faiss/gpu/GpuIndexIVFFlat.cu faiss-1.7.4/faiss/gpu/GpuIndexIVFFlat.cu
--- faiss-1.7.3/faiss/gpu/GpuIndexIVFFlat.cu	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/GpuIndexIVFFlat.cu	2023-04-19 13:18:30.000000000 +0000
@@ -39,7 +39,7 @@
 GpuIndexIVFFlat::GpuIndexIVFFlat(
         GpuResourcesProvider* provider,
         int dims,
-        int nlist,
+        idx_t nlist,
         faiss::MetricType metric,
         GpuIndexIVFFlatConfig config)
         : GpuIndexIVF(provider, dims, metric, 0, nlist, config),
@@ -53,7 +53,7 @@
         GpuResourcesProvider* provider,
         Index* coarseQuantizer,
         int dims,
-        int nlist,
+        idx_t nlist,
         faiss::MetricType metric,
         GpuIndexIVFFlatConfig config)
         : GpuIndexIVF(
@@ -188,15 +188,9 @@
     }
 }
 
-void GpuIndexIVFFlat::train(Index::idx_t n, const float* x) {
+void GpuIndexIVFFlat::train(idx_t n, const float* x) {
     DeviceScope scope(config_.device);
 
-    // For now, only support <= max int results
-    FAISS_THROW_IF_NOT_FMT(
-            n <= (Index::idx_t)std::numeric_limits<int>::max(),
-            "GPU index only supports up to %d indices",
-            std::numeric_limits<int>::max());
-
     // just in case someone changed our quantizer
     verifyIVFSettings_();
 
@@ -213,7 +207,7 @@
     auto hostData = toHost<float, 2>(
             (float*)x,
             resources_->getDefaultStream(config_.device),
-            {(int)n, (int)this->d});
+            {n, this->d});
 
     trainQuantizer_(n, hostData.data());
 
diff -Nru faiss-1.7.3/faiss/gpu/GpuIndexIVFFlat.h faiss-1.7.4/faiss/gpu/GpuIndexIVFFlat.h
--- faiss-1.7.3/faiss/gpu/GpuIndexIVFFlat.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/GpuIndexIVFFlat.h	2023-04-19 13:18:30.000000000 +0000
@@ -44,7 +44,7 @@
     GpuIndexIVFFlat(
             GpuResourcesProvider* provider,
             int dims,
-            int nlist,
+            idx_t nlist,
             faiss::MetricType metric = faiss::METRIC_L2,
             GpuIndexIVFFlatConfig config = GpuIndexIVFFlatConfig());
 
@@ -54,7 +54,7 @@
             GpuResourcesProvider* provider,
             Index* coarseQuantizer,
             int dims,
-            int nlist,
+            idx_t nlist,
             faiss::MetricType metric = faiss::METRIC_L2,
             GpuIndexIVFFlatConfig config = GpuIndexIVFFlatConfig());
 
@@ -85,7 +85,7 @@
     void updateQuantizer() override;
 
     /// Trains the coarse quantizer based on the given vector data
-    void train(Index::idx_t n, const float* x) override;
+    void train(idx_t n, const float* x) override;
 
    protected:
     /// Our configuration options
diff -Nru faiss-1.7.3/faiss/gpu/GpuIndexIVF.h faiss-1.7.4/faiss/gpu/GpuIndexIVF.h
--- faiss-1.7.3/faiss/gpu/GpuIndexIVF.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/GpuIndexIVF.h	2023-04-19 13:18:30.000000000 +0000
@@ -33,7 +33,7 @@
 /// Base class of all GPU IVF index types. This (for now) deliberately does not
 /// inherit from IndexIVF, as many of the public data members and functionality
 /// in IndexIVF is not supported in the same manner on the GPU.
-class GpuIndexIVF : public GpuIndex {
+class GpuIndexIVF : public GpuIndex, public IndexIVFInterface {
    public:
     /// Version that auto-constructs a flat coarse quantizer based on the
     /// desired metric
@@ -42,7 +42,7 @@
             int dims,
             faiss::MetricType metric,
             float metricArg,
-            int nlist,
+            idx_t nlist,
             GpuIndexIVFConfig config = GpuIndexIVFConfig());
 
     /// Version that takes a coarse quantizer instance. The GpuIndexIVF does not
@@ -53,7 +53,7 @@
             int dims,
             faiss::MetricType metric,
             float metricArg,
-            int nlist,
+            idx_t nlist,
             GpuIndexIVFConfig config = GpuIndexIVFConfig());
 
     ~GpuIndexIVF() override;
@@ -75,10 +75,10 @@
     virtual void updateQuantizer() = 0;
 
     /// Returns the number of inverted lists we're managing
-    int getNumLists() const;
+    idx_t getNumLists() const;
 
     /// Returns the number of vectors present in a particular inverted list
-    int getListLength(int listId) const;
+    idx_t getListLength(idx_t listId) const;
 
     /// Return the encoded vector data contained in a particular inverted list,
     /// for debugging purposes.
@@ -86,34 +86,13 @@
     /// GPU-side representation.
     /// Otherwise, it is converted to the CPU format.
     /// compliant format, while the native GPU format may differ.
-    std::vector<uint8_t> getListVectorData(int listId, bool gpuFormat = false)
+    std::vector<uint8_t> getListVectorData(idx_t listId, bool gpuFormat = false)
             const;
 
     /// Return the vector indices contained in a particular inverted list, for
     /// debugging purposes.
-    std::vector<Index::idx_t> getListIndices(int listId) const;
+    std::vector<idx_t> getListIndices(idx_t listId) const;
 
-    /// Sets the number of list probes per query
-    void setNumProbes(int nprobe);
-
-    /// Returns our current number of list probes per query
-    int getNumProbes() const;
-
-    /// Same interface as faiss::IndexIVF, in order to search a set of vectors
-    /// pre-quantized by the IVF quantizer. Does not include IndexIVFStats as
-    /// that can only be obtained on the host via a GPU d2h copy.
-    /// @param n      nb of vectors to query
-    /// @param x      query vectors, size nx * d
-    /// @param assign coarse quantization indices, size nx * nprobe
-    /// @param centroid_dis
-    ///             distances to coarse centroids, size nx * nprobe
-    /// @param distance
-    ///             output distances, size n * k
-    /// @param labels output labels, size n * k
-    /// @param store_pairs store inv list index + inv list offset
-    ///                   instead in upper/lower 32 bit of result,
-    ///                   instead of ids (used for reranking).
-    /// @param params used to override the object's search parameters
     void search_preassigned(
             idx_t n,
             const float* x,
@@ -123,41 +102,41 @@
             float* distances,
             idx_t* labels,
             bool store_pairs,
-            const SearchParametersIVF* params = nullptr) const;
+            const SearchParametersIVF* params = nullptr,
+            IndexIVFStats* stats = nullptr) const override;
+
+    // not implemented for GPU
+    void range_search_preassigned(
+            idx_t nx,
+            const float* x,
+            float radius,
+            const idx_t* keys,
+            const float* coarse_dis,
+            RangeSearchResult* result,
+            bool store_pairs = false,
+            const IVFSearchParameters* params = nullptr,
+            IndexIVFStats* stats = nullptr) const override;
 
    protected:
+    /// From either the current set nprobe or the SearchParameters if available,
+    /// return the nprobe that we should use for the current search
+    int getCurrentNProbe_(const SearchParameters* params) const;
     void verifyIVFSettings_() const;
     bool addImplRequiresIDs_() const override;
-    void trainQuantizer_(Index::idx_t n, const float* x);
+    void trainQuantizer_(idx_t n, const float* x);
 
     /// Called from GpuIndex for add/add_with_ids
-    void addImpl_(int n, const float* x, const Index::idx_t* ids) override;
+    void addImpl_(idx_t n, const float* x, const idx_t* ids) override;
 
     /// Called from GpuIndex for search
     void searchImpl_(
-            int n,
+            idx_t n,
             const float* x,
             int k,
             float* distances,
-            Index::idx_t* labels,
+            idx_t* labels,
             const SearchParameters* params) const override;
 
-   public:
-    /// Exposing this like the CPU version for manipulation
-    ClusteringParameters cp;
-
-    /// Exposing this like the CPU version for query
-    int nlist;
-
-    /// Exposing this like the CPU version for manipulation
-    int nprobe;
-
-    /// A user-pluggable coarse quantizer
-    Index* quantizer;
-
-    /// Whether or not we own the coarse quantizer
-    bool own_fields;
-
    protected:
     /// Our configuration options
     const GpuIndexIVFConfig ivfConfig_;
diff -Nru faiss-1.7.3/faiss/gpu/GpuIndexIVFPQ.cu faiss-1.7.4/faiss/gpu/GpuIndexIVFPQ.cu
--- faiss-1.7.3/faiss/gpu/GpuIndexIVFPQ.cu	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/GpuIndexIVFPQ.cu	2023-04-19 13:18:30.000000000 +0000
@@ -43,9 +43,9 @@
 GpuIndexIVFPQ::GpuIndexIVFPQ(
         GpuResourcesProvider* provider,
         int dims,
-        int nlist,
-        int subQuantizers,
-        int bitsPerCode,
+        idx_t nlist,
+        idx_t subQuantizers,
+        idx_t bitsPerCode,
         faiss::MetricType metric,
         GpuIndexIVFPQConfig config)
         : GpuIndexIVF(provider, dims, metric, 0, nlist, config),
@@ -62,9 +62,9 @@
         GpuResourcesProvider* provider,
         Index* coarseQuantizer,
         int dims,
-        int nlist,
-        int subQuantizers,
-        int bitsPerCode,
+        idx_t nlist,
+        idx_t subQuantizers,
+        idx_t bitsPerCode,
         faiss::MetricType metric,
         GpuIndexIVFPQConfig config)
         : GpuIndexIVF(
@@ -270,7 +270,7 @@
     }
 }
 
-void GpuIndexIVFPQ::trainResidualQuantizer_(Index::idx_t n, const float* x) {
+void GpuIndexIVFPQ::trainResidualQuantizer_(idx_t n, const float* x) {
     // Code largely copied from faiss::IndexIVFPQ
     auto x_in = x;
 
@@ -288,7 +288,7 @@
         printf("computing residuals\n");
     }
 
-    std::vector<Index::idx_t> assign(n);
+    std::vector<idx_t> assign(n);
     quantizer->assign(n, x, assign.data());
 
     std::vector<float> residuals(n * d);
@@ -347,15 +347,9 @@
     index_->setPrecomputedCodes(quantizer, usePrecomputedTables_);
 }
 
-void GpuIndexIVFPQ::train(Index::idx_t n, const float* x) {
+void GpuIndexIVFPQ::train(idx_t n, const float* x) {
     DeviceScope scope(config_.device);
 
-    // For now, only support <= max int results
-    FAISS_THROW_IF_NOT_FMT(
-            n <= (Index::idx_t)std::numeric_limits<int>::max(),
-            "GPU index only supports up to %d indices",
-            std::numeric_limits<int>::max());
-
     // just in case someone changed us
     verifyPQSettings_();
     verifyIVFSettings_();
@@ -373,7 +367,7 @@
     auto hostData = toHost<float, 2>(
             (float*)x,
             resources_->getDefaultStream(config_.device),
-            {(int)n, (int)this->d});
+            {n, this->d});
 
     trainQuantizer_(n, hostData.data());
     trainResidualQuantizer_(n, hostData.data());
diff -Nru faiss-1.7.3/faiss/gpu/GpuIndexIVFPQ.h faiss-1.7.4/faiss/gpu/GpuIndexIVFPQ.h
--- faiss-1.7.3/faiss/gpu/GpuIndexIVFPQ.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/GpuIndexIVFPQ.h	2023-04-19 13:18:30.000000000 +0000
@@ -68,9 +68,9 @@
     GpuIndexIVFPQ(
             GpuResourcesProvider* provider,
             int dims,
-            int nlist,
-            int subQuantizers,
-            int bitsPerCode,
+            idx_t nlist,
+            idx_t subQuantizers,
+            idx_t bitsPerCode,
             faiss::MetricType metric = faiss::METRIC_L2,
             GpuIndexIVFPQConfig config = GpuIndexIVFPQConfig());
 
@@ -80,9 +80,9 @@
             GpuResourcesProvider* provider,
             Index* coarseQuantizer,
             int dims,
-            int nlist,
-            int subQuantizers,
-            int bitsPerCode,
+            idx_t nlist,
+            idx_t subQuantizers,
+            idx_t bitsPerCode,
             faiss::MetricType metric = faiss::METRIC_L2,
             GpuIndexIVFPQConfig config = GpuIndexIVFPQConfig());
 
@@ -131,7 +131,7 @@
     void updateQuantizer() override;
 
     /// Trains the coarse and product quantizer based on the given vector data
-    void train(Index::idx_t n, const float* x) override;
+    void train(idx_t n, const float* x) override;
 
    public:
     /// Like the CPU version, we expose a publically-visible ProductQuantizer
@@ -143,7 +143,7 @@
     void verifyPQSettings_() const;
 
     /// Trains the PQ quantizer based on the given vector data
-    void trainResidualQuantizer_(Index::idx_t n, const float* x);
+    void trainResidualQuantizer_(idx_t n, const float* x);
 
    protected:
     /// Our configuration options that we were initialized with
diff -Nru faiss-1.7.3/faiss/gpu/GpuIndexIVFScalarQuantizer.cu faiss-1.7.4/faiss/gpu/GpuIndexIVFScalarQuantizer.cu
--- faiss-1.7.3/faiss/gpu/GpuIndexIVFScalarQuantizer.cu	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/GpuIndexIVFScalarQuantizer.cu	2023-04-19 13:18:30.000000000 +0000
@@ -39,7 +39,7 @@
 GpuIndexIVFScalarQuantizer::GpuIndexIVFScalarQuantizer(
         GpuResourcesProvider* provider,
         int dims,
-        int nlist,
+        idx_t nlist,
         faiss::ScalarQuantizer::QuantizerType qtype,
         faiss::MetricType metric,
         bool encodeResidual,
@@ -58,7 +58,7 @@
         GpuResourcesProvider* provider,
         Index* coarseQuantizer,
         int dims,
-        int nlist,
+        idx_t nlist,
         faiss::ScalarQuantizer::QuantizerType qtype,
         faiss::MetricType metric,
         bool encodeResidual,
@@ -88,6 +88,26 @@
 void GpuIndexIVFScalarQuantizer::verifySQSettings_() const {
     FAISS_THROW_IF_NOT_MSG(
             isSQSupported(sq.qtype), "Unsupported scalar QuantizerType on GPU");
+
+    // Check the amount of shared memory per block available based on our type
+    // is sufficient
+    // This check was previously in IVFFlatScan.cu, moved here to apply upon
+    // index construction
+    if (sq.qtype == ScalarQuantizer::QuantizerType::QT_8bit ||
+        sq.qtype == ScalarQuantizer::QuantizerType::QT_4bit) {
+        // There are quantization parameters per each dimension for these SQ
+        // types. These parameters are retained in shared memory for access
+        int maxDim =
+                getMaxSharedMemPerBlock(config_.device) / (sizeof(float) * 2);
+
+        FAISS_THROW_IF_NOT_FMT(
+                this->d < maxDim,
+                "GpuIndexIVFScalarQuantizer: Insufficient shared memory "
+                "available on the GPU for QT_8bit or QT_4bit with %d "
+                "dimensions; maximum dimensions possible is %d",
+                this->d,
+                maxDim);
+    }
 }
 
 void GpuIndexIVFScalarQuantizer::reserveMemory(size_t numVecs) {
@@ -197,22 +217,14 @@
     }
 }
 
-void GpuIndexIVFScalarQuantizer::trainResiduals_(
-        Index::idx_t n,
-        const float* x) {
+void GpuIndexIVFScalarQuantizer::trainResiduals_(idx_t n, const float* x) {
     // The input is already guaranteed to be on the CPU
     sq.train_residual(n, x, quantizer, by_residual, verbose);
 }
 
-void GpuIndexIVFScalarQuantizer::train(Index::idx_t n, const float* x) {
+void GpuIndexIVFScalarQuantizer::train(idx_t n, const float* x) {
     DeviceScope scope(config_.device);
 
-    // For now, only support <= max int results
-    FAISS_THROW_IF_NOT_FMT(
-            n <= (Index::idx_t)std::numeric_limits<int>::max(),
-            "GPU index only supports up to %d indices",
-            std::numeric_limits<int>::max());
-
     // just in case someone changed us
     verifySQSettings_();
     verifyIVFSettings_();
@@ -230,7 +242,7 @@
     auto hostData = toHost<float, 2>(
             (float*)x,
             resources_->getDefaultStream(config_.device),
-            {(int)n, (int)this->d});
+            {n, this->d});
 
     trainQuantizer_(n, hostData.data());
     trainResiduals_(n, hostData.data());
diff -Nru faiss-1.7.3/faiss/gpu/GpuIndexIVFScalarQuantizer.h faiss-1.7.4/faiss/gpu/GpuIndexIVFScalarQuantizer.h
--- faiss-1.7.3/faiss/gpu/GpuIndexIVFScalarQuantizer.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/GpuIndexIVFScalarQuantizer.h	2023-04-19 13:18:30.000000000 +0000
@@ -42,7 +42,7 @@
     GpuIndexIVFScalarQuantizer(
             GpuResourcesProvider* provider,
             int dims,
-            int nlist,
+            idx_t nlist,
             faiss::ScalarQuantizer::QuantizerType qtype,
             faiss::MetricType metric = MetricType::METRIC_L2,
             bool encodeResidual = true,
@@ -55,7 +55,7 @@
             GpuResourcesProvider* provider,
             Index* coarseQuantizer,
             int dims,
-            int nlist,
+            idx_t nlist,
             faiss::ScalarQuantizer::QuantizerType qtype,
             faiss::MetricType metric = MetricType::METRIC_L2,
             bool encodeResidual = true,
@@ -89,14 +89,14 @@
     void updateQuantizer() override;
 
     /// Trains the coarse and scalar quantizer based on the given vector data
-    void train(Index::idx_t n, const float* x) override;
+    void train(idx_t n, const float* x) override;
 
    protected:
     /// Validates index SQ parameters
     void verifySQSettings_() const;
 
     /// Called from train to handle SQ residual training
-    void trainResiduals_(Index::idx_t n, const float* x);
+    void trainResiduals_(idx_t n, const float* x);
 
    public:
     /// Exposed like the CPU version
diff -Nru faiss-1.7.3/faiss/gpu/impl/BinaryDistance.cu faiss-1.7.4/faiss/gpu/impl/BinaryDistance.cu
--- faiss-1.7.3/faiss/gpu/impl/BinaryDistance.cu	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/BinaryDistance.cu	2023-04-19 13:18:30.000000000 +0000
@@ -27,7 +27,7 @@
         const Tensor<BinaryType, 2, true> vecs,
         const Tensor<BinaryType, 2, true> query,
         Tensor<int, 2, true> outK,
-        Tensor<int, 2, true> outV,
+        Tensor<idx_t, 2, true> outV,
         int k) {
     // A matrix tile (query, k)
     __shared__ BinaryType queryTile[kWarps][kLanes + 1]; // avoid bank conflict
@@ -37,7 +37,7 @@
 
     WarpSelect<
             int,
-            int,
+            idx_t,
             false,
             Comparator<int>,
             NumWarpQ,
@@ -49,16 +49,16 @@
     int laneId = threadIdx.x;
 
     // Each warp handles a single query
-    int warpQuery = blockIdx.x * kWarps + warpId;
+    idx_t warpQuery = idx_t(blockIdx.x) * kWarps + warpId;
     bool queryInBounds = warpQuery < query.getSize(0);
 
     // Each warp loops through the entire chunk of vectors
-    for (int blockVec = 0; blockVec < vecs.getSize(0); blockVec += kLanes) {
+    for (idx_t blockVec = 0; blockVec < vecs.getSize(0); blockVec += kLanes) {
         int threadDistance = 0;
 
         // Reduction dimension
-        for (int blockK = 0; blockK < vecs.getSize(1); blockK += kLanes) {
-            int laneK = blockK + laneId;
+        for (idx_t blockK = 0; blockK < vecs.getSize(1); blockK += kLanes) {
+            idx_t laneK = blockK + laneId;
             bool kInBounds = laneK < vecs.getSize(1);
 
             queryTile[warpId][laneId] =
@@ -68,7 +68,7 @@
 #pragma unroll
             for (int i = 0; i < kLanes / kWarps; ++i) {
                 int warpVec = i * kWarps + warpId;
-                int vec = blockVec + warpVec;
+                idx_t vec = blockVec + warpVec;
                 bool vecInBounds = vec < vecs.getSize(0);
 
                 vecTile[warpVec][laneId] =
@@ -92,7 +92,7 @@
         bool valInBounds =
                 queryInBounds && (blockVec + laneId < vecs.getSize(0));
         threadDistance = valInBounds ? threadDistance : kMaxDistance;
-        int id = valInBounds ? blockVec + laneId : -1;
+        idx_t id = valInBounds ? blockVec + laneId : idx_t(-1);
 
         heap.add(threadDistance, id);
     }
@@ -115,7 +115,7 @@
         const Tensor<BinaryType, 2, true> vecs,
         const Tensor<BinaryType, 2, true> query,
         Tensor<int, 2, true> outK,
-        Tensor<int, 2, true> outV,
+        Tensor<idx_t, 2, true> outV,
         int k) {
     // A matrix tile (query, k)
     __shared__ BinaryType queryTile[kWarps][kLanes + 1]; // avoid bank conflict
@@ -125,7 +125,7 @@
 
     WarpSelect<
             int,
-            int,
+            idx_t,
             false,
             Comparator<int>,
             NumWarpQ,
@@ -138,7 +138,7 @@
 
     // Each warp handles a single query
     int laneK = laneId;
-    int warpQuery = blockIdx.x * kWarps + warpId;
+    idx_t warpQuery = idx_t(blockIdx.x) * kWarps + warpId;
     bool kInBounds = laneK < vecs.getSize(1);
     bool queryInBounds = warpQuery < query.getSize(0);
 
@@ -146,14 +146,14 @@
             queryInBounds && kInBounds ? query[warpQuery][laneK] : 0;
 
     // Each warp loops through the entire chunk of vectors
-    for (int blockVec = 0; blockVec < vecs.getSize(0); blockVec += kLanes) {
+    for (idx_t blockVec = 0; blockVec < vecs.getSize(0); blockVec += kLanes) {
         int threadDistance = 0;
 
         // kWarps warps are responsible for loading 32 vecs
 #pragma unroll
         for (int i = 0; i < kLanes / kWarps; ++i) {
             int warpVec = i * kWarps + warpId;
-            int vec = blockVec + warpVec;
+            idx_t vec = blockVec + warpVec;
             bool vecInBounds = vec < vecs.getSize(0);
 
             vecTile[warpVec][laneId] =
@@ -175,7 +175,7 @@
         bool valInBounds =
                 queryInBounds && (blockVec + laneId < vecs.getSize(0));
         threadDistance = valInBounds ? threadDistance : kMaxDistance;
-        int id = valInBounds ? blockVec + laneId : -1;
+        idx_t id = valInBounds ? blockVec + laneId : idx_t(-1);
 
         heap.add(threadDistance, id);
     }
@@ -192,7 +192,7 @@
         Tensor<BinaryType, 2, true>& vecs,
         Tensor<BinaryType, 2, true>& query,
         Tensor<int, 2, true>& outK,
-        Tensor<int, 2, true>& outV,
+        Tensor<idx_t, 2, true>& outV,
         int k,
         cudaStream_t stream) {
     dim3 grid(utils::divUp(query.getSize(0), kWarps));
@@ -233,7 +233,7 @@
         Tensor<BinaryType, 2, true>& vecs,
         Tensor<BinaryType, 2, true>& query,
         Tensor<int, 2, true>& outK,
-        Tensor<int, 2, true>& outV,
+        Tensor<idx_t, 2, true>& outV,
         int k,
         cudaStream_t stream) {
     dim3 grid(utils::divUp(query.getSize(0), kWarps));
@@ -273,7 +273,7 @@
         Tensor<unsigned char, 2, true>& vecs,
         Tensor<unsigned char, 2, true>& query,
         Tensor<int, 2, true>& outK,
-        Tensor<int, 2, true>& outV,
+        Tensor<idx_t, 2, true>& outV,
         int k,
         cudaStream_t stream) {
     FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);
diff -Nru faiss-1.7.3/faiss/gpu/impl/BinaryDistance.cuh faiss-1.7.4/faiss/gpu/impl/BinaryDistance.cuh
--- faiss-1.7.3/faiss/gpu/impl/BinaryDistance.cuh	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/BinaryDistance.cuh	2023-04-19 13:18:30.000000000 +0000
@@ -16,7 +16,7 @@
         Tensor<unsigned char, 2, true>& vecs,
         Tensor<unsigned char, 2, true>& query,
         Tensor<int, 2, true>& outK,
-        Tensor<int, 2, true>& outV,
+        Tensor<idx_t, 2, true>& outV,
         int k,
         cudaStream_t stream);
 
diff -Nru faiss-1.7.3/faiss/gpu/impl/BinaryFlatIndex.cu faiss-1.7.4/faiss/gpu/impl/BinaryFlatIndex.cu
--- faiss-1.7.3/faiss/gpu/impl/BinaryFlatIndex.cu	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/BinaryFlatIndex.cu	2023-04-19 13:18:30.000000000 +0000
@@ -13,6 +13,8 @@
 namespace faiss {
 namespace gpu {
 
+constexpr int kBitsPerByte = 8;
+
 BinaryFlatIndex::BinaryFlatIndex(GpuResources* res, int dim, MemorySpace space)
         : resources_(res),
           dim_(dim),
@@ -23,20 +25,25 @@
                           AllocType::FlatData,
                           space,
                           res->getDefaultStreamCurrentDevice())) {
-    FAISS_ASSERT(dim % 8 == 0);
+    // Like the CPU version, dimensions must be evenly divisible by 8 (fit into
+    // an integral number of bytes)
+    FAISS_ASSERT(dim % kBitsPerByte == 0);
 }
 
 /// Returns the number of vectors we contain
-int BinaryFlatIndex::getSize() const {
+idx_t BinaryFlatIndex::getSize() const {
     return vectors_.getSize(0);
 }
 
-int BinaryFlatIndex::getDim() const {
-    return vectors_.getSize(1) * 8;
+idx_t BinaryFlatIndex::getDim() const {
+    return vectors_.getSize(1) * kBitsPerByte;
 }
 
 void BinaryFlatIndex::reserve(size_t numVecs, cudaStream_t stream) {
-    rawData_.reserve(numVecs * (dim_ / 8) * sizeof(unsigned int), stream);
+    // Like the CPU version, dimensions must be evenly divisible by 8 (fit into
+    // an integral number of bytes)
+    rawData_.reserve(
+            numVecs * (dim_ / kBitsPerByte) * sizeof(unsigned char), stream);
 }
 
 Tensor<unsigned char, 2, true>& BinaryFlatIndex::getVectorsRef() {
@@ -47,7 +54,7 @@
         Tensor<unsigned char, 2, true>& input,
         int k,
         Tensor<int, 2, true>& outDistances,
-        Tensor<int, 2, true>& outIndices) {
+        Tensor<idx_t, 2, true>& outIndices) {
     auto stream = resources_->getDefaultStreamCurrentDevice();
 
     runBinaryDistance(vectors_, input, outDistances, outIndices, k, stream);
@@ -55,7 +62,7 @@
 
 void BinaryFlatIndex::add(
         const unsigned char* data,
-        int numVecs,
+        idx_t numVecs,
         cudaStream_t stream) {
     if (numVecs == 0) {
         return;
@@ -63,14 +70,14 @@
 
     rawData_.append(
             (char*)data,
-            (size_t)(dim_ / 8) * numVecs * sizeof(unsigned char),
+            (size_t)(dim_ / kBitsPerByte) * numVecs * sizeof(unsigned char),
             stream,
             true /* reserve exactly */);
 
     num_ += numVecs;
 
     DeviceTensor<unsigned char, 2, true> vectors(
-            (unsigned char*)rawData_.data(), {(int)num_, (dim_ / 8)});
+            (unsigned char*)rawData_.data(), {num_, (dim_ / kBitsPerByte)});
     vectors_ = std::move(vectors);
 }
 
diff -Nru faiss-1.7.3/faiss/gpu/impl/BinaryFlatIndex.cuh faiss-1.7.4/faiss/gpu/impl/BinaryFlatIndex.cuh
--- faiss-1.7.3/faiss/gpu/impl/BinaryFlatIndex.cuh	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/BinaryFlatIndex.cuh	2023-04-19 13:18:30.000000000 +0000
@@ -21,9 +21,9 @@
     BinaryFlatIndex(GpuResources* res, int dim, MemorySpace space);
 
     /// Returns the number of vectors we contain
-    int getSize() const;
+    idx_t getSize() const;
 
-    int getDim() const;
+    idx_t getDim() const;
 
     /// Reserve storage that can contain at least this many vectors
     void reserve(size_t numVecs, cudaStream_t stream);
@@ -35,11 +35,11 @@
             Tensor<unsigned char, 2, true>& vecs,
             int k,
             Tensor<int, 2, true>& outDistances,
-            Tensor<int, 2, true>& outIndices);
+            Tensor<idx_t, 2, true>& outIndices);
 
     /// Add vectors to ourselves; the pointer passed can be on the host
     /// or the device
-    void add(const unsigned char* data, int numVecs, cudaStream_t stream);
+    void add(const unsigned char* data, idx_t numVecs, cudaStream_t stream);
 
     /// Free all storage
     void reset();
@@ -52,7 +52,7 @@
     const int dim_;
 
     /// How many vectors we have
-    int num_;
+    idx_t num_;
 
     /// The underlying expandable storage
     DeviceVector<char> rawData_;
diff -Nru faiss-1.7.3/faiss/gpu/impl/BroadcastSum.cu faiss-1.7.4/faiss/gpu/impl/BroadcastSum.cu
--- faiss-1.7.3/faiss/gpu/impl/BroadcastSum.cu	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/BroadcastSum.cu	2023-04-19 13:18:30.000000000 +0000
@@ -25,9 +25,9 @@
     // blockIdx.x: which chunk of rows we are responsible for updating
     // blockIdx.y: which chunk of columns we are responsible for
     // updating
-    int rowStart = blockIdx.x * kRowsPerBlock;
-    int rowEnd = rowStart + kRowsPerBlock;
-    int colStart = blockIdx.y * blockDim.x * kColLoad;
+    idx_t rowStart = idx_t(blockIdx.x) * kRowsPerBlock;
+    idx_t rowEnd = rowStart + kRowsPerBlock;
+    idx_t colStart = idx_t(blockIdx.y) * blockDim.x * kColLoad;
 
     // FIXME: if we have exact multiples, don't need this
     bool endRow = (blockIdx.x == gridDim.x - 1);
@@ -40,12 +40,12 @@
     }
 
     if (endCol) {
-        for (int col = colStart + threadIdx.x; col < input.getSize(0);
+        for (idx_t col = colStart + threadIdx.x; col < input.getSize(0);
              col += blockDim.x) {
             T val = input[col];
 
             if (endRow) {
-                for (int row = rowStart; row < output.getSize(0); ++row) {
+                for (idx_t row = rowStart; row < output.getSize(0); ++row) {
                     T out = output[row][col];
                     out = Math<T>::add(out, val);
                     output[row][col] = out;
@@ -53,7 +53,7 @@
             } else {
                 T rows[kRowUnroll];
 
-                for (int row = rowStart; row < rowEnd; row += kRowUnroll) {
+                for (idx_t row = rowStart; row < rowEnd; row += kRowUnroll) {
 #pragma unroll
                     for (int i = 0; i < kRowUnroll; ++i) {
                         rows[i] = output[row + i][col];
@@ -72,7 +72,7 @@
             }
         }
     } else {
-        int col = colStart + threadIdx.x;
+        idx_t col = colStart + threadIdx.x;
 
         T val[kColLoad];
 
@@ -82,7 +82,7 @@
         }
 
         if (endRow) {
-            for (int row = rowStart; row < output.getSize(0); ++row) {
+            for (idx_t row = rowStart; row < output.getSize(0); ++row) {
 #pragma unroll
                 for (int i = 0; i < kColLoad; ++i) {
                     T out = output[row][col + i * blockDim.x];
@@ -93,7 +93,7 @@
         } else {
             T rows[kRowUnroll * kColLoad];
 
-            for (int row = rowStart; row < rowEnd; row += kRowUnroll) {
+            for (idx_t row = rowStart; row < rowEnd; row += kRowUnroll) {
 #pragma unroll
                 for (int i = 0; i < kRowUnroll; ++i) {
 #pragma unroll
@@ -134,9 +134,9 @@
     // blockIdx.x: which chunk of rows we are responsible for updating
     // blockIdx.y: which chunk of columns we are responsible for
     // updating
-    int rowStart = blockIdx.x * kRowsPerBlock;
-    int rowEnd = rowStart + kRowsPerBlock;
-    int colStart = blockIdx.y * blockDim.x * kColLoad;
+    idx_t rowStart = idx_t(blockIdx.x) * kRowsPerBlock;
+    idx_t rowEnd = rowStart + kRowsPerBlock;
+    idx_t colStart = idx_t(blockIdx.y) * blockDim.x * kColLoad;
 
     // FIXME: if we have exact multiples, don't need this
     bool endRow = (blockIdx.x == gridDim.x - 1);
@@ -149,16 +149,16 @@
     }
 
     if (endCol) {
-        for (int col = colStart + threadIdx.x; col < input.getSize(0);
+        for (idx_t col = colStart + threadIdx.x; col < input.getSize(0);
              col += blockDim.x) {
             T val = input[col];
 
             if (endRow) {
-                for (int row = rowStart; row < output.getSize(0); ++row) {
+                for (idx_t row = rowStart; row < output.getSize(0); ++row) {
                     output[row][col] = val;
                 }
             } else {
-                for (int row = rowStart; row < rowEnd; row += kRowUnroll) {
+                for (idx_t row = rowStart; row < rowEnd; row += kRowUnroll) {
 #pragma unroll
                     for (int i = 0; i < kRowUnroll; ++i) {
                         output[row + i][col] = val;
@@ -167,7 +167,7 @@
             }
         }
     } else {
-        int col = colStart + threadIdx.x;
+        idx_t col = colStart + threadIdx.x;
 
         T val[kColLoad];
 
@@ -177,14 +177,14 @@
         }
 
         if (endRow) {
-            for (int row = rowStart; row < output.getSize(0); ++row) {
+            for (idx_t row = rowStart; row < output.getSize(0); ++row) {
 #pragma unroll
                 for (int i = 0; i < kColLoad; ++i) {
                     output[row][col + i * blockDim.x] = val[i];
                 }
             }
         } else {
-            for (int row = rowStart; row < rowEnd; row += kRowUnroll) {
+            for (idx_t row = rowStart; row < rowEnd; row += kRowUnroll) {
 #pragma unroll
                 for (int i = 0; i < kRowUnroll; ++i) {
 #pragma unroll
@@ -203,7 +203,7 @@
         Tensor<T, 2, true> output) {
     __shared__ T sval;
 
-    int row = blockIdx.x;
+    idx_t row = blockIdx.x;
 
     if (threadIdx.x == 0) {
         sval = input[row];
@@ -214,7 +214,7 @@
     T val = sval;
 
     // FIXME: speed up
-    for (int i = threadIdx.x; i < output.getSize(1); i += blockDim.x) {
+    for (idx_t i = threadIdx.x; i < output.getSize(1); i += blockDim.x) {
         T out = output[row][i];
         out = Math<T>::add(out, val);
         if (ZeroClamp) {
@@ -244,16 +244,20 @@
         auto inputV = input.template castResize<TVec>();
         auto outputV = output.template castResize<TVec>();
 
-        auto grid = dim3(
-                utils::divUp(outputV.getSize(0), kRowsPerBlock),
-                utils::divUp(outputV.getSize(1), threadsPerBlock * kColLoad));
+        auto rowTiles = utils::divUp(outputV.getSize(0), kRowsPerBlock);
+        auto colTiles =
+                utils::divUp(outputV.getSize(1), threadsPerBlock * kColLoad);
+        FAISS_ASSERT(colTiles <= getMaxGridCurrentDevice().y);
+        auto grid = dim3(rowTiles, colTiles);
 
         sumAlongColumns<TVec, kRowsPerBlock, kRowUnroll, kColLoad>
                 <<<grid, block, 0, stream>>>(inputV, outputV);
     } else {
-        auto grid = dim3(
-                utils::divUp(output.getSize(0), kRowsPerBlock),
-                utils::divUp(output.getSize(1), threadsPerBlock * kColLoad));
+        auto rowTiles = utils::divUp(output.getSize(0), kRowsPerBlock);
+        auto colTiles =
+                utils::divUp(output.getSize(1), threadsPerBlock * kColLoad);
+        FAISS_ASSERT(colTiles <= getMaxGridCurrentDevice().y);
+        auto grid = dim3(rowTiles, colTiles);
 
         sumAlongColumns<T, kRowsPerBlock, kRowUnroll, kColLoad>
                 <<<grid, block, 0, stream>>>(input, output);
@@ -295,16 +299,20 @@
         auto inputV = input.template castResize<TVec>();
         auto outputV = output.template castResize<TVec>();
 
-        auto grid = dim3(
-                utils::divUp(outputV.getSize(0), kRowsPerBlock),
-                utils::divUp(outputV.getSize(1), threadsPerBlock * kColLoad));
+        auto rowTiles = utils::divUp(outputV.getSize(0), kRowsPerBlock);
+        auto colTiles =
+                utils::divUp(outputV.getSize(1), threadsPerBlock * kColLoad);
+        FAISS_ASSERT(colTiles <= getMaxGridCurrentDevice().y);
+        auto grid = dim3(rowTiles, colTiles);
 
         assignAlongColumns<TVec, kRowsPerBlock, kRowUnroll, kColLoad>
                 <<<grid, block, 0, stream>>>(inputV, outputV);
     } else {
-        auto grid = dim3(
-                utils::divUp(output.getSize(0), kRowsPerBlock),
-                utils::divUp(output.getSize(1), threadsPerBlock * kColLoad));
+        auto rowTiles = utils::divUp(output.getSize(0), kRowsPerBlock);
+        auto colTiles =
+                utils::divUp(output.getSize(1), threadsPerBlock * kColLoad);
+        FAISS_ASSERT(colTiles <= getMaxGridCurrentDevice().y);
+        auto grid = dim3(rowTiles, colTiles);
 
         assignAlongColumns<T, kRowsPerBlock, kRowUnroll, kColLoad>
                 <<<grid, block, 0, stream>>>(input, output);
@@ -335,8 +343,8 @@
         cudaStream_t stream) {
     FAISS_ASSERT(input.getSize(0) == output.getSize(0));
 
-    int threadsPerBlock =
-            std::min(output.getSize(1), getMaxThreadsCurrentDevice());
+    idx_t threadsPerBlock =
+            std::min(output.getSize(1), (idx_t)getMaxThreadsCurrentDevice());
     auto grid = dim3(output.getSize(0));
     auto block = dim3(threadsPerBlock);
 
diff -Nru faiss-1.7.3/faiss/gpu/impl/Distance.cu faiss-1.7.4/faiss/gpu/impl/Distance.cu
--- faiss-1.7.3/faiss/gpu/impl/Distance.cu	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/Distance.cu	2023-04-19 13:18:30.000000000 +0000
@@ -79,7 +79,7 @@
     // Prepare norm vector ||q||^2; ||c||^2 is already pre-computed
     //
     DeviceTensor<float, 1, true> queryNorms(
-            res, makeTempAlloc(AllocType::Other, stream), {(int)numQueries});
+            res, makeTempAlloc(AllocType::Other, stream), {numQueries});
 
     // ||q||^2
     if (computeL2) {
@@ -128,7 +128,7 @@
         bool queriesRowMajor,
         int k,
         Tensor<float, 2, true>& outDistances,
-        Tensor<int, 2, true>& outIndices,
+        Tensor<idx_t, 2, true>& outIndices,
         bool ignoreOutDistances) {
     // The # of centroids in `centroids` based on memory layout
     auto numCentroids = centroids.getSize(centroidsRowMajor ? 0 : 1);
@@ -177,7 +177,7 @@
     // Prepare norm vector ||q||^2; ||c||^2 is already pre-computed
     //
     DeviceTensor<float, 1, true> queryNorms(
-            res, makeTempAlloc(AllocType::Other, stream), {(int)numQueries});
+            res, makeTempAlloc(AllocType::Other, stream), {numQueries});
 
     // ||q||^2
     if (computeL2) {
@@ -186,8 +186,8 @@
 
     // By default, aim to use up to 512 MB of memory for the processing, with
     // both number of queries and number of centroids being at least 512.
-    int tileRows = 0;
-    int tileCols = 0;
+    idx_t tileRows = 0;
+    idx_t tileCols = 0;
     chooseTileSize(
             numQueries,
             numCentroids,
@@ -197,7 +197,7 @@
             tileRows,
             tileCols);
 
-    int numColTiles = utils::divUp(numCentroids, tileCols);
+    idx_t numColTiles = utils::divUp(numCentroids, tileCols);
 
     // We can have any number of vectors to query against, even less than k, in
     // which case we'll return -1 for the index
@@ -222,15 +222,15 @@
     DeviceTensor<float, 2, true>* outDistanceBufs[2] = {
             &outDistanceBuf1, &outDistanceBuf2};
 
-    DeviceTensor<int, 2, true> outIndexBuf1(
+    DeviceTensor<idx_t, 2, true> outIndexBuf1(
             res,
             makeTempAlloc(AllocType::Other, stream),
             {tileRows, numColTiles * k});
-    DeviceTensor<int, 2, true> outIndexBuf2(
+    DeviceTensor<idx_t, 2, true> outIndexBuf2(
             res,
             makeTempAlloc(AllocType::Other, stream),
             {tileRows, numColTiles * k});
-    DeviceTensor<int, 2, true>* outIndexBufs[2] = {
+    DeviceTensor<idx_t, 2, true>* outIndexBufs[2] = {
             &outIndexBuf1, &outIndexBuf2};
 
     auto streams = res->getAlternateStreamsCurrentDevice();
@@ -240,13 +240,13 @@
     bool interrupt = false;
 
     // Tile over the input queries
-    for (int i = 0; i < numQueries; i += tileRows) {
+    for (idx_t i = 0; i < numQueries; i += tileRows) {
         if (interrupt || InterruptCallback::is_interrupted()) {
             interrupt = true;
             break;
         }
 
-        int curQuerySize = std::min(tileRows, numQueries - i);
+        idx_t curQuerySize = std::min(tileRows, numQueries - i);
 
         auto outDistanceView = outDistances.narrow(0, i, curQuerySize);
         auto outIndexView = outIndices.narrow(0, i, curQuerySize);
@@ -261,14 +261,14 @@
                 outIndexBufs[curStream]->narrow(0, 0, curQuerySize);
 
         // Tile over the centroids
-        for (int j = 0; j < numCentroids; j += tileCols) {
+        for (idx_t j = 0; j < numCentroids; j += tileCols) {
             if (InterruptCallback::is_interrupted()) {
                 interrupt = true;
                 break;
             }
 
-            int curCentroidSize = std::min(tileCols, numCentroids - j);
-            int curColTile = j / tileCols;
+            auto curCentroidSize = std::min(tileCols, numCentroids - j);
+            auto curColTile = j / tileCols;
 
             auto centroidsView = sliceCentroids(
                     centroids, centroidsRowMajor, j, curCentroidSize);
@@ -415,7 +415,7 @@
         bool queriesRowMajor,
         int k,
         Tensor<float, 2, true>& outDistances,
-        Tensor<int, 2, true>& outIndices,
+        Tensor<idx_t, 2, true>& outIndices,
         bool ignoreOutDistances = false) {
     runDistance<T>(
             true, // L2
@@ -442,7 +442,7 @@
         bool queriesRowMajor,
         int k,
         Tensor<float, 2, true>& outDistances,
-        Tensor<int, 2, true>& outIndices) {
+        Tensor<idx_t, 2, true>& outIndices) {
     runDistance<T>(
             false, // IP
             res,
@@ -554,7 +554,7 @@
         bool queriesRowMajor,
         int k,
         Tensor<float, 2, true>& outDistances,
-        Tensor<int, 2, true>& outIndices,
+        Tensor<idx_t, 2, true>& outIndices,
         bool ignoreOutDistances) {
     runL2Distance<float>(
             res,
@@ -580,7 +580,7 @@
         bool queriesRowMajor,
         int k,
         Tensor<float, 2, true>& outDistances,
-        Tensor<int, 2, true>& outIndices,
+        Tensor<idx_t, 2, true>& outIndices,
         bool ignoreOutDistances) {
     runL2Distance<half>(
             res,
@@ -605,7 +605,7 @@
         bool queriesRowMajor,
         int k,
         Tensor<float, 2, true>& outDistances,
-        Tensor<int, 2, true>& outIndices) {
+        Tensor<idx_t, 2, true>& outIndices) {
     runIPDistance<float>(
             res,
             stream,
@@ -627,7 +627,7 @@
         bool queriesRowMajor,
         int k,
         Tensor<float, 2, true>& outDistances,
-        Tensor<int, 2, true>& outIndices) {
+        Tensor<idx_t, 2, true>& outIndices) {
     runIPDistance<half>(
             res,
             stream,
diff -Nru faiss-1.7.3/faiss/gpu/impl/Distance.cuh faiss-1.7.4/faiss/gpu/impl/Distance.cuh
--- faiss-1.7.3/faiss/gpu/impl/Distance.cuh	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/Distance.cuh	2023-04-19 13:18:30.000000000 +0000
@@ -73,7 +73,7 @@
         bool queriesRowMajor,
         int k,
         Tensor<float, 2, true>& outDistances,
-        Tensor<int, 2, true>& outIndices,
+        Tensor<idx_t, 2, true>& outIndices,
         // Do we care about `outDistances`? If not, we can
         // take shortcuts.
         bool ignoreOutDistances = false);
@@ -88,7 +88,7 @@
         bool queriesRowMajor,
         int k,
         Tensor<float, 2, true>& outDistances,
-        Tensor<int, 2, true>& outIndices,
+        Tensor<idx_t, 2, true>& outIndices,
         bool ignoreOutDistances = false);
 
 /// Calculates brute-force inner product distance between `vectors`
@@ -102,7 +102,7 @@
         bool queriesRowMajor,
         int k,
         Tensor<float, 2, true>& outDistances,
-        Tensor<int, 2, true>& outIndices);
+        Tensor<idx_t, 2, true>& outIndices);
 
 void runIPDistance(
         GpuResources* resources,
@@ -113,7 +113,7 @@
         bool queriesRowMajor,
         int k,
         Tensor<float, 2, true>& outDistances,
-        Tensor<int, 2, true>& outIndices);
+        Tensor<idx_t, 2, true>& outIndices);
 
 //
 // General distance implementation, assumes that all arguments are on the
@@ -228,6 +228,13 @@
                     outDistances,
                     JensenShannonDistance(),
                     stream);
+        } else if (metric == faiss::MetricType::METRIC_Jaccard) {
+            runGeneralDistanceKernel(
+                    tVectorsDimInnermost,
+                    tQueriesDimInnermost,
+                    outDistances,
+                    JaccardSimilarity(),
+                    stream);
         } else {
             FAISS_THROW_FMT("unimplemented metric type %d", metric);
         }
@@ -253,7 +260,7 @@
         faiss::MetricType metric,
         float metricArg,
         Tensor<float, 2, true>& outDistances,
-        Tensor<int, 2, true>& outIndices,
+        Tensor<idx_t, 2, true>& outIndices,
         bool ignoreOutDistances) {
     DeviceScope ds(device);
     // We are guaranteed that all data arguments are resident on our preferred
@@ -375,6 +382,16 @@
                     JensenShannonDistance(),
                     outDistances,
                     outIndices);
+        } else if (metric == faiss::MetricType::METRIC_Jaccard) {
+            runGeneralDistance(
+                    resources,
+                    stream,
+                    tVectorsDimInnermost,
+                    tQueriesDimInnermost,
+                    k,
+                    JaccardSimilarity(),
+                    outDistances,
+                    outIndices);
         } else {
             FAISS_THROW_FMT("unimplemented metric type %d", metric);
         }
diff -Nru faiss-1.7.3/faiss/gpu/impl/DistanceUtils.cuh faiss-1.7.4/faiss/gpu/impl/DistanceUtils.cuh
--- faiss-1.7.3/faiss/gpu/impl/DistanceUtils.cuh	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/DistanceUtils.cuh	2023-04-19 13:18:30.000000000 +0000
@@ -251,12 +251,41 @@
     float dist;
 };
 
+struct JaccardSimilarity {
+    __host__ __device__ JaccardSimilarity() : numerator(0), denominator(0) {}
+
+    static constexpr bool kDirection = true; // maximize
+    static constexpr float kIdentityData = 0;
+    static constexpr float kMaxDistance = -std::numeric_limits<float>::max();
+
+    __host__ __device__ void handle(float a, float b) {
+        numerator += fmin(a, b);
+        denominator += fmax(a, b);
+    }
+
+    __host__ __device__ float reduce() {
+        return numerator / denominator;
+    }
+
+    __host__ __device__ void combine(const JaccardSimilarity& v) {
+        numerator += v.numerator;
+        denominator += v.denominator;
+    }
+
+    __host__ __device__ JaccardSimilarity zero() const {
+        return JaccardSimilarity();
+    }
+
+    float numerator;
+    float denominator;
+};
+
 template <typename T, bool InnerContig>
 Tensor<T, 2, InnerContig> sliceCentroids(
         Tensor<T, 2, InnerContig>& centroids,
         bool centroidsRowMajor,
-        int startCentroid,
-        int num) {
+        idx_t startCentroid,
+        idx_t num) {
     // Row major is (num, dim)
     // Col major is (dim, num)
     if (startCentroid == 0 &&
@@ -272,9 +301,11 @@
 __global__ void incrementIndex(
         Tensor<T, 2, true> indices,
         int k,
-        int increment) {
-    for (int i = threadIdx.x; i < k; i += blockDim.x) {
-        indices[blockIdx.y][blockIdx.x * k + i] += blockIdx.x * increment;
+        idx_t increment) {
+    for (idx_t i = blockIdx.y; i < indices.getSize(0); i += gridDim.y) {
+        for (int j = threadIdx.x; j < k; j += blockDim.x) {
+            indices[i][idx_t(blockIdx.x) * k + j] += blockIdx.x * increment;
+        }
     }
 }
 
@@ -284,27 +315,28 @@
 void runIncrementIndex(
         Tensor<T, 2, true>& indices,
         int k,
-        int increment,
+        idx_t increment,
         cudaStream_t stream) {
-    dim3 grid(indices.getSize(1) / k, indices.getSize(0));
-    int block = std::min(k, 512);
+    // Input should be an even divisor of k
+    FAISS_ASSERT(indices.getSize(1) % k == 0);
 
-    // should be exact
-    FAISS_ASSERT(grid.x * k == indices.getSize(1));
+    dim3 grid(indices.getSize(1) / k, indices.getSize(0));
+    auto block = std::min(k, getMaxThreadsCurrentDevice());
 
     incrementIndex<<<grid, block, 0, stream>>>(indices, k, increment);
+    CUDA_TEST_ERROR();
 }
 
 // If the inner size (dim) of the vectors is small, we want a larger query tile
 // size, like 1024
 inline void chooseTileSize(
-        int numQueries,
-        int numCentroids,
+        idx_t numQueries,
+        idx_t numCentroids,
         int dim,
-        int elementSize,
+        idx_t elementSize,
         size_t tempMemAvailable,
-        int& tileRows,
-        int& tileCols) {
+        idx_t& tileRows,
+        idx_t& tileCols) {
     // The matrix multiplication should be large enough to be efficient, but if
     // it is too large, we seem to lose efficiency as opposed to
     // double-streaming. Each tile size here defines 1/2 of the memory use due
@@ -314,7 +346,7 @@
     // prefer 768 MB of usage. Otherwise, prefer 1 GB of usage.
     auto totalMem = getCurrentDeviceProperties().totalGlobalMem;
 
-    int targetUsage = 0;
+    idx_t targetUsage = 0;
 
     if (totalMem <= ((size_t)4) * 1024 * 1024 * 1024) {
         targetUsage = 512 * 1024 * 1024;
@@ -330,7 +362,7 @@
     // If we are on float16, increase to 512.
     // If the k size (vec dim) of the matrix multiplication is small (<= 32),
     // increase to 1024.
-    int preferredTileRows = 512;
+    idx_t preferredTileRows = 512;
     if (dim <= 32) {
         preferredTileRows = 1024;
     }
diff -Nru faiss-1.7.3/faiss/gpu/impl/FlatIndex.cu faiss-1.7.4/faiss/gpu/impl/FlatIndex.cu
--- faiss-1.7.3/faiss/gpu/impl/FlatIndex.cu	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/FlatIndex.cu	2023-04-19 13:18:30.000000000 +0000
@@ -47,7 +47,7 @@
 }
 
 /// Returns the number of vectors we contain
-int FlatIndex::getSize() const {
+idx_t FlatIndex::getSize() const {
     if (useFloat16_) {
         return vectorsHalf_.getSize(0);
     } else {
@@ -99,7 +99,7 @@
         faiss::MetricType metric,
         float metricArg,
         Tensor<float, 2, true>& outDistances,
-        Tensor<int, 2, true>& outIndices,
+        Tensor<idx_t, 2, true>& outIndices,
         bool exactDistance) {
     auto stream = resources_->getDefaultStreamCurrentDevice();
 
@@ -140,7 +140,7 @@
         faiss::MetricType metric,
         float metricArg,
         Tensor<float, 2, true>& outDistances,
-        Tensor<int, 2, true>& outIndices,
+        Tensor<idx_t, 2, true>& outIndices,
         bool exactDistance) {
     FAISS_ASSERT(useFloat16_);
 
@@ -163,7 +163,7 @@
 
 void FlatIndex::computeResidual(
         Tensor<float, 2, true>& vecs,
-        Tensor<Index::idx_t, 1, true>& ids,
+        Tensor<idx_t, 1, true>& ids,
         Tensor<float, 2, true>& residuals) {
     if (useFloat16_) {
         runCalcResidual(
@@ -183,8 +183,8 @@
 }
 
 void FlatIndex::reconstruct(
-        Index::idx_t start,
-        Index::idx_t num,
+        idx_t start,
+        idx_t num,
         Tensor<float, 2, true>& vecs) {
     auto stream = resources_->getDefaultStreamCurrentDevice();
 
@@ -199,7 +199,7 @@
 }
 
 void FlatIndex::reconstruct(
-        Tensor<Index::idx_t, 1, true>& ids,
+        Tensor<idx_t, 1, true>& ids,
         Tensor<float, 2, true>& vecs) {
     auto stream = resources_->getDefaultStreamCurrentDevice();
 
@@ -213,7 +213,7 @@
     }
 }
 
-void FlatIndex::add(const float* data, int numVecs, cudaStream_t stream) {
+void FlatIndex::add(const float* data, idx_t numVecs, cudaStream_t stream) {
     if (numVecs == 0) {
         return;
     }
@@ -250,11 +250,11 @@
 
     if (useFloat16_) {
         DeviceTensor<half, 2, true> vectors16(
-                (half*)rawData16_.data(), {(int)num_, dim_});
+                (half*)rawData16_.data(), {num_, dim_});
         vectorsHalf_ = std::move(vectors16);
     } else {
         DeviceTensor<float, 2, true> vectors32(
-                (float*)rawData32_.data(), {(int)num_, dim_});
+                (float*)rawData32_.data(), {num_, dim_});
         vectors_ = std::move(vectors32);
     }
 
@@ -263,14 +263,14 @@
         DeviceTensor<float, 1, true> norms(
                 resources_,
                 makeSpaceAlloc(AllocType::FlatData, space_, stream),
-                {(int)num_});
+                {num_});
         runL2Norm(vectorsHalf_, true, norms, true, stream);
         norms_ = std::move(norms);
     } else {
         DeviceTensor<float, 1, true> norms(
                 resources_,
                 makeSpaceAlloc(AllocType::FlatData, space_, stream),
-                {(int)num_});
+                {num_});
         runL2Norm(vectors_, true, norms, true, stream);
         norms_ = std::move(norms);
     }
diff -Nru faiss-1.7.3/faiss/gpu/impl/FlatIndex.cuh faiss-1.7.4/faiss/gpu/impl/FlatIndex.cuh
--- faiss-1.7.3/faiss/gpu/impl/FlatIndex.cuh	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/FlatIndex.cuh	2023-04-19 13:18:30.000000000 +0000
@@ -30,7 +30,7 @@
     bool getUseFloat16() const;
 
     /// Returns the number of vectors we contain
-    int getSize() const;
+    idx_t getSize() const;
 
     /// Returns the dimensionality of the vectors
     int getDim() const;
@@ -50,7 +50,7 @@
             faiss::MetricType metric,
             float metricArg,
             Tensor<float, 2, true>& outDistances,
-            Tensor<int, 2, true>& outIndices,
+            Tensor<idx_t, 2, true>& outIndices,
             bool exactDistance);
 
     void query(
@@ -59,29 +59,24 @@
             faiss::MetricType metric,
             float metricArg,
             Tensor<float, 2, true>& outDistances,
-            Tensor<int, 2, true>& outIndices,
+            Tensor<idx_t, 2, true>& outIndices,
             bool exactDistance);
 
     /// Compute residual for set of vectors
     void computeResidual(
             Tensor<float, 2, true>& vecs,
-            Tensor<Index::idx_t, 1, true>& ids,
+            Tensor<idx_t, 1, true>& ids,
             Tensor<float, 2, true>& residuals);
 
     /// Gather vectors given the set of IDs
-    void reconstruct(
-            Tensor<Index::idx_t, 1, true>& ids,
-            Tensor<float, 2, true>& vecs);
+    void reconstruct(Tensor<idx_t, 1, true>& ids, Tensor<float, 2, true>& vecs);
 
     /// Gather vectors given a range of IDs
-    void reconstruct(
-            Index::idx_t start,
-            Index::idx_t num,
-            Tensor<float, 2, true>& vecs);
+    void reconstruct(idx_t start, idx_t num, Tensor<float, 2, true>& vecs);
 
     /// Add vectors to ourselves; the pointer passed can be on the host
     /// or the device
-    void add(const float* data, int numVecs, cudaStream_t stream);
+    void add(const float* data, idx_t numVecs, cudaStream_t stream);
 
     /// Free all storage
     void reset();
@@ -100,7 +95,7 @@
     MemorySpace space_;
 
     /// How many vectors we have
-    int num_;
+    idx_t num_;
 
     /// The underlying expandable storage for float32 data
     DeviceVector<char> rawData32_;
diff -Nru faiss-1.7.3/faiss/gpu/impl/GeneralDistance.cuh faiss-1.7.4/faiss/gpu/impl/GeneralDistance.cuh
--- faiss-1.7.3/faiss/gpu/impl/GeneralDistance.cuh	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/GeneralDistance.cuh	2023-04-19 13:18:30.000000000 +0000
@@ -96,12 +96,12 @@
     // block y -> query
     // block x -> vector
 
-    int queryBlock = blockIdx.y * kWarpSize;
-    int queryThread = queryBlock + threadIdx.y;
+    idx_t queryBlock = idx_t(blockIdx.y) * kWarpSize;
+    idx_t queryThread = queryBlock + threadIdx.y;
 
-    int vecBlock = blockIdx.x * kWarpSize;
-    int vecThreadLoad = vecBlock + threadIdx.y;
-    int vecThreadSave = vecBlock + threadIdx.x;
+    idx_t vecBlock = idx_t(blockIdx.x) * kWarpSize;
+    idx_t vecThreadLoad = vecBlock + threadIdx.y;
+    idx_t vecThreadSave = vecBlock + threadIdx.x;
 
     DistanceOp acc = op.zero();
 
@@ -115,10 +115,10 @@
         //
         // Interior tile
         //
-        int limit =
+        idx_t limit =
                 utils::roundDown(query.getSize(1), kWarpSize * kDimMultiple);
 
-        for (int k = threadIdx.x; k < limit; k += kWarpSize * kDimMultiple) {
+        for (idx_t k = threadIdx.x; k < limit; k += kWarpSize * kDimMultiple) {
             // Load query tile
 #pragma unroll
             for (int i = 0; i < kDimMultiple; ++i) {
@@ -141,7 +141,7 @@
         if (limit < query.getSize(1)) {
 #pragma unroll
             for (int i = 0; i < kDimMultiple; ++i) {
-                int k = limit + threadIdx.x + i * kWarpSize;
+                idx_t k = limit + threadIdx.x + i * kWarpSize;
                 bool kInBounds = k < query.getSize(1);
 
                 queryTileBase[threadIdx.x + i * kWarpSize] =
@@ -153,11 +153,11 @@
 
             __syncthreads();
 
-            int remainder = query.getSize(1) - limit;
+            idx_t remainder = query.getSize(1) - limit;
 
             // thread (y, x) does (query y, vec x)
 #pragma unroll
-            for (int i = 0; i < remainder; ++i) {
+            for (idx_t i = 0; i < remainder; ++i) {
                 acc.handle(
                         ConvertTo<float>::to(queryTileBase[i]),
                         ConvertTo<float>::to(vecTile[threadIdx.x][i]));
@@ -174,9 +174,9 @@
         bool queryThreadInBounds = queryThread < query.getSize(0);
         bool vecThreadInBoundsLoad = vecThreadLoad < vec.getSize(0);
         bool vecThreadInBoundsSave = vecThreadSave < vec.getSize(0);
-        int limit = utils::roundDown(query.getSize(1), kWarpSize);
+        idx_t limit = utils::roundDown(query.getSize(1), kWarpSize);
 
-        for (int k = threadIdx.x; k < limit; k += kWarpSize) {
+        for (idx_t k = threadIdx.x; k < limit; k += kWarpSize) {
             // Load query tile
             queryTileBase[threadIdx.x] =
                     queryThreadInBounds ? queryBase[k] : ConvertTo<T>::to(0);
@@ -199,7 +199,7 @@
 
         // Handle remainder
         if (limit < query.getSize(1)) {
-            int k = limit + threadIdx.x;
+            idx_t k = limit + threadIdx.x;
             bool kInBounds = k < query.getSize(1);
 
             // Load query tile
@@ -213,7 +213,7 @@
 
             __syncthreads();
 
-            int remainder = query.getSize(1) - limit;
+            idx_t remainder = query.getSize(1) - limit;
 
             // thread (y, x) does (query y, vec x)
             for (int i = 0; i < remainder; ++i) {
@@ -244,6 +244,7 @@
     dim3 grid(
             utils::divUp(vecs.getSize(0), kWarpSize),
             utils::divUp(query.getSize(0), kWarpSize));
+    FAISS_ASSERT(grid.y <= getMaxGridCurrentDevice().y);
     dim3 block(kWarpSize, kWarpSize);
 
     generalDistance<<<grid, block, 0, stream>>>(query, vecs, op, out);
@@ -258,7 +259,7 @@
         int k,
         const DistanceOp& op,
         Tensor<float, 2, true>& outDistances,
-        Tensor<int, 2, true>& outIndices) {
+        Tensor<idx_t, 2, true>& outIndices) {
     // The # of centroids in `centroids` based on memory layout
     auto numCentroids = centroids.getSize(0);
 
@@ -295,8 +296,8 @@
 
     // By default, aim to use up to 512 MB of memory for the processing, with
     // both number of queries and number of centroids being at least 512.
-    int tileRows = 0;
-    int tileCols = 0;
+    idx_t tileRows = 0;
+    idx_t tileCols = 0;
     chooseTileSize(
             numQueries,
             numCentroids,
@@ -306,7 +307,7 @@
             tileRows,
             tileCols);
 
-    int numColTiles = utils::divUp(numCentroids, tileCols);
+    auto numColTiles = utils::divUp(numCentroids, tileCols);
 
     // We can have any number of vectors to query against, even less than k, in
     // which case we'll return -1 for the index
@@ -331,15 +332,15 @@
     DeviceTensor<float, 2, true>* outDistanceBufs[2] = {
             &outDistanceBuf1, &outDistanceBuf2};
 
-    DeviceTensor<int, 2, true> outIndexBuf1(
+    DeviceTensor<idx_t, 2, true> outIndexBuf1(
             res,
             makeTempAlloc(AllocType::Other, stream),
             {tileRows, numColTiles * k});
-    DeviceTensor<int, 2, true> outIndexBuf2(
+    DeviceTensor<idx_t, 2, true> outIndexBuf2(
             res,
             makeTempAlloc(AllocType::Other, stream),
             {tileRows, numColTiles * k});
-    DeviceTensor<int, 2, true>* outIndexBufs[2] = {
+    DeviceTensor<idx_t, 2, true>* outIndexBufs[2] = {
             &outIndexBuf1, &outIndexBuf2};
 
     auto streams = res->getAlternateStreamsCurrentDevice();
@@ -349,13 +350,13 @@
     bool interrupt = false;
 
     // Tile over the input queries
-    for (int i = 0; i < numQueries; i += tileRows) {
+    for (idx_t i = 0; i < numQueries; i += tileRows) {
         if (interrupt || InterruptCallback::is_interrupted()) {
             interrupt = true;
             break;
         }
 
-        int curQuerySize = std::min(tileRows, numQueries - i);
+        auto curQuerySize = std::min(tileRows, numQueries - i);
 
         auto outDistanceView = outDistances.narrow(0, i, curQuerySize);
         auto outIndexView = outIndices.narrow(0, i, curQuerySize);
@@ -368,14 +369,14 @@
                 outIndexBufs[curStream]->narrow(0, 0, curQuerySize);
 
         // Tile over the centroids
-        for (int j = 0; j < numCentroids; j += tileCols) {
+        for (idx_t j = 0; j < numCentroids; j += tileCols) {
             if (InterruptCallback::is_interrupted()) {
                 interrupt = true;
                 break;
             }
 
-            int curCentroidSize = std::min(tileCols, numCentroids - j);
-            int curColTile = j / tileCols;
+            auto curCentroidSize = std::min(tileCols, numCentroids - j);
+            auto curColTile = j / tileCols;
 
             auto centroidsView =
                     sliceCentroids(centroids, true, j, curCentroidSize);
diff -Nru faiss-1.7.3/faiss/gpu/impl/GpuScalarQuantizer.cuh faiss-1.7.4/faiss/gpu/impl/GpuScalarQuantizer.cuh
--- faiss-1.7.3/faiss/gpu/impl/GpuScalarQuantizer.cuh	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/GpuScalarQuantizer.cuh	2023-04-19 13:18:30.000000000 +0000
@@ -40,9 +40,9 @@
               gpuTrained(DeviceTensor<float, 1, true>(
                       res,
                       makeDevAlloc(AllocType::Quantizer, 0),
-                      {(int)sq.trained.size()})) {
+                      {(idx_t)sq.trained.size()})) {
         HostTensor<float, 1, true> cpuTrained(
-                (float*)sq.trained.data(), {(int)sq.trained.size()});
+                (float*)sq.trained.data(), {(idx_t)sq.trained.size()});
 
         auto stream = res->getDefaultStreamCurrentDevice();
         gpuTrained.copyFrom(cpuTrained, stream);
@@ -81,21 +81,24 @@
     }
     inline __device__ void initKernel(float* smem, int dim) {}
 
-    inline __device__ void decode(void* data, int vec, int d, float* out)
+    inline __device__ void decode(void* data, idx_t vec, int d, float* out)
             const {
         float* p = (float*)&((uint8_t*)data)[vec * bytesPerVec];
         out[0] = p[d];
     }
 
-    inline __device__ float decodePartial(void* data, int vec, int d, int subD)
-            const {
+    inline __device__ float decodePartial(
+            void* data,
+            idx_t vec,
+            int d,
+            int subD) const {
         // doesn't need implementing (kDimPerIter == 1)
         return 0.0f;
     }
 
     inline __device__ void encode(
             void* data,
-            int vec,
+            idx_t vec,
             int d,
             float v[kDimPerIter]) const {
         float* p = (float*)&((uint8_t*)data)[vec * bytesPerVec];
@@ -104,7 +107,7 @@
 
     inline __device__ void encodePartial(
             void* data,
-            int vec,
+            idx_t vec,
             int d,
             int remaining,
             float v[kDimPerIter]) const {
@@ -148,21 +151,24 @@
     }
     inline __device__ void initKernel(float* smem, int dim) {}
 
-    inline __device__ void decode(void* data, int vec, int d, float* out)
+    inline __device__ void decode(void* data, idx_t vec, int d, float* out)
             const {
         half* p = (half*)&((uint8_t*)data)[vec * bytesPerVec];
         out[0] = Convert<half, float>()(p[d]);
     }
 
-    inline __device__ float decodePartial(void* data, int vec, int d, int subD)
-            const {
+    inline __device__ float decodePartial(
+            void* data,
+            idx_t vec,
+            int d,
+            int subD) const {
         // doesn't need implementing (kDimPerIter == 1)
         return 0.0f;
     }
 
     inline __device__ void encode(
             void* data,
-            int vec,
+            idx_t vec,
             int d,
             float v[kDimPerIter]) const {
         half* p = (half*)&((uint8_t*)data)[vec * bytesPerVec];
@@ -171,7 +177,7 @@
 
     inline __device__ void encodePartial(
             void* data,
-            int vec,
+            idx_t vec,
             int d,
             int remaining,
             float v[kDimPerIter]) const {
@@ -247,7 +253,7 @@
         return vmin + (float)v * vdiff;
     }
 
-    inline __device__ void decode(void* data, int vec, int d, float* out)
+    inline __device__ void decode(void* data, idx_t vec, int d, float* out)
             const {
         MemT* p = (MemT*)&((uint8_t*)data)[vec * bytesPerVec];
         MemT pv = p[d];
@@ -270,8 +276,11 @@
         }
     }
 
-    inline __device__ float decodePartial(void* data, int vec, int d, int subD)
-            const {
+    inline __device__ float decodePartial(
+            void* data,
+            idx_t vec,
+            int d,
+            int subD) const {
         if (DimMultiple > 1) {
             // should not be called
             assert(false);
@@ -289,7 +298,7 @@
 
     inline __device__ void encode(
             void* data,
-            int vec,
+            idx_t vec,
             int d,
             float v[kDimPerIter]) const {
         MemT* p = (MemT*)&((uint8_t*)data)[vec * bytesPerVec];
@@ -311,7 +320,7 @@
 
     inline __device__ void encodePartial(
             void* data,
-            int vec,
+            idx_t vec,
             int d,
             int remaining,
             float v[kDimPerIter]) const {
@@ -383,7 +392,7 @@
         return smemVmin[realDim] + (float)v * smemVdiff[realDim];
     }
 
-    inline __device__ void decode(void* data, int vec, int d, float* out)
+    inline __device__ void decode(void* data, idx_t vec, int d, float* out)
             const {
         MemT* p = (MemT*)&((uint8_t*)data)[vec * bytesPerVec];
         MemT pv = p[d];
@@ -407,8 +416,11 @@
         }
     }
 
-    inline __device__ float decodePartial(void* data, int vec, int d, int subD)
-            const {
+    inline __device__ float decodePartial(
+            void* data,
+            idx_t vec,
+            int d,
+            int subD) const {
         if (DimMultiple > 1) {
             // should not be called
             assert(false);
@@ -426,7 +438,7 @@
 
     inline __device__ void encode(
             void* data,
-            int vec,
+            idx_t vec,
             int d,
             float v[kDimPerIter]) const {
         MemT* p = (MemT*)&((uint8_t*)data)[vec * bytesPerVec];
@@ -449,7 +461,7 @@
 
     inline __device__ void encodePartial(
             void* data,
-            int vec,
+            idx_t vec,
             int d,
             int remaining,
             float v[kDimPerIter]) const {
@@ -499,21 +511,24 @@
     }
     inline __device__ void initKernel(float* smem, int dim) {}
 
-    inline __device__ void decode(void* data, int vec, int d, float* out)
+    inline __device__ void decode(void* data, idx_t vec, int d, float* out)
             const {
         uint8_t* p = &((uint8_t*)data)[vec * bytesPerVec];
         out[0] = (float)p[d];
     }
 
-    inline __device__ float decodePartial(void* data, int vec, int d, int subD)
-            const {
+    inline __device__ float decodePartial(
+            void* data,
+            idx_t vec,
+            int d,
+            int subD) const {
         // doesn't need implementing (kDimPerIter == 1)
         return 0.0f;
     }
 
     inline __device__ void encode(
             void* data,
-            int vec,
+            idx_t vec,
             int d,
             float v[kDimPerIter]) const {
         uint8_t* p = &((uint8_t*)data)[vec * bytesPerVec];
@@ -522,7 +537,7 @@
 
     inline __device__ void encodePartial(
             void* data,
-            int vec,
+            idx_t vec,
             int d,
             int remaining,
             float v[kDimPerIter]) const {
@@ -651,7 +666,7 @@
         return vmin + (float)v * vdiff;
     }
 
-    inline __device__ void decode(void* data, int vec, int d, float* out)
+    inline __device__ void decode(void* data, idx_t vec, int d, float* out)
             const {
         uint8_t* p = &((uint8_t*)data)[vec * bytesPerVec];
         uint8_t pv = p[d];
@@ -662,7 +677,7 @@
 
     inline __device__ float decodePartial(
             void* data,
-            int vec,
+            idx_t vec,
             int d,
             int subD /* unused */) const {
         // We can only be called for a single input
@@ -680,7 +695,7 @@
 
     inline __device__ void encode(
             void* data,
-            int vec,
+            idx_t vec,
             int d,
             float v[kDimPerIter]) const {
         uint8_t* p = &((uint8_t*)data)[vec * bytesPerVec];
@@ -689,7 +704,7 @@
 
     inline __device__ void encodePartial(
             void* data,
-            int vec,
+            idx_t vec,
             int d,
             int remaining, /* unused */
             float v[kDimPerIter]) const {
@@ -755,7 +770,7 @@
         return smemVmin[realDim] + (float)v * smemVdiff[realDim];
     }
 
-    inline __device__ void decode(void* data, int vec, int d, float* out)
+    inline __device__ void decode(void* data, idx_t vec, int d, float* out)
             const {
         uint8_t* p = &((uint8_t*)data)[vec * bytesPerVec];
         uint8_t pv = p[d];
@@ -767,7 +782,7 @@
 
     inline __device__ float decodePartial(
             void* data,
-            int vec,
+            idx_t vec,
             int d,
             int subD /* unused */) const {
         // We can only be called for a single input
@@ -786,7 +801,7 @@
 
     inline __device__ void encode(
             void* data,
-            int vec,
+            idx_t vec,
             int d,
             float v[kDimPerIter]) const {
         uint8_t* p = &((uint8_t*)data)[vec * bytesPerVec];
@@ -797,7 +812,7 @@
 
     inline __device__ void encodePartial(
             void* data,
-            int vec,
+            idx_t vec,
             int d,
             int remaining, /* unused */
             float v[kDimPerIter]) const {
diff -Nru faiss-1.7.3/faiss/gpu/impl/IndexUtils.cu faiss-1.7.4/faiss/gpu/impl/IndexUtils.cu
--- faiss-1.7.3/faiss/gpu/impl/IndexUtils.cu	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/IndexUtils.cu	2023-04-19 13:18:30.000000000 +0000
@@ -22,29 +22,21 @@
     return GPU_MAX_SELECTION_K;
 }
 
-void validateKSelect(Index::idx_t k) {
+void validateKSelect(int k) {
     FAISS_THROW_IF_NOT_FMT(
-            k > 0 && k < (Index::idx_t)getMaxKSelection(),
-            "GPU index only supports min/max-K selection up to %d (requested %zu)",
+            k > 0 && k <= getMaxKSelection(),
+            "GPU index only supports min/max-K selection up to %d (requested %d)",
             getMaxKSelection(),
             k);
 }
 
-void validateNProbe(Index::idx_t nprobe) {
+void validateNProbe(size_t nprobe) {
     FAISS_THROW_IF_NOT_FMT(
-            nprobe > 0 && nprobe < (Index::idx_t)getMaxKSelection(),
+            nprobe > 0 && nprobe <= (size_t)getMaxKSelection(),
             "GPU IVF index only supports nprobe selection up to %d (requested %zu)",
             getMaxKSelection(),
             nprobe);
 }
 
-void validateNumVectors(Index::idx_t n) {
-    FAISS_THROW_IF_NOT_FMT(
-            n <= (Index::idx_t)std::numeric_limits<int>::max(),
-            "GPU index only supports up to %d indices (requested %zu)",
-            std::numeric_limits<int>::max(),
-            n);
-}
-
 } // namespace gpu
 } // namespace faiss
diff -Nru faiss-1.7.3/faiss/gpu/impl/IndexUtils.h faiss-1.7.4/faiss/gpu/impl/IndexUtils.h
--- faiss-1.7.3/faiss/gpu/impl/IndexUtils.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/IndexUtils.h	2023-04-19 13:18:30.000000000 +0000
@@ -20,13 +20,10 @@
 int getMaxKSelection();
 
 // Validate the k parameter for search
-void validateKSelect(Index::idx_t k);
+void validateKSelect(int k);
 
 // Validate the nprobe parameter for search
-void validateNProbe(Index::idx_t nprobe);
-
-/// Validate the n (number of vectors) parameter for add, search, reconstruct
-void validateNumVectors(Index::idx_t n);
+void validateNProbe(size_t nprobe);
 
 } // namespace gpu
 } // namespace faiss
diff -Nru faiss-1.7.3/faiss/gpu/impl/IVFAppend.cu faiss-1.7.4/faiss/gpu/impl/IVFAppend.cu
--- faiss-1.7.3/faiss/gpu/impl/IVFAppend.cu	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/IVFAppend.cu	2023-04-19 13:18:30.000000000 +0000
@@ -27,17 +27,17 @@
 
 // Updates the device-size array of list start pointers for codes and indices
 __global__ void runUpdateListPointers(
-        Tensor<Index::idx_t, 1, true> listIds,
-        Tensor<int, 1, true> newListLength,
+        Tensor<idx_t, 1, true> listIds,
+        Tensor<idx_t, 1, true> newListLength,
         Tensor<void*, 1, true> newCodePointers,
         Tensor<void*, 1, true> newIndexPointers,
-        int* listLengths,
+        idx_t* listLengths,
         void** listCodes,
         void** listIndices) {
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    idx_t i = idx_t(blockIdx.x) * blockDim.x + threadIdx.x;
 
     if (i < listIds.getSize(0)) {
-        Index::idx_t listId = listIds[i];
+        idx_t listId = listIds[i];
         listLengths[listId] = newListLength[i];
         listCodes[listId] = newCodePointers[i];
         listIndices[listId] = newIndexPointers[i];
@@ -45,16 +45,17 @@
 }
 
 void runUpdateListPointers(
-        Tensor<Index::idx_t, 1, true>& listIds,
-        Tensor<int, 1, true>& newListLength,
+        Tensor<idx_t, 1, true>& listIds,
+        Tensor<idx_t, 1, true>& newListLength,
         Tensor<void*, 1, true>& newCodePointers,
         Tensor<void*, 1, true>& newIndexPointers,
-        DeviceVector<int>& listLengths,
+        DeviceVector<idx_t>& listLengths,
         DeviceVector<void*>& listCodes,
         DeviceVector<void*>& listIndices,
         cudaStream_t stream) {
-    int numThreads = std::min(listIds.getSize(0), getMaxThreadsCurrentDevice());
-    int numBlocks = utils::divUp(listIds.getSize(0), numThreads);
+    auto numThreads =
+            std::min(listIds.getSize(0), (idx_t)getMaxThreadsCurrentDevice());
+    auto numBlocks = utils::divUp(listIds.getSize(0), numThreads);
 
     dim3 grid(numBlocks);
     dim3 block(numThreads);
@@ -73,39 +74,39 @@
 
 // Appends new indices for vectors being added to the IVF indices lists
 __global__ void ivfIndicesAppend(
-        Tensor<Index::idx_t, 1, true> listIds,
-        Tensor<int, 1, true> listOffset,
-        Tensor<Index::idx_t, 1, true> indices,
+        Tensor<idx_t, 1, true> listIds,
+        Tensor<idx_t, 1, true> listOffset,
+        Tensor<idx_t, 1, true> indices,
         IndicesOptions opt,
         void** listIndices) {
-    int vec = blockIdx.x * blockDim.x + threadIdx.x;
+    idx_t vec = idx_t(blockIdx.x) * blockDim.x + threadIdx.x;
 
     if (vec >= listIds.getSize(0)) {
         return;
     }
 
-    Index::idx_t listId = listIds[vec];
-    int offset = listOffset[vec];
+    idx_t listId = listIds[vec];
+    idx_t offset = listOffset[vec];
 
     // Add vector could be invalid (contains NaNs etc)
     if (listId == -1 || offset == -1) {
         return;
     }
 
-    Index::idx_t index = indices[vec];
+    idx_t index = indices[vec];
 
     if (opt == INDICES_32_BIT) {
         // FIXME: there could be overflow here, but where should we check this?
         ((int*)listIndices[listId])[offset] = (int)index;
     } else if (opt == INDICES_64_BIT) {
-        ((Index::idx_t*)listIndices[listId])[offset] = index;
+        ((idx_t*)listIndices[listId])[offset] = index;
     }
 }
 
 void runIVFIndicesAppend(
-        Tensor<Index::idx_t, 1, true>& listIds,
-        Tensor<int, 1, true>& listOffset,
-        Tensor<Index::idx_t, 1, true>& indices,
+        Tensor<idx_t, 1, true>& listIds,
+        Tensor<idx_t, 1, true>& listOffset,
+        Tensor<idx_t, 1, true>& indices,
         IndicesOptions opt,
         DeviceVector<void*>& listIndices,
         cudaStream_t stream) {
@@ -114,9 +115,9 @@
             opt == INDICES_64_BIT);
 
     if (opt != INDICES_CPU && opt != INDICES_IVF) {
-        int num = listIds.getSize(0);
-        int threads = std::min(num, getMaxThreadsCurrentDevice());
-        int blocks = utils::divUp(num, threads);
+        auto num = listIds.getSize(0);
+        auto threads = std::min(num, (idx_t)getMaxThreadsCurrentDevice());
+        auto blocks = utils::divUp(num, threads);
 
         ivfIndicesAppend<<<blocks, threads, 0, stream>>>(
                 listIds, listOffset, indices, opt, listIndices.data());
@@ -131,15 +132,15 @@
 
 template <typename Codec>
 __global__ void ivfFlatAppend(
-        Tensor<Index::idx_t, 1, true> listIds,
-        Tensor<int, 1, true> listOffset,
+        Tensor<idx_t, 1, true> listIds,
+        Tensor<idx_t, 1, true> listOffset,
         Tensor<float, 2, true> vecs,
         void** listData,
         Codec codec) {
-    int vec = blockIdx.x;
+    idx_t vec = blockIdx.x;
 
-    Index::idx_t listId = listIds[vec];
-    int offset = listOffset[vec];
+    idx_t listId = listIds[vec];
+    idx_t offset = listOffset[vec];
 
     // Add vector could be invalid (contains NaNs etc)
     if (listId == -1 || offset == -1) {
@@ -147,9 +148,10 @@
     }
 
     // Handle whole encoding (only thread 0 will handle the remainder)
-    int limit = utils::divDown(vecs.getSize(1), Codec::kDimPerIter);
+    // FIXME: dimension < max int?
+    idx_t limit = utils::divDown(vecs.getSize(1), Codec::kDimPerIter);
 
-    int i;
+    idx_t i;
     for (i = threadIdx.x; i < limit; i += blockDim.x) {
         int realDim = i * Codec::kDimPerIter;
         float toEncode[Codec::kDimPerIter];
@@ -188,14 +190,14 @@
 }
 
 void runIVFFlatAppend(
-        Tensor<Index::idx_t, 1, true>& listIds,
-        Tensor<int, 1, true>& listOffset,
+        Tensor<idx_t, 1, true>& listIds,
+        Tensor<idx_t, 1, true>& listOffset,
         Tensor<float, 2, true>& vecs,
         GpuScalarQuantizer* scalarQ,
         DeviceVector<void*>& listData,
         cudaStream_t stream) {
-    int dim = vecs.getSize(1);
-    int maxThreads = getMaxThreadsCurrentDevice();
+    auto dim = vecs.getSize(1);
+    idx_t maxThreads = getMaxThreadsCurrentDevice();
 
     // Each block will handle appending a single vector
 #define RUN_APPEND                                                  \
@@ -261,18 +263,18 @@
 }
 
 __global__ void ivfpqAppend(
-        Tensor<Index::idx_t, 1, true> listIds,
-        Tensor<int, 1, true> listOffset,
+        Tensor<idx_t, 1, true> listIds,
+        Tensor<idx_t, 1, true> listOffset,
         Tensor<uint8_t, 2, true> encodings,
         void** listCodes) {
-    int encodingToAdd = blockIdx.x * blockDim.x + threadIdx.x;
+    idx_t encodingToAdd = idx_t(blockIdx.x) * blockDim.x + threadIdx.x;
 
     if (encodingToAdd >= listIds.getSize(0)) {
         return;
     }
 
-    Index::idx_t listId = listIds[encodingToAdd];
-    int vectorNumInList = listOffset[encodingToAdd];
+    idx_t listId = listIds[encodingToAdd];
+    idx_t vectorNumInList = listOffset[encodingToAdd];
 
     // Add vector could be invalid (contains NaNs etc)
     if (listId == -1 || vectorNumInList == -1) {
@@ -286,19 +288,20 @@
             vectorNumInList * encodings.getSize(1);
 
     // FIXME: stride with threads instead of single thread
-    for (int i = 0; i < encodings.getSize(1); ++i) {
+    for (idx_t i = 0; i < encodings.getSize(1); ++i) {
         codeStart[i] = encoding[i];
     }
 }
 
 void runIVFPQAppend(
-        Tensor<Index::idx_t, 1, true>& listIds,
-        Tensor<int, 1, true>& listOffset,
+        Tensor<idx_t, 1, true>& listIds,
+        Tensor<idx_t, 1, true>& listOffset,
         Tensor<uint8_t, 2, true>& encodings,
         DeviceVector<void*>& listCodes,
         cudaStream_t stream) {
-    int threads = std::min(listIds.getSize(0), getMaxThreadsCurrentDevice());
-    int blocks = utils::divUp(listIds.getSize(0), threads);
+    idx_t threads =
+            std::min(listIds.getSize(0), (idx_t)getMaxThreadsCurrentDevice());
+    idx_t blocks = utils::divUp(listIds.getSize(0), threads);
 
     ivfpqAppend<<<threads, blocks, 0, stream>>>(
             listIds, listOffset, encodings, listCodes.data());
@@ -318,9 +321,9 @@
         Tensor<float, 2, true> vecs,
         Tensor<typename Codec::EncodeT, 2, true> encodedVecs,
         Codec codec) {
-    int vec = blockIdx.x;
+    idx_t vec = blockIdx.x;
 
-    for (int d = threadIdx.x; d < vecs.getSize(1); d += blockDim.x) {
+    for (idx_t d = threadIdx.x; d < vecs.getSize(1); d += blockDim.x) {
         encodedVecs[vec][d] = codec.encodeNew(d, vecs[vec][d]);
     }
 }
@@ -331,8 +334,9 @@
         Tensor<typename Codec::EncodeT, 2, true>& encodedVecs,
         Codec codec,
         cudaStream_t stream) {
-    int threads = std::min(vecs.getSize(1), getMaxThreadsCurrentDevice());
-    int blocks = vecs.getSize(0);
+    idx_t threads =
+            std::min(vecs.getSize(1), (idx_t)getMaxThreadsCurrentDevice());
+    idx_t blocks = vecs.getSize(0);
 
     sqEncode<<<blocks, threads, 0, stream>>>(vecs, encodedVecs, codec);
 }
@@ -344,21 +348,21 @@
 __global__ void ivfInterleavedAppend(
         // the IDs (offset in listData) of the unique lists
         // being added to
-        Tensor<Index::idx_t, 1, true> uniqueLists,
+        Tensor<idx_t, 1, true> uniqueLists,
         // For each of the list IDs in uniqueLists, the start
         // offset in vectorsByUniqueList for the vectors that
         // we are adding to that list
-        Tensor<int, 1, true> uniqueListVectorStart,
+        Tensor<idx_t, 1, true> uniqueListVectorStart,
         // IDs in vecs of the vectors being added to each
         // unique list
         // The vectors (offset in vecs) added to
         // uniqueLists[i] is:
         // {vBUL[uLVS[i]], ..., vBUL[uLVS[i+1] - 1]}
-        Tensor<int, 1, true> vectorsByUniqueList,
+        Tensor<idx_t, 1, true> vectorsByUniqueList,
         // For each of the list IDs in uniqueLists, the start
         // offset (by vector) within that list where we begin
         // appending
-        Tensor<int, 1, true> uniqueListStartOffset,
+        Tensor<idx_t, 1, true> uniqueListStartOffset,
         // The EncodeT-sized encoded vectors
         Tensor<EncodeT, 2, true> encodedVecs,
         // The set of addresses for each of the lists
@@ -369,23 +373,23 @@
     int warpsPerBlock = blockDim.x / kWarpSize;
 
     // Each block is dedicated to a separate list
-    Index::idx_t listId = uniqueLists[blockIdx.x];
+    idx_t listId = uniqueLists[blockIdx.x];
 
     // The vecs we add to the list are at indices [vBUL[vecIdStart],
     // vBUL[vecIdEnd])
-    int vecIdStart = uniqueListVectorStart[blockIdx.x];
+    idx_t vecIdStart = uniqueListVectorStart[blockIdx.x];
     // uLVS is explicitly terminated for us with one more than the number of
     // blocks that we have
-    int vecIdEnd = uniqueListVectorStart[blockIdx.x + 1];
+    idx_t vecIdEnd = uniqueListVectorStart[blockIdx.x + 1];
 
     // How many vectors we are adding to this list
-    int numVecsAdding = vecIdEnd - vecIdStart;
+    auto numVecsAdding = vecIdEnd - vecIdStart;
 
     // The first vector we are updating within the list
     auto listVecStart = uniqueListStartOffset[blockIdx.x];
 
     // These are the actual vec IDs that we are adding (in vecs)
-    int* listVecIds = vectorsByUniqueList[vecIdStart].data();
+    auto listVecIds = vectorsByUniqueList[vecIdStart].data();
 
     // All data is written by groups of 32 vectors (to mirror the warp).
     // listVecStart could be in the middle of this, or even, for sub-byte
@@ -396,18 +400,18 @@
     // 32, but we ensure that it only operates on the group of 32 vectors. In
     // order to do this we need to actually start updating vectors at the next
     // lower multiple of 32 from listVecStart.
-    int alignedListVecStart = utils::roundDown(listVecStart, 32);
+    auto alignedListVecStart = utils::roundDown(listVecStart, 32);
 
     // Each block of 32 vectors fully encodes into this many bytes
     constexpr int bytesPerVectorBlockDim = EncodeBits * 32 / 8;
     constexpr int wordsPerVectorBlockDim =
             bytesPerVectorBlockDim / sizeof(EncodeT);
-    int wordsPerVectorBlock = wordsPerVectorBlockDim * encodedVecs.getSize(1);
+    auto wordsPerVectorBlock = wordsPerVectorBlockDim * encodedVecs.getSize(1);
 
     EncodeT* listStart = ((EncodeT*)listData[listId]);
 
     // Each warp within the block handles a different chunk of 32
-    int warpVec = alignedListVecStart + warpId * 32;
+    auto warpVec = alignedListVecStart + warpId * 32;
 
     // The warp data starts here
     EncodeT* warpData = listStart + (warpVec / 32) * wordsPerVectorBlock;
@@ -420,18 +424,18 @@
          // encoding, which is one per warp
          warpData += warpsPerBlock * wordsPerVectorBlock) {
         // This lane is adding this vec (if it is within bounds)
-        int laneVec = warpVec + laneId;
+        auto laneVec = warpVec + laneId;
 
         // Which vector does this correspond to in the set of vectors that we
         // need to add? If this is < 0, then this particular thread is not
         // encoding / appending a new vector
-        int laneVecAdding = laneVec - listVecStart;
+        auto laneVecAdding = laneVec - listVecStart;
 
         // We are actually adding a new vector if this is within range
         bool valid = laneVecAdding >= 0 && laneVecAdding < numVecsAdding;
 
         // Now, which actual vector in vecs is this?
-        int vecId = valid ? listVecIds[laneVecAdding] : 0;
+        auto vecId = valid ? listVecIds[laneVecAdding] : 0;
 
         // Each warp that has some vector data available needs to write out the
         // vector components
@@ -448,18 +452,18 @@
 }
 
 void runIVFFlatInterleavedAppend(
-        Tensor<Index::idx_t, 1, true>& listIds,
-        Tensor<int, 1, true>& listOffset,
-        Tensor<Index::idx_t, 1, true>& uniqueLists,
-        Tensor<int, 1, true>& vectorsByUniqueList,
-        Tensor<int, 1, true>& uniqueListVectorStart,
-        Tensor<int, 1, true>& uniqueListStartOffset,
+        Tensor<idx_t, 1, true>& listIds,
+        Tensor<idx_t, 1, true>& listOffset,
+        Tensor<idx_t, 1, true>& uniqueLists,
+        Tensor<idx_t, 1, true>& vectorsByUniqueList,
+        Tensor<idx_t, 1, true>& uniqueListVectorStart,
+        Tensor<idx_t, 1, true>& uniqueListStartOffset,
         Tensor<float, 2, true>& vecs,
         GpuScalarQuantizer* scalarQ,
         DeviceVector<void*>& listData,
         GpuResources* res,
         cudaStream_t stream) {
-    int dim = vecs.getSize(1);
+    auto dim = vecs.getSize(1);
 
 #define RUN_APPEND(ENCODE_T, ENCODE_BITS, DATA)     \
     do {                                            \
@@ -582,12 +586,12 @@
 }
 
 void runIVFPQInterleavedAppend(
-        Tensor<Index::idx_t, 1, true>& listIds,
-        Tensor<int, 1, true>& listOffset,
-        Tensor<Index::idx_t, 1, true>& uniqueLists,
-        Tensor<int, 1, true>& vectorsByUniqueList,
-        Tensor<int, 1, true>& uniqueListVectorStart,
-        Tensor<int, 1, true>& uniqueListStartOffset,
+        Tensor<idx_t, 1, true>& listIds,
+        Tensor<idx_t, 1, true>& listOffset,
+        Tensor<idx_t, 1, true>& uniqueLists,
+        Tensor<idx_t, 1, true>& vectorsByUniqueList,
+        Tensor<idx_t, 1, true>& uniqueListVectorStart,
+        Tensor<idx_t, 1, true>& uniqueListStartOffset,
         int bitsPerCode,
         Tensor<uint8_t, 2, true>& encodings,
         DeviceVector<void*>& listCodes,
diff -Nru faiss-1.7.3/faiss/gpu/impl/IVFAppend.cuh faiss-1.7.4/faiss/gpu/impl/IVFAppend.cuh
--- faiss-1.7.3/faiss/gpu/impl/IVFAppend.cuh	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/IVFAppend.cuh	2023-04-19 13:18:30.000000000 +0000
@@ -18,40 +18,40 @@
 
 /// Append user indices to IVF lists
 void runIVFIndicesAppend(
-        Tensor<Index::idx_t, 1, true>& listIds,
-        Tensor<int, 1, true>& listOffset,
-        Tensor<Index::idx_t, 1, true>& indices,
+        Tensor<idx_t, 1, true>& listIds,
+        Tensor<idx_t, 1, true>& listOffset,
+        Tensor<idx_t, 1, true>& indices,
         IndicesOptions opt,
         DeviceVector<void*>& listIndices,
         cudaStream_t stream);
 
 /// Update device-side list pointers in a batch
 void runUpdateListPointers(
-        Tensor<Index::idx_t, 1, true>& listIds,
-        Tensor<int, 1, true>& newListLength,
+        Tensor<idx_t, 1, true>& listIds,
+        Tensor<idx_t, 1, true>& newListLength,
         Tensor<void*, 1, true>& newCodePointers,
         Tensor<void*, 1, true>& newIndexPointers,
-        DeviceVector<int>& listLengths,
+        DeviceVector<idx_t>& listLengths,
         DeviceVector<void*>& listCodes,
         DeviceVector<void*>& listIndices,
         cudaStream_t stream);
 
 /// Append PQ codes to IVF lists (non-interleaved format)
 void runIVFPQAppend(
-        Tensor<Index::idx_t, 1, true>& listIds,
-        Tensor<int, 1, true>& listOffset,
+        Tensor<idx_t, 1, true>& listIds,
+        Tensor<idx_t, 1, true>& listOffset,
         Tensor<uint8_t, 2, true>& encodings,
         DeviceVector<void*>& listCodes,
         cudaStream_t stream);
 
 /// Append PQ codes to IVF lists (interleaved format)
 void runIVFPQInterleavedAppend(
-        Tensor<Index::idx_t, 1, true>& listIds,
-        Tensor<int, 1, true>& listOffset,
-        Tensor<Index::idx_t, 1, true>& uniqueLists,
-        Tensor<int, 1, true>& vectorsByUniqueList,
-        Tensor<int, 1, true>& uniqueListVectorStart,
-        Tensor<int, 1, true>& uniqueListStartOffset,
+        Tensor<idx_t, 1, true>& listIds,
+        Tensor<idx_t, 1, true>& listOffset,
+        Tensor<idx_t, 1, true>& uniqueLists,
+        Tensor<idx_t, 1, true>& vectorsByUniqueList,
+        Tensor<idx_t, 1, true>& uniqueListVectorStart,
+        Tensor<idx_t, 1, true>& uniqueListStartOffset,
         int bitsPerCode,
         Tensor<uint8_t, 2, true>& encodings,
         DeviceVector<void*>& listCodes,
@@ -59,8 +59,8 @@
 
 /// Append SQ codes to IVF lists (non-interleaved, old format)
 void runIVFFlatAppend(
-        Tensor<Index::idx_t, 1, true>& listIds,
-        Tensor<int, 1, true>& listOffset,
+        Tensor<idx_t, 1, true>& listIds,
+        Tensor<idx_t, 1, true>& listOffset,
         Tensor<float, 2, true>& vecs,
         GpuScalarQuantizer* scalarQ,
         DeviceVector<void*>& listData,
@@ -68,12 +68,12 @@
 
 /// Append SQ codes to IVF lists (interleaved)
 void runIVFFlatInterleavedAppend(
-        Tensor<Index::idx_t, 1, true>& listIds,
-        Tensor<int, 1, true>& listOffset,
-        Tensor<Index::idx_t, 1, true>& uniqueLists,
-        Tensor<int, 1, true>& vectorsByUniqueList,
-        Tensor<int, 1, true>& uniqueListVectorStart,
-        Tensor<int, 1, true>& uniqueListStartOffset,
+        Tensor<idx_t, 1, true>& listIds,
+        Tensor<idx_t, 1, true>& listOffset,
+        Tensor<idx_t, 1, true>& uniqueLists,
+        Tensor<idx_t, 1, true>& vectorsByUniqueList,
+        Tensor<idx_t, 1, true>& uniqueListVectorStart,
+        Tensor<idx_t, 1, true>& uniqueListStartOffset,
         Tensor<float, 2, true>& vecs,
         GpuScalarQuantizer* scalarQ,
         DeviceVector<void*>& listData,
diff -Nru faiss-1.7.3/faiss/gpu/impl/IVFBase.cu faiss-1.7.4/faiss/gpu/impl/IVFBase.cu
--- faiss-1.7.3/faiss/gpu/impl/IVFBase.cu	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/IVFBase.cu	2023-04-19 13:18:30.000000000 +0000
@@ -31,7 +31,7 @@
 IVFBase::IVFBase(
         GpuResources* resources,
         int dim,
-        int nlist,
+        idx_t nlist,
         faiss::MetricType metric,
         float metricArg,
         bool useResidual,
@@ -74,7 +74,7 @@
 
 IVFBase::~IVFBase() {}
 
-void IVFBase::reserveMemory(size_t numVecs) {
+void IVFBase::reserveMemory(idx_t numVecs) {
     auto stream = resources_->getDefaultStreamCurrentDevice();
 
     auto vecsPerList = numVecs / deviceListData_.size();
@@ -93,7 +93,7 @@
         // Reserve for index lists as well
         size_t bytesPerIndexList = vecsPerList *
                 (indicesOptions_ == INDICES_32_BIT ? sizeof(int)
-                                                   : sizeof(Index::idx_t));
+                                                   : sizeof(idx_t));
 
         for (auto& list : deviceListIndices_) {
             list->data.reserve(bytesPerIndexList, stream);
@@ -118,14 +118,14 @@
     auto info =
             AllocInfo(AllocType::IVFLists, getCurrentDevice(), space_, stream);
 
-    for (size_t i = 0; i < numLists_; ++i) {
+    for (idx_t i = 0; i < numLists_; ++i) {
         deviceListData_.emplace_back(std::unique_ptr<DeviceIVFList>(
                 new DeviceIVFList(resources_, info)));
 
         deviceListIndices_.emplace_back(std::unique_ptr<DeviceIVFList>(
                 new DeviceIVFList(resources_, info)));
 
-        listOffsetToUserIndex_.emplace_back(std::vector<Index::idx_t>());
+        listOffsetToUserIndex_.emplace_back(std::vector<idx_t>());
     }
 
     deviceListDataPointers_.resize(numLists_, stream);
@@ -140,7 +140,7 @@
     maxListLength_ = 0;
 }
 
-int IVFBase::getDim() const {
+idx_t IVFBase::getDim() const {
     return dim_;
 }
 
@@ -154,14 +154,14 @@
 
     size_t totalReclaimed = 0;
 
-    for (int i = 0; i < deviceListData_.size(); ++i) {
+    for (idx_t i = 0; i < deviceListData_.size(); ++i) {
         auto& data = deviceListData_[i]->data;
         totalReclaimed += data.reclaim(exact, stream);
 
         deviceListDataPointers_.setAt(i, (void*)data.data(), stream);
     }
 
-    for (int i = 0; i < deviceListIndices_.size(); ++i) {
+    for (idx_t i = 0; i < deviceListIndices_.size(); ++i) {
         auto& indices = deviceListIndices_[i]->data;
         totalReclaimed += indices.reclaim(exact, stream);
 
@@ -176,8 +176,8 @@
 }
 
 void IVFBase::updateDeviceListInfo_(cudaStream_t stream) {
-    std::vector<Index::idx_t> listIds(deviceListData_.size());
-    for (int i = 0; i < deviceListData_.size(); ++i) {
+    std::vector<idx_t> listIds(deviceListData_.size());
+    for (idx_t i = 0; i < deviceListData_.size(); ++i) {
         listIds[i] = i;
     }
 
@@ -185,14 +185,15 @@
 }
 
 void IVFBase::updateDeviceListInfo_(
-        const std::vector<Index::idx_t>& listIds,
+        const std::vector<idx_t>& listIds,
         cudaStream_t stream) {
-    HostTensor<Index::idx_t, 1, true> hostListsToUpdate({(int)listIds.size()});
-    HostTensor<int, 1, true> hostNewListLength({(int)listIds.size()});
-    HostTensor<void*, 1, true> hostNewDataPointers({(int)listIds.size()});
-    HostTensor<void*, 1, true> hostNewIndexPointers({(int)listIds.size()});
+    idx_t listSize = listIds.size();
+    HostTensor<idx_t, 1, true> hostListsToUpdate({listSize});
+    HostTensor<idx_t, 1, true> hostNewListLength({listSize});
+    HostTensor<void*, 1, true> hostNewDataPointers({listSize});
+    HostTensor<void*, 1, true> hostNewIndexPointers({listSize});
 
-    for (int i = 0; i < listIds.size(); ++i) {
+    for (idx_t i = 0; i < listSize; ++i) {
         auto listId = listIds[i];
         auto& data = deviceListData_[listId];
         auto& indices = deviceListIndices_[listId];
@@ -204,11 +205,11 @@
     }
 
     // Copy the above update sets to the GPU
-    DeviceTensor<Index::idx_t, 1, true> listsToUpdate(
+    DeviceTensor<idx_t, 1, true> listsToUpdate(
             resources_,
             makeTempAlloc(AllocType::Other, stream),
             hostListsToUpdate);
-    DeviceTensor<int, 1, true> newListLength(
+    DeviceTensor<idx_t, 1, true> newListLength(
             resources_,
             makeTempAlloc(AllocType::Other, stream),
             hostNewListLength);
@@ -234,14 +235,14 @@
             stream);
 }
 
-size_t IVFBase::getNumLists() const {
+idx_t IVFBase::getNumLists() const {
     return numLists_;
 }
 
-int IVFBase::getListLength(int listId) const {
+idx_t IVFBase::getListLength(idx_t listId) const {
     FAISS_THROW_IF_NOT_FMT(
             listId < numLists_,
-            "IVF list %d is out of bounds (%d lists total)",
+            "IVF list %ld is out of bounds (%ld lists total)",
             listId,
             numLists_);
     FAISS_ASSERT(listId < deviceListLengths_.size());
@@ -250,10 +251,10 @@
     return deviceListData_[listId]->numVecs;
 }
 
-std::vector<Index::idx_t> IVFBase::getListIndices(int listId) const {
+std::vector<idx_t> IVFBase::getListIndices(idx_t listId) const {
     FAISS_THROW_IF_NOT_FMT(
             listId < numLists_,
-            "IVF list %d is out of bounds (%d lists total)",
+            "IVF list %ld is out of bounds (%ld lists total)",
             listId,
             numLists_);
     FAISS_ASSERT(listId < deviceListData_.size());
@@ -267,9 +268,9 @@
 
         auto intInd = deviceListIndices_[listId]->data.copyToHost<int>(stream);
 
-        std::vector<Index::idx_t> out(intInd.size());
+        std::vector<idx_t> out(intInd.size());
         for (size_t i = 0; i < intInd.size(); ++i) {
-            out[i] = (Index::idx_t)intInd[i];
+            out[i] = (idx_t)intInd[i];
         }
 
         return out;
@@ -277,8 +278,7 @@
         // The data is stored as int64 on the GPU
         FAISS_ASSERT(listId < deviceListIndices_.size());
 
-        return deviceListIndices_[listId]->data.copyToHost<Index::idx_t>(
-                stream);
+        return deviceListIndices_[listId]->data.copyToHost<idx_t>(stream);
     } else if (indicesOptions_ == INDICES_CPU) {
         // The data is not stored on the GPU
         FAISS_ASSERT(listId < listOffsetToUserIndex_.size());
@@ -294,15 +294,15 @@
     } else {
         // unhandled indices type (includes INDICES_IVF)
         FAISS_ASSERT(false);
-        return std::vector<Index::idx_t>();
+        return std::vector<idx_t>();
     }
 }
 
-std::vector<uint8_t> IVFBase::getListVectorData(int listId, bool gpuFormat)
+std::vector<uint8_t> IVFBase::getListVectorData(idx_t listId, bool gpuFormat)
         const {
     FAISS_THROW_IF_NOT_FMT(
             listId < numLists_,
-            "IVF list %d is out of bounds (%d lists total)",
+            "IVF list %ld is out of bounds (%ld lists total)",
             listId,
             numLists_);
     FAISS_ASSERT(listId < deviceListData_.size());
@@ -323,25 +323,15 @@
 }
 
 void IVFBase::copyInvertedListsFrom(const InvertedLists* ivf) {
-    size_t nlist = ivf ? ivf->nlist : 0;
-    for (size_t i = 0; i < nlist; ++i) {
-        size_t listSize = ivf->list_size(i);
-
-        // GPU index can only support max int entries per list
-        FAISS_THROW_IF_NOT_FMT(
-                listSize <= (size_t)std::numeric_limits<int>::max(),
-                "GPU inverted list can only support "
-                "%zu entries; %zu found",
-                (size_t)std::numeric_limits<int>::max(),
-                listSize);
-
+    idx_t nlist = ivf ? ivf->nlist : 0;
+    for (idx_t i = 0; i < nlist; ++i) {
         addEncodedVectorsToList_(
-                i, ivf->get_codes(i), ivf->get_ids(i), listSize);
+                i, ivf->get_codes(i), ivf->get_ids(i), ivf->list_size(i));
     }
 }
 
 void IVFBase::copyInvertedListsTo(InvertedLists* ivf) {
-    for (int i = 0; i < numLists_; ++i) {
+    for (idx_t i = 0; i < numLists_; ++i) {
         auto listIndices = getListIndices(i);
         auto listData = getListVectorData(i, false);
 
@@ -351,10 +341,10 @@
 }
 
 void IVFBase::addEncodedVectorsToList_(
-        int listId,
+        idx_t listId,
         const void* codes,
-        const Index::idx_t* indices,
-        size_t numVecs) {
+        const idx_t* indices,
+        idx_t numVecs) {
     auto stream = resources_->getDefaultStreamCurrentDevice();
 
     // This list must already exist
@@ -374,10 +364,6 @@
     auto gpuListSizeInBytes = getGpuVectorsEncodingSize_(numVecs);
     auto cpuListSizeInBytes = getCpuVectorsEncodingSize_(numVecs);
 
-    // We only have int32 length representaz3tions on the GPU per each
-    // list; the length is in sizeof(char)
-    FAISS_ASSERT(gpuListSizeInBytes <= (size_t)std::numeric_limits<int>::max());
-
     // Translate the codes as needed to our preferred form
     std::vector<uint8_t> codesV(cpuListSizeInBytes);
     std::memcpy(codesV.data(), codes, cpuListSizeInBytes);
@@ -395,16 +381,16 @@
 
     deviceListDataPointers_.setAt(
             listId, (void*)listCodes->data.data(), stream);
-    deviceListLengths_.setAt(listId, (int)numVecs, stream);
+    deviceListLengths_.setAt(listId, numVecs, stream);
 
     // We update this as well, since the multi-pass algorithm uses it
-    maxListLength_ = std::max(maxListLength_, (int)numVecs);
+    maxListLength_ = std::max(maxListLength_, numVecs);
 }
 
 void IVFBase::addIndicesFromCpu_(
-        int listId,
-        const Index::idx_t* indices,
-        size_t numVecs) {
+        idx_t listId,
+        const idx_t* indices,
+        idx_t numVecs) {
     auto stream = resources_->getDefaultStreamCurrentDevice();
 
     // This list must currently be empty
@@ -415,9 +401,9 @@
     if (indicesOptions_ == INDICES_32_BIT) {
         // Make sure that all indices are in bounds
         std::vector<int> indices32(numVecs);
-        for (size_t i = 0; i < numVecs; ++i) {
+        for (idx_t i = 0; i < numVecs; ++i) {
             auto ind = indices[i];
-            FAISS_ASSERT(ind <= (Index::idx_t)std::numeric_limits<int>::max());
+            FAISS_ASSERT(ind <= (idx_t)std::numeric_limits<int>::max());
             indices32[i] = (int)ind;
         }
 
@@ -435,7 +421,7 @@
     } else if (indicesOptions_ == INDICES_64_BIT) {
         listIndices->data.append(
                 (uint8_t*)indices,
-                numVecs * sizeof(Index::idx_t),
+                numVecs * sizeof(idx_t),
                 stream,
                 true /* exact reserved size */);
 
@@ -478,7 +464,7 @@
             DeviceTensor<float, 2, true> centroids(
                     resources_,
                     makeSpaceAlloc(AllocType::FlatData, space_, stream),
-                    {(int)getNumLists(), (int)getDim()});
+                    {getNumLists(), getDim()});
 
             gpuData->reconstruct(0, gpuData->getSize(), centroids);
 
@@ -506,7 +492,7 @@
         DeviceTensor<float, 2, true> centroids(
                 resources_,
                 makeSpaceAlloc(AllocType::FlatData, space_, stream),
-                {(int)quantizer->ntotal, (int)quantizer->d});
+                {quantizer->ntotal, quantizer->d});
         centroids.copyFrom(vecs, stream);
 
         ivfCentroids_ = std::move(centroids);
@@ -519,7 +505,7 @@
         // Guaranteed to be on device
         Tensor<float, 2, true>& vecs,
         Tensor<float, 2, true>& distances,
-        Tensor<Index::idx_t, 2, true>& indices,
+        Tensor<idx_t, 2, true>& indices,
         Tensor<float, 3, true>* residuals,
         Tensor<float, 3, true>* centroids) {
     auto stream = resources_->getDefaultStreamCurrentDevice();
@@ -556,7 +542,7 @@
         auto cpuVecs = toHost<float, 2>(
                 vecs.data(), stream, {vecs.getSize(0), vecs.getSize(1)});
         auto cpuDistances = std::vector<float>(vecs.getSize(0) * nprobe);
-        auto cpuIndices = std::vector<Index::idx_t>(vecs.getSize(0) * nprobe);
+        auto cpuIndices = std::vector<idx_t>(vecs.getSize(0) * nprobe);
 
         coarseQuantizer->search(
                 vecs.getSize(0),
@@ -599,10 +585,10 @@
     }
 }
 
-int IVFBase::addVectors(
+idx_t IVFBase::addVectors(
         Index* coarseQuantizer,
         Tensor<float, 2, true>& vecs,
-        Tensor<Index::idx_t, 1, true>& indices) {
+        Tensor<idx_t, 1, true>& indices) {
     FAISS_ASSERT(vecs.getSize(0) == indices.getSize(0));
     FAISS_ASSERT(vecs.getSize(1) == dim_);
 
@@ -617,7 +603,7 @@
             {vecs.getSize(0), 1});
 
     // We do need the closest IVF cell IDs though
-    DeviceTensor<Index::idx_t, 2, true> ivfIndices(
+    DeviceTensor<idx_t, 2, true> ivfIndices(
             resources_,
             makeTempAlloc(AllocType::Other, stream),
             {vecs.getSize(0), 1});
@@ -648,19 +634,19 @@
     // encoded vectors and indices
 
     // list id -> vectors being added
-    std::unordered_map<Index::idx_t, std::vector<int>> listToVectorIds;
+    std::unordered_map<idx_t, std::vector<idx_t>> listToVectorIds;
 
     // vector id -> which list it is being appended to
-    std::vector<Index::idx_t> vectorIdToList(vecs.getSize(0));
+    std::vector<idx_t> vectorIdToList(vecs.getSize(0));
 
     // vector id -> offset in list
     // (we already have vector id -> list id in listIds)
-    std::vector<int> listOffsetHost(ivfIndicesHost.size());
+    std::vector<idx_t> listOffsetHost(ivfIndicesHost.size());
 
     // Number of valid vectors that we actually add; we return this
-    int numAdded = 0;
+    idx_t numAdded = 0;
 
-    for (int i = 0; i < ivfIndicesHost.size(); ++i) {
+    for (idx_t i = 0; i < ivfIndicesHost.size(); ++i) {
         auto listId = ivfIndicesHost[i];
 
         // Add vector could be invalid (contains NaNs etc)
@@ -674,14 +660,14 @@
         ++numAdded;
         vectorIdToList[i] = listId;
 
-        int offset = deviceListData_[listId]->numVecs;
+        auto offset = deviceListData_[listId]->numVecs;
 
         auto it = listToVectorIds.find(listId);
         if (it != listToVectorIds.end()) {
             offset += it->second.size();
             it->second.push_back(i);
         } else {
-            listToVectorIds[listId] = std::vector<int>{i};
+            listToVectorIds[listId] = std::vector<idx_t>{i};
         }
 
         listOffsetHost[i] = offset;
@@ -694,7 +680,7 @@
     }
 
     // unique lists being added to
-    std::vector<Index::idx_t> uniqueLists;
+    std::vector<idx_t> uniqueLists;
 
     for (auto& vecs : listToVectorIds) {
         uniqueLists.push_back(vecs.first);
@@ -705,14 +691,14 @@
     // In the same order as uniqueLists, list the vectors being added to that
     // list contiguously (unique list 0 vectors ...)(unique list 1 vectors ...)
     // ...
-    std::vector<int> vectorsByUniqueList;
+    std::vector<idx_t> vectorsByUniqueList;
 
     // For each of the unique lists, the start offset in vectorsByUniqueList
-    std::vector<int> uniqueListVectorStart;
+    std::vector<idx_t> uniqueListVectorStart;
 
     // For each of the unique lists, where we start appending in that list by
     // the vector offset
-    std::vector<int> uniqueListStartOffset;
+    std::vector<idx_t> uniqueListStartOffset;
 
     // For each of the unique lists, find the vectors which should be appended
     // to that list
@@ -744,11 +730,11 @@
         // Resize all of the lists that we are appending to
         for (auto& counts : listToVectorIds) {
             auto listId = counts.first;
-            int numVecsToAdd = counts.second.size();
+            idx_t numVecsToAdd = counts.second.size();
 
             auto& codes = deviceListData_[listId];
-            int oldNumVecs = codes->numVecs;
-            int newNumVecs = codes->numVecs + numVecsToAdd;
+            auto oldNumVecs = codes->numVecs;
+            auto newNumVecs = codes->numVecs + numVecsToAdd;
 
             auto newSizeBytes = getGpuVectorsEncodingSize_(newNumVecs);
             codes->data.resize(newSizeBytes, stream);
@@ -759,7 +745,7 @@
                 (indicesOptions_ == INDICES_64_BIT)) {
                 size_t indexSize = (indicesOptions_ == INDICES_32_BIT)
                         ? sizeof(int)
-                        : sizeof(Index::idx_t);
+                        : sizeof(idx_t);
 
                 indices->data.resize(
                         indices->data.size() + numVecsToAdd * indexSize,
@@ -792,17 +778,17 @@
     // map. We already resized our map above.
     if (indicesOptions_ == INDICES_CPU) {
         // We need to maintain the indices on the CPU side
-        HostTensor<Index::idx_t, 1, true> hostIndices(indices, stream);
+        HostTensor<idx_t, 1, true> hostIndices(indices, stream);
 
-        for (int i = 0; i < hostIndices.getSize(0); ++i) {
-            Index::idx_t listId = ivfIndicesHost[i];
+        for (idx_t i = 0; i < hostIndices.getSize(0); ++i) {
+            idx_t listId = ivfIndicesHost[i];
 
             // Add vector could be invalid (contains NaNs etc)
             if (listId < 0) {
                 continue;
             }
 
-            int offset = listOffsetHost[i];
+            auto offset = listOffsetHost[i];
             FAISS_ASSERT(offset >= 0);
 
             FAISS_ASSERT(listId < listOffsetToUserIndex_.size());
diff -Nru faiss-1.7.3/faiss/gpu/impl/IVFBase.cuh faiss-1.7.4/faiss/gpu/impl/IVFBase.cuh
--- faiss-1.7.3/faiss/gpu/impl/IVFBase.cuh	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/IVFBase.cuh	2023-04-19 13:18:30.000000000 +0000
@@ -30,7 +30,7 @@
    public:
     IVFBase(GpuResources* resources,
             int dim,
-            int nlist,
+            idx_t nlist,
             faiss::MetricType metric,
             float metricArg,
             bool interleavedLayout,
@@ -41,31 +41,31 @@
     virtual ~IVFBase();
 
     /// Reserve GPU memory in our inverted lists for this number of vectors
-    void reserveMemory(size_t numVecs);
+    void reserveMemory(idx_t numVecs);
 
     /// Clear out all inverted lists, but retain the coarse quantizer
     /// and the product quantizer info
     void reset();
 
     /// Return the number of dimensions we are indexing
-    int getDim() const;
+    idx_t getDim() const;
 
     /// After adding vectors, one can call this to reclaim device memory
     /// to exactly the amount needed. Returns space reclaimed in bytes
     size_t reclaimMemory();
 
     /// Returns the number of inverted lists
-    size_t getNumLists() const;
+    idx_t getNumLists() const;
 
     /// For debugging purposes, return the list length of a particular
     /// list
-    int getListLength(int listId) const;
+    idx_t getListLength(idx_t listId) const;
 
     /// Return the list indices of a particular list back to the CPU
-    std::vector<Index::idx_t> getListIndices(int listId) const;
+    std::vector<idx_t> getListIndices(idx_t listId) const;
 
     /// Return the encoded vectors of a particular list back to the CPU
-    std::vector<uint8_t> getListVectorData(int listId, bool gpuFormat) const;
+    std::vector<uint8_t> getListVectorData(idx_t listId, bool gpuFormat) const;
 
     /// Copy all inverted lists from a CPU representation to ourselves
     void copyInvertedListsFrom(const InvertedLists* ivf);
@@ -81,10 +81,10 @@
     /// The input data must be on our current device.
     /// Returns the number of vectors successfully added. Vectors may
     /// not be able to be added because they contain NaNs.
-    int addVectors(
+    idx_t addVectors(
             Index* coarseQuantizer,
             Tensor<float, 2, true>& vecs,
-            Tensor<Index::idx_t, 1, true>& indices);
+            Tensor<idx_t, 1, true>& indices);
 
     /// Find the approximate k nearest neigbors for `queries` against
     /// our database
@@ -94,7 +94,7 @@
             int nprobe,
             int k,
             Tensor<float, 2, true>& outDistances,
-            Tensor<Index::idx_t, 2, true>& outIndices) = 0;
+            Tensor<idx_t, 2, true>& outIndices) = 0;
 
     /// Performs search when we are already given the IVF cells to look at
     /// (GpuIndexIVF::search_preassigned implementation)
@@ -102,22 +102,22 @@
             Index* coarseQuantizer,
             Tensor<float, 2, true>& vecs,
             Tensor<float, 2, true>& ivfDistances,
-            Tensor<Index::idx_t, 2, true>& ivfAssignments,
+            Tensor<idx_t, 2, true>& ivfAssignments,
             int k,
             Tensor<float, 2, true>& outDistances,
-            Tensor<Index::idx_t, 2, true>& outIndices,
+            Tensor<idx_t, 2, true>& outIndices,
             bool storePairs) = 0;
 
    protected:
     /// Adds a set of codes and indices to a list, with the representation
     /// coming from the CPU equivalent
     void addEncodedVectorsToList_(
-            int listId,
+            idx_t listId,
             // resident on the host
             const void* codes,
             // resident on the host
-            const Index::idx_t* indices,
-            size_t numVecs);
+            const idx_t* indices,
+            idx_t numVecs);
 
     /// Performs search in a CPU or GPU coarse quantizer for IVF cells,
     /// returning residuals as well if necessary
@@ -132,7 +132,7 @@
             Tensor<float, 2, true>& distances,
             // Output: the closest nprobe IVF cells the query vectors lie in
             // size (#vecs, nprobe)
-            Tensor<Index::idx_t, 2, true>& indices,
+            Tensor<idx_t, 2, true>& indices,
             // optionally compute the residual relative to the IVF cell centroid
             // if passed
             // size (#vecs, nprobe, dim)
@@ -146,30 +146,30 @@
     /// vectors is encoded on the device. Note that due to padding this is not
     /// the same as the encoding size for a subset of vectors in an IVF list;
     /// this is the size for an entire IVF list
-    virtual size_t getGpuVectorsEncodingSize_(int numVecs) const = 0;
-    virtual size_t getCpuVectorsEncodingSize_(int numVecs) const = 0;
+    virtual size_t getGpuVectorsEncodingSize_(idx_t numVecs) const = 0;
+    virtual size_t getCpuVectorsEncodingSize_(idx_t numVecs) const = 0;
 
     /// Translate to our preferred GPU encoding
     virtual std::vector<uint8_t> translateCodesToGpu_(
             std::vector<uint8_t> codes,
-            size_t numVecs) const = 0;
+            idx_t numVecs) const = 0;
 
     /// Translate from our preferred GPU encoding
     virtual std::vector<uint8_t> translateCodesFromGpu_(
             std::vector<uint8_t> codes,
-            size_t numVecs) const = 0;
+            idx_t numVecs) const = 0;
 
     /// Append vectors to our on-device lists
     virtual void appendVectors_(
             Tensor<float, 2, true>& vecs,
             Tensor<float, 2, true>& ivfCentroidResiduals,
-            Tensor<Index::idx_t, 1, true>& indices,
-            Tensor<Index::idx_t, 1, true>& uniqueLists,
-            Tensor<int, 1, true>& vectorsByUniqueList,
-            Tensor<int, 1, true>& uniqueListVectorStart,
-            Tensor<int, 1, true>& uniqueListStartOffset,
-            Tensor<Index::idx_t, 1, true>& listIds,
-            Tensor<int, 1, true>& listOffset,
+            Tensor<idx_t, 1, true>& indices,
+            Tensor<idx_t, 1, true>& uniqueLists,
+            Tensor<idx_t, 1, true>& vectorsByUniqueList,
+            Tensor<idx_t, 1, true>& uniqueListVectorStart,
+            Tensor<idx_t, 1, true>& uniqueListStartOffset,
+            Tensor<idx_t, 1, true>& listIds,
+            Tensor<idx_t, 1, true>& listOffset,
             cudaStream_t stream) = 0;
 
     /// Reclaim memory consumed on the device for our inverted lists
@@ -182,14 +182,11 @@
     /// For a set of list IDs, update device-side list pointer and size
     /// information
     void updateDeviceListInfo_(
-            const std::vector<Index::idx_t>& listIds,
+            const std::vector<idx_t>& listIds,
             cudaStream_t stream);
 
     /// Shared function to copy indices from CPU to GPU
-    void addIndicesFromCpu_(
-            int listId,
-            const Index::idx_t* indices,
-            size_t numVecs);
+    void addIndicesFromCpu_(idx_t listId, const idx_t* indices, idx_t numVecs);
 
    protected:
     /// Collection of GPU resources that we use
@@ -205,7 +202,7 @@
     const int dim_;
 
     /// Number of inverted lists we maintain
-    const int numLists_;
+    const idx_t numLists_;
 
     /// Do we need to also compute residuals when processing vectors?
     bool useResidual_;
@@ -240,10 +237,10 @@
 
     /// Device representation of all inverted list lengths
     /// id -> length in number of vectors
-    DeviceVector<int> deviceListLengths_;
+    DeviceVector<idx_t> deviceListLengths_;
 
     /// Maximum list length seen
-    int maxListLength_;
+    idx_t maxListLength_;
 
     struct DeviceIVFList {
         DeviceIVFList(GpuResources* res, const AllocInfo& info);
@@ -253,7 +250,7 @@
 
         /// The number of vectors encoded in this list, which may be unrelated
         /// to the above allocated data size
-        int numVecs;
+        idx_t numVecs;
     };
 
     /// Device memory for each separate list, as managed by the host.
@@ -266,7 +263,7 @@
     /// If we are storing indices on the CPU (indicesOptions_ is
     /// INDICES_CPU), then this maintains a CPU-side map of what
     /// (inverted list id, offset) maps to which user index
-    std::vector<std::vector<Index::idx_t>> listOffsetToUserIndex_;
+    std::vector<std::vector<idx_t>> listOffsetToUserIndex_;
 };
 
 } // namespace gpu
diff -Nru faiss-1.7.3/faiss/gpu/impl/IVFFlat.cu faiss-1.7.4/faiss/gpu/impl/IVFFlat.cu
--- faiss-1.7.3/faiss/gpu/impl/IVFFlat.cu	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/IVFFlat.cu	2023-04-19 13:18:30.000000000 +0000
@@ -31,7 +31,7 @@
 IVFFlat::IVFFlat(
         GpuResources* res,
         int dim,
-        int nlist,
+        idx_t nlist,
         faiss::MetricType metric,
         float metricArg,
         bool useResidual,
@@ -52,19 +52,19 @@
 
 IVFFlat::~IVFFlat() {}
 
-size_t IVFFlat::getGpuVectorsEncodingSize_(int numVecs) const {
+size_t IVFFlat::getGpuVectorsEncodingSize_(idx_t numVecs) const {
     if (interleavedLayout_) {
         // bits per scalar code
-        int bits = scalarQ_ ? scalarQ_->bits : 32 /* float */;
+        idx_t bits = scalarQ_ ? scalarQ_->bits : 32 /* float */;
 
         // bytes to encode a block of 32 vectors (single dimension)
-        int bytesPerDimBlock = bits * 32 / 8;
+        idx_t bytesPerDimBlock = bits * 32 / 8;
 
         // bytes to fully encode 32 vectors
-        int bytesPerBlock = bytesPerDimBlock * dim_;
+        idx_t bytesPerBlock = bytesPerDimBlock * dim_;
 
         // number of blocks of 32 vectors we have
-        int numBlocks = utils::divUp(numVecs, 32);
+        idx_t numBlocks = utils::divUp(numVecs, 32);
 
         // total size to encode numVecs
         return bytesPerBlock * numBlocks;
@@ -76,7 +76,7 @@
     }
 }
 
-size_t IVFFlat::getCpuVectorsEncodingSize_(int numVecs) const {
+size_t IVFFlat::getCpuVectorsEncodingSize_(idx_t numVecs) const {
     size_t sizePerVector =
             (scalarQ_ ? scalarQ_->code_size : sizeof(float) * dim_);
 
@@ -85,7 +85,7 @@
 
 std::vector<uint8_t> IVFFlat::translateCodesToGpu_(
         std::vector<uint8_t> codes,
-        size_t numVecs) const {
+        idx_t numVecs) const {
     if (!interleavedLayout_) {
         // same format
         return codes;
@@ -100,7 +100,7 @@
 
 std::vector<uint8_t> IVFFlat::translateCodesFromGpu_(
         std::vector<uint8_t> codes,
-        size_t numVecs) const {
+        idx_t numVecs) const {
     if (!interleavedLayout_) {
         // same format
         return codes;
@@ -115,13 +115,13 @@
 void IVFFlat::appendVectors_(
         Tensor<float, 2, true>& vecs,
         Tensor<float, 2, true>& ivfCentroidResiduals,
-        Tensor<Index::idx_t, 1, true>& indices,
-        Tensor<Index::idx_t, 1, true>& uniqueLists,
-        Tensor<int, 1, true>& vectorsByUniqueList,
-        Tensor<int, 1, true>& uniqueListVectorStart,
-        Tensor<int, 1, true>& uniqueListStartOffset,
-        Tensor<Index::idx_t, 1, true>& listIds,
-        Tensor<int, 1, true>& listOffset,
+        Tensor<idx_t, 1, true>& indices,
+        Tensor<idx_t, 1, true>& uniqueLists,
+        Tensor<idx_t, 1, true>& vectorsByUniqueList,
+        Tensor<idx_t, 1, true>& uniqueListVectorStart,
+        Tensor<idx_t, 1, true>& uniqueListStartOffset,
+        Tensor<idx_t, 1, true>& listIds,
+        Tensor<idx_t, 1, true>& listOffset,
         cudaStream_t stream) {
     //
     // Append the new encodings
@@ -167,13 +167,13 @@
         int nprobe,
         int k,
         Tensor<float, 2, true>& outDistances,
-        Tensor<Index::idx_t, 2, true>& outIndices) {
+        Tensor<idx_t, 2, true>& outIndices) {
     auto stream = resources_->getDefaultStreamCurrentDevice();
 
     // These are caught at a higher level
     FAISS_ASSERT(nprobe <= GPU_MAX_SELECTION_K);
     FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);
-    nprobe = std::min(nprobe, (int)getNumLists());
+    nprobe = int(std::min(idx_t(nprobe), getNumLists()));
 
     FAISS_ASSERT(queries.getSize(1) == dim_);
 
@@ -185,7 +185,7 @@
             resources_,
             makeTempAlloc(AllocType::Other, stream),
             {queries.getSize(0), nprobe});
-    DeviceTensor<Index::idx_t, 2, true> coarseIndices(
+    DeviceTensor<idx_t, 2, true> coarseIndices(
             resources_,
             makeTempAlloc(AllocType::Other, stream),
             {queries.getSize(0), nprobe});
@@ -223,10 +223,10 @@
         Index* coarseQuantizer,
         Tensor<float, 2, true>& vecs,
         Tensor<float, 2, true>& ivfDistances,
-        Tensor<Index::idx_t, 2, true>& ivfAssignments,
+        Tensor<idx_t, 2, true>& ivfAssignments,
         int k,
         Tensor<float, 2, true>& outDistances,
-        Tensor<Index::idx_t, 2, true>& outIndices,
+        Tensor<idx_t, 2, true>& outIndices,
         bool storePairs) {
     FAISS_ASSERT(ivfDistances.getSize(0) == vecs.getSize(0));
     FAISS_ASSERT(ivfAssignments.getSize(0) == vecs.getSize(0));
@@ -286,11 +286,11 @@
 void IVFFlat::searchImpl_(
         Tensor<float, 2, true>& queries,
         Tensor<float, 2, true>& coarseDistances,
-        Tensor<Index::idx_t, 2, true>& coarseIndices,
+        Tensor<idx_t, 2, true>& coarseIndices,
         Tensor<float, 3, true>& ivfCentroids,
         int k,
         Tensor<float, 2, true>& outDistances,
-        Tensor<Index::idx_t, 2, true>& outIndices,
+        Tensor<idx_t, 2, true>& outIndices,
         bool storePairs) {
     FAISS_ASSERT(storePairs == false);
 
@@ -336,7 +336,7 @@
     // FIXME: we might ultimately be calling this function with inputs
     // from the CPU, these are unnecessary copies
     if (indicesOptions_ == INDICES_CPU) {
-        HostTensor<Index::idx_t, 2, true> hostOutIndices(outIndices, stream);
+        HostTensor<idx_t, 2, true> hostOutIndices(outIndices, stream);
 
         ivfOffsetToUserIndex(
                 hostOutIndices.data(),
diff -Nru faiss-1.7.3/faiss/gpu/impl/IVFFlat.cuh faiss-1.7.4/faiss/gpu/impl/IVFFlat.cuh
--- faiss-1.7.3/faiss/gpu/impl/IVFFlat.cuh	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/IVFFlat.cuh	2023-04-19 13:18:30.000000000 +0000
@@ -17,7 +17,7 @@
    public:
     IVFFlat(GpuResources* resources,
             int dim,
-            int nlist,
+            idx_t nlist,
             faiss::MetricType metric,
             float metricArg,
             bool useResidual,
@@ -37,7 +37,7 @@
             int nprobe,
             int k,
             Tensor<float, 2, true>& outDistances,
-            Tensor<Index::idx_t, 2, true>& outIndices) override;
+            Tensor<idx_t, 2, true>& outIndices) override;
 
     /// Performs search when we are already given the IVF cells to look at
     /// (GpuIndexIVF::search_preassigned implementation)
@@ -45,10 +45,10 @@
             Index* coarseQuantizer,
             Tensor<float, 2, true>& vecs,
             Tensor<float, 2, true>& ivfDistances,
-            Tensor<Index::idx_t, 2, true>& ivfAssignments,
+            Tensor<idx_t, 2, true>& ivfAssignments,
             int k,
             Tensor<float, 2, true>& outDistances,
-            Tensor<Index::idx_t, 2, true>& outIndices,
+            Tensor<idx_t, 2, true>& outIndices,
             bool storePairs) override;
 
    protected:
@@ -56,30 +56,30 @@
     /// vectors is encoded on the device. Note that due to padding this is not
     /// the same as the encoding size for a subset of vectors in an IVF list;
     /// this is the size for an entire IVF list
-    size_t getGpuVectorsEncodingSize_(int numVecs) const override;
-    size_t getCpuVectorsEncodingSize_(int numVecs) const override;
+    size_t getGpuVectorsEncodingSize_(idx_t numVecs) const override;
+    size_t getCpuVectorsEncodingSize_(idx_t numVecs) const override;
 
     /// Translate to our preferred GPU encoding
     std::vector<uint8_t> translateCodesToGpu_(
             std::vector<uint8_t> codes,
-            size_t numVecs) const override;
+            idx_t numVecs) const override;
 
     /// Translate from our preferred GPU encoding
     std::vector<uint8_t> translateCodesFromGpu_(
             std::vector<uint8_t> codes,
-            size_t numVecs) const override;
+            idx_t numVecs) const override;
 
     /// Encode the vectors that we're adding and append to our IVF lists
     void appendVectors_(
             Tensor<float, 2, true>& vecs,
             Tensor<float, 2, true>& ivfCentroidResiduals,
-            Tensor<Index::idx_t, 1, true>& indices,
-            Tensor<Index::idx_t, 1, true>& uniqueLists,
-            Tensor<int, 1, true>& vectorsByUniqueList,
-            Tensor<int, 1, true>& uniqueListVectorStart,
-            Tensor<int, 1, true>& uniqueListStartOffset,
-            Tensor<Index::idx_t, 1, true>& listIds,
-            Tensor<int, 1, true>& listOffset,
+            Tensor<idx_t, 1, true>& indices,
+            Tensor<idx_t, 1, true>& uniqueLists,
+            Tensor<idx_t, 1, true>& vectorsByUniqueList,
+            Tensor<idx_t, 1, true>& uniqueListVectorStart,
+            Tensor<idx_t, 1, true>& uniqueListStartOffset,
+            Tensor<idx_t, 1, true>& listIds,
+            Tensor<idx_t, 1, true>& listOffset,
             cudaStream_t stream) override;
 
     /// Shared IVF search implementation, used by both search and
@@ -87,11 +87,11 @@
     void searchImpl_(
             Tensor<float, 2, true>& queries,
             Tensor<float, 2, true>& coarseDistances,
-            Tensor<Index::idx_t, 2, true>& coarseIndices,
+            Tensor<idx_t, 2, true>& coarseIndices,
             Tensor<float, 3, true>& ivfCentroids,
             int k,
             Tensor<float, 2, true>& outDistances,
-            Tensor<Index::idx_t, 2, true>& outIndices,
+            Tensor<idx_t, 2, true>& outIndices,
             bool storePairs);
 
    protected:
diff -Nru faiss-1.7.3/faiss/gpu/impl/IVFFlatScan.cu faiss-1.7.4/faiss/gpu/impl/IVFFlatScan.cu
--- faiss-1.7.3/faiss/gpu/impl/IVFFlatScan.cu	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/IVFFlatScan.cu	2023-04-19 13:18:30.000000000 +0000
@@ -58,7 +58,7 @@
             void* vecData,
             const Codec& codec,
             const Metric& metric,
-            int numVecs,
+            idx_t numVecs,
             int dim,
             float* distanceOut) {
         // How many separate loading points are there for the decoder?
@@ -70,13 +70,13 @@
         int laneId = threadIdx.x % kWarpSize; // getLaneId();
 
         // Divide the set of vectors among the warps
-        int vecsPerWarp = utils::divUp(numVecs, kIVFFlatScanWarps);
+        idx_t vecsPerWarp = utils::divUp(numVecs, kIVFFlatScanWarps);
 
-        int vecStart = vecsPerWarp * warpId;
-        int vecEnd = min(vecsPerWarp * (warpId + 1), numVecs);
+        idx_t vecStart = vecsPerWarp * warpId;
+        idx_t vecEnd = min(vecsPerWarp * (warpId + 1), numVecs);
 
         // Walk the list of vectors for this warp
-        for (int vec = vecStart; vec < vecEnd; ++vec) {
+        for (idx_t vec = vecStart; vec < vecEnd; ++vec) {
             Metric dist = metric.zero();
 
             // Scan the dimensions available that have whole units for the
@@ -137,12 +137,12 @@
         Tensor<float, 2, true> queries,
         bool useResidual,
         Tensor<float, 3, true> residualBase,
-        Tensor<Index::idx_t, 2, true> listIds,
+        Tensor<idx_t, 2, true> listIds,
         void** allListData,
-        int* listLengths,
+        idx_t* listLengths,
         Codec codec,
         Metric metric,
-        Tensor<int, 2, true> prefixSumOffsets,
+        Tensor<idx_t, 2, true> prefixSumOffsets,
         Tensor<float, 1, true> distance) {
     extern __shared__ float smem[];
 
@@ -151,9 +151,9 @@
 
     // This is where we start writing out data
     // We ensure that before the array (at offset -1), there is a 0 value
-    int outBase = *(prefixSumOffsets[queryId][probeId].data() - 1);
+    auto outBase = *(prefixSumOffsets[queryId][probeId].data() - 1);
 
-    Index::idx_t listId = listIds[queryId][probeId];
+    idx_t listId = listIds[queryId][probeId];
     // Safety guard in case NaNs in input cause no list ID to be generated
     if (listId == -1) {
         return;
@@ -185,42 +185,26 @@
 void runIVFFlatScanTile(
         GpuResources* res,
         Tensor<float, 2, true>& queries,
-        Tensor<Index::idx_t, 2, true>& listIds,
+        Tensor<idx_t, 2, true>& listIds,
         DeviceVector<void*>& listData,
         DeviceVector<void*>& listIndices,
         IndicesOptions indicesOptions,
-        DeviceVector<int>& listLengths,
+        DeviceVector<idx_t>& listLengths,
         Tensor<char, 1, true>& thrustMem,
-        Tensor<int, 2, true>& prefixSumOffsets,
+        Tensor<idx_t, 2, true>& prefixSumOffsets,
         Tensor<float, 1, true>& allDistances,
         Tensor<float, 3, true>& heapDistances,
-        Tensor<int, 3, true>& heapIndices,
+        Tensor<idx_t, 3, true>& heapIndices,
         int k,
+        bool use64BitSelection,
         faiss::MetricType metricType,
         bool useResidual,
         Tensor<float, 3, true>& residualBase,
         GpuScalarQuantizer* scalarQ,
         Tensor<float, 2, true>& outDistances,
-        Tensor<Index::idx_t, 2, true>& outIndices,
+        Tensor<idx_t, 2, true>& outIndices,
         cudaStream_t stream) {
-    int dim = queries.getSize(1);
-
-    // Check the amount of shared memory per block available based on our type
-    // is sufficient
-    if (scalarQ &&
-        (scalarQ->qtype == ScalarQuantizer::QuantizerType::QT_8bit ||
-         scalarQ->qtype == ScalarQuantizer::QuantizerType::QT_4bit)) {
-        int maxDim =
-                getMaxSharedMemPerBlockCurrentDevice() / (sizeof(float) * 2);
-
-        FAISS_THROW_IF_NOT_FMT(
-                dim < maxDim,
-                "Insufficient shared memory available on the GPU "
-                "for QT_8bit or QT_4bit with %d dimensions; "
-                "maximum dimensions possible is %d",
-                dim,
-                maxDim);
-    }
+    auto dim = queries.getSize(1);
 
     // Calculate offset lengths, so we know where to write out
     // intermediate results
@@ -316,6 +300,7 @@
             allDistances,
             listIds.getSize(1),
             k,
+            use64BitSelection,
             metricToSortDirection(metricType),
             heapDistances,
             heapIndices,
@@ -333,6 +318,7 @@
             prefixSumOffsets,
             listIds,
             k,
+            use64BitSelection,
             metricToSortDirection(metricType),
             outDistances,
             outIndices,
@@ -341,12 +327,12 @@
 
 void runIVFFlatScan(
         Tensor<float, 2, true>& queries,
-        Tensor<Index::idx_t, 2, true>& listIds,
+        Tensor<idx_t, 2, true>& listIds,
         DeviceVector<void*>& listData,
         DeviceVector<void*>& listIndices,
         IndicesOptions indicesOptions,
-        DeviceVector<int>& listLengths,
-        int maxListLength,
+        DeviceVector<idx_t>& listLengths,
+        idx_t maxListLength,
         int k,
         faiss::MetricType metric,
         bool useResidual,
@@ -355,15 +341,22 @@
         // output
         Tensor<float, 2, true>& outDistances,
         // output
-        Tensor<Index::idx_t, 2, true>& outIndices,
+        Tensor<idx_t, 2, true>& outIndices,
         GpuResources* res) {
-    constexpr int kMinQueryTileSize = 8;
-    constexpr int kMaxQueryTileSize = 128;
-    constexpr int kThrustMemSize = 16384;
-
-    int nprobe = listIds.getSize(1);
     auto stream = res->getDefaultStreamCurrentDevice();
 
+    constexpr idx_t kMinQueryTileSize = 8;
+    constexpr idx_t kMaxQueryTileSize = 65536; // used as blockIdx.y dimension
+    constexpr idx_t kThrustMemSize = 16384;
+
+    auto nprobe = listIds.getSize(1);
+
+    // If the maximum list length (in terms of number of vectors) times nprobe
+    // (number of lists) is > 2^31 - 1, then we will use 64-bit indexing in the
+    // selection kernels
+    bool use64BitSelection =
+            maxListLength * nprobe > idx_t(std::numeric_limits<int32_t>::max());
+
     // Make a reservation for Thrust to do its dirty work (global memory
     // cross-block reduction space); hopefully this is large enough.
     DeviceTensor<char, 1, true> thrustMem1(
@@ -378,19 +371,19 @@
 
     // We run two passes of heap selection
     // This is the size of the first-level heap passes
-    constexpr int kNProbeSplit = 8;
-    int pass2Chunks = std::min(nprobe, kNProbeSplit);
+    constexpr idx_t kNProbeSplit = 8;
+    idx_t pass2Chunks = std::min(nprobe, kNProbeSplit);
 
-    size_t sizeForFirstSelectPass =
-            pass2Chunks * k * (sizeof(float) + sizeof(int));
+    idx_t sizeForFirstSelectPass =
+            pass2Chunks * k * (sizeof(float) + sizeof(idx_t));
 
     // How much temporary storage we need per each query
-    size_t sizePerQuery = 2 *                         // # streams
-            ((nprobe * sizeof(int) + sizeof(int)) +   // prefixSumOffsets
-             nprobe * maxListLength * sizeof(float) + // allDistances
+    idx_t sizePerQuery = 2 *                            // # streams
+            ((nprobe * sizeof(idx_t) + sizeof(idx_t)) + // prefixSumOffsets
+             nprobe * maxListLength * sizeof(float) +   // allDistances
              sizeForFirstSelectPass);
 
-    int queryTileSize = (int)(sizeAvailable / sizePerQuery);
+    idx_t queryTileSize = sizeAvailable / sizePerQuery;
 
     if (queryTileSize < kMinQueryTileSize) {
         queryTileSize = kMinQueryTileSize;
@@ -398,37 +391,31 @@
         queryTileSize = kMaxQueryTileSize;
     }
 
-    // FIXME: we should adjust queryTileSize to deal with this, since
-    // indexing is in int32
-    FAISS_ASSERT(
-            queryTileSize * nprobe * maxListLength <
-            std::numeric_limits<int>::max());
-
     // Temporary memory buffers
     // Make sure there is space prior to the start which will be 0, and
     // will handle the boundary condition without branches
-    DeviceTensor<int, 1, true> prefixSumOffsetSpace1(
+    DeviceTensor<idx_t, 1, true> prefixSumOffsetSpace1(
             res,
             makeTempAlloc(AllocType::Other, stream),
             {queryTileSize * nprobe + 1});
-    DeviceTensor<int, 1, true> prefixSumOffsetSpace2(
+    DeviceTensor<idx_t, 1, true> prefixSumOffsetSpace2(
             res,
             makeTempAlloc(AllocType::Other, stream),
             {queryTileSize * nprobe + 1});
 
-    DeviceTensor<int, 2, true> prefixSumOffsets1(
+    DeviceTensor<idx_t, 2, true> prefixSumOffsets1(
             prefixSumOffsetSpace1[1].data(), {queryTileSize, nprobe});
-    DeviceTensor<int, 2, true> prefixSumOffsets2(
+    DeviceTensor<idx_t, 2, true> prefixSumOffsets2(
             prefixSumOffsetSpace2[1].data(), {queryTileSize, nprobe});
-    DeviceTensor<int, 2, true>* prefixSumOffsets[2] = {
+    DeviceTensor<idx_t, 2, true>* prefixSumOffsets[2] = {
             &prefixSumOffsets1, &prefixSumOffsets2};
 
     // Make sure the element before prefixSumOffsets is 0, since we
     // depend upon simple, boundary-less indexing to get proper results
     CUDA_VERIFY(cudaMemsetAsync(
-            prefixSumOffsetSpace1.data(), 0, sizeof(int), stream));
+            prefixSumOffsetSpace1.data(), 0, sizeof(idx_t), stream));
     CUDA_VERIFY(cudaMemsetAsync(
-            prefixSumOffsetSpace2.data(), 0, sizeof(int), stream));
+            prefixSumOffsetSpace2.data(), 0, sizeof(idx_t), stream));
 
     DeviceTensor<float, 1, true> allDistances1(
             res,
@@ -452,23 +439,24 @@
     DeviceTensor<float, 3, true>* heapDistances[2] = {
             &heapDistances1, &heapDistances2};
 
-    DeviceTensor<int, 3, true> heapIndices1(
+    DeviceTensor<idx_t, 3, true> heapIndices1(
             res,
             makeTempAlloc(AllocType::Other, stream),
             {queryTileSize, pass2Chunks, k});
-    DeviceTensor<int, 3, true> heapIndices2(
+    DeviceTensor<idx_t, 3, true> heapIndices2(
             res,
             makeTempAlloc(AllocType::Other, stream),
             {queryTileSize, pass2Chunks, k});
-    DeviceTensor<int, 3, true>* heapIndices[2] = {&heapIndices1, &heapIndices2};
+    DeviceTensor<idx_t, 3, true>* heapIndices[2] = {
+            &heapIndices1, &heapIndices2};
 
     auto streams = res->getAlternateStreamsCurrentDevice();
     streamWait(streams, {stream});
 
     int curStream = 0;
 
-    for (int query = 0; query < queries.getSize(0); query += queryTileSize) {
-        int numQueriesInTile =
+    for (idx_t query = 0; query < queries.getSize(0); query += queryTileSize) {
+        auto numQueriesInTile =
                 std::min(queryTileSize, queries.getSize(0) - query);
 
         auto prefixSumOffsetsView =
@@ -504,6 +492,7 @@
                 heapDistancesView,
                 heapIndicesView,
                 k,
+                use64BitSelection,
                 metric,
                 useResidual,
                 residualBaseView,
diff -Nru faiss-1.7.3/faiss/gpu/impl/IVFFlatScan.cuh faiss-1.7.4/faiss/gpu/impl/IVFFlatScan.cuh
--- faiss-1.7.3/faiss/gpu/impl/IVFFlatScan.cuh	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/IVFFlatScan.cuh	2023-04-19 13:18:30.000000000 +0000
@@ -21,12 +21,12 @@
 
 void runIVFFlatScan(
         Tensor<float, 2, true>& queries,
-        Tensor<Index::idx_t, 2, true>& listIds,
+        Tensor<idx_t, 2, true>& listIds,
         DeviceVector<void*>& listData,
         DeviceVector<void*>& listIndices,
         IndicesOptions indicesOptions,
-        DeviceVector<int>& listLengths,
-        int maxListLength,
+        DeviceVector<idx_t>& listLengths,
+        idx_t maxListLength,
         int k,
         faiss::MetricType metric,
         bool useResidual,
@@ -35,7 +35,7 @@
         // output
         Tensor<float, 2, true>& outDistances,
         // output
-        Tensor<Index::idx_t, 2, true>& outIndices,
+        Tensor<idx_t, 2, true>& outIndices,
         GpuResources* res);
 
 } // namespace gpu
diff -Nru faiss-1.7.3/faiss/gpu/impl/IVFInterleaved.cu faiss-1.7.4/faiss/gpu/impl/IVFInterleaved.cu
--- faiss-1.7.3/faiss/gpu/impl/IVFInterleaved.cu	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/IVFInterleaved.cu	2023-04-19 13:18:30.000000000 +0000
@@ -18,19 +18,22 @@
 template <int ThreadsPerBlock, int NumWarpQ, int NumThreadQ>
 __global__ void ivfInterleavedScan2(
         Tensor<float, 3, true> distanceIn,
-        Tensor<int, 3, true> indicesIn,
-        Tensor<Index::idx_t, 2, true> listIds,
+        Tensor<idx_t, 3, true> indicesIn,
+        Tensor<idx_t, 2, true> listIds,
         int k,
         void** listIndices,
         IndicesOptions opt,
         bool dir,
         Tensor<float, 2, true> distanceOut,
-        Tensor<Index::idx_t, 2, true> indicesOut) {
+        Tensor<idx_t, 2, true> indicesOut) {
     int queryId = blockIdx.x;
 
     constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
 
     __shared__ float smemK[kNumWarps * NumWarpQ];
+    // The BlockSelect value type is uint32_t, as we pack together which probe
+    // (up to nprobe - 1) and which k (up to k - 1) from each individual list
+    // together, and both nprobe and k are limited to GPU_MAX_SELECTION_K.
     __shared__ uint32_t smemV[kNumWarps * NumWarpQ];
 
     // To avoid creating excessive specializations, we combine direction
@@ -47,15 +50,15 @@
             heap(kFloatMax, kMaxUInt32, smemK, smemV, k);
 
     // nprobe x k
-    int num = distanceIn.getSize(1) * distanceIn.getSize(2);
+    idx_t num = distanceIn.getSize(1) * distanceIn.getSize(2);
 
-    auto distanceBase = distanceIn[queryId].data();
-    int limit = utils::roundDown(num, kWarpSize);
+    const float* distanceBase = distanceIn[queryId].data();
+    idx_t limit = utils::roundDown(num, kWarpSize);
 
     // This will keep our negation factor
     float adj = dir ? -1 : 1;
 
-    int i = threadIdx.x;
+    idx_t i = threadIdx.x;
     for (; i < limit; i += blockDim.x) {
         // We represent the index as (probe id)(k)
         // Right now, both are limited to a maximum of 2048, but we will
@@ -64,10 +67,13 @@
 
         uint32_t curProbe = i / k;
         uint32_t curK = i % k;
+        // Since nprobe and k are limited, we can pack both of these together
+        // into a uint32_t
         uint32_t index = (curProbe << 16) | (curK & (uint32_t)0xffff);
 
-        Index::idx_t listId = listIds[queryId][curProbe];
-        if (listId != -1) {
+        // The IDs reported from the list may be -1, if a particular IVF list
+        // doesn't even have k entries in it
+        if (listIds[queryId][curProbe] != -1) {
             // Adjust the value we are selecting based on the sorting order
             heap.addThreadQ(distanceBase[i] * adj, index);
         }
@@ -81,7 +87,7 @@
         uint32_t curK = i % k;
         uint32_t index = (curProbe << 16) | (curK & (uint32_t)0xffff);
 
-        Index::idx_t listId = listIds[queryId][curProbe];
+        idx_t listId = listIds[queryId][curProbe];
         if (listId != -1) {
             heap.addThreadQ(distanceBase[i] * adj, index);
         }
@@ -96,7 +102,7 @@
         auto packedIndex = smemV[i];
 
         // We need to remap to the user-provided indices
-        Index::idx_t index = -1;
+        idx_t index = -1;
 
         // We may not have at least k values to return; in this function, max
         // uint32 is our sentinel value
@@ -104,15 +110,15 @@
             uint32_t curProbe = packedIndex >> 16;
             uint32_t curK = packedIndex & 0xffff;
 
-            Index::idx_t listId = listIds[queryId][curProbe];
-            int listOffset = indicesIn[queryId][curProbe][curK];
+            idx_t listId = listIds[queryId][curProbe];
+            idx_t listOffset = indicesIn[queryId][curProbe][curK];
 
             if (opt == INDICES_32_BIT) {
-                index = (Index::idx_t)((int*)listIndices[listId])[listOffset];
+                index = (idx_t)((int*)listIndices[listId])[listOffset];
             } else if (opt == INDICES_64_BIT) {
-                index = ((Index::idx_t*)listIndices[listId])[listOffset];
+                index = ((idx_t*)listIndices[listId])[listOffset];
             } else {
-                index = (listId << 32 | (Index::idx_t)listOffset);
+                index = (listId << 32 | (idx_t)listOffset);
             }
         }
 
@@ -122,14 +128,14 @@
 
 void runIVFInterleavedScan2(
         Tensor<float, 3, true>& distanceIn,
-        Tensor<int, 3, true>& indicesIn,
-        Tensor<Index::idx_t, 2, true>& listIds,
+        Tensor<idx_t, 3, true>& indicesIn,
+        Tensor<idx_t, 2, true>& listIds,
         int k,
         DeviceVector<void*>& listIndices,
         IndicesOptions indicesOptions,
         bool dir,
         Tensor<float, 2, true>& distanceOut,
-        Tensor<Index::idx_t, 2, true>& indicesOut,
+        Tensor<idx_t, 2, true>& indicesOut,
         cudaStream_t stream) {
 #define IVF_SCAN_2(THREADS, NUM_WARP_Q, NUM_THREAD_Q)        \
     ivfInterleavedScan2<THREADS, NUM_WARP_Q, NUM_THREAD_Q>   \
@@ -168,11 +174,11 @@
 
 void runIVFInterleavedScan(
         Tensor<float, 2, true>& queries,
-        Tensor<Index::idx_t, 2, true>& listIds,
+        Tensor<idx_t, 2, true>& listIds,
         DeviceVector<void*>& listData,
         DeviceVector<void*>& listIndices,
         IndicesOptions indicesOptions,
-        DeviceVector<int>& listLengths,
+        DeviceVector<idx_t>& listLengths,
         int k,
         faiss::MetricType metric,
         bool useResidual,
@@ -181,7 +187,7 @@
         // output
         Tensor<float, 2, true>& outDistances,
         // output
-        Tensor<Index::idx_t, 2, true>& outIndices,
+        Tensor<idx_t, 2, true>& outIndices,
         GpuResources* res) {
     // caught for exceptions at a higher level
     FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);
diff -Nru faiss-1.7.3/faiss/gpu/impl/IVFInterleaved.cuh faiss-1.7.4/faiss/gpu/impl/IVFInterleaved.cuh
--- faiss-1.7.3/faiss/gpu/impl/IVFInterleaved.cuh	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/IVFInterleaved.cuh	2023-04-19 13:18:30.000000000 +0000
@@ -40,171 +40,175 @@
 __global__ void ivfInterleavedScan(
         Tensor<float, 2, true> queries,
         Tensor<float, 3, true> residualBase,
-        Tensor<Index::idx_t, 2, true> listIds,
+        Tensor<idx_t, 2, true> listIds,
         void** allListData,
-        int* listLengths,
+        idx_t* listLengths,
         Codec codec,
         Metric metric,
         int k,
         // [query][probe][k]
         Tensor<float, 3, true> distanceOut,
-        Tensor<int, 3, true> indicesOut) {
+        Tensor<idx_t, 3, true> indicesOut) {
     extern __shared__ float smem[];
 
     constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
 
-    int queryId = blockIdx.y;
-    int probeId = blockIdx.x;
-    Index::idx_t listId = listIds[queryId][probeId];
-
-    // Safety guard in case NaNs in input cause no list ID to be generated, or
-    // we have more nprobe than nlist
-    if (listId == -1) {
-        return;
-    }
+    for (idx_t queryId = blockIdx.y; queryId < queries.getSize(0);
+         queryId += gridDim.y) {
+        int probeId = blockIdx.x;
+        idx_t listId = listIds[queryId][probeId];
+
+        // Safety guard in case NaNs in input cause no list ID to be generated,
+        // or we have more nprobe than nlist
+        if (listId == -1) {
+            return;
+        }
 
-    int dim = queries.getSize(1);
+        // Vector dimension is currently limited to 32 bit
+        int dim = queries.getSize(1);
 
-    // FIXME: some issue with getLaneId() and CUDA 10.1 and P4 GPUs?
-    int laneId = threadIdx.x % kWarpSize;
-    int warpId = threadIdx.x / kWarpSize;
-
-    using EncodeT = typename Codec::EncodeT;
-
-    auto query = queries[queryId].data();
-    auto vecsBase = (EncodeT*)allListData[listId];
-    int numVecs = listLengths[listId];
-    auto residualBaseSlice = residualBase[queryId][probeId].data();
-
-    constexpr auto kInit = Metric::kDirection ? kFloatMin : kFloatMax;
-
-    __shared__ float smemK[kNumWarps * NumWarpQ];
-    __shared__ int smemV[kNumWarps * NumWarpQ];
-
-    BlockSelect<
-            float,
-            int,
-            Metric::kDirection,
-            Comparator<float>,
-            NumWarpQ,
-            NumThreadQ,
-            ThreadsPerBlock>
-            heap(kInit, -1, smemK, smemV, k);
-
-    // The codec might be dependent upon data that we need to reference or store
-    // in shared memory
-    codec.initKernel(smem, dim);
-    __syncthreads();
-
-    // How many vector blocks of 32 are in this list?
-    int numBlocks = utils::divUp(numVecs, 32);
-
-    // Number of EncodeT words per each dimension of block of 32 vecs
-    constexpr int bytesPerVectorBlockDim = Codec::kEncodeBits * 32 / 8;
-    constexpr int wordsPerVectorBlockDim =
-            bytesPerVectorBlockDim / sizeof(EncodeT);
-    int wordsPerVectorBlock = wordsPerVectorBlockDim * dim;
-
-    int dimBlocks = utils::roundDown(dim, kWarpSize);
-
-    for (int block = warpId; block < numBlocks; block += kNumWarps) {
-        // We're handling a new vector
-        Metric dist = metric.zero();
-
-        // This is the vector a given lane/thread handles
-        int vec = block * kWarpSize + laneId;
-        bool valid = vec < numVecs;
-
-        // This is where this warp begins reading data
-        EncodeT* data = vecsBase + block * wordsPerVectorBlock;
-
-        // whole blocks
-        for (int dBase = 0; dBase < dimBlocks; dBase += kWarpSize) {
-            int loadDim = dBase + laneId;
-            float queryReg = query[loadDim];
-            float residualReg = Residual ? residualBaseSlice[loadDim] : 0;
+        // FIXME: some issue with getLaneId() and CUDA 10.1 and P4 GPUs?
+        int laneId = threadIdx.x % kWarpSize;
+        int warpId = threadIdx.x / kWarpSize;
+
+        using EncodeT = typename Codec::EncodeT;
+
+        auto query = queries[queryId].data();
+        auto vecsBase = (EncodeT*)allListData[listId];
+        int numVecs = listLengths[listId];
+        auto residualBaseSlice = residualBase[queryId][probeId].data();
+
+        constexpr auto kInit = Metric::kDirection ? kFloatMin : kFloatMax;
+
+        __shared__ float smemK[kNumWarps * NumWarpQ];
+        __shared__ idx_t smemV[kNumWarps * NumWarpQ];
+
+        BlockSelect<
+                float,
+                idx_t,
+                Metric::kDirection,
+                Comparator<float>,
+                NumWarpQ,
+                NumThreadQ,
+                ThreadsPerBlock>
+                heap(kInit, -1, smemK, smemV, k);
+
+        // The codec might be dependent upon data that we need to reference or
+        // store in shared memory
+        codec.initKernel(smem, dim);
+        __syncthreads();
+
+        // How many vector blocks of 32 are in this list?
+        idx_t numBlocks = utils::divUp(numVecs, (idx_t)32);
+
+        // Number of EncodeT words per each dimension of block of 32 vecs
+        constexpr int bytesPerVectorBlockDim = Codec::kEncodeBits * 32 / 8;
+        constexpr int wordsPerVectorBlockDim =
+                bytesPerVectorBlockDim / sizeof(EncodeT);
+        int wordsPerVectorBlock = wordsPerVectorBlockDim * dim;
+
+        int dimBlocks = utils::roundDown(dim, kWarpSize);
+
+        for (idx_t block = warpId; block < numBlocks; block += kNumWarps) {
+            // We're handling a new vector
+            Metric dist = metric.zero();
+
+            // This is the vector a given lane/thread handles
+            idx_t vec = block * kWarpSize + laneId;
+            bool valid = vec < numVecs;
+
+            // This is where this warp begins reading data
+            EncodeT* data = vecsBase + block * wordsPerVectorBlock;
+
+            // whole blocks
+            for (int dBase = 0; dBase < dimBlocks; dBase += kWarpSize) {
+                int loadDim = dBase + laneId;
+                float queryReg = query[loadDim];
+                float residualReg = Residual ? residualBaseSlice[loadDim] : 0;
 
-            constexpr int kUnroll = 4;
+                constexpr int kUnroll = 4;
 
 #pragma unroll
-            for (int i = 0; i < kWarpSize / kUnroll;
-                 ++i, data += kUnroll * wordsPerVectorBlockDim) {
-                EncodeT encV[kUnroll];
+                for (int i = 0; i < kWarpSize / kUnroll;
+                     ++i, data += kUnroll * wordsPerVectorBlockDim) {
+                    EncodeT encV[kUnroll];
 #pragma unroll
-                for (int j = 0; j < kUnroll; ++j) {
-                    encV[j] = WarpPackedBits<EncodeT, Codec::kEncodeBits>::read(
-                            laneId, data + j * wordsPerVectorBlockDim);
-                }
+                    for (int j = 0; j < kUnroll; ++j) {
+                        encV[j] = WarpPackedBits<EncodeT, Codec::kEncodeBits>::
+                                read(laneId, data + j * wordsPerVectorBlockDim);
+                    }
 
 #pragma unroll
-                for (int j = 0; j < kUnroll; ++j) {
-                    encV[j] = WarpPackedBits<EncodeT, Codec::kEncodeBits>::
-                            postRead(laneId, encV[j]);
-                }
+                    for (int j = 0; j < kUnroll; ++j) {
+                        encV[j] = WarpPackedBits<EncodeT, Codec::kEncodeBits>::
+                                postRead(laneId, encV[j]);
+                    }
 
-                float decV[kUnroll];
+                    float decV[kUnroll];
 #pragma unroll
-                for (int j = 0; j < kUnroll; ++j) {
-                    int d = i * kUnroll + j;
-                    decV[j] = codec.decodeNew(dBase + d, encV[j]);
-                }
+                    for (int j = 0; j < kUnroll; ++j) {
+                        int d = i * kUnroll + j;
+                        decV[j] = codec.decodeNew(dBase + d, encV[j]);
+                    }
+
+                    if (Residual) {
+#pragma unroll
+                        for (int j = 0; j < kUnroll; ++j) {
+                            int d = i * kUnroll + j;
+                            decV[j] += SHFL_SYNC(residualReg, d, kWarpSize);
+                        }
+                    }
 
-                if (Residual) {
 #pragma unroll
                     for (int j = 0; j < kUnroll; ++j) {
                         int d = i * kUnroll + j;
-                        decV[j] += SHFL_SYNC(residualReg, d, kWarpSize);
+                        float q = SHFL_SYNC(queryReg, d, kWarpSize);
+                        dist.handle(q, decV[j]);
                     }
                 }
+            }
 
-#pragma unroll
-                for (int j = 0; j < kUnroll; ++j) {
-                    int d = i * kUnroll + j;
-                    float q = SHFL_SYNC(queryReg, d, kWarpSize);
-                    dist.handle(q, decV[j]);
+            // remainder
+            int loadDim = dimBlocks + laneId;
+            bool loadDimInBounds = loadDim < dim;
+
+            float queryReg = loadDimInBounds ? query[loadDim] : 0;
+            float residualReg = Residual && loadDimInBounds
+                    ? residualBaseSlice[loadDim]
+                    : 0;
+
+            for (int d = 0; d < dim - dimBlocks;
+                 ++d, data += wordsPerVectorBlockDim) {
+                float q = SHFL_SYNC(queryReg, d, kWarpSize);
+
+                EncodeT enc = WarpPackedBits<EncodeT, Codec::kEncodeBits>::read(
+                        laneId, data);
+                enc = WarpPackedBits<EncodeT, Codec::kEncodeBits>::postRead(
+                        laneId, enc);
+                float dec = codec.decodeNew(dimBlocks + d, enc);
+                if (Residual) {
+                    dec += SHFL_SYNC(residualReg, d, kWarpSize);
                 }
-            }
-        }
 
-        // remainder
-        int loadDim = dimBlocks + laneId;
-        bool loadDimInBounds = loadDim < dim;
-
-        float queryReg = loadDimInBounds ? query[loadDim] : 0;
-        float residualReg =
-                Residual && loadDimInBounds ? residualBaseSlice[loadDim] : 0;
-
-        for (int d = 0; d < dim - dimBlocks;
-             ++d, data += wordsPerVectorBlockDim) {
-            float q = SHFL_SYNC(queryReg, d, kWarpSize);
-
-            EncodeT enc = WarpPackedBits<EncodeT, Codec::kEncodeBits>::read(
-                    laneId, data);
-            enc = WarpPackedBits<EncodeT, Codec::kEncodeBits>::postRead(
-                    laneId, enc);
-            float dec = codec.decodeNew(dimBlocks + d, enc);
-            if (Residual) {
-                dec += SHFL_SYNC(residualReg, d, kWarpSize);
+                dist.handle(q, dec);
             }
 
-            dist.handle(q, dec);
-        }
+            if (valid) {
+                heap.addThreadQ(dist.reduce(), vec);
+            }
 
-        if (valid) {
-            heap.addThreadQ(dist.reduce(), vec);
+            heap.checkThreadQ();
         }
 
-        heap.checkThreadQ();
-    }
-
-    heap.reduce();
+        heap.reduce();
 
-    auto distanceOutBase = distanceOut[queryId][probeId].data();
-    auto indicesOutBase = indicesOut[queryId][probeId].data();
+        auto distanceOutBase = distanceOut[queryId][probeId].data();
+        auto indicesOutBase = indicesOut[queryId][probeId].data();
 
-    for (int i = threadIdx.x; i < k; i += blockDim.x) {
-        distanceOutBase[i] = smemK[i];
-        indicesOutBase[i] = smemV[i];
+        for (int i = threadIdx.x; i < k; i += blockDim.x) {
+            distanceOutBase[i] = smemK[i];
+            indicesOutBase[i] = smemV[i];
+        }
     }
 }
 
@@ -215,7 +219,7 @@
 
 #define IVFINT_RUN(CODEC_TYPE, METRIC_TYPE, THREADS, NUM_WARP_Q, NUM_THREAD_Q) \
     do {                                                                       \
-        dim3 grid(nprobe, nq);                                                 \
+        dim3 grid(nprobe, std::min(nq, (idx_t)getMaxGridCurrentDevice().y));   \
         if (useResidual) {                                                     \
             ivfInterleavedScan<                                                \
                     CODEC_TYPE,                                                \
@@ -389,7 +393,7 @@
                 res,                                                      \
                 makeTempAlloc(AllocType::Other, stream),                  \
                 {queries.getSize(0), listIds.getSize(1), k});             \
-        DeviceTensor<int, 3, true> indicesTemp(                           \
+        DeviceTensor<idx_t, 3, true> indicesTemp(                         \
                 res,                                                      \
                 makeTempAlloc(AllocType::Other, stream),                  \
                 {queries.getSize(0), listIds.getSize(1), k});             \
@@ -409,11 +413,11 @@
 // with all implementations
 void runIVFInterleavedScan(
         Tensor<float, 2, true>& queries,
-        Tensor<Index::idx_t, 2, true>& listIds,
+        Tensor<idx_t, 2, true>& listIds,
         DeviceVector<void*>& listData,
         DeviceVector<void*>& listIndices,
         IndicesOptions indicesOptions,
-        DeviceVector<int>& listLengths,
+        DeviceVector<idx_t>& listLengths,
         int k,
         faiss::MetricType metric,
         bool useResidual,
@@ -422,21 +426,21 @@
         // output
         Tensor<float, 2, true>& outDistances,
         // output
-        Tensor<Index::idx_t, 2, true>& outIndices,
+        Tensor<idx_t, 2, true>& outIndices,
         GpuResources* res);
 
 // Second pass of IVF list scanning to perform final k-selection and look up the
 // user indices
 void runIVFInterleavedScan2(
         Tensor<float, 3, true>& distanceIn,
-        Tensor<int, 3, true>& indicesIn,
-        Tensor<Index::idx_t, 2, true>& listIds,
+        Tensor<idx_t, 3, true>& indicesIn,
+        Tensor<idx_t, 2, true>& listIds,
         int k,
         DeviceVector<void*>& listIndices,
         IndicesOptions indicesOptions,
         bool dir,
         Tensor<float, 2, true>& distanceOut,
-        Tensor<Index::idx_t, 2, true>& indicesOut,
+        Tensor<idx_t, 2, true>& indicesOut,
         cudaStream_t stream);
 
 } // namespace gpu
diff -Nru faiss-1.7.3/faiss/gpu/impl/IVFPQ.cu faiss-1.7.4/faiss/gpu/impl/IVFPQ.cu
--- faiss-1.7.3/faiss/gpu/impl/IVFPQ.cu	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/IVFPQ.cu	2023-04-19 13:18:30.000000000 +0000
@@ -37,7 +37,7 @@
 IVFPQ::IVFPQ(
         GpuResources* resources,
         int dim,
-        int nlist,
+        idx_t nlist,
         faiss::MetricType metric,
         float metricArg,
         int numSubQuantizers,
@@ -129,13 +129,13 @@
 void IVFPQ::appendVectors_(
         Tensor<float, 2, true>& vecs,
         Tensor<float, 2, true>& ivfCentroidResiduals,
-        Tensor<Index::idx_t, 1, true>& indices,
-        Tensor<Index::idx_t, 1, true>& uniqueLists,
-        Tensor<int, 1, true>& vectorsByUniqueList,
-        Tensor<int, 1, true>& uniqueListVectorStart,
-        Tensor<int, 1, true>& uniqueListStartOffset,
-        Tensor<Index::idx_t, 1, true>& listIds,
-        Tensor<int, 1, true>& listOffset,
+        Tensor<idx_t, 1, true>& indices,
+        Tensor<idx_t, 1, true>& uniqueLists,
+        Tensor<idx_t, 1, true>& vectorsByUniqueList,
+        Tensor<idx_t, 1, true>& uniqueListVectorStart,
+        Tensor<idx_t, 1, true>& uniqueListStartOffset,
+        Tensor<idx_t, 1, true>& listIds,
+        Tensor<idx_t, 1, true>& listOffset,
         cudaStream_t stream) {
     //
     // Determine the encodings of the vectors
@@ -182,7 +182,7 @@
                 resources_,
                 makeTempAlloc(AllocType::Other, stream),
                 {numSubQuantizers_, ivfCentroidResiduals.getSize(0), 1});
-        DeviceTensor<int, 3, true> closestSubQIndex(
+        DeviceTensor<idx_t, 3, true> closestSubQIndex(
                 resources_,
                 makeTempAlloc(AllocType::Other, stream),
                 {numSubQuantizers_, ivfCentroidResiduals.getSize(0), 1});
@@ -210,9 +210,9 @@
                     true);
         }
 
-        // The L2 distance function only returns int32 indices. As we are
+        // The L2 distance function only returns idx_t indices. As we are
         // restricted to <= 8 bits per code, convert to uint8
-        auto closestSubQIndex8 = convertTensorTemporary<int, uint8_t, 3>(
+        auto closestSubQIndex8 = convertTensorTemporary<idx_t, uint8_t, 3>(
                 resources_, stream, closestSubQIndex);
 
         // Now, we have the nearest sub-q centroid for each slice of the
@@ -256,19 +256,19 @@
     }
 }
 
-size_t IVFPQ::getGpuVectorsEncodingSize_(int numVecs) const {
+size_t IVFPQ::getGpuVectorsEncodingSize_(idx_t numVecs) const {
     if (interleavedLayout_) {
         // bits per PQ code
-        int bits = bitsPerSubQuantizer_;
+        idx_t bits = bitsPerSubQuantizer_;
 
         // bytes to encode a block of 32 vectors (single PQ code)
-        int bytesPerDimBlock = bits * 32 / 8;
+        idx_t bytesPerDimBlock = bits * 32 / 8;
 
         // bytes to fully encode 32 vectors
-        int bytesPerBlock = bytesPerDimBlock * numSubQuantizers_;
+        idx_t bytesPerBlock = bytesPerDimBlock * numSubQuantizers_;
 
         // number of blocks of 32 vectors we have
-        int numBlocks = utils::divUp(numVecs, 32);
+        idx_t numBlocks = utils::divUp(numVecs, idx_t(32));
 
         // total size to encode numVecs
         return bytesPerBlock * numBlocks;
@@ -277,17 +277,17 @@
     }
 }
 
-size_t IVFPQ::getCpuVectorsEncodingSize_(int numVecs) const {
+size_t IVFPQ::getCpuVectorsEncodingSize_(idx_t numVecs) const {
     size_t sizePerVector =
             utils::divUp(numSubQuantizers_ * bitsPerSubQuantizer_, 8);
 
-    return (size_t)numVecs * sizePerVector;
+    return numVecs * sizePerVector;
 }
 
 // Convert the CPU layout to the GPU layout
 std::vector<uint8_t> IVFPQ::translateCodesToGpu_(
         std::vector<uint8_t> codes,
-        size_t numVecs) const {
+        idx_t numVecs) const {
     if (!interleavedLayout_) {
         return codes;
     }
@@ -301,7 +301,7 @@
 // Conver the GPU layout to the CPU layout
 std::vector<uint8_t> IVFPQ::translateCodesFromGpu_(
         std::vector<uint8_t> codes,
-        size_t numVecs) const {
+        idx_t numVecs) const {
     if (!interleavedLayout_) {
         return codes;
     }
@@ -492,13 +492,13 @@
         int nprobe,
         int k,
         Tensor<float, 2, true>& outDistances,
-        Tensor<Index::idx_t, 2, true>& outIndices) {
+        Tensor<idx_t, 2, true>& outIndices) {
     // These are caught at a higher level
     FAISS_ASSERT(nprobe <= GPU_MAX_SELECTION_K);
     FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);
 
     auto stream = resources_->getDefaultStreamCurrentDevice();
-    nprobe = std::min(nprobe, (int)getNumLists());
+    nprobe = int(std::min(idx_t(nprobe), getNumLists()));
 
     FAISS_ASSERT(queries.getSize(1) == dim_);
     FAISS_ASSERT(outDistances.getSize(0) == queries.getSize(0));
@@ -509,7 +509,7 @@
             resources_,
             makeTempAlloc(AllocType::Other, stream),
             {queries.getSize(0), nprobe});
-    DeviceTensor<Index::idx_t, 2, true> coarseIndices(
+    DeviceTensor<idx_t, 2, true> coarseIndices(
             resources_,
             makeTempAlloc(AllocType::Other, stream),
             {queries.getSize(0), nprobe});
@@ -537,10 +537,10 @@
         Index* coarseQuantizer,
         Tensor<float, 2, true>& vecs,
         Tensor<float, 2, true>& ivfDistances,
-        Tensor<Index::idx_t, 2, true>& ivfAssignments,
+        Tensor<idx_t, 2, true>& ivfAssignments,
         int k,
         Tensor<float, 2, true>& outDistances,
-        Tensor<Index::idx_t, 2, true>& outIndices,
+        Tensor<idx_t, 2, true>& outIndices,
         bool storePairs) {
     FAISS_ASSERT(ivfDistances.getSize(0) == vecs.getSize(0));
     FAISS_ASSERT(ivfAssignments.getSize(0) == vecs.getSize(0));
@@ -565,10 +565,10 @@
 void IVFPQ::searchImpl_(
         Tensor<float, 2, true>& queries,
         Tensor<float, 2, true>& coarseDistances,
-        Tensor<Index::idx_t, 2, true>& coarseIndices,
+        Tensor<idx_t, 2, true>& coarseIndices,
         int k,
         Tensor<float, 2, true>& outDistances,
-        Tensor<Index::idx_t, 2, true>& outIndices,
+        Tensor<idx_t, 2, true>& outIndices,
         bool storePairs) {
     FAISS_ASSERT(storePairs == false);
 
@@ -599,7 +599,7 @@
     // FIXME: we might ultimately be calling this function with inputs
     // from the CPU, these are unnecessary copies
     if (indicesOptions_ == INDICES_CPU) {
-        HostTensor<Index::idx_t, 2, true> hostOutIndices(outIndices, stream);
+        HostTensor<idx_t, 2, true> hostOutIndices(outIndices, stream);
 
         ivfOffsetToUserIndex(
                 hostOutIndices.data(),
@@ -617,10 +617,10 @@
 void IVFPQ::runPQPrecomputedCodes_(
         Tensor<float, 2, true>& queries,
         Tensor<float, 2, true>& coarseDistances,
-        Tensor<Index::idx_t, 2, true>& coarseIndices,
+        Tensor<idx_t, 2, true>& coarseIndices,
         int k,
         Tensor<float, 2, true>& outDistances,
-        Tensor<Index::idx_t, 2, true>& outIndices) {
+        Tensor<idx_t, 2, true>& outIndices) {
     FAISS_ASSERT(metric_ == MetricType::METRIC_L2);
 
     auto stream = resources_->getDefaultStreamCurrentDevice();
@@ -705,10 +705,10 @@
 void IVFPQ::runPQNoPrecomputedCodes_(
         Tensor<float, 2, true>& queries,
         Tensor<float, 2, true>& coarseDistances,
-        Tensor<Index::idx_t, 2, true>& coarseIndices,
+        Tensor<idx_t, 2, true>& coarseIndices,
         int k,
         Tensor<float, 2, true>& outDistances,
-        Tensor<Index::idx_t, 2, true>& outIndices) {
+        Tensor<idx_t, 2, true>& outIndices) {
     runPQScanMultiPassNoPrecomputed(
             queries,
             ivfCentroids_,
diff -Nru faiss-1.7.3/faiss/gpu/impl/IVFPQ.cuh faiss-1.7.4/faiss/gpu/impl/IVFPQ.cuh
--- faiss-1.7.3/faiss/gpu/impl/IVFPQ.cuh	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/IVFPQ.cuh	2023-04-19 13:18:30.000000000 +0000
@@ -20,7 +20,7 @@
    public:
     IVFPQ(GpuResources* resources,
           int dim,
-          int nlist,
+          idx_t nlist,
           faiss::MetricType metric,
           float metricArg,
           int numSubQuantizers,
@@ -53,7 +53,7 @@
             int nprobe,
             int k,
             Tensor<float, 2, true>& outDistances,
-            Tensor<Index::idx_t, 2, true>& outIndices) override;
+            Tensor<idx_t, 2, true>& outIndices) override;
 
     /// Performs search when we are already given the IVF cells to look at
     /// (GpuIndexIVF::search_preassigned implementation)
@@ -61,38 +61,38 @@
             Index* coarseQuantizer,
             Tensor<float, 2, true>& vecs,
             Tensor<float, 2, true>& ivfDistances,
-            Tensor<Index::idx_t, 2, true>& ivfAssignments,
+            Tensor<idx_t, 2, true>& ivfAssignments,
             int k,
             Tensor<float, 2, true>& outDistances,
-            Tensor<Index::idx_t, 2, true>& outIndices,
+            Tensor<idx_t, 2, true>& outIndices,
             bool storePairs) override;
 
    protected:
     /// Returns the encoding size for a PQ-encoded IVF list
-    size_t getGpuVectorsEncodingSize_(int numVecs) const override;
-    size_t getCpuVectorsEncodingSize_(int numVecs) const override;
+    size_t getGpuVectorsEncodingSize_(idx_t numVecs) const override;
+    size_t getCpuVectorsEncodingSize_(idx_t numVecs) const override;
 
     /// Translate to our preferred GPU encoding
     std::vector<uint8_t> translateCodesToGpu_(
             std::vector<uint8_t> codes,
-            size_t numVecs) const override;
+            idx_t numVecs) const override;
 
     /// Translate from our preferred GPU encoding
     std::vector<uint8_t> translateCodesFromGpu_(
             std::vector<uint8_t> codes,
-            size_t numVecs) const override;
+            idx_t numVecs) const override;
 
     /// Encode the vectors that we're adding and append to our IVF lists
     void appendVectors_(
             Tensor<float, 2, true>& vecs,
             Tensor<float, 2, true>& ivfCentroidResiduals,
-            Tensor<Index::idx_t, 1, true>& indices,
-            Tensor<Index::idx_t, 1, true>& uniqueLists,
-            Tensor<int, 1, true>& vectorsByUniqueList,
-            Tensor<int, 1, true>& uniqueListVectorStart,
-            Tensor<int, 1, true>& uniqueListStartOffset,
-            Tensor<Index::idx_t, 1, true>& listIds,
-            Tensor<int, 1, true>& listOffset,
+            Tensor<idx_t, 1, true>& indices,
+            Tensor<idx_t, 1, true>& uniqueLists,
+            Tensor<idx_t, 1, true>& vectorsByUniqueList,
+            Tensor<idx_t, 1, true>& uniqueListVectorStart,
+            Tensor<idx_t, 1, true>& uniqueListStartOffset,
+            Tensor<idx_t, 1, true>& listIds,
+            Tensor<idx_t, 1, true>& listOffset,
             cudaStream_t stream) override;
 
     /// Shared IVF search implementation, used by both search and
@@ -100,10 +100,10 @@
     void searchImpl_(
             Tensor<float, 2, true>& queries,
             Tensor<float, 2, true>& coarseDistances,
-            Tensor<Index::idx_t, 2, true>& coarseIndices,
+            Tensor<idx_t, 2, true>& coarseIndices,
             int k,
             Tensor<float, 2, true>& outDistances,
-            Tensor<Index::idx_t, 2, true>& outIndices,
+            Tensor<idx_t, 2, true>& outIndices,
             bool storePairs);
 
     /// Sets the current product quantizer centroids; the data can be
@@ -120,19 +120,19 @@
     void runPQPrecomputedCodes_(
             Tensor<float, 2, true>& queries,
             Tensor<float, 2, true>& coarseDistances,
-            Tensor<Index::idx_t, 2, true>& coarseIndices,
+            Tensor<idx_t, 2, true>& coarseIndices,
             int k,
             Tensor<float, 2, true>& outDistances,
-            Tensor<Index::idx_t, 2, true>& outIndices);
+            Tensor<idx_t, 2, true>& outIndices);
 
     /// Runs kernels for scanning inverted lists without precomputed codes
     void runPQNoPrecomputedCodes_(
             Tensor<float, 2, true>& queries,
             Tensor<float, 2, true>& coarseDistances,
-            Tensor<Index::idx_t, 2, true>& coarseIndices,
+            Tensor<idx_t, 2, true>& coarseIndices,
             int k,
             Tensor<float, 2, true>& outDistances,
-            Tensor<Index::idx_t, 2, true>& outIndices);
+            Tensor<idx_t, 2, true>& outIndices);
 
    private:
     /// Number of sub-quantizers per vector
diff -Nru faiss-1.7.3/faiss/gpu/impl/IVFUtils.cu faiss-1.7.4/faiss/gpu/impl/IVFUtils.cu
--- faiss-1.7.3/faiss/gpu/impl/IVFUtils.cu	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/IVFUtils.cu	2023-04-19 13:18:30.000000000 +0000
@@ -21,20 +21,20 @@
 // Calculates the total number of intermediate distances to consider
 // for all queries
 __global__ void getResultLengths(
-        Tensor<Index::idx_t, 2, true> ivfListIds,
-        int* listLengths,
-        int totalSize,
-        Tensor<int, 2, true> length) {
-    int linearThreadId = blockIdx.x * blockDim.x + threadIdx.x;
+        Tensor<idx_t, 2, true> ivfListIds,
+        idx_t* listLengths,
+        idx_t totalSize,
+        Tensor<idx_t, 2, true> length) {
+    idx_t linearThreadId = idx_t(blockIdx.x) * blockDim.x + threadIdx.x;
     if (linearThreadId >= totalSize) {
         return;
     }
 
-    int nprobe = ivfListIds.getSize(1);
-    int queryId = linearThreadId / nprobe;
-    int listId = linearThreadId % nprobe;
+    auto nprobe = ivfListIds.getSize(1);
+    auto queryId = linearThreadId / nprobe;
+    auto listId = linearThreadId % nprobe;
 
-    Index::idx_t centroidId = ivfListIds[queryId][listId];
+    idx_t centroidId = ivfListIds[queryId][listId];
 
     // Safety guard in case NaNs in input cause no list ID to be generated
     length[queryId][listId] = (centroidId != -1) ? listLengths[centroidId] : 0;
@@ -42,18 +42,18 @@
 
 void runCalcListOffsets(
         GpuResources* res,
-        Tensor<Index::idx_t, 2, true>& ivfListIds,
-        DeviceVector<int>& listLengths,
-        Tensor<int, 2, true>& prefixSumOffsets,
+        Tensor<idx_t, 2, true>& ivfListIds,
+        DeviceVector<idx_t>& listLengths,
+        Tensor<idx_t, 2, true>& prefixSumOffsets,
         Tensor<char, 1, true>& thrustMem,
         cudaStream_t stream) {
     FAISS_ASSERT(ivfListIds.getSize(0) == prefixSumOffsets.getSize(0));
     FAISS_ASSERT(ivfListIds.getSize(1) == prefixSumOffsets.getSize(1));
 
-    int totalSize = ivfListIds.numElements();
+    idx_t totalSize = ivfListIds.numElements();
 
-    int numThreads = std::min(totalSize, getMaxThreadsCurrentDevice());
-    int numBlocks = utils::divUp(totalSize, numThreads);
+    idx_t numThreads = std::min(totalSize, (idx_t)getMaxThreadsCurrentDevice());
+    idx_t numBlocks = utils::divUp(totalSize, numThreads);
 
     auto grid = dim3(numBlocks);
     auto block = dim3(numThreads);
diff -Nru faiss-1.7.3/faiss/gpu/impl/IVFUtils.cuh faiss-1.7.4/faiss/gpu/impl/IVFUtils.cuh
--- faiss-1.7.3/faiss/gpu/impl/IVFUtils.cuh	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/IVFUtils.cuh	2023-04-19 13:18:30.000000000 +0000
@@ -23,36 +23,38 @@
 /// intermediate results for all (query, probe) pair
 void runCalcListOffsets(
         GpuResources* res,
-        Tensor<Index::idx_t, 2, true>& ivfListIds,
-        DeviceVector<int>& listLengths,
-        Tensor<int, 2, true>& prefixSumOffsets,
+        Tensor<idx_t, 2, true>& ivfListIds,
+        DeviceVector<idx_t>& listLengths,
+        Tensor<idx_t, 2, true>& prefixSumOffsets,
         Tensor<char, 1, true>& thrustMem,
         cudaStream_t stream);
 
 /// Performs a first pass of k-selection on the results
 void runPass1SelectLists(
-        Tensor<int, 2, true>& prefixSumOffsets,
+        Tensor<idx_t, 2, true>& prefixSumOffsets,
         Tensor<float, 1, true>& distance,
         int nprobe,
         int k,
+        bool use64BitSelection,
         bool chooseLargest,
         Tensor<float, 3, true>& heapDistances,
-        Tensor<int, 3, true>& heapIndices,
+        Tensor<idx_t, 3, true>& heapIndices,
         cudaStream_t stream);
 
 /// Performs a final pass of k-selection on the results, producing the
 /// final indices
 void runPass2SelectLists(
         Tensor<float, 2, true>& heapDistances,
-        Tensor<int, 2, true>& heapIndices,
+        Tensor<idx_t, 2, true>& heapIndices,
         DeviceVector<void*>& listIndices,
         IndicesOptions indicesOptions,
-        Tensor<int, 2, true>& prefixSumOffsets,
-        Tensor<Index::idx_t, 2, true>& ivfListIds,
+        Tensor<idx_t, 2, true>& prefixSumOffsets,
+        Tensor<idx_t, 2, true>& ivfListIds,
         int k,
+        bool use64BitSelection,
         bool chooseLargest,
         Tensor<float, 2, true>& outDistances,
-        Tensor<Index::idx_t, 2, true>& outIndices,
+        Tensor<idx_t, 2, true>& outIndices,
         cudaStream_t stream);
 
 } // namespace gpu
diff -Nru faiss-1.7.3/faiss/gpu/impl/IVFUtilsSelect1.cu faiss-1.7.4/faiss/gpu/impl/IVFUtilsSelect1.cu
--- faiss-1.7.3/faiss/gpu/impl/IVFUtilsSelect1.cu	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/IVFUtilsSelect1.cu	2023-04-19 13:18:30.000000000 +0000
@@ -21,154 +21,176 @@
 namespace faiss {
 namespace gpu {
 
-template <int ThreadsPerBlock, int NumWarpQ, int NumThreadQ, bool Dir>
+template <
+        typename IndexT,
+        int ThreadsPerBlock,
+        int NumWarpQ,
+        int NumThreadQ,
+        bool Dir>
 __global__ void pass1SelectLists(
-        Tensor<int, 2, true> prefixSumOffsets,
+        Tensor<idx_t, 2, true> prefixSumOffsets,
         Tensor<float, 1, true> distance,
         int nprobe,
         int k,
         Tensor<float, 3, true> heapDistances,
-        Tensor<int, 3, true> heapIndices) {
+        Tensor<idx_t, 3, true> heapIndices) {
     constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
 
     __shared__ float smemK[kNumWarps * NumWarpQ];
-    __shared__ int smemV[kNumWarps * NumWarpQ];
+    __shared__ IndexT smemV[kNumWarps * NumWarpQ];
 
-    constexpr auto kInit = Dir ? kFloatMin : kFloatMax;
-    BlockSelect<
-            float,
-            int,
-            Dir,
-            Comparator<float>,
-            NumWarpQ,
-            NumThreadQ,
-            ThreadsPerBlock>
-            heap(kInit, -1, smemK, smemV, k);
-
-    auto queryId = blockIdx.y;
-    auto sliceId = blockIdx.x;
-    auto numSlices = gridDim.x;
-
-    int sliceSize = (nprobe / numSlices);
-    int sliceStart = sliceSize * sliceId;
-    int sliceEnd = sliceId == (numSlices - 1) ? nprobe : sliceStart + sliceSize;
-    auto offsets = prefixSumOffsets[queryId].data();
-
-    // We ensure that before the array (at offset -1), there is a 0 value
-    int start = *(&offsets[sliceStart] - 1);
-    int end = offsets[sliceEnd - 1];
-
-    int num = end - start;
-    int limit = utils::roundDown(num, kWarpSize);
-
-    int i = threadIdx.x;
-    auto distanceStart = distance[start].data();
-
-    // BlockSelect add cannot be used in a warp divergent circumstance; we
-    // handle the remainder warp below
-    for (; i < limit; i += blockDim.x) {
-        heap.add(distanceStart[i], start + i);
-    }
-
-    // Handle warp divergence separately
-    if (i < num) {
-        heap.addThreadQ(distanceStart[i], start + i);
-    }
-
-    // Merge all final results
-    heap.reduce();
-
-    // Write out the final k-selected values; they should be all
-    // together
-    for (int i = threadIdx.x; i < k; i += blockDim.x) {
-        heapDistances[queryId][sliceId][i] = smemK[i];
-        heapIndices[queryId][sliceId][i] = smemV[i];
+    for (IndexT queryId = blockIdx.y; queryId < prefixSumOffsets.getSize(0);
+         queryId += gridDim.y) {
+        constexpr auto kInit = Dir ? kFloatMin : kFloatMax;
+        BlockSelect<
+                float,
+                IndexT,
+                Dir,
+                Comparator<float>,
+                NumWarpQ,
+                NumThreadQ,
+                ThreadsPerBlock>
+                heap(kInit, -1, smemK, smemV, k);
+
+        auto sliceId = blockIdx.x;
+        auto numSlices = gridDim.x;
+
+        IndexT sliceSize = (nprobe / numSlices);
+        IndexT sliceStart = sliceSize * sliceId;
+        IndexT sliceEnd =
+                sliceId == (numSlices - 1) ? nprobe : sliceStart + sliceSize;
+        auto offsets = prefixSumOffsets[queryId].data();
+
+        // We ensure that before the array (at offset -1), there is a 0 value
+        auto start = *(&offsets[sliceStart] - 1);
+        auto end = offsets[sliceEnd - 1];
+
+        auto num = end - start;
+        auto limit = utils::roundDown(num, (IndexT)kWarpSize);
+
+        IndexT i = threadIdx.x;
+        auto distanceStart = distance[start].data();
+
+        // BlockSelect add cannot be used in a warp divergent circumstance; we
+        // handle the remainder warp below
+        for (; i < limit; i += blockDim.x) {
+            heap.add(distanceStart[i], IndexT(start + i));
+        }
+
+        // Handle the remainder if any separately (warp is divergent)
+        if (i < num) {
+            heap.addThreadQ(distanceStart[i], IndexT(start + i));
+        }
+
+        // Merge all final results
+        heap.reduce();
+
+        // Write out the final k-selected values; they should be all
+        // together
+        for (int i = threadIdx.x; i < k; i += blockDim.x) {
+            heapDistances[queryId][sliceId][i] = smemK[i];
+            heapIndices[queryId][sliceId][i] = idx_t(smemV[i]);
+        }
     }
 }
 
 void runPass1SelectLists(
-        Tensor<int, 2, true>& prefixSumOffsets,
+        Tensor<idx_t, 2, true>& prefixSumOffsets,
         Tensor<float, 1, true>& distance,
         int nprobe,
         int k,
+        bool use64BitSelection,
         bool chooseLargest,
         Tensor<float, 3, true>& heapDistances,
-        Tensor<int, 3, true>& heapIndices,
+        Tensor<idx_t, 3, true>& heapIndices,
         cudaStream_t stream) {
-    // This is caught at a higher level
+    // This is also caught at a higher level
     FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);
 
-    auto grid = dim3(heapDistances.getSize(1), prefixSumOffsets.getSize(0));
-
-#define RUN_PASS(BLOCK, NUM_WARP_Q, NUM_THREAD_Q, DIR)         \
-    do {                                                       \
-        pass1SelectLists<BLOCK, NUM_WARP_Q, NUM_THREAD_Q, DIR> \
-                <<<grid, BLOCK, 0, stream>>>(                  \
-                        prefixSumOffsets,                      \
-                        distance,                              \
-                        nprobe,                                \
-                        k,                                     \
-                        heapDistances,                         \
-                        heapIndices);                          \
-        CUDA_TEST_ERROR();                                     \
-        return; /* success */                                  \
+    auto grid =
+            dim3(heapDistances.getSize(1),
+                 std::min(
+                         prefixSumOffsets.getSize(0),
+                         (idx_t)getMaxGridCurrentDevice().y));
+
+#define RUN_PASS(INDEX_T, BLOCK, NUM_WARP_Q, NUM_THREAD_Q, DIR)         \
+    do {                                                                \
+        pass1SelectLists<INDEX_T, BLOCK, NUM_WARP_Q, NUM_THREAD_Q, DIR> \
+                <<<grid, BLOCK, 0, stream>>>(                           \
+                        prefixSumOffsets,                               \
+                        distance,                                       \
+                        nprobe,                                         \
+                        k,                                              \
+                        heapDistances,                                  \
+                        heapIndices);                                   \
+        return; /* success */                                           \
     } while (0)
 
 #if GPU_MAX_SELECTION_K >= 2048
 
     // block size 128 for k <= 1024, 64 for k = 2048
-#define RUN_PASS_DIR(DIR)                \
-    do {                                 \
-        if (k == 1) {                    \
-            RUN_PASS(128, 1, 1, DIR);    \
-        } else if (k <= 32) {            \
-            RUN_PASS(128, 32, 2, DIR);   \
-        } else if (k <= 64) {            \
-            RUN_PASS(128, 64, 3, DIR);   \
-        } else if (k <= 128) {           \
-            RUN_PASS(128, 128, 3, DIR);  \
-        } else if (k <= 256) {           \
-            RUN_PASS(128, 256, 4, DIR);  \
-        } else if (k <= 512) {           \
-            RUN_PASS(128, 512, 8, DIR);  \
-        } else if (k <= 1024) {          \
-            RUN_PASS(128, 1024, 8, DIR); \
-        } else if (k <= 2048) {          \
-            RUN_PASS(64, 2048, 8, DIR);  \
-        }                                \
+#define RUN_PASS_DIR(INDEX_T, DIR)                \
+    do {                                          \
+        if (k == 1) {                             \
+            RUN_PASS(INDEX_T, 128, 1, 1, DIR);    \
+        } else if (k <= 32) {                     \
+            RUN_PASS(INDEX_T, 128, 32, 2, DIR);   \
+        } else if (k <= 64) {                     \
+            RUN_PASS(INDEX_T, 128, 64, 3, DIR);   \
+        } else if (k <= 128) {                    \
+            RUN_PASS(INDEX_T, 128, 128, 3, DIR);  \
+        } else if (k <= 256) {                    \
+            RUN_PASS(INDEX_T, 128, 256, 4, DIR);  \
+        } else if (k <= 512) {                    \
+            RUN_PASS(INDEX_T, 128, 512, 8, DIR);  \
+        } else if (k <= 1024) {                   \
+            RUN_PASS(INDEX_T, 128, 1024, 8, DIR); \
+        } else if (k <= 2048) {                   \
+            RUN_PASS(INDEX_T, 64, 2048, 8, DIR);  \
+        }                                         \
     } while (0)
 
 #else
 
-#define RUN_PASS_DIR(DIR)                \
-    do {                                 \
-        if (k == 1) {                    \
-            RUN_PASS(128, 1, 1, DIR);    \
-        } else if (k <= 32) {            \
-            RUN_PASS(128, 32, 2, DIR);   \
-        } else if (k <= 64) {            \
-            RUN_PASS(128, 64, 3, DIR);   \
-        } else if (k <= 128) {           \
-            RUN_PASS(128, 128, 3, DIR);  \
-        } else if (k <= 256) {           \
-            RUN_PASS(128, 256, 4, DIR);  \
-        } else if (k <= 512) {           \
-            RUN_PASS(128, 512, 8, DIR);  \
-        } else if (k <= 1024) {          \
-            RUN_PASS(128, 1024, 8, DIR); \
-        }                                \
+#define RUN_PASS_DIR(INDEX_T, DIR)                \
+    do {                                          \
+        if (k == 1) {                             \
+            RUN_PASS(INDEX_T, 128, 1, 1, DIR);    \
+        } else if (k <= 32) {                     \
+            RUN_PASS(INDEX_T, 128, 32, 2, DIR);   \
+        } else if (k <= 64) {                     \
+            RUN_PASS(INDEX_T, 128, 64, 3, DIR);   \
+        } else if (k <= 128) {                    \
+            RUN_PASS(INDEX_T, 128, 128, 3, DIR);  \
+        } else if (k <= 256) {                    \
+            RUN_PASS(INDEX_T, 128, 256, 4, DIR);  \
+        } else if (k <= 512) {                    \
+            RUN_PASS(INDEX_T, 128, 512, 8, DIR);  \
+        } else if (k <= 1024) {                   \
+            RUN_PASS(INDEX_T, 128, 1024, 8, DIR); \
+        }                                         \
     } while (0)
 
 #endif // GPU_MAX_SELECTION_K
 
-    if (chooseLargest) {
-        RUN_PASS_DIR(true);
+    if (use64BitSelection) {
+        if (chooseLargest) {
+            RUN_PASS_DIR(idx_t, true);
+        } else {
+            RUN_PASS_DIR(idx_t, false);
+        }
     } else {
-        RUN_PASS_DIR(false);
+        if (chooseLargest) {
+            RUN_PASS_DIR(int32_t, true);
+        } else {
+            RUN_PASS_DIR(int32_t, false);
+        }
     }
 
 #undef RUN_PASS_DIR
 #undef RUN_PASS
+
+    CUDA_TEST_ERROR();
 }
 
 } // namespace gpu
diff -Nru faiss-1.7.3/faiss/gpu/impl/IVFUtilsSelect2.cu faiss-1.7.4/faiss/gpu/impl/IVFUtilsSelect2.cu
--- faiss-1.7.3/faiss/gpu/impl/IVFUtilsSelect2.cu	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/IVFUtilsSelect2.cu	2023-04-19 13:18:30.000000000 +0000
@@ -23,17 +23,14 @@
 
 // This is warp divergence central, but this is really a final step
 // and happening a small number of times
-inline __device__ int binarySearchForBucket(
-        int* prefixSumOffsets,
-        int size,
-        int val) {
-    int start = 0;
-    int end = size;
+template <typename T>
+__device__ int binarySearchForBucket(T* prefixSumOffsets, T size, T val) {
+    T start = 0;
+    T end = size;
 
     while (end - start > 0) {
-        int mid = start + (end - start) / 2;
-
-        int midVal = prefixSumOffsets[mid];
+        T mid = start + (end - start) / 2;
+        T midVal = prefixSumOffsets[mid];
 
         // Find the first bucket that we are <=
         if (midVal <= val) {
@@ -49,26 +46,31 @@
     return start;
 }
 
-template <int ThreadsPerBlock, int NumWarpQ, int NumThreadQ, bool Dir>
+template <
+        typename IndexT,
+        int ThreadsPerBlock,
+        int NumWarpQ,
+        int NumThreadQ,
+        bool Dir>
 __global__ void pass2SelectLists(
         Tensor<float, 2, true> heapDistances,
-        Tensor<int, 2, true> heapIndices,
+        Tensor<idx_t, 2, true> heapIndices,
         void** listIndices,
-        Tensor<int, 2, true> prefixSumOffsets,
-        Tensor<Index::idx_t, 2, true> ivfListIds,
+        Tensor<idx_t, 2, true> prefixSumOffsets,
+        Tensor<idx_t, 2, true> ivfListIds,
         int k,
         IndicesOptions opt,
         Tensor<float, 2, true> outDistances,
-        Tensor<Index::idx_t, 2, true> outIndices) {
+        Tensor<idx_t, 2, true> outIndices) {
     constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
 
     __shared__ float smemK[kNumWarps * NumWarpQ];
-    __shared__ int smemV[kNumWarps * NumWarpQ];
+    __shared__ IndexT smemV[kNumWarps * NumWarpQ];
 
     constexpr auto kInit = Dir ? kFloatMin : kFloatMax;
     BlockSelect<
             float,
-            int,
+            IndexT,
             Dir,
             Comparator<float>,
             NumWarpQ,
@@ -77,21 +79,21 @@
             heap(kInit, -1, smemK, smemV, k);
 
     auto queryId = blockIdx.x;
-    int num = heapDistances.getSize(1);
-    int limit = utils::roundDown(num, kWarpSize);
+    idx_t num = heapDistances.getSize(1);
+    idx_t limit = utils::roundDown(num, kWarpSize);
 
-    int i = threadIdx.x;
+    idx_t i = threadIdx.x;
     auto heapDistanceStart = heapDistances[queryId];
 
     // BlockSelect add cannot be used in a warp divergent circumstance; we
     // handle the remainder warp below
     for (; i < limit; i += blockDim.x) {
-        heap.add(heapDistanceStart[i], i);
+        heap.add(heapDistanceStart[i], IndexT(i));
     }
 
     // Handle warp divergence separately
     if (i < num) {
-        heap.addThreadQ(heapDistanceStart[i], i);
+        heap.addThreadQ(heapDistanceStart[i], IndexT(i));
     }
 
     // Merge all final results
@@ -109,39 +111,39 @@
         // This code is highly divergent, but it's probably ok, since this
         // is the very last step and it is happening a small number of
         // times (#queries x k).
-        int v = smemV[i];
-        Index::idx_t index = -1;
+        idx_t v = smemV[i];
+        idx_t index = -1;
 
         if (v != -1) {
             // `offset` is the offset of the intermediate result, as
             // calculated by the original scan.
-            int offset = heapIndices[queryId][v];
+            idx_t offset = heapIndices[queryId][v];
 
             // In order to determine the actual user index, we need to first
             // determine what list it was in.
             // We do this by binary search in the prefix sum list.
-            int probe = binarySearchForBucket(
+            idx_t probe = binarySearchForBucket(
                     prefixSumOffsets[queryId].data(),
                     prefixSumOffsets.getSize(1),
                     offset);
 
             // This is then the probe for the query; we can find the actual
             // list ID from this
-            Index::idx_t listId = ivfListIds[queryId][probe];
+            idx_t listId = ivfListIds[queryId][probe];
 
             // Now, we need to know the offset within the list
             // We ensure that before the array (at offset -1), there is a 0
             // value
-            int listStart = *(prefixSumOffsets[queryId][probe].data() - 1);
-            int listOffset = offset - listStart;
+            idx_t listStart = *(prefixSumOffsets[queryId][probe].data() - 1);
+            idx_t listOffset = offset - listStart;
 
             // This gives us our final index
             if (opt == INDICES_32_BIT) {
-                index = (Index::idx_t)((int*)listIndices[listId])[listOffset];
+                index = (idx_t)((int*)listIndices[listId])[listOffset];
             } else if (opt == INDICES_64_BIT) {
-                index = ((Index::idx_t*)listIndices[listId])[listOffset];
+                index = ((idx_t*)listIndices[listId])[listOffset];
             } else {
-                index = (listId << 32 | (Index::idx_t)listOffset);
+                index = (listId << 32 | (idx_t)listOffset);
             }
         }
 
@@ -151,93 +153,102 @@
 
 void runPass2SelectLists(
         Tensor<float, 2, true>& heapDistances,
-        Tensor<int, 2, true>& heapIndices,
+        Tensor<idx_t, 2, true>& heapIndices,
         DeviceVector<void*>& listIndices,
         IndicesOptions indicesOptions,
-        Tensor<int, 2, true>& prefixSumOffsets,
-        Tensor<Index::idx_t, 2, true>& ivfListIds,
+        Tensor<idx_t, 2, true>& prefixSumOffsets,
+        Tensor<idx_t, 2, true>& ivfListIds,
         int k,
+        bool use64BitSelection,
         bool chooseLargest,
         Tensor<float, 2, true>& outDistances,
-        Tensor<Index::idx_t, 2, true>& outIndices,
+        Tensor<idx_t, 2, true>& outIndices,
         cudaStream_t stream) {
+    // This is also caught at a higher level
+    FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);
+
     auto grid = dim3(ivfListIds.getSize(0));
 
-#define RUN_PASS(BLOCK, NUM_WARP_Q, NUM_THREAD_Q, DIR)         \
-    do {                                                       \
-        pass2SelectLists<BLOCK, NUM_WARP_Q, NUM_THREAD_Q, DIR> \
-                <<<grid, BLOCK, 0, stream>>>(                  \
-                        heapDistances,                         \
-                        heapIndices,                           \
-                        listIndices.data(),                    \
-                        prefixSumOffsets,                      \
-                        ivfListIds,                            \
-                        k,                                     \
-                        indicesOptions,                        \
-                        outDistances,                          \
-                        outIndices);                           \
-        CUDA_TEST_ERROR();                                     \
-        return; /* success */                                  \
+#define RUN_PASS(INDEX_T, BLOCK, NUM_WARP_Q, NUM_THREAD_Q, DIR)         \
+    do {                                                                \
+        pass2SelectLists<INDEX_T, BLOCK, NUM_WARP_Q, NUM_THREAD_Q, DIR> \
+                <<<grid, BLOCK, 0, stream>>>(                           \
+                        heapDistances,                                  \
+                        heapIndices,                                    \
+                        listIndices.data(),                             \
+                        prefixSumOffsets,                               \
+                        ivfListIds,                                     \
+                        k,                                              \
+                        indicesOptions,                                 \
+                        outDistances,                                   \
+                        outIndices);                                    \
     } while (0)
 
 #if GPU_MAX_SELECTION_K >= 2048
 
     // block size 128 for k <= 1024, 64 for k = 2048
-#define RUN_PASS_DIR(DIR)                \
-    do {                                 \
-        if (k == 1) {                    \
-            RUN_PASS(128, 1, 1, DIR);    \
-        } else if (k <= 32) {            \
-            RUN_PASS(128, 32, 2, DIR);   \
-        } else if (k <= 64) {            \
-            RUN_PASS(128, 64, 3, DIR);   \
-        } else if (k <= 128) {           \
-            RUN_PASS(128, 128, 3, DIR);  \
-        } else if (k <= 256) {           \
-            RUN_PASS(128, 256, 4, DIR);  \
-        } else if (k <= 512) {           \
-            RUN_PASS(128, 512, 8, DIR);  \
-        } else if (k <= 1024) {          \
-            RUN_PASS(128, 1024, 8, DIR); \
-        } else if (k <= 2048) {          \
-            RUN_PASS(64, 2048, 8, DIR);  \
-        }                                \
+#define RUN_PASS_DIR(INDEX_T, DIR)                \
+    do {                                          \
+        if (k == 1) {                             \
+            RUN_PASS(INDEX_T, 128, 1, 1, DIR);    \
+        } else if (k <= 32) {                     \
+            RUN_PASS(INDEX_T, 128, 32, 2, DIR);   \
+        } else if (k <= 64) {                     \
+            RUN_PASS(INDEX_T, 128, 64, 3, DIR);   \
+        } else if (k <= 128) {                    \
+            RUN_PASS(INDEX_T, 128, 128, 3, DIR);  \
+        } else if (k <= 256) {                    \
+            RUN_PASS(INDEX_T, 128, 256, 4, DIR);  \
+        } else if (k <= 512) {                    \
+            RUN_PASS(INDEX_T, 128, 512, 8, DIR);  \
+        } else if (k <= 1024) {                   \
+            RUN_PASS(INDEX_T, 128, 1024, 8, DIR); \
+        } else if (k <= 2048) {                   \
+            RUN_PASS(INDEX_T, 64, 2048, 8, DIR);  \
+        }                                         \
     } while (0)
 
 #else
 
-#define RUN_PASS_DIR(DIR)                \
-    do {                                 \
-        if (k == 1) {                    \
-            RUN_PASS(128, 1, 1, DIR);    \
-        } else if (k <= 32) {            \
-            RUN_PASS(128, 32, 2, DIR);   \
-        } else if (k <= 64) {            \
-            RUN_PASS(128, 64, 3, DIR);   \
-        } else if (k <= 128) {           \
-            RUN_PASS(128, 128, 3, DIR);  \
-        } else if (k <= 256) {           \
-            RUN_PASS(128, 256, 4, DIR);  \
-        } else if (k <= 512) {           \
-            RUN_PASS(128, 512, 8, DIR);  \
-        } else if (k <= 1024) {          \
-            RUN_PASS(128, 1024, 8, DIR); \
-        }                                \
+#define RUN_PASS_DIR(INDEX_T, DIR)                \
+    do {                                          \
+        if (k == 1) {                             \
+            RUN_PASS(INDEX_T, 128, 1, 1, DIR);    \
+        } else if (k <= 32) {                     \
+            RUN_PASS(INDEX_T, 128, 32, 2, DIR);   \
+        } else if (k <= 64) {                     \
+            RUN_PASS(INDEX_T, 128, 64, 3, DIR);   \
+        } else if (k <= 128) {                    \
+            RUN_PASS(INDEX_T, 128, 128, 3, DIR);  \
+        } else if (k <= 256) {                    \
+            RUN_PASS(INDEX_T, 128, 256, 4, DIR);  \
+        } else if (k <= 512) {                    \
+            RUN_PASS(INDEX_T, 128, 512, 8, DIR);  \
+        } else if (k <= 1024) {                   \
+            RUN_PASS(INDEX_T, 128, 1024, 8, DIR); \
+        }                                         \
     } while (0)
 
 #endif // GPU_MAX_SELECTION_K
 
-    if (chooseLargest) {
-        RUN_PASS_DIR(true);
+    if (use64BitSelection) {
+        if (chooseLargest) {
+            RUN_PASS_DIR(idx_t, true);
+        } else {
+            RUN_PASS_DIR(idx_t, false);
+        }
     } else {
-        RUN_PASS_DIR(false);
+        if (chooseLargest) {
+            RUN_PASS_DIR(int32_t, true);
+        } else {
+            RUN_PASS_DIR(int32_t, false);
+        }
     }
 
-    // unimplemented / too many resources
-    FAISS_ASSERT_FMT(false, "unimplemented k value (%d)", k);
-
 #undef RUN_PASS_DIR
 #undef RUN_PASS
+
+    CUDA_TEST_ERROR();
 }
 
 } // namespace gpu
diff -Nru faiss-1.7.3/faiss/gpu/impl/L2Norm.cu faiss-1.7.4/faiss/gpu/impl/L2Norm.cu
--- faiss-1.7.3/faiss/gpu/impl/L2Norm.cu	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/L2Norm.cu	2023-04-19 13:18:30.000000000 +0000
@@ -34,32 +34,32 @@
 template <
         typename T,
         typename TVec,
-        typename IndexType,
         int RowTileSize,
         bool NormLoop,
         bool NormSquared>
 __global__ void l2NormRowMajor(
-        Tensor<TVec, 2, true, IndexType> input,
-        Tensor<float, 1, true, IndexType> output) {
+        Tensor<TVec, 2, true> input,
+        Tensor<float, 1, true> output) {
     extern __shared__ char smemByte[]; // #warps * RowTileSize elements
     float* smem = (float*)smemByte;
 
-    IndexType numWarps = utils::divUp(blockDim.x, kWarpSize);
-    IndexType laneId = getLaneId();
-    IndexType warpId = threadIdx.x / kWarpSize;
+    // these are fine to be int (just based on block dimensions)
+    int numWarps = utils::divUp(blockDim.x, kWarpSize);
+    int laneId = getLaneId();
+    int warpId = threadIdx.x / kWarpSize;
 
     bool lastRowTile = (blockIdx.x == (gridDim.x - 1));
-    IndexType rowStart = RowTileSize * blockIdx.x;
+    idx_t rowStart = idx_t(blockIdx.x) * RowTileSize;
     // accumulate in f32
     float rowNorm[RowTileSize];
 
     if (lastRowTile) {
         // We are handling the very end of the input matrix rows
-        for (IndexType row = 0; row < input.getSize(0) - rowStart; ++row) {
+        for (idx_t row = 0; row < input.getSize(0) - rowStart; ++row) {
             if (NormLoop) {
                 rowNorm[0] = 0;
 
-                for (IndexType col = threadIdx.x; col < input.getSize(1);
+                for (idx_t col = threadIdx.x; col < input.getSize(1);
                      col += blockDim.x) {
                     TVec val = input[rowStart + row][col];
                     val = Math<TVec>::mul(val, val);
@@ -90,7 +90,7 @@
                 rowNorm[row] = 0;
             }
 
-            for (IndexType col = threadIdx.x; col < input.getSize(1);
+            for (idx_t col = threadIdx.x; col < input.getSize(1);
                  col += blockDim.x) {
 #pragma unroll
                 for (int row = 0; row < RowTileSize; ++row) {
@@ -183,18 +183,18 @@
 // Output: (batch norm)
 // Handles the case where `input` is column major. A single thread calculates
 // the norm of each vector instead of a block-wide reduction.
-template <typename T, typename IndexType, bool NormSquared>
+template <typename T, bool NormSquared>
 __global__ void l2NormColMajor(
-        Tensor<T, 2, true, IndexType> input,
-        Tensor<float, 1, true, IndexType> output) {
+        Tensor<T, 2, true> input,
+        Tensor<float, 1, true> output) {
     // grid-stride loop to handle all batch elements
-    for (IndexType batch = blockIdx.x * blockDim.x + threadIdx.x;
+    for (idx_t batch = idx_t(blockIdx.x) * blockDim.x + threadIdx.x;
          batch < input.getSize(1);
          batch += gridDim.x * blockDim.x) {
         float sum = 0;
 
         // This is still a coalesced load from the memory
-        for (IndexType dim = 0; dim < input.getSize(0); ++dim) {
+        for (idx_t dim = 0; dim < input.getSize(0); ++dim) {
             // Just do the math in float32, even if the input is float16
             float v = ConvertTo<float>::to(input[dim][batch]);
             sum += v * v;
@@ -208,55 +208,35 @@
     }
 }
 
-template <typename T, typename TVec, typename IndexType>
+template <typename T, typename TVec>
 void runL2Norm(
-        Tensor<T, 2, true, IndexType>& input,
+        Tensor<T, 2, true>& input,
         bool inputRowMajor,
-        Tensor<float, 1, true, IndexType>& output,
+        Tensor<float, 1, true>& output,
         bool normSquared,
         cudaStream_t stream) {
-    IndexType maxThreads = (IndexType)getMaxThreadsCurrentDevice();
+    idx_t maxThreads = (idx_t)getMaxThreadsCurrentDevice();
     constexpr int rowTileSize = 8;
 
-#define RUN_L2_ROW_MAJOR(TYPE_T, TYPE_TVEC, INPUT)                            \
-    do {                                                                      \
-        if (normLoop) {                                                       \
-            if (normSquared) {                                                \
-                l2NormRowMajor<                                               \
-                        TYPE_T,                                               \
-                        TYPE_TVEC,                                            \
-                        IndexType,                                            \
-                        rowTileSize,                                          \
-                        true,                                                 \
-                        true><<<grid, block, smem, stream>>>(INPUT, output);  \
-            } else {                                                          \
-                l2NormRowMajor<                                               \
-                        TYPE_T,                                               \
-                        TYPE_TVEC,                                            \
-                        IndexType,                                            \
-                        rowTileSize,                                          \
-                        true,                                                 \
-                        false><<<grid, block, smem, stream>>>(INPUT, output); \
-            }                                                                 \
-        } else {                                                              \
-            if (normSquared) {                                                \
-                l2NormRowMajor<                                               \
-                        TYPE_T,                                               \
-                        TYPE_TVEC,                                            \
-                        IndexType,                                            \
-                        rowTileSize,                                          \
-                        false,                                                \
-                        true><<<grid, block, smem, stream>>>(INPUT, output);  \
-            } else {                                                          \
-                l2NormRowMajor<                                               \
-                        TYPE_T,                                               \
-                        TYPE_TVEC,                                            \
-                        IndexType,                                            \
-                        rowTileSize,                                          \
-                        false,                                                \
-                        false><<<grid, block, smem, stream>>>(INPUT, output); \
-            }                                                                 \
-        }                                                                     \
+#define RUN_L2_ROW_MAJOR(TYPE_T, TYPE_TVEC, INPUT)                           \
+    do {                                                                     \
+        if (normLoop) {                                                      \
+            if (normSquared) {                                               \
+                l2NormRowMajor<TYPE_T, TYPE_TVEC, rowTileSize, true, true>   \
+                        <<<grid, block, smem, stream>>>(INPUT, output);      \
+            } else {                                                         \
+                l2NormRowMajor<TYPE_T, TYPE_TVEC, rowTileSize, true, false>  \
+                        <<<grid, block, smem, stream>>>(INPUT, output);      \
+            }                                                                \
+        } else {                                                             \
+            if (normSquared) {                                               \
+                l2NormRowMajor<TYPE_T, TYPE_TVEC, rowTileSize, false, true>  \
+                        <<<grid, block, smem, stream>>>(INPUT, output);      \
+            } else {                                                         \
+                l2NormRowMajor<TYPE_T, TYPE_TVEC, rowTileSize, false, false> \
+                        <<<grid, block, smem, stream>>>(INPUT, output);      \
+            }                                                                \
+        }                                                                    \
     } while (0)
 
     if (inputRowMajor) {
@@ -306,15 +286,12 @@
         // Cap the grid size at 2^16 since there is a grid-stride loop to handle
         // processing everything
         auto grid = (int)std::min(
-                utils::divUp(input.getSize(1), (IndexType)block),
-                (IndexType)65536);
+                utils::divUp(input.getSize(1), (idx_t)block), (idx_t)65536);
 
         if (normSquared) {
-            l2NormColMajor<T, IndexType, true>
-                    <<<grid, block, 0, stream>>>(input, output);
+            l2NormColMajor<T, true><<<grid, block, 0, stream>>>(input, output);
         } else {
-            l2NormColMajor<T, IndexType, false>
-                    <<<grid, block, 0, stream>>>(input, output);
+            l2NormColMajor<T, false><<<grid, block, 0, stream>>>(input, output);
         }
     }
 
@@ -329,16 +306,7 @@
         Tensor<float, 1, true>& output,
         bool normSquared,
         cudaStream_t stream) {
-    if (input.canUseIndexType<int>()) {
-        runL2Norm<float, float4, int>(
-                input, inputRowMajor, output, normSquared, stream);
-    } else {
-        auto inputCast = input.castIndexType<long>();
-        auto outputCast = output.castIndexType<long>();
-
-        runL2Norm<float, float4, long>(
-                inputCast, inputRowMajor, outputCast, normSquared, stream);
-    }
+    runL2Norm<float, float4>(input, inputRowMajor, output, normSquared, stream);
 }
 
 void runL2Norm(
@@ -347,16 +315,7 @@
         Tensor<float, 1, true>& output,
         bool normSquared,
         cudaStream_t stream) {
-    if (input.canUseIndexType<int>()) {
-        runL2Norm<half, half2, int>(
-                input, inputRowMajor, output, normSquared, stream);
-    } else {
-        auto inputCast = input.castIndexType<long>();
-        auto outputCast = output.castIndexType<long>();
-
-        runL2Norm<half, half2, long>(
-                inputCast, inputRowMajor, outputCast, normSquared, stream);
-    }
+    runL2Norm<half, half2>(input, inputRowMajor, output, normSquared, stream);
 }
 
 } // namespace gpu
diff -Nru faiss-1.7.3/faiss/gpu/impl/L2Select.cu faiss-1.7.4/faiss/gpu/impl/L2Select.cu
--- faiss-1.7.3/faiss/gpu/impl/L2Select.cu	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/L2Select.cu	2023-04-19 13:18:30.000000000 +0000
@@ -26,10 +26,11 @@
         Tensor<T, 2, true> productDistances,
         Tensor<T, 1, true> centroidDistances,
         Tensor<T, 2, true> outDistances,
-        Tensor<int, 2, true> outIndices) {
+        Tensor<idx_t, 2, true> outIndices) {
     // Each block handles kRowsPerBlock rows of the distances (results)
-    Pair<T, int> threadMin[kRowsPerBlock];
-    __shared__ Pair<T, int> blockMin[kRowsPerBlock * (kBlockSize / kWarpSize)];
+    Pair<T, idx_t> threadMin[kRowsPerBlock];
+    __shared__ Pair<T, idx_t>
+            blockMin[kRowsPerBlock * (kBlockSize / kWarpSize)];
 
     T distance[kRowsPerBlock];
 
@@ -40,7 +41,7 @@
     }
 
     // blockIdx.x: which chunk of rows we are responsible for updating
-    int rowStart = blockIdx.x * kRowsPerBlock;
+    idx_t rowStart = idx_t(blockIdx.x) * kRowsPerBlock;
 
     // FIXME: if we have exact multiples, don't need this
     bool endRow = (blockIdx.x == gridDim.x - 1);
@@ -52,8 +53,8 @@
     }
 
     if (endRow) {
-        for (int row = rowStart; row < productDistances.getSize(0); ++row) {
-            for (int col = threadIdx.x; col < productDistances.getSize(1);
+        for (idx_t row = rowStart; row < productDistances.getSize(0); ++row) {
+            for (idx_t col = threadIdx.x; col < productDistances.getSize(1);
                  col += blockDim.x) {
                 distance[0] = Math<T>::add(
                         centroidDistances[col], productDistances[row][col]);
@@ -66,10 +67,10 @@
 
             // Reduce within the block
             threadMin[0] = blockReduceAll<
-                    Pair<T, int>,
-                    Min<Pair<T, int>>,
+                    Pair<T, idx_t>,
+                    Min<Pair<T, idx_t>>,
                     false,
-                    false>(threadMin[0], Min<Pair<T, int>>(), blockMin);
+                    false>(threadMin[0], Min<Pair<T, idx_t>>(), blockMin);
 
             if (threadIdx.x == 0) {
                 outDistances[row][0] = threadMin[0].k;
@@ -83,22 +84,22 @@
             threadMin[0].v = -1;
         }
     } else {
-        for (int col = threadIdx.x; col < productDistances.getSize(1);
+        for (idx_t col = threadIdx.x; col < productDistances.getSize(1);
              col += blockDim.x) {
             T centroidDistance = centroidDistances[col];
 
 #pragma unroll
-            for (int row = 0; row < kRowsPerBlock; ++row) {
+            for (idx_t row = 0; row < kRowsPerBlock; ++row) {
                 distance[row] = productDistances[rowStart + row][col];
             }
 
 #pragma unroll
-            for (int row = 0; row < kRowsPerBlock; ++row) {
+            for (idx_t row = 0; row < kRowsPerBlock; ++row) {
                 distance[row] = Math<T>::add(distance[row], centroidDistance);
             }
 
 #pragma unroll
-            for (int row = 0; row < kRowsPerBlock; ++row) {
+            for (idx_t row = 0; row < kRowsPerBlock; ++row) {
                 if (Math<T>::lt(distance[row], threadMin[row].k)) {
                     threadMin[row].k = distance[row];
                     threadMin[row].v = col;
@@ -109,14 +110,14 @@
         // Reduce within the block
         blockReduceAll<
                 kRowsPerBlock,
-                Pair<T, int>,
-                Min<Pair<T, int>>,
+                Pair<T, idx_t>,
+                Min<Pair<T, idx_t>>,
                 false,
-                false>(threadMin, Min<Pair<T, int>>(), blockMin);
+                false>(threadMin, Min<Pair<T, idx_t>>(), blockMin);
 
         if (threadIdx.x == 0) {
 #pragma unroll
-            for (int row = 0; row < kRowsPerBlock; ++row) {
+            for (idx_t row = 0; row < kRowsPerBlock; ++row) {
                 outDistances[rowStart + row][0] = threadMin[row].k;
                 outIndices[rowStart + row][0] = threadMin[row].v;
             }
@@ -125,23 +126,30 @@
 }
 
 // L2 + select kernel for k > 1, no re-use of ||c||^2
-template <typename T, int NumWarpQ, int NumThreadQ, int ThreadsPerBlock>
+// IndexT is either int32_t or idx_t (int64_t) depending upon maximum sizes of
+// inputs
+template <
+        typename IndexT,
+        typename T,
+        int NumWarpQ,
+        int NumThreadQ,
+        int ThreadsPerBlock>
 __global__ void l2SelectMinK(
         Tensor<T, 2, true> productDistances,
         Tensor<T, 1, true> centroidDistances,
         Tensor<T, 2, true> outDistances,
-        Tensor<int, 2, true> outIndices,
+        Tensor<idx_t, 2, true> outIndices,
         int k,
         T initK) {
     // Each block handles a single row of the distances (results)
     constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
 
     __shared__ T smemK[kNumWarps * NumWarpQ];
-    __shared__ int smemV[kNumWarps * NumWarpQ];
+    __shared__ IndexT smemV[kNumWarps * NumWarpQ];
 
     BlockSelect<
             T,
-            int,
+            IndexT,
             false,
             Comparator<T>,
             NumWarpQ,
@@ -149,26 +157,29 @@
             ThreadsPerBlock>
             heap(initK, -1, smemK, smemV, k);
 
-    int row = blockIdx.x;
+    IndexT row = blockIdx.x;
 
     // Whole warps must participate in the selection
-    int limit = utils::roundDown(productDistances.getSize(1), kWarpSize);
-    int i = threadIdx.x;
+    IndexT limit = utils::roundDown(productDistances.getSize(1), kWarpSize);
+    IndexT i = threadIdx.x;
 
     for (; i < limit; i += blockDim.x) {
         T v = Math<T>::add(centroidDistances[i], productDistances[row][i]);
-        heap.add(v, i);
+        heap.add(v, IndexT(i));
     }
 
+    // Handle the remainder if any separately (warp is divergent)
     if (i < productDistances.getSize(1)) {
         T v = Math<T>::add(centroidDistances[i], productDistances[row][i]);
-        heap.addThreadQ(v, i);
+        heap.addThreadQ(v, IndexT(i));
     }
 
+    // Merge all final results
     heap.reduce();
+
     for (int i = threadIdx.x; i < k; i += blockDim.x) {
         outDistances[row][i] = smemK[i];
-        outIndices[row][i] = smemV[i];
+        outIndices[row][i] = idx_t(smemV[i]);
     }
 }
 
@@ -177,7 +188,7 @@
         Tensor<T, 2, true>& productDistances,
         Tensor<T, 1, true>& centroidDistances,
         Tensor<T, 2, true>& outDistances,
-        Tensor<int, 2, true>& outIndices,
+        Tensor<idx_t, 2, true>& outIndices,
         int k,
         cudaStream_t stream) {
     FAISS_ASSERT(productDistances.getSize(0) == outDistances.getSize(0));
@@ -185,6 +196,8 @@
     FAISS_ASSERT(centroidDistances.getSize(0) == productDistances.getSize(1));
     FAISS_ASSERT(outDistances.getSize(1) == k);
     FAISS_ASSERT(outIndices.getSize(1) == k);
+
+    // This is also caught at a higher level
     FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);
 
     if (k == 1) {
@@ -203,17 +216,29 @@
     } else {
         auto grid = dim3(outDistances.getSize(0));
 
-#define RUN_L2_SELECT(BLOCK, NUM_WARP_Q, NUM_THREAD_Q)   \
-    do {                                                 \
-        l2SelectMinK<T, NUM_WARP_Q, NUM_THREAD_Q, BLOCK> \
-                <<<grid, BLOCK, 0, stream>>>(            \
-                        productDistances,                \
-                        centroidDistances,               \
-                        outDistances,                    \
-                        outIndices,                      \
-                        k,                               \
-                        Limits<T>::getMax());            \
-    } while (0)
+        constexpr int kIndexMax = std::numeric_limits<int>::max();
+
+#define L2_KERNEL(INDEX_T, BLOCK, NUM_WARP_Q, NUM_THREAD_Q)   \
+    l2SelectMinK<INDEX_T, T, NUM_WARP_Q, NUM_THREAD_Q, BLOCK> \
+            <<<grid, BLOCK, 0, stream>>>(                     \
+                    productDistances,                         \
+                    centroidDistances,                        \
+                    outDistances,                             \
+                    outIndices,                               \
+                    k,                                        \
+                    Limits<T>::getMax())
+
+        // Choose which k-selection index (k-select values) type we should use
+        // if our problem fits into int32, in order to improve kernel occupancy
+#define RUN_L2_SELECT(BLOCK, NUM_WARP_Q, NUM_THREAD_Q)           \
+    do {                                                         \
+        if (productDistances.getSize(1) > kIndexMax ||           \
+            centroidDistances.getSize(0) > kIndexMax) {          \
+            L2_KERNEL(idx_t, BLOCK, NUM_WARP_Q, NUM_THREAD_Q);   \
+        } else {                                                 \
+            L2_KERNEL(int32_t, BLOCK, NUM_WARP_Q, NUM_THREAD_Q); \
+        }                                                        \
+    } while (false)
 
         // block size 128 for everything <= 1024
         if (k <= 32) {
@@ -234,12 +259,14 @@
             // smaller block for less shared memory
             RUN_L2_SELECT(64, 2048, 8);
 #endif
-
         } else {
             FAISS_ASSERT(false);
         }
     }
 
+#undef L2_KERNEL
+#undef RUN_L2_SELECT
+
     CUDA_TEST_ERROR();
 }
 
@@ -247,7 +274,7 @@
         Tensor<float, 2, true>& productDistances,
         Tensor<float, 1, true>& centroidDistances,
         Tensor<float, 2, true>& outDistances,
-        Tensor<int, 2, true>& outIndices,
+        Tensor<idx_t, 2, true>& outIndices,
         int k,
         cudaStream_t stream) {
     runL2SelectMin<float>(
diff -Nru faiss-1.7.3/faiss/gpu/impl/L2Select.cuh faiss-1.7.4/faiss/gpu/impl/L2Select.cuh
--- faiss-1.7.3/faiss/gpu/impl/L2Select.cuh	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/L2Select.cuh	2023-04-19 13:18:30.000000000 +0000
@@ -16,7 +16,7 @@
         Tensor<float, 2, true>& productDistances,
         Tensor<float, 1, true>& centroidDistances,
         Tensor<float, 2, true>& outDistances,
-        Tensor<int, 2, true>& outIndices,
+        Tensor<idx_t, 2, true>& outIndices,
         int k,
         cudaStream_t stream);
 
diff -Nru faiss-1.7.3/faiss/gpu/impl/PQCodeDistances.cuh faiss-1.7.4/faiss/gpu/impl/PQCodeDistances.cuh
--- faiss-1.7.3/faiss/gpu/impl/PQCodeDistances.cuh	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/PQCodeDistances.cuh	2023-04-19 13:18:30.000000000 +0000
@@ -28,7 +28,7 @@
         Tensor<float, 2, true>& queries,
         Tensor<CentroidT, 2, true>& coarseCentroids,
         Tensor<float, 2, true>& coarseDistances,
-        Tensor<Index::idx_t, 2, true>& coarseIndices,
+        Tensor<idx_t, 2, true>& coarseIndices,
         NoTypeTensor<4, true>& outCodeDistances,
         bool useMMImplementation,
         bool l2Distance,
diff -Nru faiss-1.7.3/faiss/gpu/impl/PQCodeDistances-inl.cuh faiss-1.7.4/faiss/gpu/impl/PQCodeDistances-inl.cuh
--- faiss-1.7.3/faiss/gpu/impl/PQCodeDistances-inl.cuh	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/PQCodeDistances-inl.cuh	2023-04-19 13:18:30.000000000 +0000
@@ -32,7 +32,7 @@
         int queriesPerBlock,
         Tensor<CentroidT, 2, true> coarseCentroids,
         Tensor<float, 3, true> pqCentroids,
-        Tensor<Index::idx_t, 2, true> coarseIndices,
+        Tensor<idx_t, 2, true> coarseIndices,
         // (query id)(coarse)(subquantizer)(code) -> dist
         Tensor<OutCodeT, 4, true> outCodeDistances) {
     const auto numSubQuantizers = pqCentroids.getSize(0);
@@ -76,7 +76,7 @@
     // performing the reductions locally
 
     // Handle multiple queries per block
-    auto startQueryId = blockIdx.x * queriesPerBlock;
+    auto startQueryId = idx_t(blockIdx.x) * queriesPerBlock;
     auto numQueries = queries.getSize(0) - startQueryId;
     if (numQueries > queriesPerBlock) {
         numQueries = queriesPerBlock;
@@ -96,7 +96,7 @@
         // Load list of coarse centroids found
         for (int i = threadIdx.x; i < coarseIndices.getSize(1);
              i += blockDim.x) {
-            // FIXME: coarseIndices is now Index::idx_t but the smem allocation
+            // FIXME: coarseIndices is now idx_t but the smem allocation
             // of coarseIds is still int. In practical limitation, everything
             // should still fit into int32
             coarseIds[i] = (int)coarseIndices[queryId][i];
@@ -284,7 +284,7 @@
 __global__ void pqResidualVector(
         Tensor<float, 2, true> queries,
         Tensor<CentroidT, 2, true> coarseCentroids,
-        Tensor<Index::idx_t, 2, true> coarseIndices,
+        Tensor<idx_t, 2, true> coarseIndices,
         int numSubDim,
         // output is transposed:
         // (sub q)(query id)(centroid id)(sub dim)
@@ -292,7 +292,7 @@
     auto queryId = blockIdx.x;
     auto centroidId = blockIdx.y;
 
-    Index::idx_t realCentroidId = coarseIndices[queryId][centroidId];
+    idx_t realCentroidId = coarseIndices[queryId][centroidId];
 
     for (int dim = threadIdx.x; dim < queries.getSize(1); dim += blockDim.x) {
         float q = queries[queryId][dim];
@@ -323,13 +323,14 @@
         Tensor<float, 3, true>& pqCentroids,
         Tensor<float, 2, true>& queries,
         Tensor<CentroidT, 2, true>& coarseCentroids,
-        Tensor<Index::idx_t, 2, true>& coarseIndices,
+        Tensor<idx_t, 2, true>& coarseIndices,
         Tensor<float, 4, true>& residual,
         bool l2Residual,
         cudaStream_t stream) {
+    // blockDim.y is limited by nprobe
     auto grid = dim3(coarseIndices.getSize(0), coarseIndices.getSize(1));
-    auto block =
-            dim3(std::min(queries.getSize(1), getMaxThreadsCurrentDevice()));
+    auto block = dim3(
+            std::min(queries.getSize(1), (idx_t)getMaxThreadsCurrentDevice()));
 
     if (l2Residual) {
         pqResidualVector<CentroidT, true><<<grid, block, 0, stream>>>(
@@ -380,6 +381,7 @@
         Tensor<T, 4, true>& codeDistances,
         Tensor<T, 2, true>& coarseDistances,
         cudaStream_t stream) {
+    // blockDim.y is limited by nprobe
     auto grid = dim3(coarseDistances.getSize(1), coarseDistances.getSize(0));
     auto block = 512;
 
@@ -399,7 +401,7 @@
         Tensor<float, 2, true>& queries,
         Tensor<CentroidT, 2, true>& coarseCentroids,
         Tensor<float, 2, true>& coarseDistances,
-        Tensor<Index::idx_t, 2, true>& coarseIndices,
+        Tensor<idx_t, 2, true>& coarseIndices,
         // Output is (query)(centroid)(sub q)(code)
         NoTypeTensor<4, true>& outCodeDistances,
         bool l2Distance,
@@ -588,7 +590,7 @@
         Tensor<float, 2, true>& queries,
         Tensor<CentroidT, 2, true>& coarseCentroids,
         Tensor<float, 2, true>& coarseDistances,
-        Tensor<Index::idx_t, 2, true>& coarseIndices,
+        Tensor<idx_t, 2, true>& coarseIndices,
         NoTypeTensor<4, true>& outCodeDistances,
         bool useMMImplementation,
         bool l2Distance,
diff -Nru faiss-1.7.3/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cuh faiss-1.7.4/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cuh
--- faiss-1.7.3/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cuh	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cuh	2023-04-19 13:18:30.000000000 +0000
@@ -24,7 +24,7 @@
         Tensor<CentroidT, 2, true>& centroids,
         Tensor<float, 3, true>& pqCentroidsInnermostCode,
         Tensor<float, 2, true>& coarseDistances,
-        Tensor<Index::idx_t, 2, true>& coarseIndices,
+        Tensor<idx_t, 2, true>& coarseIndices,
         bool useFloat16Lookup,
         bool useMMCodeDistance,
         bool interleavedCodeLayout,
@@ -34,14 +34,14 @@
         DeviceVector<void*>& listCodes,
         DeviceVector<void*>& listIndices,
         IndicesOptions indicesOptions,
-        DeviceVector<int>& listLengths,
-        int maxListLength,
+        DeviceVector<idx_t>& listLengths,
+        idx_t maxListLength,
         int k,
         faiss::MetricType metric,
         // output
         Tensor<float, 2, true>& outDistances,
         // output
-        Tensor<Index::idx_t, 2, true>& outIndices,
+        Tensor<idx_t, 2, true>& outIndices,
         GpuResources* res);
 
 } // namespace gpu
diff -Nru faiss-1.7.3/faiss/gpu/impl/PQScanMultiPassNoPrecomputed-inl.cuh faiss-1.7.4/faiss/gpu/impl/PQScanMultiPassNoPrecomputed-inl.cuh
--- faiss-1.7.3/faiss/gpu/impl/PQScanMultiPassNoPrecomputed-inl.cuh	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/PQScanMultiPassNoPrecomputed-inl.cuh	2023-04-19 13:18:30.000000000 +0000
@@ -28,17 +28,17 @@
 __global__ void pqScanInterleaved(
         Tensor<float, 2, true> queries,
         Tensor<float, 3, true> pqCentroids,
-        Tensor<Index::idx_t, 2, true> ivfListIds,
+        Tensor<idx_t, 2, true> ivfListIds,
         Tensor<CodeDistanceT, 4, true> codeDistances,
         void** listCodes,
-        int* listLengths,
-        Tensor<int, 2, true> prefixSumOffsets,
+        idx_t* listLengths,
+        Tensor<idx_t, 2, true> prefixSumOffsets,
         Tensor<float, 1, true> distance) {
     // Each block handles a single query
     auto queryId = blockIdx.y;
     auto probeId = blockIdx.x;
 
-    Index::idx_t listId = ivfListIds[queryId][probeId];
+    idx_t listId = ivfListIds[queryId][probeId];
     // Safety guard in case NaNs in input cause no list ID to be generated
     if (listId == -1) {
         return;
@@ -53,16 +53,16 @@
 
     // This is where we start writing out data
     // We ensure that before the array (at offset -1), there is a 0 value
-    int outBase = *(prefixSumOffsets[queryId][probeId].data() - 1);
+    auto outBase = *(prefixSumOffsets[queryId][probeId].data() - 1);
     float* distanceOut = distance[outBase].data();
     auto localCodeDistances = codeDistances[queryId][probeId];
 
     // This is where the codes for our list start
     auto vecsBase = (EncodeT*)listCodes[listId];
-    int numVecs = listLengths[listId];
+    auto numVecs = listLengths[listId];
 
     // How many vector blocks of 32 are in this list?
-    int numBlocks = utils::divUp(numVecs, 32);
+    idx_t numBlocks = utils::divUp(numVecs, (idx_t)32);
 
     // Number of EncodeT words per each dimension of block of 32 vecs
     constexpr int bytesPerVectorBlockDim = EncodeBits * 32 / 8;
@@ -70,11 +70,11 @@
             bytesPerVectorBlockDim / sizeof(EncodeT);
     int wordsPerVectorBlock = wordsPerVectorBlockDim * numSubQuantizers;
 
-    for (int block = warpId; block < numBlocks; block += numWarps) {
+    for (idx_t block = warpId; block < numBlocks; block += numWarps) {
         float dist = 0;
 
         // This is the vector a given lane/thread handles
-        int vec = block * kWarpSize + laneId;
+        idx_t vec = block * kWarpSize + laneId;
         bool valid = vec < numVecs;
 
         EncodeT* data = vecsBase + block * wordsPerVectorBlock;
@@ -174,11 +174,11 @@
 __global__ void pqScanNoPrecomputedMultiPass(
         Tensor<float, 2, true> queries,
         Tensor<float, 3, true> pqCentroids,
-        Tensor<Index::idx_t, 2, true> ivfListIds,
+        Tensor<idx_t, 2, true> ivfListIds,
         Tensor<LookupT, 4, true> codeDistances,
         void** listCodes,
-        int* listLengths,
-        Tensor<int, 2, true> prefixSumOffsets,
+        idx_t* listLengths,
+        Tensor<idx_t, 2, true> prefixSumOffsets,
         Tensor<float, 1, true> distance) {
     const auto codesPerSubQuantizer = pqCentroids.getSize(2);
 
@@ -192,17 +192,17 @@
 
     // This is where we start writing out data
     // We ensure that before the array (at offset -1), there is a 0 value
-    int outBase = *(prefixSumOffsets[queryId][probeId].data() - 1);
+    auto outBase = *(prefixSumOffsets[queryId][probeId].data() - 1);
     float* distanceOut = distance[outBase].data();
 
-    Index::idx_t listId = ivfListIds[queryId][probeId];
+    idx_t listId = ivfListIds[queryId][probeId];
     // Safety guard in case NaNs in input cause no list ID to be generated
     if (listId == -1) {
         return;
     }
 
     uint8_t* codeList = (uint8_t*)listCodes[listId];
-    int limit = listLengths[listId];
+    auto limit = listLengths[listId];
 
     constexpr int kNumCode32 =
             NumSubQuantizers <= 4 ? 1 : (NumSubQuantizers / 4);
@@ -224,7 +224,7 @@
 
     // Each thread handles one code element in the list, with a
     // block-wide stride
-    for (int codeIndex = threadIdx.x; codeIndex < limit;
+    for (idx_t codeIndex = threadIdx.x; codeIndex < limit;
          codeIndex += blockDim.x) {
         // Prefetch next codes
         if (codeIndex + blockDim.x < limit) {
@@ -277,7 +277,7 @@
         Tensor<float, 3, true>& pqCentroidsInnermostCode,
         NoTypeTensor<4, true>& codeDistances,
         Tensor<float, 2, true>& coarseDistances,
-        Tensor<Index::idx_t, 2, true>& coarseIndices,
+        Tensor<idx_t, 2, true>& coarseIndices,
         bool useFloat16Lookup,
         bool useMMCodeDistance,
         bool interleavedCodeLayout,
@@ -287,16 +287,17 @@
         DeviceVector<void*>& listCodes,
         DeviceVector<void*>& listIndices,
         IndicesOptions indicesOptions,
-        DeviceVector<int>& listLengths,
+        DeviceVector<idx_t>& listLengths,
         Tensor<char, 1, true>& thrustMem,
-        Tensor<int, 2, true>& prefixSumOffsets,
+        Tensor<idx_t, 2, true>& prefixSumOffsets,
         Tensor<float, 1, true>& allDistances,
         Tensor<float, 3, true>& heapDistances,
-        Tensor<int, 3, true>& heapIndices,
+        Tensor<idx_t, 3, true>& heapIndices,
         int k,
+        bool use64BitSelection,
         faiss::MetricType metric,
         Tensor<float, 2, true>& outDistances,
-        Tensor<Index::idx_t, 2, true>& outIndices,
+        Tensor<idx_t, 2, true>& outIndices,
         cudaStream_t stream) {
     // We only support two metrics at the moment
     FAISS_ASSERT(
@@ -497,6 +498,7 @@
             allDistances,
             coarseIndices.getSize(1),
             k,
+            use64BitSelection,
             !l2Distance, // L2 distance chooses smallest
             heapDistances,
             heapIndices,
@@ -514,6 +516,7 @@
             prefixSumOffsets,
             coarseIndices,
             k,
+            use64BitSelection,
             !l2Distance, // L2 distance chooses smallest
             outDistances,
             outIndices,
@@ -526,7 +529,7 @@
         Tensor<CentroidT, 2, true>& centroids,
         Tensor<float, 3, true>& pqCentroidsInnermostCode,
         Tensor<float, 2, true>& coarseDistances,
-        Tensor<Index::idx_t, 2, true>& coarseIndices,
+        Tensor<idx_t, 2, true>& coarseIndices,
         bool useFloat16Lookup,
         bool useMMCodeDistance,
         bool interleavedCodeLayout,
@@ -536,22 +539,30 @@
         DeviceVector<void*>& listCodes,
         DeviceVector<void*>& listIndices,
         IndicesOptions indicesOptions,
-        DeviceVector<int>& listLengths,
-        int maxListLength,
+        DeviceVector<idx_t>& listLengths,
+        idx_t maxListLength,
         int k,
         faiss::MetricType metric,
         // output
         Tensor<float, 2, true>& outDistances,
         // output
-        Tensor<Index::idx_t, 2, true>& outIndices,
+        Tensor<idx_t, 2, true>& outIndices,
         GpuResources* res) {
-    constexpr int kMinQueryTileSize = 8;
-    constexpr int kMaxQueryTileSize = 128;
-    constexpr int kThrustMemSize = 16384;
+    auto stream = res->getDefaultStreamCurrentDevice();
 
-    int nprobe = coarseIndices.getSize(1);
+    constexpr idx_t kMinQueryTileSize = 8;
+    constexpr idx_t kMaxQueryTileSize = 65536; // typical max gridDim.y
+    constexpr idx_t kThrustMemSize = 16384;
+
+    auto nprobe = coarseIndices.getSize(1);
+
+    // If the maximum list length (in terms of number of vectors) times nprobe
+    // (number of lists) is > 2^31 - 1, then we will use 64-bit indexing in the
+    // selection kernels
+    constexpr int k32Limit = idx_t(std::numeric_limits<int32_t>::max());
 
-    auto stream = res->getDefaultStreamCurrentDevice();
+    bool use64BitSelection = (maxListLength * nprobe > k32Limit) ||
+            (queries.getSize(0) > k32Limit);
 
     // Make a reservation for Thrust to do its dirty work (global memory
     // cross-block reduction space); hopefully this is large enough.
@@ -563,25 +574,25 @@
 
     // How much temporary storage is available?
     // If possible, we'd like to fit within the space available.
-    size_t sizeAvailable = res->getTempMemoryAvailableCurrentDevice();
+    idx_t sizeAvailable = res->getTempMemoryAvailableCurrentDevice();
 
     // We run two passes of heap selection
     // This is the size of the first-level heap passes
-    constexpr int kNProbeSplit = 8;
-    int pass2Chunks = std::min(nprobe, kNProbeSplit);
+    constexpr idx_t kNProbeSplit = 8;
+    idx_t pass2Chunks = std::min(nprobe, kNProbeSplit);
 
-    size_t sizeForFirstSelectPass =
-            pass2Chunks * k * (sizeof(float) + sizeof(int));
+    idx_t sizeForFirstSelectPass =
+            pass2Chunks * k * (sizeof(float) + sizeof(idx_t));
 
     // How much temporary storage we need per each query
-    size_t sizePerQuery = 2 *                         // streams
-            ((nprobe * sizeof(int) + sizeof(int)) +   // prefixSumOffsets
-             nprobe * maxListLength * sizeof(float) + // allDistances
+    idx_t sizePerQuery = 2 *                            // streams
+            ((nprobe * sizeof(idx_t) + sizeof(idx_t)) + // prefixSumOffsets
+             nprobe * maxListLength * sizeof(float) +   // allDistances
              // residual distances
              nprobe * numSubQuantizers * numSubQuantizerCodes * sizeof(float) +
              sizeForFirstSelectPass);
 
-    int queryTileSize = (int)(sizeAvailable / sizePerQuery);
+    idx_t queryTileSize = (sizeAvailable / sizePerQuery);
 
     if (queryTileSize < kMinQueryTileSize) {
         queryTileSize = kMinQueryTileSize;
@@ -589,41 +600,36 @@
         queryTileSize = kMaxQueryTileSize;
     }
 
-    // FIXME: we should adjust queryTileSize to deal with this, since
-    // indexing is in int32
-    FAISS_ASSERT(
-            queryTileSize * nprobe * maxListLength <
-            std::numeric_limits<int>::max());
-
     // Temporary memory buffers
     // Make sure there is space prior to the start which will be 0, and
     // will handle the boundary condition without branches
-    DeviceTensor<int, 1, true> prefixSumOffsetSpace1(
+    DeviceTensor<idx_t, 1, true> prefixSumOffsetSpace1(
             res,
             makeTempAlloc(AllocType::Other, stream),
             {queryTileSize * nprobe + 1});
-    DeviceTensor<int, 1, true> prefixSumOffsetSpace2(
+    DeviceTensor<idx_t, 1, true> prefixSumOffsetSpace2(
             res,
             makeTempAlloc(AllocType::Other, stream),
             {queryTileSize * nprobe + 1});
 
-    DeviceTensor<int, 2, true> prefixSumOffsets1(
+    DeviceTensor<idx_t, 2, true> prefixSumOffsets1(
             prefixSumOffsetSpace1[1].data(), {queryTileSize, nprobe});
-    DeviceTensor<int, 2, true> prefixSumOffsets2(
+    DeviceTensor<idx_t, 2, true> prefixSumOffsets2(
             prefixSumOffsetSpace2[1].data(), {queryTileSize, nprobe});
-    DeviceTensor<int, 2, true>* prefixSumOffsets[2] = {
+    DeviceTensor<idx_t, 2, true>* prefixSumOffsets[2] = {
             &prefixSumOffsets1, &prefixSumOffsets2};
 
     // Make sure the element before prefixSumOffsets is 0, since we
     // depend upon simple, boundary-less indexing to get proper results
     CUDA_VERIFY(cudaMemsetAsync(
-            prefixSumOffsetSpace1.data(), 0, sizeof(int), stream));
+            prefixSumOffsetSpace1.data(), 0, sizeof(idx_t), stream));
     CUDA_VERIFY(cudaMemsetAsync(
-            prefixSumOffsetSpace2.data(), 0, sizeof(int), stream));
+            prefixSumOffsetSpace2.data(), 0, sizeof(idx_t), stream));
 
-    int codeDistanceTypeSize = useFloat16Lookup ? sizeof(half) : sizeof(float);
+    idx_t codeDistanceTypeSize =
+            useFloat16Lookup ? sizeof(half) : sizeof(float);
 
-    int totalCodeDistancesSize = queryTileSize * nprobe * numSubQuantizers *
+    idx_t totalCodeDistancesSize = queryTileSize * nprobe * numSubQuantizers *
             numSubQuantizerCodes * codeDistanceTypeSize;
 
     DeviceTensor<char, 1, true> codeDistances1Mem(
@@ -669,23 +675,24 @@
     DeviceTensor<float, 3, true>* heapDistances[2] = {
             &heapDistances1, &heapDistances2};
 
-    DeviceTensor<int, 3, true> heapIndices1(
+    DeviceTensor<idx_t, 3, true> heapIndices1(
             res,
             makeTempAlloc(AllocType::Other, stream),
             {queryTileSize, pass2Chunks, k});
-    DeviceTensor<int, 3, true> heapIndices2(
+    DeviceTensor<idx_t, 3, true> heapIndices2(
             res,
             makeTempAlloc(AllocType::Other, stream),
             {queryTileSize, pass2Chunks, k});
-    DeviceTensor<int, 3, true>* heapIndices[2] = {&heapIndices1, &heapIndices2};
+    DeviceTensor<idx_t, 3, true>* heapIndices[2] = {
+            &heapIndices1, &heapIndices2};
 
     auto streams = res->getAlternateStreamsCurrentDevice();
     streamWait(streams, {stream});
 
     int curStream = 0;
 
-    for (int query = 0; query < queries.getSize(0); query += queryTileSize) {
-        int numQueriesInTile =
+    for (idx_t query = 0; query < queries.getSize(0); query += queryTileSize) {
+        idx_t numQueriesInTile =
                 std::min(queryTileSize, queries.getSize(0) - query);
 
         auto prefixSumOffsetsView =
@@ -734,6 +741,7 @@
                 heapDistancesView,
                 heapIndicesView,
                 k,
+                use64BitSelection,
                 metric,
                 outDistanceView,
                 outIndicesView,
diff -Nru faiss-1.7.3/faiss/gpu/impl/PQScanMultiPassPrecomputed.cu faiss-1.7.4/faiss/gpu/impl/PQScanMultiPassPrecomputed.cu
--- faiss-1.7.3/faiss/gpu/impl/PQScanMultiPassPrecomputed.cu	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/PQScanMultiPassPrecomputed.cu	2023-04-19 13:18:30.000000000 +0000
@@ -35,16 +35,16 @@
         Tensor<CodeDistanceT, 3, true> precompTerm2,
         // (query id)(sub q)(code id)
         Tensor<CodeDistanceT, 3, true> precompTerm3,
-        Tensor<Index::idx_t, 2, true> ivfListIds,
+        Tensor<idx_t, 2, true> ivfListIds,
         void** listCodes,
-        int* listLengths,
-        Tensor<int, 2, true> prefixSumOffsets,
+        idx_t* listLengths,
+        Tensor<idx_t, 2, true> prefixSumOffsets,
         Tensor<float, 1, true> distance) {
     // Each block handles a single query versus single list
     auto queryId = blockIdx.y;
     auto probeId = blockIdx.x;
 
-    Index::idx_t listId = ivfListIds[queryId][probeId];
+    idx_t listId = ivfListIds[queryId][probeId];
     // Safety guard in case NaNs in input cause no list ID to be generated
     if (listId == -1) {
         return;
@@ -60,29 +60,29 @@
 
     // This is where we start writing out data
     // We ensure that before the array (at offset -1), there is a 0 value
-    int outBase = *(prefixSumOffsets[queryId][probeId].data() - 1);
+    auto outBase = *(prefixSumOffsets[queryId][probeId].data() - 1);
     float* distanceOut = distance[outBase].data();
 
     auto vecsBase = (EncodeT*)listCodes[listId];
-    int numVecs = listLengths[listId];
+    idx_t numVecs = listLengths[listId];
 
     // How many vector blocks of 32 are in this list?
-    int numBlocks = utils::divUp(numVecs, 32);
+    idx_t numBlocks = utils::divUp(numVecs, idx_t(32));
 
     // Number of EncodeT words per each dimension of block of 32 vecs
-    constexpr int bytesPerVectorBlockDim = EncodeBits * 32 / 8;
-    constexpr int wordsPerVectorBlockDim =
+    constexpr idx_t bytesPerVectorBlockDim = EncodeBits * 32 / 8;
+    constexpr idx_t wordsPerVectorBlockDim =
             bytesPerVectorBlockDim / sizeof(EncodeT);
-    int wordsPerVectorBlock = wordsPerVectorBlockDim * numSubQuantizers;
+    idx_t wordsPerVectorBlock = wordsPerVectorBlockDim * numSubQuantizers;
 
     // This is constant for the (query, probe)
     float term1 = precompTerm1[queryId][probeId];
 
-    for (int block = warpId; block < numBlocks; block += numWarps) {
+    for (idx_t block = warpId; block < numBlocks; block += numWarps) {
         float dist = term1;
 
         // This is the vector a given lane/thread handles
-        int vec = block * kWarpSize + laneId;
+        idx_t vec = block * kWarpSize + laneId;
         bool valid = vec < numVecs;
 
         EncodeT* data = vecsBase + block * wordsPerVectorBlock;
@@ -129,14 +129,14 @@
         // Load the data by float4 for efficiency, and then handle any remainder
         // limitVec is the number of whole vec words we can load, in terms
         // of whole blocks performing the load
-        int limitVec = numCodes / (kUnroll * kWordSize * blockDim.x);
+        idx_t limitVec = numCodes / (kUnroll * kWordSize * blockDim.x);
         limitVec *= kUnroll * blockDim.x;
 
         LookupVecT* smemV = (LookupVecT*)smem;
         LookupVecT* term2StartV = (LookupVecT*)term2Start;
         LookupVecT* term3StartV = (LookupVecT*)term3Start;
 
-        for (int i = threadIdx.x; i < limitVec; i += kUnroll * blockDim.x) {
+        for (idx_t i = threadIdx.x; i < limitVec; i += kUnroll * blockDim.x) {
             LookupVecT vals[kUnroll];
 
 #pragma unroll
@@ -164,16 +164,16 @@
         // fit into kUnroll x blockDim.x
         int remainder = limitVec * kWordSize;
 
-        for (int i = remainder + threadIdx.x; i < numCodes; i += blockDim.x) {
+        for (idx_t i = remainder + threadIdx.x; i < numCodes; i += blockDim.x) {
             smem[i] = Math<LookupT>::add(term2Start[i], term3Start[i]);
         }
     } else {
         // Potential unaligned load
         constexpr int kUnroll = 4;
 
-        int limit = utils::roundDown(numCodes, kUnroll * blockDim.x);
+        idx_t limit = utils::roundDown(numCodes, kUnroll * blockDim.x);
 
-        int i = threadIdx.x;
+        idx_t i = threadIdx.x;
         for (; i < limit; i += kUnroll * blockDim.x) {
             LookupT vals[kUnroll];
 
@@ -206,10 +206,10 @@
         Tensor<float, 2, true> precompTerm1,
         Tensor<LookupT, 3, true> precompTerm2,
         Tensor<LookupT, 3, true> precompTerm3,
-        Tensor<Index::idx_t, 2, true> ivfListIds,
+        Tensor<idx_t, 2, true> ivfListIds,
         void** listCodes,
-        int* listLengths,
-        Tensor<int, 2, true> prefixSumOffsets,
+        idx_t* listLengths,
+        Tensor<idx_t, 2, true> prefixSumOffsets,
         Tensor<float, 1, true> distance) {
     // precomputed term 2 + 3 storage
     // (sub q)(code id)
@@ -224,19 +224,19 @@
 
     // This is where we start writing out data
     // We ensure that before the array (at offset -1), there is a 0 value
-    int outBase = *(prefixSumOffsets[queryId][probeId].data() - 1);
+    idx_t outBase = *(prefixSumOffsets[queryId][probeId].data() - 1);
     float* distanceOut = distance[outBase].data();
 
-    Index::idx_t listId = ivfListIds[queryId][probeId];
+    idx_t listId = ivfListIds[queryId][probeId];
     // Safety guard in case NaNs in input cause no list ID to be generated
     if (listId == -1) {
         return;
     }
 
     uint8_t* codeList = (uint8_t*)listCodes[listId];
-    int limit = listLengths[listId];
+    idx_t limit = listLengths[listId];
 
-    constexpr int kNumCode32 =
+    constexpr idx_t kNumCode32 =
             NumSubQuantizers <= 4 ? 1 : (NumSubQuantizers / 4);
     unsigned int code32[kNumCode32];
     unsigned int nextCode32[kNumCode32];
@@ -259,7 +259,7 @@
 
     // Each thread handles one code element in the list, with a
     // block-wide stride
-    for (int codeIndex = threadIdx.x; codeIndex < limit;
+    for (idx_t codeIndex = threadIdx.x; codeIndex < limit;
          codeIndex += blockDim.x) {
         // Prefetch next codes
         if (codeIndex + blockDim.x < limit) {
@@ -310,7 +310,7 @@
         Tensor<float, 2, true>& precompTerm1,
         NoTypeTensor<3, true>& precompTerm2,
         NoTypeTensor<3, true>& precompTerm3,
-        Tensor<Index::idx_t, 2, true>& ivfListIds,
+        Tensor<idx_t, 2, true>& ivfListIds,
         bool useFloat16Lookup,
         bool interleavedCodeLayout,
         int bitsPerSubQuantizer,
@@ -319,15 +319,16 @@
         DeviceVector<void*>& listCodes,
         DeviceVector<void*>& listIndices,
         IndicesOptions indicesOptions,
-        DeviceVector<int>& listLengths,
+        DeviceVector<idx_t>& listLengths,
         Tensor<char, 1, true>& thrustMem,
-        Tensor<int, 2, true>& prefixSumOffsets,
+        Tensor<idx_t, 2, true>& prefixSumOffsets,
         Tensor<float, 1, true>& allDistances,
         Tensor<float, 3, true>& heapDistances,
-        Tensor<int, 3, true>& heapIndices,
+        Tensor<idx_t, 3, true>& heapIndices,
         int k,
+        bool use64BitSelection,
         Tensor<float, 2, true>& outDistances,
-        Tensor<Index::idx_t, 2, true>& outIndices,
+        Tensor<idx_t, 2, true>& outIndices,
         cudaStream_t stream) {
     // Calculate offset lengths, so we know where to write out
     // intermediate results
@@ -507,6 +508,7 @@
             allDistances,
             ivfListIds.getSize(1),
             k,
+            use64BitSelection,
             false, // L2 distance chooses smallest
             heapDistances,
             heapIndices,
@@ -524,6 +526,7 @@
             prefixSumOffsets,
             ivfListIds,
             k,
+            use64BitSelection,
             false, // L2 distance chooses smallest
             outDistances,
             outIndices,
@@ -540,7 +543,7 @@
         NoTypeTensor<3, true>& precompTerm2,
         // (query id)(sub q)(code id)
         NoTypeTensor<3, true>& precompTerm3,
-        Tensor<Index::idx_t, 2, true>& ivfListIds,
+        Tensor<idx_t, 2, true>& ivfListIds,
         bool useFloat16Lookup,
         bool interleavedCodeLayout,
         int bitsPerSubQuantizer,
@@ -549,21 +552,29 @@
         DeviceVector<void*>& listCodes,
         DeviceVector<void*>& listIndices,
         IndicesOptions indicesOptions,
-        DeviceVector<int>& listLengths,
-        int maxListLength,
+        DeviceVector<idx_t>& listLengths,
+        idx_t maxListLength,
         int k,
         // output
         Tensor<float, 2, true>& outDistances,
         // output
-        Tensor<Index::idx_t, 2, true>& outIndices,
+        Tensor<idx_t, 2, true>& outIndices,
         GpuResources* res) {
-    constexpr int kMinQueryTileSize = 8;
-    constexpr int kMaxQueryTileSize = 128;
-    constexpr int kThrustMemSize = 16384;
+    auto stream = res->getDefaultStreamCurrentDevice();
 
-    int nprobe = ivfListIds.getSize(1);
+    constexpr idx_t kMinQueryTileSize = 8;
+    constexpr idx_t kMaxQueryTileSize = 65536; // typical max gridDim.y
+    constexpr idx_t kThrustMemSize = 16384;
+
+    auto nprobe = ivfListIds.getSize(1);
+
+    // If the maximum list length (in terms of number of vectors) times nprobe
+    // (number of lists) is > 2^31 - 1, then we will use 64-bit indexing in the
+    // selection kernels
+    constexpr int k32Limit = idx_t(std::numeric_limits<int32_t>::max());
 
-    auto stream = res->getDefaultStreamCurrentDevice();
+    bool use64BitSelection = (maxListLength * nprobe > k32Limit) ||
+            (queries.getSize(0) > k32Limit);
 
     // Make a reservation for Thrust to do its dirty work (global memory
     // cross-block reduction space); hopefully this is large enough.
@@ -579,19 +590,19 @@
 
     // We run two passes of heap selection
     // This is the size of the first-level heap passes
-    constexpr int kNProbeSplit = 8;
-    int pass2Chunks = std::min(nprobe, kNProbeSplit);
+    constexpr idx_t kNProbeSplit = 8;
+    idx_t pass2Chunks = std::min(nprobe, kNProbeSplit);
 
-    size_t sizeForFirstSelectPass =
-            pass2Chunks * k * (sizeof(float) + sizeof(int));
+    idx_t sizeForFirstSelectPass =
+            pass2Chunks * k * (sizeof(float) + sizeof(idx_t));
 
     // How much temporary storage we need per each query
-    size_t sizePerQuery = 2 *                         // # streams
-            ((nprobe * sizeof(int) + sizeof(int)) +   // prefixSumOffsets
-             nprobe * maxListLength * sizeof(float) + // allDistances
+    idx_t sizePerQuery = 2 *                            // # streams
+            ((nprobe * sizeof(idx_t) + sizeof(idx_t)) + // prefixSumOffsets
+             nprobe * maxListLength * sizeof(float) +   // allDistances
              sizeForFirstSelectPass);
 
-    int queryTileSize = (int)(sizeAvailable / sizePerQuery);
+    idx_t queryTileSize = sizeAvailable / sizePerQuery;
 
     if (queryTileSize < kMinQueryTileSize) {
         queryTileSize = kMinQueryTileSize;
@@ -599,37 +610,31 @@
         queryTileSize = kMaxQueryTileSize;
     }
 
-    // FIXME: we should adjust queryTileSize to deal with this, since
-    // indexing is in int32
-    FAISS_ASSERT(
-            queryTileSize * nprobe * maxListLength <=
-            std::numeric_limits<int>::max());
-
     // Temporary memory buffers
     // Make sure there is space prior to the start which will be 0, and
     // will handle the boundary condition without branches
-    DeviceTensor<int, 1, true> prefixSumOffsetSpace1(
+    DeviceTensor<idx_t, 1, true> prefixSumOffsetSpace1(
             res,
             makeTempAlloc(AllocType::Other, stream),
             {queryTileSize * nprobe + 1});
-    DeviceTensor<int, 1, true> prefixSumOffsetSpace2(
+    DeviceTensor<idx_t, 1, true> prefixSumOffsetSpace2(
             res,
             makeTempAlloc(AllocType::Other, stream),
             {queryTileSize * nprobe + 1});
 
-    DeviceTensor<int, 2, true> prefixSumOffsets1(
+    DeviceTensor<idx_t, 2, true> prefixSumOffsets1(
             prefixSumOffsetSpace1[1].data(), {queryTileSize, nprobe});
-    DeviceTensor<int, 2, true> prefixSumOffsets2(
+    DeviceTensor<idx_t, 2, true> prefixSumOffsets2(
             prefixSumOffsetSpace2[1].data(), {queryTileSize, nprobe});
-    DeviceTensor<int, 2, true>* prefixSumOffsets[2] = {
+    DeviceTensor<idx_t, 2, true>* prefixSumOffsets[2] = {
             &prefixSumOffsets1, &prefixSumOffsets2};
 
     // Make sure the element before prefixSumOffsets is 0, since we
     // depend upon simple, boundary-less indexing to get proper results
     CUDA_VERIFY(cudaMemsetAsync(
-            prefixSumOffsetSpace1.data(), 0, sizeof(int), stream));
+            prefixSumOffsetSpace1.data(), 0, sizeof(idx_t), stream));
     CUDA_VERIFY(cudaMemsetAsync(
-            prefixSumOffsetSpace2.data(), 0, sizeof(int), stream));
+            prefixSumOffsetSpace2.data(), 0, sizeof(idx_t), stream));
 
     DeviceTensor<float, 1, true> allDistances1(
             res,
@@ -653,23 +658,24 @@
     DeviceTensor<float, 3, true>* heapDistances[2] = {
             &heapDistances1, &heapDistances2};
 
-    DeviceTensor<int, 3, true> heapIndices1(
+    DeviceTensor<idx_t, 3, true> heapIndices1(
             res,
             makeTempAlloc(AllocType::Other, stream),
             {queryTileSize, pass2Chunks, k});
-    DeviceTensor<int, 3, true> heapIndices2(
+    DeviceTensor<idx_t, 3, true> heapIndices2(
             res,
             makeTempAlloc(AllocType::Other, stream),
             {queryTileSize, pass2Chunks, k});
-    DeviceTensor<int, 3, true>* heapIndices[2] = {&heapIndices1, &heapIndices2};
+    DeviceTensor<idx_t, 3, true>* heapIndices[2] = {
+            &heapIndices1, &heapIndices2};
 
     auto streams = res->getAlternateStreamsCurrentDevice();
     streamWait(streams, {stream});
 
     int curStream = 0;
 
-    for (int query = 0; query < queries.getSize(0); query += queryTileSize) {
-        int numQueriesInTile =
+    for (idx_t query = 0; query < queries.getSize(0); query += queryTileSize) {
+        idx_t numQueriesInTile =
                 std::min(queryTileSize, queries.getSize(0) - query);
 
         auto prefixSumOffsetsView =
@@ -714,6 +720,7 @@
                 heapDistancesView,
                 heapIndicesView,
                 k,
+                use64BitSelection,
                 outDistanceView,
                 outIndicesView,
                 streams[curStream]);
diff -Nru faiss-1.7.3/faiss/gpu/impl/PQScanMultiPassPrecomputed.cuh faiss-1.7.4/faiss/gpu/impl/PQScanMultiPassPrecomputed.cuh
--- faiss-1.7.3/faiss/gpu/impl/PQScanMultiPassPrecomputed.cuh	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/PQScanMultiPassPrecomputed.cuh	2023-04-19 13:18:30.000000000 +0000
@@ -23,7 +23,7 @@
         Tensor<float, 2, true>& precompTerm1,
         NoTypeTensor<3, true>& precompTerm2,
         NoTypeTensor<3, true>& precompTerm3,
-        Tensor<Index::idx_t, 2, true>& ivfListIds,
+        Tensor<idx_t, 2, true>& ivfListIds,
         bool useFloat16Lookup,
         bool interleavedCodeLayout,
         int bitsPerSubQuantizer,
@@ -32,13 +32,13 @@
         DeviceVector<void*>& listCodes,
         DeviceVector<void*>& listIndices,
         IndicesOptions indicesOptions,
-        DeviceVector<int>& listLengths,
-        int maxListLength,
+        DeviceVector<idx_t>& listLengths,
+        idx_t maxListLength,
         int k,
         // output
         Tensor<float, 2, true>& outDistances,
         // output
-        Tensor<Index::idx_t, 2, true>& outIndices,
+        Tensor<idx_t, 2, true>& outIndices,
         GpuResources* res);
 
 } // namespace gpu
diff -Nru faiss-1.7.3/faiss/gpu/impl/RemapIndices.cpp faiss-1.7.4/faiss/gpu/impl/RemapIndices.cpp
--- faiss-1.7.3/faiss/gpu/impl/RemapIndices.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/RemapIndices.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -14,21 +14,23 @@
 // Utility function to translate (list id, offset) to a user index on
 // the CPU. In a cpp in order to use OpenMP
 void ivfOffsetToUserIndex(
-        Index::idx_t* indices,
-        int numLists,
-        int queries,
+        idx_t* indices,
+        idx_t numLists,
+        idx_t queries,
         int k,
-        const std::vector<std::vector<Index::idx_t>>& listOffsetToUserIndex) {
+        const std::vector<std::vector<idx_t>>& listOffsetToUserIndex) {
     FAISS_ASSERT(numLists == listOffsetToUserIndex.size());
 
 #pragma omp parallel for
-    for (int q = 0; q < queries; ++q) {
-        for (int r = 0; r < k; ++r) {
+    for (idx_t q = 0; q < queries; ++q) {
+        for (idx_t r = 0; r < k; ++r) {
             auto offsetIndex = indices[q * k + r];
 
-            if (offsetIndex < 0)
+            if (offsetIndex < 0) {
                 continue;
+            }
 
+            // FIXME: implicit limit on list and list offset length
             int listId = (int)(offsetIndex >> 32);
             int listOffset = (int)(offsetIndex & 0xffffffff);
 
diff -Nru faiss-1.7.3/faiss/gpu/impl/RemapIndices.h faiss-1.7.4/faiss/gpu/impl/RemapIndices.h
--- faiss-1.7.3/faiss/gpu/impl/RemapIndices.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/RemapIndices.h	2023-04-19 13:18:30.000000000 +0000
@@ -16,11 +16,11 @@
 /// Utility function to translate (list id, offset) to a user index on
 /// the CPU. In a cpp in order to use OpenMP.
 void ivfOffsetToUserIndex(
-        Index::idx_t* indices,
-        int numLists,
-        int queries,
+        idx_t* indices,
+        idx_t numLists,
+        idx_t queries,
         int k,
-        const std::vector<std::vector<Index::idx_t>>& listOffsetToUserIndex);
+        const std::vector<std::vector<idx_t>>& listOffsetToUserIndex);
 
 } // namespace gpu
 } // namespace faiss
diff -Nru faiss-1.7.3/faiss/gpu/impl/scan/IVFInterleavedImpl.cuh faiss-1.7.4/faiss/gpu/impl/scan/IVFInterleavedImpl.cuh
--- faiss-1.7.3/faiss/gpu/impl/scan/IVFInterleavedImpl.cuh	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/scan/IVFInterleavedImpl.cuh	2023-04-19 13:18:30.000000000 +0000
@@ -15,18 +15,18 @@
                                                         \
     void ivfInterleavedScanImpl_##WARP_Q##_(            \
             Tensor<float, 2, true>& queries,            \
-            Tensor<Index::idx_t, 2, true>& listIds,     \
+            Tensor<idx_t, 2, true>& listIds,            \
             DeviceVector<void*>& listData,              \
             DeviceVector<void*>& listIndices,           \
             IndicesOptions indicesOptions,              \
-            DeviceVector<int>& listLengths,             \
+            DeviceVector<idx_t>& listLengths,           \
             int k,                                      \
             faiss::MetricType metric,                   \
             bool useResidual,                           \
             Tensor<float, 3, true>& residualBase,       \
             GpuScalarQuantizer* scalarQ,                \
             Tensor<float, 2, true>& outDistances,       \
-            Tensor<Index::idx_t, 2, true>& outIndices,  \
+            Tensor<idx_t, 2, true>& outIndices,         \
             GpuResources* res) {                        \
         FAISS_ASSERT(k <= WARP_Q);                      \
                                                         \
@@ -35,22 +35,22 @@
         CUDA_TEST_ERROR();                              \
     }
 
-#define IVF_INTERLEAVED_DECL(WARP_Q)                   \
-                                                       \
-    void ivfInterleavedScanImpl_##WARP_Q##_(           \
-            Tensor<float, 2, true>& queries,           \
-            Tensor<Index::idx_t, 2, true>& listIds,    \
-            DeviceVector<void*>& listData,             \
-            DeviceVector<void*>& listIndices,          \
-            IndicesOptions indicesOptions,             \
-            DeviceVector<int>& listLengths,            \
-            int k,                                     \
-            faiss::MetricType metric,                  \
-            bool useResidual,                          \
-            Tensor<float, 3, true>& residualBase,      \
-            GpuScalarQuantizer* scalarQ,               \
-            Tensor<float, 2, true>& outDistances,      \
-            Tensor<Index::idx_t, 2, true>& outIndices, \
+#define IVF_INTERLEAVED_DECL(WARP_Q)              \
+                                                  \
+    void ivfInterleavedScanImpl_##WARP_Q##_(      \
+            Tensor<float, 2, true>& queries,      \
+            Tensor<idx_t, 2, true>& listIds,      \
+            DeviceVector<void*>& listData,        \
+            DeviceVector<void*>& listIndices,     \
+            IndicesOptions indicesOptions,        \
+            DeviceVector<idx_t>& listLengths,     \
+            int k,                                \
+            faiss::MetricType metric,             \
+            bool useResidual,                     \
+            Tensor<float, 3, true>& residualBase, \
+            GpuScalarQuantizer* scalarQ,          \
+            Tensor<float, 2, true>& outDistances, \
+            Tensor<idx_t, 2, true>& outIndices,   \
             GpuResources* res)
 
 #define IVF_INTERLEAVED_CALL(WARP_Q)    \
diff -Nru faiss-1.7.3/faiss/gpu/impl/VectorResidual.cu faiss-1.7.4/faiss/gpu/impl/VectorResidual.cu
--- faiss-1.7.3/faiss/gpu/impl/VectorResidual.cu	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/VectorResidual.cu	2023-04-19 13:18:30.000000000 +0000
@@ -18,21 +18,21 @@
 namespace faiss {
 namespace gpu {
 
-template <typename IndexT, typename CentroidT, bool LargeDim>
+template <typename CentroidT, bool LargeDim>
 __global__ void calcResidual(
         Tensor<float, 2, true> vecs,
         Tensor<CentroidT, 2, true> centroids,
-        Tensor<IndexT, 1, true> vecToCentroid,
+        Tensor<idx_t, 1, true> vecToCentroid,
         Tensor<float, 2, true> residuals) {
     auto vec = vecs[blockIdx.x];
     auto residual = residuals[blockIdx.x];
-    IndexT centroidId = vecToCentroid[blockIdx.x];
+    auto centroidId = vecToCentroid[blockIdx.x];
 
     // Vector could be invalid (containing NaNs), so -1 was the
     // classified centroid
     if (centroidId == -1) {
         if (LargeDim) {
-            for (int i = threadIdx.x; i < vecs.getSize(1); i += blockDim.x) {
+            for (idx_t i = threadIdx.x; i < vecs.getSize(1); i += blockDim.x) {
                 residual[i] = CUDART_NAN_F;
             }
         } else {
@@ -45,7 +45,7 @@
     auto centroid = centroids[centroidId];
 
     if (LargeDim) {
-        for (int i = threadIdx.x; i < vecs.getSize(1); i += blockDim.x) {
+        for (idx_t i = threadIdx.x; i < vecs.getSize(1); i += blockDim.x) {
             residual[i] = vec[i] - ConvertTo<float>::to(centroid[i]);
         }
     } else {
@@ -54,11 +54,11 @@
     }
 }
 
-template <typename IndexT, typename CentroidT>
+template <typename CentroidT>
 void calcResidual(
         Tensor<float, 2, true>& vecs,
         Tensor<CentroidT, 2, true>& centroids,
-        Tensor<IndexT, 1, true>& vecToCentroid,
+        Tensor<idx_t, 1, true>& vecToCentroid,
         Tensor<float, 2, true>& residuals,
         cudaStream_t stream) {
     FAISS_ASSERT(vecs.getSize(1) == centroids.getSize(1));
@@ -68,15 +68,15 @@
 
     dim3 grid(vecs.getSize(0));
 
-    int maxThreads = getMaxThreadsCurrentDevice();
+    idx_t maxThreads = getMaxThreadsCurrentDevice();
     bool largeDim = vecs.getSize(1) > maxThreads;
     dim3 block(std::min(vecs.getSize(1), maxThreads));
 
     if (largeDim) {
-        calcResidual<IndexT, CentroidT, true><<<grid, block, 0, stream>>>(
+        calcResidual<CentroidT, true><<<grid, block, 0, stream>>>(
                 vecs, centroids, vecToCentroid, residuals);
     } else {
-        calcResidual<IndexT, CentroidT, false><<<grid, block, 0, stream>>>(
+        calcResidual<CentroidT, false><<<grid, block, 0, stream>>>(
                 vecs, centroids, vecToCentroid, residuals);
     }
 
@@ -86,69 +86,57 @@
 void runCalcResidual(
         Tensor<float, 2, true>& vecs,
         Tensor<float, 2, true>& centroids,
-        Tensor<Index::idx_t, 1, true>& vecToCentroid,
+        Tensor<idx_t, 1, true>& vecToCentroid,
         Tensor<float, 2, true>& residuals,
         cudaStream_t stream) {
-    calcResidual<Index::idx_t, float>(
-            vecs, centroids, vecToCentroid, residuals, stream);
+    calcResidual<float>(vecs, centroids, vecToCentroid, residuals, stream);
 }
 
 void runCalcResidual(
         Tensor<float, 2, true>& vecs,
         Tensor<half, 2, true>& centroids,
-        Tensor<Index::idx_t, 1, true>& vecToCentroid,
+        Tensor<idx_t, 1, true>& vecToCentroid,
         Tensor<float, 2, true>& residuals,
         cudaStream_t stream) {
-    calcResidual<Index::idx_t, half>(
-            vecs, centroids, vecToCentroid, residuals, stream);
+    calcResidual<half>(vecs, centroids, vecToCentroid, residuals, stream);
 }
 
-template <typename IndexT, typename T>
+template <typename T>
 __global__ void gatherReconstructByIds(
-        Tensor<IndexT, 1, true> ids,
+        Tensor<idx_t, 1, true> ids,
         Tensor<T, 2, true> vecs,
         Tensor<float, 2, true> out) {
-    IndexT id = ids[blockIdx.x];
-
-    // FIXME: will update all GPU code shortly to use int64 indexing types, but
-    // this is a minimal change to allow for >= 2^31 elements in a matrix
-    // auto vec = vecs[id];
-    // auto outVec = out[blockIdx.x];
-    auto vec = vecs.data() + id * vecs.getSize(1);
-    auto outVec = out.data() + blockIdx.x * out.getSize(1);
+    auto id = ids[blockIdx.x];
+    auto vec = vecs[id];
+    auto outVec = out[blockIdx.x];
 
     Convert<T, float> conv;
 
-    for (IndexT i = threadIdx.x; i < vecs.getSize(1); i += blockDim.x) {
-        outVec[i] = id == IndexT(-1) ? 0.0f : conv(vec[i]);
+    for (idx_t i = threadIdx.x; i < vecs.getSize(1); i += blockDim.x) {
+        outVec[i] = id == idx_t(-1) ? 0.0f : conv(vec[i]);
     }
 }
 
-template <typename IndexT, typename T>
+template <typename T>
 __global__ void gatherReconstructByRange(
-        IndexT start,
-        IndexT num,
+        idx_t start,
+        idx_t num,
         Tensor<T, 2, true> vecs,
         Tensor<float, 2, true> out) {
-    IndexT id = start + blockIdx.x;
-
-    // FIXME: will update all GPU code shortly to use int64 indexing types, but
-    // this is a minimal change to allow for >= 2^31 elements in a matrix
-    // auto vec = vecs[id];
-    // auto outVec = out[blockIdx.x];
-    auto vec = vecs.data() + id * vecs.getSize(1);
-    auto outVec = out.data() + blockIdx.x * out.getSize(1);
+    auto id = start + blockIdx.x;
+    auto vec = vecs[id];
+    auto outVec = out[blockIdx.x];
 
     Convert<T, float> conv;
 
-    for (IndexT i = threadIdx.x; i < vecs.getSize(1); i += blockDim.x) {
-        outVec[i] = id == IndexT(-1) ? 0.0f : conv(vec[i]);
+    for (idx_t i = threadIdx.x; i < vecs.getSize(1); i += blockDim.x) {
+        outVec[i] = id == idx_t(-1) ? 0.0f : conv(vec[i]);
     }
 }
 
-template <typename IndexT, typename T>
+template <typename T>
 void gatherReconstructByIds(
-        Tensor<IndexT, 1, true>& ids,
+        Tensor<idx_t, 1, true>& ids,
         Tensor<T, 2, true>& vecs,
         Tensor<float, 2, true>& out,
         cudaStream_t stream) {
@@ -157,19 +145,18 @@
 
     dim3 grid(ids.getSize(0));
 
-    int maxThreads = getMaxThreadsCurrentDevice();
+    idx_t maxThreads = getMaxThreadsCurrentDevice();
     dim3 block(std::min(vecs.getSize(1), maxThreads));
 
-    gatherReconstructByIds<IndexT, T>
-            <<<grid, block, 0, stream>>>(ids, vecs, out);
+    gatherReconstructByIds<T><<<grid, block, 0, stream>>>(ids, vecs, out);
 
     CUDA_TEST_ERROR();
 }
 
-template <typename IndexT, typename T>
+template <typename T>
 void gatherReconstructByRange(
-        IndexT start,
-        IndexT num,
+        idx_t start,
+        idx_t num,
         Tensor<T, 2, true>& vecs,
         Tensor<float, 2, true>& out,
         cudaStream_t stream) {
@@ -180,48 +167,47 @@
 
     dim3 grid(num);
 
-    int maxThreads = getMaxThreadsCurrentDevice();
+    idx_t maxThreads = getMaxThreadsCurrentDevice();
     dim3 block(std::min(vecs.getSize(1), maxThreads));
 
-    gatherReconstructByRange<IndexT, T>
+    gatherReconstructByRange<T>
             <<<grid, block, 0, stream>>>(start, num, vecs, out);
 
     CUDA_TEST_ERROR();
 }
 
 void runReconstruct(
-        Tensor<Index::idx_t, 1, true>& ids,
+        Tensor<idx_t, 1, true>& ids,
         Tensor<float, 2, true>& vecs,
         Tensor<float, 2, true>& out,
         cudaStream_t stream) {
-    gatherReconstructByIds<Index::idx_t, float>(ids, vecs, out, stream);
+    gatherReconstructByIds<float>(ids, vecs, out, stream);
 }
 
 void runReconstruct(
-        Tensor<Index::idx_t, 1, true>& ids,
+        Tensor<idx_t, 1, true>& ids,
         Tensor<half, 2, true>& vecs,
         Tensor<float, 2, true>& out,
         cudaStream_t stream) {
-    gatherReconstructByIds<Index::idx_t, half>(ids, vecs, out, stream);
+    gatherReconstructByIds<half>(ids, vecs, out, stream);
 }
 
 void runReconstruct(
-        Index::idx_t start,
-        Index::idx_t num,
+        idx_t start,
+        idx_t num,
         Tensor<float, 2, true>& vecs,
         Tensor<float, 2, true>& out,
         cudaStream_t stream) {
-    gatherReconstructByRange<Index::idx_t, float>(
-            start, num, vecs, out, stream);
+    gatherReconstructByRange<float>(start, num, vecs, out, stream);
 }
 
 void runReconstruct(
-        Index::idx_t start,
-        Index::idx_t num,
+        idx_t start,
+        idx_t num,
         Tensor<half, 2, true>& vecs,
         Tensor<float, 2, true>& out,
         cudaStream_t stream) {
-    gatherReconstructByRange<Index::idx_t, half>(start, num, vecs, out, stream);
+    gatherReconstructByRange<half>(start, num, vecs, out, stream);
 }
 
 } // namespace gpu
diff -Nru faiss-1.7.3/faiss/gpu/impl/VectorResidual.cuh faiss-1.7.4/faiss/gpu/impl/VectorResidual.cuh
--- faiss-1.7.3/faiss/gpu/impl/VectorResidual.cuh	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/impl/VectorResidual.cuh	2023-04-19 13:18:30.000000000 +0000
@@ -17,40 +17,40 @@
 void runCalcResidual(
         Tensor<float, 2, true>& vecs,
         Tensor<float, 2, true>& centroids,
-        Tensor<Index::idx_t, 1, true>& vecToCentroid,
+        Tensor<idx_t, 1, true>& vecToCentroid,
         Tensor<float, 2, true>& residuals,
         cudaStream_t stream);
 
 void runCalcResidual(
         Tensor<float, 2, true>& vecs,
         Tensor<half, 2, true>& centroids,
-        Tensor<Index::idx_t, 1, true>& vecToCentroid,
+        Tensor<idx_t, 1, true>& vecToCentroid,
         Tensor<float, 2, true>& residuals,
         cudaStream_t stream);
 
 // Gather vectors
 void runReconstruct(
-        Tensor<Index::idx_t, 1, true>& listIds,
+        Tensor<idx_t, 1, true>& listIds,
         Tensor<float, 2, true>& vecs,
         Tensor<float, 2, true>& out,
         cudaStream_t stream);
 
 void runReconstruct(
-        Tensor<Index::idx_t, 1, true>& listIds,
+        Tensor<idx_t, 1, true>& listIds,
         Tensor<half, 2, true>& vecs,
         Tensor<float, 2, true>& out,
         cudaStream_t stream);
 
 void runReconstruct(
-        Index::idx_t start,
-        Index::idx_t num,
+        idx_t start,
+        idx_t num,
         Tensor<float, 2, true>& vecs,
         Tensor<float, 2, true>& out,
         cudaStream_t stream);
 
 void runReconstruct(
-        Index::idx_t start,
-        Index::idx_t num,
+        idx_t start,
+        idx_t num,
         Tensor<half, 2, true>& vecs,
         Tensor<float, 2, true>& out,
         cudaStream_t stream);
diff -Nru faiss-1.7.3/faiss/gpu/perf/IndexWrapper.h faiss-1.7.4/faiss/gpu/perf/IndexWrapper.h
--- faiss-1.7.3/faiss/gpu/perf/IndexWrapper.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/perf/IndexWrapper.h	2023-04-19 13:18:30.000000000 +0000
@@ -32,7 +32,7 @@
     faiss::Index* getIndex();
 
     void runOnIndices(std::function<void(GpuIndex*)> f);
-    void setNumProbes(int nprobe);
+    void setNumProbes(size_t nprobe);
 };
 
 } // namespace gpu
diff -Nru faiss-1.7.3/faiss/gpu/perf/IndexWrapper-inl.h faiss-1.7.4/faiss/gpu/perf/IndexWrapper-inl.h
--- faiss-1.7.3/faiss/gpu/perf/IndexWrapper-inl.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/perf/IndexWrapper-inl.h	2023-04-19 13:18:30.000000000 +0000
@@ -58,8 +58,8 @@
 }
 
 template <typename GpuIndex>
-void IndexWrapper<GpuIndex>::setNumProbes(int nprobe) {
-    runOnIndices([nprobe](GpuIndex* index) { index->setNumProbes(nprobe); });
+void IndexWrapper<GpuIndex>::setNumProbes(size_t nprobe) {
+    runOnIndices([nprobe](GpuIndex* index) { index->nprobe = nprobe; });
 }
 
 } // namespace gpu
diff -Nru faiss-1.7.3/faiss/gpu/perf/PerfBinaryFlat.cu faiss-1.7.4/faiss/gpu/perf/PerfBinaryFlat.cu
--- faiss-1.7.3/faiss/gpu/perf/PerfBinaryFlat.cu	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/perf/PerfBinaryFlat.cu	2023-04-19 13:18:30.000000000 +0000
@@ -71,8 +71,7 @@
 
     // Time faiss CPU
     HostTensor<int, 2, true> cpuDistances({numQueries, FLAGS_k});
-    HostTensor<faiss::IndexBinary::idx_t, 2, true> cpuIndices(
-            {numQueries, FLAGS_k});
+    HostTensor<faiss::idx_t, 2, true> cpuIndices({numQueries, FLAGS_k});
 
     if (FLAGS_cpu) {
         float cpuTime = 0.0f;
@@ -90,7 +89,7 @@
     }
 
     HostTensor<int, 2, true> gpuDistances({numQueries, FLAGS_k});
-    HostTensor<faiss::Index::idx_t, 2, true> gpuIndices({numQueries, FLAGS_k});
+    HostTensor<faiss::idx_t, 2, true> gpuIndices({numQueries, FLAGS_k});
 
     CUDA_VERIFY(cudaProfilerStart());
     faiss::gpu::synchronizeAllDevices();
diff -Nru faiss-1.7.3/faiss/gpu/perf/PerfFlat.cu faiss-1.7.4/faiss/gpu/perf/PerfFlat.cu
--- faiss-1.7.3/faiss/gpu/perf/PerfFlat.cu	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/perf/PerfFlat.cu	2023-04-19 13:18:30.000000000 +0000
@@ -95,7 +95,7 @@
 
     // Time faiss CPU
     HostTensor<float, 2, true> cpuDistances({numQueries, FLAGS_k});
-    HostTensor<faiss::Index::idx_t, 2, true> cpuIndices({numQueries, FLAGS_k});
+    HostTensor<faiss::idx_t, 2, true> cpuIndices({numQueries, FLAGS_k});
 
     if (FLAGS_cpu) {
         float cpuTime = 0.0f;
@@ -113,7 +113,7 @@
     }
 
     HostTensor<float, 2, true> gpuDistances({numQueries, FLAGS_k});
-    HostTensor<faiss::Index::idx_t, 2, true> gpuIndices({numQueries, FLAGS_k});
+    HostTensor<faiss::idx_t, 2, true> gpuIndices({numQueries, FLAGS_k});
 
     CUDA_VERIFY(cudaProfilerStart());
     faiss::gpu::synchronizeAllDevices();
diff -Nru faiss-1.7.3/faiss/gpu/perf/PerfIVFFlat.cu faiss-1.7.4/faiss/gpu/perf/PerfIVFFlat.cu
--- faiss-1.7.3/faiss/gpu/perf/PerfIVFFlat.cu	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/perf/PerfIVFFlat.cu	2023-04-19 13:18:30.000000000 +0000
@@ -93,7 +93,7 @@
 
     // Time faiss CPU
     HostTensor<float, 2, true> cpuDistances({numQueries, FLAGS_k});
-    HostTensor<faiss::Index::idx_t, 2, true> cpuIndices({numQueries, FLAGS_k});
+    HostTensor<faiss::idx_t, 2, true> cpuIndices({numQueries, FLAGS_k});
 
     float cpuTime = 0.0f;
 
@@ -112,7 +112,7 @@
     printf("CPU time %.3f ms\n", cpuTime);
 
     HostTensor<float, 2, true> gpuDistances({numQueries, FLAGS_k});
-    HostTensor<faiss::Index::idx_t, 2, true> gpuIndices({numQueries, FLAGS_k});
+    HostTensor<faiss::idx_t, 2, true> gpuIndices({numQueries, FLAGS_k});
 
     CUDA_VERIFY(cudaProfilerStart());
     faiss::gpu::synchronizeAllDevices();
diff -Nru faiss-1.7.3/faiss/gpu/perf/PerfIVFPQ.cu faiss-1.7.4/faiss/gpu/perf/PerfIVFPQ.cu
--- faiss-1.7.3/faiss/gpu/perf/PerfIVFPQ.cu	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/perf/PerfIVFPQ.cu	2023-04-19 13:18:30.000000000 +0000
@@ -103,7 +103,7 @@
 
     // Time faiss CPU
     HostTensor<float, 2, true> cpuDistances({numQueries, FLAGS_k});
-    HostTensor<faiss::Index::idx_t, 2, true> cpuIndices({numQueries, FLAGS_k});
+    HostTensor<faiss::idx_t, 2, true> cpuIndices({numQueries, FLAGS_k});
 
     float cpuTime = 0.0f;
 
@@ -122,7 +122,7 @@
     printf("CPU time %.3f ms\n", cpuTime);
 
     HostTensor<float, 2, true> gpuDistances({numQueries, FLAGS_k});
-    HostTensor<faiss::Index::idx_t, 2, true> gpuIndices({numQueries, FLAGS_k});
+    HostTensor<faiss::idx_t, 2, true> gpuIndices({numQueries, FLAGS_k});
 
     CUDA_VERIFY(cudaProfilerStart());
     faiss::gpu::synchronizeAllDevices();
diff -Nru faiss-1.7.3/faiss/gpu/perf/PerfSelect.cu faiss-1.7.4/faiss/gpu/perf/PerfSelect.cu
--- faiss-1.7.3/faiss/gpu/perf/PerfSelect.cu	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/perf/PerfSelect.cu	2023-04-19 13:18:30.000000000 +0000
@@ -64,7 +64,7 @@
                 resUse.get(),
                 makeDevAlloc(AllocType::Other, 0),
                 {FLAGS_rows, k});
-        DeviceTensor<int, 2, true> gpuOutInd(
+        DeviceTensor<faiss::idx_t, 2, true> gpuOutInd(
                 resUse.get(),
                 makeDevAlloc(AllocType::Other, 0),
                 {FLAGS_rows, k});
diff -Nru faiss-1.7.3/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp faiss-1.7.4/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp
--- faiss-1.7.3/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/test/demo_ivfpq_indexing_gpu.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -130,7 +130,7 @@
                k,
                nq);
 
-        std::vector<faiss::Index::idx_t> nns(k * nq);
+        std::vector<faiss::idx_t> nns(k * nq);
         std::vector<float> dis(k * nq);
 
         index.search(nq, queries.data(), k, dis.data(), nns.data());
diff -Nru faiss-1.7.3/faiss/gpu/test/test_contrib_gpu.py faiss-1.7.4/faiss/gpu/test/test_contrib_gpu.py
--- faiss-1.7.3/faiss/gpu/test/test_contrib_gpu.py	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/test/test_contrib_gpu.py	2023-04-19 13:18:30.000000000 +0000
@@ -0,0 +1,133 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import faiss
+import numpy as np
+
+from common_faiss_tests import get_dataset_2
+
+from faiss.contrib import datasets, evaluation, ivf_tools
+from faiss.contrib.exhaustive_search import knn_ground_truth, \
+    range_ground_truth
+
+
+class TestComputeGT(unittest.TestCase):
+
+    def test_compute_GT(self):
+        d = 64
+        xt, xb, xq = get_dataset_2(d, 0, 10000, 100)
+
+        index = faiss.IndexFlatL2(d)
+        index.add(xb)
+        Dref, Iref = index.search(xq, 10)
+
+        # iterator function on the matrix
+
+        def matrix_iterator(xb, bs):
+            for i0 in range(0, xb.shape[0], bs):
+                yield xb[i0:i0 + bs]
+
+        Dnew, Inew = knn_ground_truth(xq, matrix_iterator(xb, 1000), 10)
+
+        np.testing.assert_array_equal(Iref, Inew)
+        np.testing.assert_almost_equal(Dref, Dnew, decimal=4)
+
+    def do_test_range(self, metric):
+        ds = datasets.SyntheticDataset(32, 0, 1000, 10)
+        xq = ds.get_queries()
+        xb = ds.get_database()
+        D, I = faiss.knn(xq, xb, 10, metric=metric)
+        threshold = float(D[:, -1].mean())
+
+        index = faiss.IndexFlat(32, metric)
+        index.add(xb)
+        ref_lims, ref_D, ref_I = index.range_search(xq, threshold)
+
+        new_lims, new_D, new_I = range_ground_truth(
+            xq, ds.database_iterator(bs=100), threshold,
+            metric_type=metric)
+
+        evaluation.test_ref_range_results(
+            ref_lims, ref_D, ref_I,
+            new_lims, new_D, new_I
+        )
+
+    def test_range_L2(self):
+        self.do_test_range(faiss.METRIC_L2)
+
+    def test_range_IP(self):
+        self.do_test_range(faiss.METRIC_INNER_PRODUCT)
+
+
+class TestBigBatchSearch(unittest.TestCase):
+
+    def do_test(self, factory_string):
+        ds = datasets.SyntheticDataset(32, 2000, 4000, 1000)
+        k = 10
+        index = faiss.index_factory(ds.d, factory_string)
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+        index.nprobe = 5
+        Dref, Iref = index.search(ds.get_queries(), k)
+        res = faiss.StandardGpuResources()
+
+        def pairwise_distances(xq, xb, metric=faiss.METRIC_L2):
+            return faiss.pairwise_distance_gpu(
+                res, xq, xb, metric=faiss.METRIC_L2)
+
+        def knn_function(xq, xb, k, metric=faiss.METRIC_L2):
+            return faiss.knn_gpu(res, xq, xb, k, metric=faiss.METRIC_L2)
+
+        for method in "pairwise_distances", "knn_function":
+            Dnew, Inew = ivf_tools.big_batch_search(
+                index, ds.get_queries(),
+                k, method=method,
+                pairwise_distances=pairwise_distances,
+                knn=knn_function
+            )
+            self.assertLess((Inew != Iref).sum() / Iref.size, 1e-4)
+            np.testing.assert_almost_equal(Dnew, Dref, decimal=4)
+
+    def test_Flat(self):
+        self.do_test("IVF64,Flat")
+
+    def test_PQ(self):
+        self.do_test("IVF64,PQ4np")
+
+
+class TestBigBatchSearchMultiGPU(unittest.TestCase):
+
+    @unittest.skipIf(faiss.get_num_gpus() < 2, "multiple GPU only test")
+    def do_test(self, factory_string):
+        ds = datasets.SyntheticDataset(32, 2000, 4000, 1000)
+        k = 10
+        index = faiss.index_factory(ds.d, factory_string)
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+        index.nprobe = 5
+        Dref, Iref = index.search(ds.get_queries(), k)
+        ngpu = faiss.get_num_gpus()
+        res = [faiss.StandardGpuResources() for _ in range(ngpu)]
+
+        def knn_function(xq, xb, k, metric=faiss.METRIC_L2, thread_id=None):
+            return faiss.knn_gpu(
+                res[thread_id], xq, xb, k,
+                metric=faiss.METRIC_L2, device=thread_id
+            )
+
+        Dnew, Inew = ivf_tools.big_batch_search(
+            index, ds.get_queries(),
+            k, method="knn_function",
+            knn=knn_function,
+            threaded=8,
+            computation_threads=ngpu
+        )
+        self.assertLess((Inew != Iref).sum() / Iref.size, 1e-4)
+        np.testing.assert_almost_equal(Dnew, Dref, decimal=4)
+
+    def test_Flat(self):
+        self.do_test("IVF64,Flat")
diff -Nru faiss-1.7.3/faiss/gpu/test/test_contrib.py faiss-1.7.4/faiss/gpu/test/test_contrib.py
--- faiss-1.7.3/faiss/gpu/test/test_contrib.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/test/test_contrib.py	1970-01-01 00:00:00.000000000 +0000
@@ -1,63 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import faiss
-import unittest
-import numpy as np
-
-from faiss.contrib import datasets
-from faiss.contrib.exhaustive_search import knn_ground_truth, range_ground_truth
-from faiss.contrib import evaluation
-
-
-from common_faiss_tests import get_dataset_2
-
-
-class TestComputeGT(unittest.TestCase):
-
-    def test_compute_GT(self):
-        d = 64
-        xt, xb, xq = get_dataset_2(d, 0, 10000, 100)
-
-        index = faiss.IndexFlatL2(d)
-        index.add(xb)
-        Dref, Iref = index.search(xq, 10)
-
-        # iterator function on the matrix
-
-        def matrix_iterator(xb, bs):
-            for i0 in range(0, xb.shape[0], bs):
-                yield xb[i0:i0 + bs]
-
-        Dnew, Inew = knn_ground_truth(xq, matrix_iterator(xb, 1000), 10)
-
-        np.testing.assert_array_equal(Iref, Inew)
-        np.testing.assert_almost_equal(Dref, Dnew, decimal=4)
-
-    def do_test_range(self, metric):
-        ds = datasets.SyntheticDataset(32, 0, 1000, 10)
-        xq = ds.get_queries()
-        xb = ds.get_database()
-        D, I = faiss.knn(xq, xb, 10, metric=metric)
-        threshold = float(D[:, -1].mean())
-
-        index = faiss.IndexFlat(32, metric)
-        index.add(xb)
-        ref_lims, ref_D, ref_I = index.range_search(xq, threshold)
-
-        new_lims, new_D, new_I = range_ground_truth(
-            xq, ds.database_iterator(bs=100), threshold,
-            metric_type=metric)
-
-        evaluation.test_ref_range_results(
-            ref_lims, ref_D, ref_I,
-            new_lims, new_D, new_I
-        )
-
-    def test_range_L2(self):
-        self.do_test_range(faiss.METRIC_L2)
-
-    def test_range_IP(self):
-        self.do_test_range(faiss.METRIC_INNER_PRODUCT)
diff -Nru faiss-1.7.3/faiss/gpu/test/test_gpu_basics.py faiss-1.7.4/faiss/gpu/test/test_gpu_basics.py
--- faiss-1.7.3/faiss/gpu/test/test_gpu_basics.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/test/test_gpu_basics.py	2023-04-19 13:18:30.000000000 +0000
@@ -8,6 +8,7 @@
 import unittest
 import numpy as np
 import faiss
+import random
 from common_faiss_tests import get_dataset_2
 
 class ReferencedObject(unittest.TestCase):
@@ -252,6 +253,7 @@
         params.numQueries = nq
         params.outDistances = faiss.swig_ptr(out_d)
         params.outIndices = faiss.swig_ptr(out_i)
+        params.device = random.randrange(0, faiss.get_num_gpus())
 
         faiss.bfKnn(res, params)
 
@@ -279,6 +281,7 @@
         params.vectorType = faiss.DistanceDataType_F16
         params.queries = faiss.swig_ptr(qs_f16)
         params.queryType = faiss.DistanceDataType_F16
+        params.device = random.randrange(0, faiss.get_num_gpus())
 
         out_d_f16 = np.empty((nq, k), dtype=np.float32)
         out_i_f16 = np.empty((nq, k), dtype=np.int64)
@@ -286,6 +289,7 @@
         params.outDistances = faiss.swig_ptr(out_d_f16)
         params.outIndices = faiss.swig_ptr(out_i_f16)
         params.outIndicesType = faiss.IndicesDataType_I64
+        params.device = random.randrange(0, faiss.get_num_gpus())
 
         faiss.bfKnn(res, params)
 
@@ -301,7 +305,8 @@
             faiss.METRIC_Linf,
             faiss.METRIC_Canberra,
             faiss.METRIC_BrayCurtis,
-            faiss.METRIC_JensenShannon
+            faiss.METRIC_JensenShannon,
+            faiss.METRIC_Jaccard
         ]
 
         for metric in metrics:
@@ -335,6 +340,7 @@
             params.queries = faiss.swig_ptr(qs)
             params.numQueries = nq
             params.outDistances = faiss.swig_ptr(out_d)
+            params.device = random.randrange(0, faiss.get_num_gpus())
 
             faiss.bfKnn(res, params)
 
@@ -344,7 +350,7 @@
 
             # INNER_PRODUCT is in descending order, make sure it is the same
             # order
-            if metric == faiss.METRIC_INNER_PRODUCT:
+            if faiss.is_similarity_metric(metric):
                 ref_d = np.sort(ref_d, axis=1)
 
             print('f32', np.abs(ref_d - out_d).max())
@@ -367,6 +373,7 @@
 
             out_d_f16 = np.empty((nq, k), dtype=np.float32)
             params.outDistances = faiss.swig_ptr(out_d_f16)
+            params.device = random.randrange(0, faiss.get_num_gpus())
 
             faiss.bfKnn(res, params)
 
@@ -376,7 +383,7 @@
 
             # INNER_PRODUCT is in descending order, make sure it is the same
             # order
-            if metric == faiss.METRIC_INNER_PRODUCT:
+            if faiss.is_similarity_metric(metric):
                 ref_d_f16 = np.sort(ref_d_f16, axis=1)
 
             print('f16', np.abs(ref_d_f16 - out_d_f16).max())
@@ -394,7 +401,7 @@
 class TestResidualQuantizer(unittest.TestCase):
 
     def test_with_gpu(self):
-        """ check that we get the same resutls with a GPU quantizer and a CPU quantizer """
+        """ check that we get the same results with a GPU quantizer and a CPU quantizer """
         d = 32
         nt = 3000
         nb = 1000
@@ -419,8 +426,3 @@
         self.assertTrue(0.9 * err_rq0 < err_rq1 < 1.1 * err_rq0)
 
         # np.testing.assert_array_equal(codes0, codes1)
-
-
-
-if __name__ == '__main__':
-    unittest.main()
diff -Nru faiss-1.7.3/faiss/gpu/test/TestGpuDistance.cu faiss-1.7.4/faiss/gpu/test/TestGpuDistance.cu
--- faiss-1.7.3/faiss/gpu/test/TestGpuDistance.cu	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/test/TestGpuDistance.cu	2023-04-19 13:18:30.000000000 +0000
@@ -37,7 +37,8 @@
     std::vector<float> vecs = randVecs(numVecs, dim);
     std::vector<float> queries = randVecs(numQuery, dim);
 
-    if (metric == faiss::MetricType::METRIC_JensenShannon) {
+    if ((metric == faiss::MetricType::METRIC_JensenShannon) ||
+        (metric == faiss::MetricType::METRIC_Jaccard)) {
         // make values positive
         for (auto& v : vecs) {
             v = std::abs(v);
@@ -60,7 +61,7 @@
     cpuIndex.add(numVecs, vecs.data());
 
     std::vector<float> cpuDistance(numQuery * k, 0);
-    std::vector<faiss::Index::idx_t> cpuIndices(numQuery * k, -1);
+    std::vector<faiss::idx_t> cpuIndices(numQuery * k, -1);
 
     cpuIndex.search(
             numQuery, queries.data(), k, cpuDistance.data(), cpuIndices.data());
@@ -97,7 +98,7 @@
     runTransposeAny(gpuQueries, 0, 1, queriesT, stream);
 
     std::vector<float> gpuDistance(numQuery * k, 0);
-    std::vector<faiss::Index::idx_t> gpuIndices(numQuery * k, -1);
+    std::vector<faiss::idx_t> gpuIndices(numQuery * k, -1);
 
     GpuDistanceParams args;
     args.metric = metric;
@@ -112,6 +113,7 @@
     args.numQueries = numQuery;
     args.outDistances = gpuDistance.data();
     args.outIndices = gpuIndices.data();
+    args.device = device;
 
     bfKnn(&res, args);
 
@@ -191,6 +193,10 @@
     testTransposition(false, false, faiss::MetricType::METRIC_JensenShannon);
 }
 
+TEST(TestGpuDistance, Jaccard) {
+    testTransposition(false, false, faiss::MetricType::METRIC_Jaccard);
+}
+
 int main(int argc, char** argv) {
     testing::InitGoogleTest(&argc, argv);
 
diff -Nru faiss-1.7.3/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp faiss-1.7.4/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp
--- faiss-1.7.3/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/test/TestGpuIndexBinaryFlat.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -18,9 +18,9 @@
 
 void compareBinaryDist(
         const std::vector<int>& cpuDist,
-        const std::vector<faiss::IndexBinary::idx_t>& cpuLabels,
+        const std::vector<faiss::idx_t>& cpuLabels,
         const std::vector<int>& gpuDist,
-        const std::vector<faiss::IndexBinary::idx_t>& gpuLabels,
+        const std::vector<faiss::idx_t>& gpuLabels,
         int numQuery,
         int k) {
     for (int i = 0; i < numQuery; ++i) {
@@ -29,8 +29,8 @@
         // encounters the values. The last set of equivalent distances seen in
         // the min-k might be truncated, so we can't check that set, but all
         // others we can check.
-        std::set<faiss::IndexBinary::idx_t> cpuLabelSet;
-        std::set<faiss::IndexBinary::idx_t> gpuLabelSet;
+        std::set<faiss::idx_t> cpuLabelSet;
+        std::set<faiss::idx_t> gpuLabelSet;
 
         int curDist = -1;
 
@@ -89,13 +89,13 @@
     auto query = faiss::gpu::randBinaryVecs(numQuery, dims);
 
     std::vector<int> cpuDist(numQuery * k);
-    std::vector<faiss::IndexBinary::idx_t> cpuLabels(numQuery * k);
+    std::vector<faiss::idx_t> cpuLabels(numQuery * k);
 
     cpuIndex.search(
             numQuery, query.data(), k, cpuDist.data(), cpuLabels.data());
 
     std::vector<int> gpuDist(numQuery * k);
-    std::vector<faiss::IndexBinary::idx_t> gpuLabels(numQuery * k);
+    std::vector<faiss::idx_t> gpuLabels(numQuery * k);
 
     gpuIndex.search(
             numQuery, query.data(), k, gpuDist.data(), gpuLabels.data());
@@ -115,6 +115,55 @@
     }
 }
 
+TEST(TestGpuIndexBinaryFlat, LargeIndex) {
+    // Construct on a random device to test multi-device, if we have
+    // multiple devices
+    int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+
+    faiss::gpu::StandardGpuResources res;
+    res.noTempMemory();
+
+    // Skip this device if we do not have sufficient memory
+    constexpr size_t kMem = size_t(8) * 1024 * 1024 * 1024;
+
+    if (faiss::gpu::getFreeMemory(device) < kMem) {
+        std::cerr << "TestGpuIndexFlat.LargeIndex: skipping due "
+                     "to insufficient device memory\n";
+        return;
+    }
+
+    std::cerr << "Running LargeIndex test\n";
+
+    faiss::gpu::GpuIndexBinaryFlatConfig config;
+    config.device = device;
+
+    int dims = 1250 * 8;
+    faiss::gpu::GpuIndexBinaryFlat gpuIndex(&res, dims, config);
+
+    faiss::IndexBinaryFlat cpuIndex(dims);
+
+    int k = 10;
+    int nb = 4000000;
+    int nq = 10;
+
+    auto xb = faiss::gpu::randBinaryVecs(nb, dims);
+    auto xq = faiss::gpu::randBinaryVecs(nq, dims);
+    gpuIndex.add(nb, xb.data());
+    cpuIndex.add(nb, xb.data());
+
+    std::vector<int> cpuDist(nq * k);
+    std::vector<faiss::idx_t> cpuLabels(nq * k);
+
+    cpuIndex.search(nq, xq.data(), k, cpuDist.data(), cpuLabels.data());
+
+    std::vector<int> gpuDist(nq * k);
+    std::vector<faiss::idx_t> gpuLabels(nq * k);
+
+    gpuIndex.search(nq, xq.data(), k, gpuDist.data(), gpuLabels.data());
+
+    compareBinaryDist(cpuDist, cpuLabels, gpuDist, gpuLabels, nq, k);
+}
+
 int main(int argc, char** argv) {
     testing::InitGoogleTest(&argc, argv);
 
diff -Nru faiss-1.7.3/faiss/gpu/test/TestGpuIndexFlat.cpp faiss-1.7.4/faiss/gpu/test/TestGpuIndexFlat.cpp
--- faiss-1.7.3/faiss/gpu/test/TestGpuIndexFlat.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/test/TestGpuIndexFlat.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -141,6 +141,20 @@
     }
 }
 
+// At least one test for the k > 1024 select
+TEST(TestGpuIndexFlat, L2_k_2048) {
+    if (faiss::gpu::getMaxKSelection() >= 2048) {
+        TestFlatOptions opt;
+        opt.metric = faiss::MetricType::METRIC_L2;
+        opt.useFloat16 = false;
+        opt.kOverride = 2048;
+        opt.dimOverride = 128;
+        opt.numVecsOverride = 10000;
+
+        testFlat(opt);
+    }
+}
+
 // test specialized k == 1 codepath
 TEST(TestGpuIndexFlat, L2_Float32_K1) {
     for (int tries = 0; tries < 3; ++tries) {
@@ -220,7 +234,7 @@
     std::vector<float> queries(numQuery * dim, 1.0f);
 
     std::vector<float> dist(numQuery * k, 0);
-    std::vector<faiss::Index::idx_t> ind(numQuery * k);
+    std::vector<faiss::idx_t> ind(numQuery * k);
 
     gpuIndex.search(numQuery, queries.data(), k, dist.data(), ind.data());
 
@@ -437,7 +451,7 @@
     cpuIndex.add(numVecs, vecs.data());
     gpuIndex.add(numVecs, vecs.data());
 
-    auto indexVecs = std::vector<faiss::Index::idx_t>{0, 2, 4, 6, 8};
+    auto indexVecs = std::vector<faiss::idx_t>{0, 2, 4, 6, 8};
     auto queryVecs = faiss::gpu::randVecs(indexVecs.size(), dim);
 
     auto residualsCpu = std::vector<float>(indexVecs.size() * dim);
@@ -517,7 +531,7 @@
 
         // Test reconstruct_batch
         if (false) {
-            auto reconstructKeys = std::vector<faiss::Index::idx_t>{1, 3, 5};
+            auto reconstructKeys = std::vector<faiss::idx_t>{1, 3, 5};
             auto reconstructVecs =
                     std::vector<float>(reconstructKeys.size() * dim);
 
@@ -565,7 +579,7 @@
     gpuIndex.add(nb, xb.data());
 
     std::vector<float> refDistance(nq * k, 0);
-    std::vector<faiss::Index::idx_t> refIndices(nq * k, -1);
+    std::vector<faiss::idx_t> refIndices(nq * k, -1);
     std::vector<float> refReconstruct(nq * k * dim, 0);
     cpuIndex.search_and_reconstruct(
             nq,
@@ -576,7 +590,7 @@
             refReconstruct.data());
 
     std::vector<float> testDistance(nq * k, 0);
-    std::vector<faiss::Index::idx_t> testIndices(nq * k, -1);
+    std::vector<faiss::idx_t> testIndices(nq * k, -1);
     std::vector<float> testReconstruct(nq * k * dim, 0);
     gpuIndex.search_and_reconstruct(
             nq,
@@ -606,7 +620,7 @@
     // above will ensure a decent number of matches), reconstruction should be
     // the same for the vectors that do match
     for (int i = 0; i < nq; ++i) {
-        std::unordered_map<faiss::Index::idx_t, int> refLocation;
+        std::unordered_map<faiss::idx_t, int> refLocation;
 
         for (int j = 0; j < k; ++j) {
             refLocation.insert(std::make_pair(refIndices[i * k + j], j));
diff -Nru faiss-1.7.3/faiss/gpu/test/TestGpuIndexIVFFlat.cpp faiss-1.7.4/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
--- faiss-1.7.3/faiss/gpu/test/TestGpuIndexIVFFlat.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/test/TestGpuIndexIVFFlat.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -64,13 +64,10 @@
 };
 
 void queryTest(
+        Options opt,
         faiss::MetricType metricType,
-        bool useFloat16CoarseQuantizer,
-        int dimOverride = -1) {
+        bool useFloat16CoarseQuantizer) {
     for (int tries = 0; tries < 2; ++tries) {
-        Options opt;
-        opt.dim = dimOverride != -1 ? dimOverride : opt.dim;
-
         std::vector<float> trainVecs =
                 faiss::gpu::randVecs(opt.numTrain, opt.dim);
         std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
@@ -98,7 +95,7 @@
         faiss::gpu::GpuIndexIVFFlat gpuIndex(
                 &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
         gpuIndex.copyFrom(&cpuIndex);
-        gpuIndex.setNumProbes(opt.nprobe);
+        gpuIndex.nprobe = opt.nprobe;
 
         bool compFloat16 = useFloat16CoarseQuantizer;
         faiss::gpu::compareIndices(
@@ -147,7 +144,7 @@
         faiss::gpu::GpuIndexIVFFlat gpuIndex(
                 &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
         gpuIndex.copyFrom(&cpuIndex);
-        gpuIndex.setNumProbes(opt.nprobe);
+        gpuIndex.nprobe = opt.nprobe;
 
         cpuIndex.add(opt.numAdd, addVecs.data());
         gpuIndex.add(opt.numAdd, addVecs.data());
@@ -183,7 +180,7 @@
             &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
     gpuIndex.train(opt.numTrain, trainVecs.data());
     gpuIndex.add(opt.numAdd, addVecs.data());
-    gpuIndex.setNumProbes(opt.nprobe);
+    gpuIndex.nprobe = opt.nprobe;
 
     // use garbage values to see if we overwrite then
     faiss::IndexFlatL2 cpuQuantizer(1);
@@ -199,7 +196,7 @@
     EXPECT_EQ(cpuIndex.quantizer->d, gpuIndex.quantizer->d);
     EXPECT_EQ(cpuIndex.d, opt.dim);
     EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists());
-    EXPECT_EQ(cpuIndex.nprobe, gpuIndex.getNumProbes());
+    EXPECT_EQ(cpuIndex.nprobe, gpuIndex.nprobe);
 
     testIVFEquality(cpuIndex, gpuIndex);
 
@@ -239,7 +236,7 @@
     config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
 
     faiss::gpu::GpuIndexIVFFlat gpuIndex(&res, 1, 1, faiss::METRIC_L2, config);
-    gpuIndex.setNumProbes(1);
+    gpuIndex.nprobe = 1;
 
     gpuIndex.copyFrom(&cpuIndex);
 
@@ -249,7 +246,7 @@
     EXPECT_EQ(cpuIndex.d, gpuIndex.d);
     EXPECT_EQ(cpuIndex.d, opt.dim);
     EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists());
-    EXPECT_EQ(cpuIndex.nprobe, gpuIndex.getNumProbes());
+    EXPECT_EQ(cpuIndex.nprobe, gpuIndex.nprobe);
 
     testIVFEquality(cpuIndex, gpuIndex);
 
@@ -288,21 +285,28 @@
 //
 
 TEST(TestGpuIndexIVFFlat, Float32_Query_L2) {
-    queryTest(faiss::METRIC_L2, false);
+    queryTest(Options(), faiss::METRIC_L2, false);
 }
 
 TEST(TestGpuIndexIVFFlat, Float32_Query_IP) {
-    queryTest(faiss::METRIC_INNER_PRODUCT, false);
+    queryTest(Options(), faiss::METRIC_INNER_PRODUCT, false);
+}
+
+TEST(TestGpuIndexIVFFlat, LargeBatch) {
+    Options opt;
+    opt.dim = 3;
+    opt.numQuery = 100000;
+    queryTest(opt, faiss::METRIC_L2, false);
 }
 
 // float16 coarse quantizer
 
 TEST(TestGpuIndexIVFFlat, Float16_32_Query_L2) {
-    queryTest(faiss::METRIC_L2, true);
+    queryTest(Options(), faiss::METRIC_L2, true);
 }
 
 TEST(TestGpuIndexIVFFlat, Float16_32_Query_IP) {
-    queryTest(faiss::METRIC_INNER_PRODUCT, true);
+    queryTest(Options(), faiss::METRIC_INNER_PRODUCT, true);
 }
 
 //
@@ -311,19 +315,27 @@
 //
 
 TEST(TestGpuIndexIVFFlat, Float32_Query_L2_64) {
-    queryTest(faiss::METRIC_L2, false, 64);
+    Options opt;
+    opt.dim = 64;
+    queryTest(opt, faiss::METRIC_L2, false);
 }
 
 TEST(TestGpuIndexIVFFlat, Float32_Query_IP_64) {
-    queryTest(faiss::METRIC_INNER_PRODUCT, false, 64);
+    Options opt;
+    opt.dim = 64;
+    queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
 }
 
 TEST(TestGpuIndexIVFFlat, Float32_Query_L2_128) {
-    queryTest(faiss::METRIC_L2, false, 128);
+    Options opt;
+    opt.dim = 128;
+    queryTest(opt, faiss::METRIC_L2, false);
 }
 
 TEST(TestGpuIndexIVFFlat, Float32_Query_IP_128) {
-    queryTest(faiss::METRIC_INNER_PRODUCT, false, 128);
+    Options opt;
+    opt.dim = 128;
+    queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
 }
 
 //
@@ -372,7 +384,7 @@
     faiss::gpu::GpuIndexIVFFlat gpuIndex(
             &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
     gpuIndex.copyFrom(&cpuIndex);
-    gpuIndex.setNumProbes(opt.nprobe);
+    gpuIndex.nprobe = opt.nprobe;
 
     // Construct a positive test set
     auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
@@ -419,7 +431,7 @@
 
     faiss::gpu::GpuIndexIVFFlat gpuIndex(
             &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
-    gpuIndex.setNumProbes(opt.nprobe);
+    gpuIndex.nprobe = opt.nprobe;
 
     gpuIndex.train(opt.numTrain, trainVecs.data());
     gpuIndex.add(opt.numAdd, addVecs.data());
@@ -429,7 +441,7 @@
             numQuery * opt.dim, std::numeric_limits<float>::quiet_NaN());
 
     std::vector<float> distances(numQuery * opt.k, 0);
-    std::vector<faiss::Index::idx_t> indices(numQuery * opt.k, 0);
+    std::vector<faiss::idx_t> indices(numQuery * opt.k, 0);
 
     gpuIndex.search(
             numQuery, nans.data(), opt.k, distances.data(), indices.data());
@@ -457,7 +469,7 @@
 
     faiss::gpu::GpuIndexIVFFlat gpuIndex(
             &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
-    gpuIndex.setNumProbes(opt.nprobe);
+    gpuIndex.nprobe = opt.nprobe;
 
     int numNans = 10;
     std::vector<float> nans(
@@ -478,7 +490,7 @@
 
     std::vector<float> queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
     std::vector<float> distance(opt.numQuery * opt.k, 0);
-    std::vector<faiss::Index::idx_t> indices(opt.numQuery * opt.k, 0);
+    std::vector<faiss::idx_t> indices(opt.numQuery * opt.k, 0);
 
     // should not crash
     gpuIndex.search(
@@ -531,7 +543,65 @@
     faiss::gpu::GpuIndexIVFFlat gpuIndex(
             &res, dim, numCentroids, faiss::METRIC_L2, config);
     gpuIndex.copyFrom(&cpuIndex);
-    gpuIndex.setNumProbes(nprobe);
+    gpuIndex.nprobe = nprobe;
+
+    faiss::gpu::compareIndices(
+            cpuIndex,
+            gpuIndex,
+            numQuery,
+            dim,
+            k,
+            "Unified Memory",
+            kF32MaxRelErr,
+            0.1f,
+            0.015f);
+}
+
+TEST(TestGpuIndexIVFFlat, LongIVFList) {
+    int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+
+    // Skip this device if we do not have sufficient memory
+    constexpr size_t kMem = size_t(24) * 1024 * 1024 * 1024;
+
+    if (faiss::gpu::getFreeMemory(device) < kMem) {
+        std::cout << "TestGpuIndexIVFFlat.LongIVFList: skipping due "
+                     "to insufficient device memory\n";
+        return;
+    }
+
+    std::cout << "Running LongIVFList test\n";
+
+    // Test functionality where a single IVF list has more than 2B code values
+    int dim = 64;
+
+    int numCentroids = 1;
+    size_t numAdd = (size_t(1024) * 1024 * 1024 * 2 + 100000) / dim;
+    size_t numTrain = 100;
+    int numQuery = 5;
+    int k = 10;
+
+    std::vector<float> trainVecs = faiss::gpu::randVecs(numTrain, dim);
+    std::vector<float> addVecs = faiss::gpu::randVecs(numAdd, dim);
+
+    faiss::IndexFlatL2 quantizer(dim);
+    faiss::IndexIVFFlat cpuIndex(
+            &quantizer, dim, numCentroids, faiss::METRIC_L2);
+
+    cpuIndex.train(numTrain, trainVecs.data());
+    cpuIndex.add(numAdd, addVecs.data());
+    cpuIndex.nprobe = 1;
+
+    faiss::gpu::StandardGpuResources res;
+    res.noTempMemory();
+
+    faiss::gpu::GpuIndexIVFFlatConfig config;
+    config.device = device;
+
+    faiss::gpu::GpuIndexIVFFlat gpuIndex(
+            &res, dim, numCentroids, faiss::METRIC_L2, config);
+    gpuIndex.train(numTrain, trainVecs.data());
+    gpuIndex.add(numAdd, addVecs.data());
+    gpuIndex.nprobe = 1;
 
     faiss::gpu::compareIndices(
             cpuIndex,
diff -Nru faiss-1.7.3/faiss/gpu/test/TestGpuIndexIVFPQ.cpp faiss-1.7.4/faiss/gpu/test/TestGpuIndexIVFPQ.cpp
--- faiss-1.7.3/faiss/gpu/test/TestGpuIndexIVFPQ.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/test/TestGpuIndexIVFPQ.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -137,7 +137,58 @@
         config.useFloat16LookupTables = opt.useFloat16;
 
         faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
-        gpuIndex.setNumProbes(opt.nprobe);
+        gpuIndex.nprobe = opt.nprobe;
+
+        faiss::gpu::compareIndices(
+                cpuIndex,
+                gpuIndex,
+                opt.numQuery,
+                opt.dim,
+                opt.k,
+                opt.toString(),
+                opt.getCompareEpsilon(),
+                opt.getPctMaxDiff1(),
+                opt.getPctMaxDiffN());
+    }
+}
+
+// Large batch sizes (>= 65536) should also work
+TEST(TestGpuIndexIVFPQ, LargeBatch) {
+    for (bool usePrecomputed : {false, true}) {
+        Options opt;
+
+        // override for large sizes
+        opt.dim = 4;
+        opt.numQuery = 100000;
+        opt.codes = 2;
+
+        std::vector<float> trainVecs =
+                faiss::gpu::randVecs(opt.numTrain, opt.dim);
+        std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+
+        faiss::IndexFlatL2 coarseQuantizer(opt.dim);
+        faiss::IndexIVFPQ cpuIndex(
+                &coarseQuantizer,
+                opt.dim,
+                opt.numCentroids,
+                opt.codes,
+                opt.bitsPerCode);
+        cpuIndex.nprobe = opt.nprobe;
+        cpuIndex.train(opt.numTrain, trainVecs.data());
+        cpuIndex.add(opt.numAdd, addVecs.data());
+
+        // Use the default temporary memory management to test the memory
+        // manager
+        faiss::gpu::StandardGpuResources res;
+
+        faiss::gpu::GpuIndexIVFPQConfig config;
+        config.device = opt.device;
+        config.usePrecomputedTables = usePrecomputed;
+        config.indicesOptions = opt.indicesOpt;
+        config.useFloat16LookupTables = false;
+
+        faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
+        gpuIndex.nprobe = opt.nprobe;
 
         faiss::gpu::compareIndices(
                 cpuIndex,
@@ -189,7 +240,7 @@
         config.flatConfig.useFloat16 = (tries % 2 == 1);
 
         faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
-        gpuIndex.setNumProbes(opt.nprobe);
+        gpuIndex.nprobe = opt.nprobe;
 
         faiss::gpu::compareIndices(
                 cpuIndex,
@@ -238,7 +289,7 @@
         config.useFloat16LookupTables = (dimPerSubQ == 7);
 
         faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
-        gpuIndex.setNumProbes(opt.nprobe);
+        gpuIndex.nprobe = opt.nprobe;
 
         faiss::gpu::compareIndices(
                 cpuIndex,
@@ -293,7 +344,7 @@
         config.useFloat16LookupTables = opt.useFloat16;
 
         faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
-        gpuIndex.setNumProbes(opt.nprobe);
+        gpuIndex.nprobe = opt.nprobe;
 
         faiss::gpu::compareIndices(
                 cpuIndex,
@@ -335,7 +386,7 @@
     config.useFloat16LookupTables = opt.useFloat16;
 
     faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
-    gpuIndex.setNumProbes(opt.nprobe);
+    gpuIndex.nprobe = opt.nprobe;
 
     gpuIndex.add(opt.numAdd, addVecs.data());
     cpuIndex.add(opt.numAdd, addVecs.data());
@@ -381,7 +432,7 @@
         config.useFloat16LookupTables = opt.useFloat16;
 
         faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
-        gpuIndex.setNumProbes(opt.nprobe);
+        gpuIndex.nprobe = opt.nprobe;
 
         gpuIndex.add(opt.numAdd, addVecs.data());
         cpuIndex.add(opt.numAdd, addVecs.data());
@@ -429,7 +480,7 @@
         config.useFloat16LookupTables = opt.useFloat16;
 
         faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
-        gpuIndex.setNumProbes(opt.nprobe);
+        gpuIndex.nprobe = opt.nprobe;
 
         gpuIndex.add(opt.numAdd, addVecs.data());
         cpuIndex.add(opt.numAdd, addVecs.data());
@@ -472,7 +523,7 @@
                 opt.bitsPerCode,
                 faiss::METRIC_L2,
                 config);
-        gpuIndex.setNumProbes(opt.nprobe);
+        gpuIndex.nprobe = opt.nprobe;
         gpuIndex.train(opt.numTrain, trainVecs.data());
         gpuIndex.add(opt.numAdd, addVecs.data());
 
@@ -488,7 +539,7 @@
         EXPECT_EQ(cpuIndex.d, gpuIndex.d);
         EXPECT_EQ(cpuIndex.d, opt.dim);
         EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists());
-        EXPECT_EQ(cpuIndex.nprobe, gpuIndex.getNumProbes());
+        EXPECT_EQ(cpuIndex.nprobe, gpuIndex.nprobe);
         EXPECT_EQ(cpuIndex.pq.M, gpuIndex.getNumSubQuantizers());
         EXPECT_EQ(gpuIndex.getNumSubQuantizers(), opt.codes);
         EXPECT_EQ(cpuIndex.pq.nbits, gpuIndex.getBitsPerCode());
@@ -538,7 +589,7 @@
     // Use garbage values to see if we overwrite them
     faiss::gpu::GpuIndexIVFPQ gpuIndex(
             &res, 1, 1, 1, 8, faiss::METRIC_L2, config);
-    gpuIndex.setNumProbes(1);
+    gpuIndex.nprobe = 1;
 
     gpuIndex.copyFrom(&cpuIndex);
 
@@ -549,7 +600,7 @@
     EXPECT_EQ(cpuIndex.d, gpuIndex.d);
     EXPECT_EQ(cpuIndex.d, opt.dim);
     EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists());
-    EXPECT_EQ(cpuIndex.nprobe, gpuIndex.getNumProbes());
+    EXPECT_EQ(cpuIndex.nprobe, gpuIndex.nprobe);
     EXPECT_EQ(cpuIndex.pq.M, gpuIndex.getNumSubQuantizers());
     EXPECT_EQ(gpuIndex.getNumSubQuantizers(), opt.codes);
     EXPECT_EQ(cpuIndex.pq.nbits, gpuIndex.getBitsPerCode());
@@ -594,7 +645,7 @@
             faiss::METRIC_L2,
             config);
 
-    gpuIndex.setNumProbes(opt.nprobe);
+    gpuIndex.nprobe = opt.nprobe;
 
     gpuIndex.train(opt.numTrain, trainVecs.data());
     gpuIndex.add(opt.numAdd, addVecs.data());
@@ -604,7 +655,7 @@
             numQuery * opt.dim, std::numeric_limits<float>::quiet_NaN());
 
     std::vector<float> distances(numQuery * opt.k, 0);
-    std::vector<faiss::Index::idx_t> indices(numQuery * opt.k, 0);
+    std::vector<faiss::idx_t> indices(numQuery * opt.k, 0);
 
     gpuIndex.search(
             numQuery, nans.data(), opt.k, distances.data(), indices.data());
@@ -640,7 +691,7 @@
             faiss::METRIC_L2,
             config);
 
-    gpuIndex.setNumProbes(opt.nprobe);
+    gpuIndex.nprobe = opt.nprobe;
 
     int numNans = 10;
     std::vector<float> nans(
@@ -660,7 +711,7 @@
 
     std::vector<float> queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
     std::vector<float> distance(opt.numQuery * opt.k, 0);
-    std::vector<faiss::Index::idx_t> indices(opt.numQuery * opt.k, 0);
+    std::vector<faiss::idx_t> indices(opt.numQuery * opt.k, 0);
 
     // should not crash
     gpuIndex.search(
@@ -721,7 +772,7 @@
             faiss::METRIC_L2,
             config);
     gpuIndex.copyFrom(&cpuIndex);
-    gpuIndex.setNumProbes(nprobe);
+    gpuIndex.nprobe = nprobe;
 
     faiss::gpu::compareIndices(
             cpuIndex,
diff -Nru faiss-1.7.3/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp faiss-1.7.4/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp
--- faiss-1.7.3/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/test/TestGpuIndexIVFScalarQuantizer.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -79,7 +79,7 @@
             &res, opt.dim, opt.numCentroids, qtype, METRIC_L2, true, config);
     gpuIndex.train(opt.numTrain, trainVecs.data());
     gpuIndex.add(opt.numAdd, addVecs.data());
-    gpuIndex.setNumProbes(opt.nprobe);
+    gpuIndex.nprobe = opt.nprobe;
 
     // use garbage values to see if we overwrite then
     IndexFlatL2 cpuQuantizer(1);
@@ -100,7 +100,7 @@
     EXPECT_EQ(cpuIndex.quantizer->d, gpuIndex.quantizer->d);
     EXPECT_EQ(cpuIndex.d, opt.dim);
     EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists());
-    EXPECT_EQ(cpuIndex.nprobe, gpuIndex.getNumProbes());
+    EXPECT_EQ(cpuIndex.nprobe, gpuIndex.nprobe);
 
     testIVFEquality(cpuIndex, gpuIndex);
 
@@ -172,7 +172,7 @@
             METRIC_L2,
             false,
             config);
-    gpuIndex.setNumProbes(1);
+    gpuIndex.nprobe = 1;
 
     gpuIndex.copyFrom(&cpuIndex);
 
@@ -182,7 +182,7 @@
     EXPECT_EQ(cpuIndex.d, gpuIndex.d);
     EXPECT_EQ(cpuIndex.d, opt.dim);
     EXPECT_EQ(cpuIndex.nlist, gpuIndex.getNumLists());
-    EXPECT_EQ(cpuIndex.nprobe, gpuIndex.getNumProbes());
+    EXPECT_EQ(cpuIndex.nprobe, gpuIndex.nprobe);
 
     testIVFEquality(cpuIndex, gpuIndex);
 
diff -Nru faiss-1.7.3/faiss/gpu/test/test_gpu_index_ivfsq.py faiss-1.7.4/faiss/gpu/test/test_gpu_index_ivfsq.py
--- faiss-1.7.3/faiss/gpu/test/test_gpu_index_ivfsq.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/test/test_gpu_index_ivfsq.py	2023-04-19 13:18:30.000000000 +0000
@@ -215,7 +215,3 @@
 
     def test_8bit_direct(self):
         do_multi_test(faiss.ScalarQuantizer.QT_8bit_direct)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff -Nru faiss-1.7.3/faiss/gpu/test/test_gpu_index.py faiss-1.7.4/faiss/gpu/test/test_gpu_index.py
--- faiss-1.7.3/faiss/gpu/test/test_gpu_index.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/test/test_gpu_index.py	2023-04-19 13:18:30.000000000 +0000
@@ -6,7 +6,6 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 
 import math
-import time
 import unittest
 import numpy as np
 import faiss
@@ -14,164 +13,6 @@
 from faiss.contrib import ivf_tools
 from faiss.contrib.evaluation import knn_intersection_measure
 
-class EvalIVFPQAccuracy(unittest.TestCase):
-
-    def get_dataset(self, small_one=False):
-        if not small_one:
-            d = 128
-            nb = 100000
-            nt = 15000
-            nq = 2000
-        else:
-            d = 32
-            nb = 10000
-            nt = 1000
-            nq = 200
-        np.random.seed(123)
-
-        # generate points in a low-dim subspace to make the resutls
-        # look better :-)
-        d1 = 16
-        q, r = np.linalg.qr(np.random.randn(d, d))
-        qc = q[:d1, :]
-        def make_mat(n):
-            return np.dot(
-                np.random.random(size=(nb, d1)), qc).astype('float32')
-
-        return (make_mat(nt), make_mat(nb), make_mat(nq))
-
-
-    def test_mm(self):
-        # trouble with MKL+fbmake that appears only at runtime. Check it here
-        x = np.random.random(size=(100, 20)).astype('float32')
-        mat = faiss.PCAMatrix(20, 10)
-        mat.train(x)
-        mat.apply_py(x)
-
-    def do_cpu_to_gpu(self, index_key):
-        ts = []
-        ts.append(time.time())
-        (xt, xb, xq) = self.get_dataset(small_one=True)
-        nb, d = xb.shape
-
-        index = faiss.index_factory(d, index_key)
-        if index.__class__ == faiss.IndexIVFPQ:
-            # speed up test
-            index.pq.cp.niter = 2
-            index.do_polysemous_training = False
-        ts.append(time.time())
-
-        index.train(xt)
-        ts.append(time.time())
-
-        # adding some ids because there was a bug in this case;
-        # those need to be cast to idx_t(= int64_t), because
-        # on windows the numpy int default is int32
-        ids = (np.arange(nb) * 3 + 12345).astype('int64')
-        index.add_with_ids(xb, ids)
-        ts.append(time.time())
-
-        index.nprobe = 4
-        Dref, Iref = index.search(xq, 10)
-        ts.append(time.time())
-
-        res = faiss.StandardGpuResources()
-        gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
-        ts.append(time.time())
-
-        # Validate the layout of the memory info
-        mem_info = res.getMemoryInfo()
-
-        assert type(mem_info) == dict
-        assert type(mem_info[0]['FlatData']) == tuple
-        assert type(mem_info[0]['FlatData'][0]) == int
-        assert type(mem_info[0]['FlatData'][1]) == int
-
-        gpu_index.setNumProbes(4)
-
-        Dnew, Inew = gpu_index.search(xq, 10)
-        ts.append(time.time())
-        print('times:', [t - ts[0] for t in ts])
-
-        # Give us some margin of error
-        self.assertGreaterEqual((Iref == Inew).sum(), Iref.size - 50)
-
-        if faiss.get_num_gpus() == 1:
-            return
-
-        for shard in False, True:
-
-            # test on just 2 GPUs
-            res = [faiss.StandardGpuResources() for i in range(2)]
-            co = faiss.GpuMultipleClonerOptions()
-            co.shard = shard
-
-            gpu_index = faiss.index_cpu_to_gpu_multiple_py(res, index, co)
-
-            faiss.GpuParameterSpace().set_index_parameter(
-                gpu_index, 'nprobe', 4)
-
-            Dnew, Inew = gpu_index.search(xq, 10)
-
-            # 0.99: allow some tolerance in results otherwise test
-            # fails occasionally (not reproducible)
-            self.assertGreaterEqual((Iref == Inew).sum(), Iref.size * 0.99)
-
-    def test_cpu_to_gpu_IVFPQ(self):
-        self.do_cpu_to_gpu('IVF128,PQ4')
-
-    def test_cpu_to_gpu_IVFFlat(self):
-        self.do_cpu_to_gpu('IVF128,Flat')
-
-    def test_set_gpu_param(self):
-        index = faiss.index_factory(12, "PCAR8,IVF10,PQ4")
-        res = faiss.StandardGpuResources()
-        gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
-        faiss.GpuParameterSpace().set_index_parameter(gpu_index, "nprobe", 3)
-
-
-
-class TestShardedFlat(unittest.TestCase):
-
-    @unittest.skipIf(faiss.get_num_gpus() < 2, "Relevant for multiple GPU only.")
-    def test_sharded(self):
-        d = 32
-        nb = 1000
-        nq = 200
-        k = 10
-        rs = np.random.RandomState(123)
-        xb = rs.rand(nb, d).astype('float32')
-        xq = rs.rand(nq, d).astype('float32')
-
-        index_cpu = faiss.IndexFlatL2(d)
-
-        assert faiss.get_num_gpus() > 1
-
-        co = faiss.GpuMultipleClonerOptions()
-        co.shard = True
-        index = faiss.index_cpu_to_all_gpus(index_cpu, co, ngpu=2)
-
-        index.add(xb)
-        D, I = index.search(xq, k)
-
-        index_cpu.add(xb)
-        D_ref, I_ref = index_cpu.search(xq, k)
-
-        assert np.all(I == I_ref)
-
-        del index
-        index2 = faiss.index_cpu_to_all_gpus(index_cpu, co, ngpu=2)
-        D2, I2 = index2.search(xq, k)
-
-        assert np.all(I2 == I_ref)
-
-        try:
-            index2.add(xb)
-        except RuntimeError:
-            pass
-        else:
-            assert False, "this call should fail!"
-
 
 class TestIVFSearchPreassigned(unittest.TestCase):
     def test_ivfflat_search_preassigned(self):
@@ -678,20 +519,17 @@
 
         # invalid k (should be > 0)
         k = -5
-        idx.setNumProbes(3)
+        idx.nprobe = 3
         self.assertRaises(AssertionError, idx.search, xb[10:20], k)
 
-        # invalid nprobe (should be > 0)
-        self.assertRaises(RuntimeError, idx.setNumProbes, 0)
-        self.assertRaises(RuntimeError, idx.setNumProbes, -3)
-
-        k = 5
-        idx.nprobe = -3
-        self.assertRaises(RuntimeError, idx.search, xb[10:20], k)
+        # nprobe is unsigned now, so this is caught before reaching C++
+        # k = 5
+        # idx.nprobe = -3
+        # self.assertRaises(RuntimeError, idx.search, xb[10:20], k)
 
         # valid params
         k = 5
-        idx.setNumProbes(3)
+        idx.nprobe = 3
         _, I = idx.search(xb[10:20], k)
         self.assertTrue(np.array_equal(xb_indices[10:20], I[:, 0]))
 
@@ -735,5 +573,19 @@
         self.subtest_gpu_encoding(ngpu)
 
 
-if __name__ == '__main__':
-    unittest.main()
+class TestGpuAutoTune(unittest.TestCase):
+
+    def test_params(self):
+        index = faiss.index_factory(32, "IVF65536_HNSW,PQ16")
+        index = faiss.index_cpu_to_gpu(faiss.StandardGpuResources(), 0, index)
+        ps = faiss.GpuParameterSpace()
+        ps.initialize(index)
+        for i in range(ps.parameter_ranges.size()):
+            pr = ps.parameter_ranges.at(i)
+            if pr.name == "quantizer_efSearch":
+                break
+        else:
+            self.fail("should include efSearch")
+        ps.set_index_parameter(index, "quantizer_efSearch", 123)
+        quantizer = faiss.downcast_index(index.quantizer)
+        self.assertEqual(quantizer.hnsw.efSearch, 123)
diff -Nru faiss-1.7.3/faiss/gpu/test/test_gpu_index_serialize.py faiss-1.7.4/faiss/gpu/test/test_gpu_index_serialize.py
--- faiss-1.7.3/faiss/gpu/test/test_gpu_index_serialize.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/test/test_gpu_index_serialize.py	2023-04-19 13:18:30.000000000 +0000
@@ -3,13 +3,9 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-import math
-import time
 import unittest
 import numpy as np
 import faiss
-import tempfile
-import os
 
 def make_t(num, d):
     rs = np.random.RandomState(123)
diff -Nru faiss-1.7.3/faiss/gpu/test/TestGpuSelect.cu faiss-1.7.4/faiss/gpu/test/TestGpuSelect.cu
--- faiss-1.7.3/faiss/gpu/test/TestGpuSelect.cu	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/test/TestGpuSelect.cu	2023-04-19 13:18:30.000000000 +0000
@@ -5,6 +5,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <faiss/Index.h>
 #include <faiss/gpu/StandardGpuResources.h>
 #include <faiss/gpu/test/TestUtils.h>
 #include <faiss/gpu/utils/DeviceUtils.h>
@@ -20,6 +21,7 @@
 #include <vector>
 
 void testForSize(int rows, int cols, int k, bool dir, bool warp) {
+    using namespace faiss;
     using namespace faiss::gpu;
 
     StandardGpuResources res;
@@ -65,7 +67,7 @@
             res.getResources().get(),
             makeDevAlloc(AllocType::Other, 0),
             {rows, k});
-    DeviceTensor<int, 2, true> gpuOutInd(
+    DeviceTensor<idx_t, 2, true> gpuOutInd(
             res.getResources().get(),
             makeDevAlloc(AllocType::Other, 0),
             {rows, k});
@@ -78,10 +80,10 @@
 
     // Copy back to CPU
     HostTensor<float, 2, true> outVal(gpuOutVal, 0);
-    HostTensor<int, 2, true> outInd(gpuOutInd, 0);
+    HostTensor<idx_t, 2, true> outInd(gpuOutInd, 0);
 
     for (int r = 0; r < rows; ++r) {
-        std::unordered_map<int, int> seenIndices;
+        std::unordered_map<idx_t, idx_t> seenIndices;
 
         for (int i = 0; i < k; ++i) {
             float gpuV = outVal[r][i];
@@ -97,8 +99,8 @@
             // equivalent values is different than the CPU (and will remain
             // unspecified, since this is affected by the choice of
             // k-selection algorithm that we use)
-            int gpuInd = outInd[r][i];
-            int cpuInd = hostOutValAndInd[r][i].first;
+            idx_t gpuInd = outInd[r][i];
+            idx_t cpuInd = hostOutValAndInd[r][i].first;
 
             // We should never see duplicate indices, however
             auto itSeenIndex = seenIndices.find(gpuInd);
diff -Nru faiss-1.7.3/faiss/gpu/test/test_multi_gpu.py faiss-1.7.4/faiss/gpu/test/test_multi_gpu.py
--- faiss-1.7.3/faiss/gpu/test/test_multi_gpu.py	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/test/test_multi_gpu.py	2023-04-19 13:18:30.000000000 +0000
@@ -0,0 +1,215 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import time
+import unittest
+import numpy as np
+import faiss
+
+from faiss.contrib.datasets import SyntheticDataset
+
+
+class TestShardedFlat(unittest.TestCase):
+
+    @unittest.skipIf(faiss.get_num_gpus() < 2, "multiple GPU only test")
+    def test_sharded(self):
+        d = 32
+        nb = 1000
+        nq = 200
+        k = 10
+        rs = np.random.RandomState(123)
+        xb = rs.rand(nb, d).astype('float32')
+        xq = rs.rand(nq, d).astype('float32')
+
+        index_cpu = faiss.IndexFlatL2(d)
+
+        assert faiss.get_num_gpus() > 1
+
+        co = faiss.GpuMultipleClonerOptions()
+        co.shard = True
+        index = faiss.index_cpu_to_all_gpus(index_cpu, co, ngpu=2)
+
+        index.add(xb)
+        D, I = index.search(xq, k)
+
+        index_cpu.add(xb)
+        D_ref, I_ref = index_cpu.search(xq, k)
+
+        assert np.all(I == I_ref)
+
+        del index
+        index2 = faiss.index_cpu_to_all_gpus(index_cpu, co, ngpu=2)
+        D2, I2 = index2.search(xq, k)
+
+        assert np.all(I2 == I_ref)
+
+        try:
+            index2.add(xb)
+        except RuntimeError:
+            pass
+        else:
+            raise AssertionError("errpr: call should fail but isn't failing")
+
+    @unittest.skipIf(faiss.get_num_gpus() < 2, "multiple GPU only test")
+    def do_test_sharded_ivf(self, index_key):
+        ds = SyntheticDataset(32, 8000, 10000, 100)
+        index = faiss.index_factory(ds.d, index_key)
+        if 'HNSW' in index_key:
+            # make a bit more reproducible...
+            faiss.ParameterSpace().set_index_parameter(
+                index, 'quantizer_efSearch', 40)
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+        Dref, Iref = index.search(ds.get_queries(), 10)
+        index.nprobe = 8
+        Dref8, Iref8 = index.search(ds.get_queries(), 10)
+        index.nprobe = 1
+        print("REF checksum", faiss.checksum(Iref))
+
+        co = faiss.GpuMultipleClonerOptions()
+        co.shard = True
+        co.common_ivf_quantizer = True
+        index = faiss.index_cpu_to_all_gpus(index, co, ngpu=2)
+
+        index.quantizer  # make sure there is indeed a quantizer
+        print("QUANT", faiss.downcast_index(index.quantizer))
+        Dnew, Inew = index.search(ds.get_queries(), 10)
+        np.testing.assert_array_equal(Iref, Inew)
+        np.testing.assert_array_almost_equal(Dref, Dnew, decimal=4)
+
+        # the nprobe is taken from the sub-indexes
+        faiss.GpuParameterSpace().set_index_parameter(index, 'nprobe', 8)
+        Dnew8, Inew8 = index.search(ds.get_queries(), 10)
+        np.testing.assert_array_equal(Iref8, Inew8)
+        np.testing.assert_array_almost_equal(Dref8, Dnew8, decimal=4)
+
+        index.reset()
+        index.add(ds.get_database())
+
+        Dnew8, Inew8 = index.search(ds.get_queries(), 10)
+        np.testing.assert_array_equal(Iref8, Inew8)
+        np.testing.assert_array_almost_equal(Dref8, Dnew8, decimal=4)
+
+    def test_sharded_IVFSQ(self):
+        self.do_test_sharded_ivf("IVF128,SQ8")
+
+    def test_sharded_IVF_HNSW(self):
+        self.do_test_sharded_ivf("IVF1000_HNSW,Flat")
+
+
+# This class also has a multi-GPU test within
+class EvalIVFPQAccuracy(unittest.TestCase):
+    def get_dataset(self, small_one=False):
+        if not small_one:
+            d = 128
+            nb = 100000
+            nt = 15000
+            nq = 2000
+        else:
+            d = 32
+            nb = 10000
+            nt = 1000
+            nq = 200
+        np.random.seed(123)
+
+        # generate points in a low-dim subspace to make the resutls
+        # look better :-)
+        d1 = 16
+        q, r = np.linalg.qr(np.random.randn(d, d))
+        qc = q[:d1, :]
+
+        def make_mat(n):
+            return np.dot(
+                np.random.random(size=(nb, d1)), qc).astype('float32')
+
+        return (make_mat(nt), make_mat(nb), make_mat(nq))
+
+    def test_mm(self):
+        # trouble with MKL+fbmake that appears only at runtime. Check it here
+        x = np.random.random(size=(100, 20)).astype('float32')
+        mat = faiss.PCAMatrix(20, 10)
+        mat.train(x)
+        mat.apply_py(x)
+
+    def do_cpu_to_gpu(self, index_key):
+        ts = []
+        ts.append(time.time())
+        (xt, xb, xq) = self.get_dataset(small_one=True)
+        nb, d = xb.shape
+
+        index = faiss.index_factory(d, index_key)
+        if index.__class__ == faiss.IndexIVFPQ:
+            # speed up test
+            index.pq.cp.niter = 2
+            index.do_polysemous_training = False
+        ts.append(time.time())
+
+        index.train(xt)
+        ts.append(time.time())
+
+        # adding some ids because there was a bug in this case;
+        # those need to be cast to idx_t(= int64_t), because
+        # on windows the numpy int default is int32
+        ids = (np.arange(nb) * 3 + 12345).astype('int64')
+        index.add_with_ids(xb, ids)
+        ts.append(time.time())
+
+        index.nprobe = 4
+        Dref, Iref = index.search(xq, 10)
+        ts.append(time.time())
+
+        res = faiss.StandardGpuResources()
+        gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
+        ts.append(time.time())
+
+        # Validate the layout of the memory info
+        mem_info = res.getMemoryInfo()
+
+        assert type(mem_info) == dict
+        assert type(mem_info[0]['FlatData']) == tuple
+        assert type(mem_info[0]['FlatData'][0]) == int
+        assert type(mem_info[0]['FlatData'][1]) == int
+
+        gpu_index.nprobe = 4
+
+        Dnew, Inew = gpu_index.search(xq, 10)
+        ts.append(time.time())
+        print('times:', [t - ts[0] for t in ts])
+
+        # Give us some margin of error
+        self.assertGreaterEqual((Iref == Inew).sum(), Iref.size - 50)
+
+        if faiss.get_num_gpus() == 1:
+            return
+
+        for shard in False, True:
+
+            # test on just 2 GPUs
+            res = [faiss.StandardGpuResources() for i in range(2)]
+            co = faiss.GpuMultipleClonerOptions()
+            co.shard = shard
+
+            gpu_index = faiss.index_cpu_to_gpu_multiple_py(res, index, co)
+
+            faiss.GpuParameterSpace().set_index_parameter(
+                gpu_index, 'nprobe', 4)
+
+            Dnew, Inew = gpu_index.search(xq, 10)
+
+            # 0.99: allow some tolerance in results otherwise test
+            # fails occasionally (not reproducible)
+            self.assertGreaterEqual((Iref == Inew).sum(), Iref.size * 0.99)
+
+    def test_cpu_to_gpu_IVFPQ(self):
+        self.do_cpu_to_gpu('IVF128,PQ4')
+
+    def test_cpu_to_gpu_IVFFlat(self):
+        self.do_cpu_to_gpu('IVF128,Flat')
+
+    def test_set_gpu_param(self):
+        index = faiss.index_factory(12, "PCAR8,IVF10,PQ4")
+        res = faiss.StandardGpuResources()
+        gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
+        faiss.GpuParameterSpace().set_index_parameter(gpu_index, "nprobe", 3)
diff -Nru faiss-1.7.3/faiss/gpu/test/TestUtils.cpp faiss-1.7.4/faiss/gpu/test/TestUtils.cpp
--- faiss-1.7.3/faiss/gpu/test/TestUtils.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/test/TestUtils.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -97,7 +97,7 @@
         float pctMaxDiffN) {
     // Compare
     std::vector<float> refDistance(numQuery * k, 0);
-    std::vector<faiss::Index::idx_t> refIndices(numQuery * k, -1);
+    std::vector<faiss::idx_t> refIndices(numQuery * k, -1);
     refIndex.search(
             numQuery,
             queryVecs.data(),
@@ -106,7 +106,7 @@
             refIndices.data());
 
     std::vector<float> testDistance(numQuery * k, 0);
-    std::vector<faiss::Index::idx_t> testIndices(numQuery * k, -1);
+    std::vector<faiss::idx_t> testIndices(numQuery * k, -1);
     testIndex.search(
             numQuery,
             queryVecs.data(),
@@ -162,9 +162,9 @@
 
 void compareLists(
         const float* refDist,
-        const faiss::Index::idx_t* refInd,
+        const faiss::idx_t* refInd,
         const float* testDist,
-        const faiss::Index::idx_t* testInd,
+        const faiss::idx_t* testInd,
         int dim1,
         int dim2,
         const std::string& configMsg,
@@ -181,10 +181,10 @@
     int numResults = dim1 * dim2;
 
     // query -> {index -> result position}
-    std::vector<std::unordered_map<faiss::Index::idx_t, int>> refIndexMap;
+    std::vector<std::unordered_map<faiss::idx_t, int>> refIndexMap;
 
     for (int query = 0; query < dim1; ++query) {
-        std::unordered_map<faiss::Index::idx_t, int> indices;
+        std::unordered_map<faiss::idx_t, int> indices;
 
         for (int result = 0; result < dim2; ++result) {
             indices[lookup(refInd, query, result, dim1, dim2)] = result;
@@ -208,7 +208,7 @@
 
     for (int query = 0; query < dim1; ++query) {
         std::vector<int> diffs;
-        std::set<faiss::Index::idx_t> uniqueIndices;
+        std::set<faiss::idx_t> uniqueIndices;
 
         auto& indices = refIndexMap[query];
 
diff -Nru faiss-1.7.3/faiss/gpu/test/TestUtils.h faiss-1.7.4/faiss/gpu/test/TestUtils.h
--- faiss-1.7.3/faiss/gpu/test/TestUtils.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/test/TestUtils.h	2023-04-19 13:18:30.000000000 +0000
@@ -93,9 +93,9 @@
 /// Display specific differences in the two (distance, index) lists
 void compareLists(
         const float* refDist,
-        const faiss::Index::idx_t* refInd,
+        const faiss::idx_t* refInd,
         const float* testDist,
-        const faiss::Index::idx_t* testInd,
+        const faiss::idx_t* testInd,
         int dim1,
         int dim2,
         const std::string& configMsg,
@@ -130,13 +130,13 @@
         EXPECT_EQ(cpuCodes, gpuCodes);
 
         // Index equality
-        std::vector<Index::idx_t> cpuIndices(cpuLists->list_size(i));
+        std::vector<idx_t> cpuIndices(cpuLists->list_size(i));
 
         auto si = faiss::InvertedLists::ScopedIds(cpuLists, i);
         std::memcpy(
                 cpuIndices.data(),
                 si.get(),
-                cpuLists->list_size(i) * sizeof(faiss::Index::idx_t));
+                cpuLists->list_size(i) * sizeof(faiss::idx_t));
         EXPECT_EQ(cpuIndices, gpuIndex.getListIndices(i));
     }
 }
diff -Nru faiss-1.7.3/faiss/gpu/utils/blockselect/BlockSelectImpl.cuh faiss-1.7.4/faiss/gpu/utils/blockselect/BlockSelectImpl.cuh
--- faiss-1.7.3/faiss/gpu/utils/blockselect/BlockSelectImpl.cuh	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/utils/blockselect/BlockSelectImpl.cuh	2023-04-19 13:18:30.000000000 +0000
@@ -14,16 +14,16 @@
     extern void runBlockSelect_##TYPE##_##DIR##_##WARP_Q##_(     \
             Tensor<TYPE, 2, true>& in,                           \
             Tensor<TYPE, 2, true>& outK,                         \
-            Tensor<int, 2, true>& outV,                          \
+            Tensor<idx_t, 2, true>& outV,                        \
             bool dir,                                            \
             int k,                                               \
             cudaStream_t stream);                                \
                                                                  \
     extern void runBlockSelectPair_##TYPE##_##DIR##_##WARP_Q##_( \
             Tensor<TYPE, 2, true>& inK,                          \
-            Tensor<int, 2, true>& inV,                           \
+            Tensor<idx_t, 2, true>& inV,                         \
             Tensor<TYPE, 2, true>& outK,                         \
-            Tensor<int, 2, true>& outV,                          \
+            Tensor<idx_t, 2, true>& outV,                        \
             bool dir,                                            \
             int k,                                               \
             cudaStream_t stream)
@@ -32,7 +32,7 @@
     void runBlockSelect_##TYPE##_##DIR##_##WARP_Q##_(                          \
             Tensor<TYPE, 2, true>& in,                                         \
             Tensor<TYPE, 2, true>& outK,                                       \
-            Tensor<int, 2, true>& outV,                                        \
+            Tensor<idx_t, 2, true>& outV,                                      \
             bool dir,                                                          \
             int k,                                                             \
             cudaStream_t stream) {                                             \
@@ -52,16 +52,22 @@
         auto kInit = dir ? Limits<TYPE>::getMin() : Limits<TYPE>::getMax();    \
         auto vInit = -1;                                                       \
                                                                                \
-        blockSelect<TYPE, int, DIR, WARP_Q, THREAD_Q, kBlockSelectNumThreads>  \
+        blockSelect<                                                           \
+                TYPE,                                                          \
+                idx_t,                                                         \
+                DIR,                                                           \
+                WARP_Q,                                                        \
+                THREAD_Q,                                                      \
+                kBlockSelectNumThreads>                                        \
                 <<<grid, block, 0, stream>>>(in, outK, outV, kInit, vInit, k); \
         CUDA_TEST_ERROR();                                                     \
     }                                                                          \
                                                                                \
     void runBlockSelectPair_##TYPE##_##DIR##_##WARP_Q##_(                      \
             Tensor<TYPE, 2, true>& inK,                                        \
-            Tensor<int, 2, true>& inV,                                         \
+            Tensor<idx_t, 2, true>& inV,                                       \
             Tensor<TYPE, 2, true>& outK,                                       \
-            Tensor<int, 2, true>& outV,                                        \
+            Tensor<idx_t, 2, true>& outV,                                      \
             bool dir,                                                          \
             int k,                                                             \
             cudaStream_t stream) {                                             \
@@ -81,7 +87,7 @@
                                                                                \
         blockSelectPair<                                                       \
                 TYPE,                                                          \
-                int,                                                           \
+                idx_t,                                                         \
                 DIR,                                                           \
                 WARP_Q,                                                        \
                 THREAD_Q,                                                      \
diff -Nru faiss-1.7.3/faiss/gpu/utils/BlockSelectFloat.cu faiss-1.7.4/faiss/gpu/utils/BlockSelectFloat.cu
--- faiss-1.7.3/faiss/gpu/utils/BlockSelectFloat.cu	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/utils/BlockSelectFloat.cu	2023-04-19 13:18:30.000000000 +0000
@@ -46,7 +46,7 @@
 void runBlockSelect(
         Tensor<float, 2, true>& in,
         Tensor<float, 2, true>& outK,
-        Tensor<int, 2, true>& outV,
+        Tensor<idx_t, 2, true>& outV,
         bool dir,
         int k,
         cudaStream_t stream) {
@@ -97,9 +97,9 @@
 
 void runBlockSelectPair(
         Tensor<float, 2, true>& inK,
-        Tensor<int, 2, true>& inV,
+        Tensor<idx_t, 2, true>& inV,
         Tensor<float, 2, true>& outK,
-        Tensor<int, 2, true>& outV,
+        Tensor<idx_t, 2, true>& outV,
         bool dir,
         int k,
         cudaStream_t stream) {
diff -Nru faiss-1.7.3/faiss/gpu/utils/BlockSelectKernel.cuh faiss-1.7.4/faiss/gpu/utils/BlockSelectKernel.cuh
--- faiss-1.7.3/faiss/gpu/utils/BlockSelectKernel.cuh	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/utils/BlockSelectKernel.cuh	2023-04-19 13:18:30.000000000 +0000
@@ -42,13 +42,13 @@
             heap(initK, initV, smemK, smemV, k);
 
     // Grid is exactly sized to rows available
-    int row = blockIdx.x;
+    idx_t row = blockIdx.x;
 
-    int i = threadIdx.x;
+    idx_t i = threadIdx.x;
     K* inStart = in[row][i].data();
 
     // Whole warps must participate in the selection
-    int limit = utils::roundDown(in.getSize(1), kWarpSize);
+    idx_t limit = utils::roundDown(in.getSize(1), kWarpSize);
 
     for (; i < limit; i += ThreadsPerBlock) {
         heap.add(*inStart, (IndexType)i);
@@ -99,14 +99,14 @@
             heap(initK, initV, smemK, smemV, k);
 
     // Grid is exactly sized to rows available
-    int row = blockIdx.x;
+    idx_t row = blockIdx.x;
 
-    int i = threadIdx.x;
+    idx_t i = threadIdx.x;
     K* inKStart = inK[row][i].data();
     IndexType* inVStart = inV[row][i].data();
 
     // Whole warps must participate in the selection
-    int limit = utils::roundDown(inK.getSize(1), kWarpSize);
+    idx_t limit = utils::roundDown(inK.getSize(1), (idx_t)kWarpSize);
 
     for (; i < limit; i += ThreadsPerBlock) {
         heap.add(*inKStart, *inVStart);
@@ -130,16 +130,16 @@
 void runBlockSelect(
         Tensor<float, 2, true>& in,
         Tensor<float, 2, true>& outKeys,
-        Tensor<int, 2, true>& outIndices,
+        Tensor<idx_t, 2, true>& outIndices,
         bool dir,
         int k,
         cudaStream_t stream);
 
 void runBlockSelectPair(
         Tensor<float, 2, true>& inKeys,
-        Tensor<int, 2, true>& inIndices,
+        Tensor<idx_t, 2, true>& inIndices,
         Tensor<float, 2, true>& outKeys,
-        Tensor<int, 2, true>& outIndices,
+        Tensor<idx_t, 2, true>& outIndices,
         bool dir,
         int k,
         cudaStream_t stream);
@@ -147,16 +147,16 @@
 void runBlockSelect(
         Tensor<half, 2, true>& in,
         Tensor<half, 2, true>& outKeys,
-        Tensor<int, 2, true>& outIndices,
+        Tensor<idx_t, 2, true>& outIndices,
         bool dir,
         int k,
         cudaStream_t stream);
 
 void runBlockSelectPair(
         Tensor<half, 2, true>& inKeys,
-        Tensor<int, 2, true>& inIndices,
+        Tensor<idx_t, 2, true>& inIndices,
         Tensor<half, 2, true>& outKeys,
-        Tensor<int, 2, true>& outIndices,
+        Tensor<idx_t, 2, true>& outIndices,
         bool dir,
         int k,
         cudaStream_t stream);
diff -Nru faiss-1.7.3/faiss/gpu/utils/CopyUtils.cuh faiss-1.7.4/faiss/gpu/utils/CopyUtils.cuh
--- faiss-1.7.3/faiss/gpu/utils/CopyUtils.cuh	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/utils/CopyUtils.cuh	2023-04-19 13:18:30.000000000 +0000
@@ -21,7 +21,7 @@
         int dstDevice,
         T* src,
         cudaStream_t stream,
-        std::initializer_list<int> sizes) {
+        std::initializer_list<idx_t> sizes) {
     int dev = getDeviceForAddress(src);
     DeviceTensor<T, Dim, true> oldT(src, sizes);
 
@@ -46,7 +46,7 @@
         int dstDevice,
         T* src,
         cudaStream_t stream,
-        std::initializer_list<int> sizes) {
+        std::initializer_list<idx_t> sizes) {
     int dev = getDeviceForAddress(src);
     DeviceTensor<T, Dim, true> oldT(src, sizes);
 
@@ -74,12 +74,10 @@
     // Uses the current device if device == -1
     DeviceScope scope(device);
 
-    FAISS_ASSERT(src.size() < (size_t)std::numeric_limits<int>::max());
-
     DeviceTensor<T, 1, true> out(
             resources,
             makeTempAlloc(AllocType::Other, stream),
-            {(int)src.size()});
+            {(idx_t)src.size()});
 
     out.copyFrom(src, stream);
 
@@ -91,7 +89,7 @@
 HostTensor<T, Dim, true> toHost(
         T* src,
         cudaStream_t stream,
-        std::initializer_list<int> sizes) {
+        std::initializer_list<idx_t> sizes) {
     int dev = getDeviceForAddress(src);
 
     if (dev == -1) {
diff -Nru faiss-1.7.3/faiss/gpu/utils/DeviceTensor.cuh faiss-1.7.4/faiss/gpu/utils/DeviceTensor.cuh
--- faiss-1.7.3/faiss/gpu/utils/DeviceTensor.cuh	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/utils/DeviceTensor.cuh	2023-04-19 13:18:30.000000000 +0000
@@ -7,6 +7,7 @@
 
 #pragma once
 
+#include <faiss/Index.h> // idx_t
 #include <faiss/gpu/GpuResources.h>
 #include <faiss/gpu/utils/Tensor.cuh>
 
@@ -17,7 +18,7 @@
         typename T,
         int Dim,
         bool InnerContig = false,
-        typename IndexT = int,
+        typename IndexT = idx_t,
         template <typename U> class PtrTraits = traits::DefaultPtrTraits>
 class DeviceTensor : public Tensor<T, Dim, InnerContig, IndexT, PtrTraits> {
    public:
diff -Nru faiss-1.7.3/faiss/gpu/utils/DeviceUtils.cu faiss-1.7.4/faiss/gpu/utils/DeviceUtils.cu
--- faiss-1.7.3/faiss/gpu/utils/DeviceUtils.cu	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/utils/DeviceUtils.cu	2023-04-19 13:18:30.000000000 +0000
@@ -86,6 +86,16 @@
     return getMaxThreads(getCurrentDevice());
 }
 
+dim3 getMaxGrid(int device) {
+    auto& prop = getDeviceProperties(device);
+
+    return dim3(prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
+}
+
+dim3 getMaxGridCurrentDevice() {
+    return getMaxGrid(getCurrentDevice());
+}
+
 size_t getMaxSharedMemPerBlock(int device) {
     return getDeviceProperties(device).sharedMemPerBlock;
 }
diff -Nru faiss-1.7.3/faiss/gpu/utils/DeviceUtils.h faiss-1.7.4/faiss/gpu/utils/DeviceUtils.h
--- faiss-1.7.3/faiss/gpu/utils/DeviceUtils.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/utils/DeviceUtils.h	2023-04-19 13:18:30.000000000 +0000
@@ -47,6 +47,12 @@
 /// Equivalent to getMaxThreads(getCurrentDevice())
 int getMaxThreadsCurrentDevice();
 
+/// Returns the maximum grid size for the given GPU device
+dim3 getMaxGrid(int device);
+
+/// Equivalent to getMaxGrid(getCurrentDevice())
+dim3 getMaxGridCurrentDevice();
+
 /// Returns the maximum smem available for the given GPU device
 size_t getMaxSharedMemPerBlock(int device);
 
diff -Nru faiss-1.7.3/faiss/gpu/utils/DeviceVector.cuh faiss-1.7.4/faiss/gpu/utils/DeviceVector.cuh
--- faiss-1.7.3/faiss/gpu/utils/DeviceVector.cuh	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/utils/DeviceVector.cuh	2023-04-19 13:18:30.000000000 +0000
@@ -20,6 +20,16 @@
 namespace faiss {
 namespace gpu {
 
+// For growing GPU allocations:
+// Below this size, we always round the allocation size up to the next highest
+// power of 2
+constexpr size_t kDeviceVector_2x_Limit = 4 * 1024 * 1024;
+
+// Otherwise, below this size, we always round the allocation size up by a
+// factor of 1.25. Otherwise, all reallocations are exact to the newly requested
+// size.
+constexpr size_t kDeviceVector_1_25x_Limit = 128 * 1024 * 1024;
+
 /// A simple version of thrust::device_vector<T>, but has more control
 /// over streams, whether resize() initializes new space with T() (which we
 /// don't want), and control on how much the reserved space grows by
@@ -236,7 +246,13 @@
     }
 
     size_t getNewCapacity_(size_t preferredSize) {
-        return utils::nextHighestPowerOf2(preferredSize);
+        if (preferredSize <= kDeviceVector_2x_Limit) {
+            return utils::nextHighestPowerOf2(preferredSize);
+        } else if (preferredSize <= kDeviceVector_1_25x_Limit) {
+            return preferredSize + (preferredSize << 2);
+        } else {
+            return preferredSize;
+        }
     }
 
     /// Our current memory allocation, if any
diff -Nru faiss-1.7.3/faiss/gpu/utils/HostTensor.cuh faiss-1.7.4/faiss/gpu/utils/HostTensor.cuh
--- faiss-1.7.3/faiss/gpu/utils/HostTensor.cuh	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/utils/HostTensor.cuh	2023-04-19 13:18:30.000000000 +0000
@@ -7,6 +7,7 @@
 
 #pragma once
 
+#include <faiss/Index.h> // idx_t
 #include <faiss/gpu/utils/Tensor.cuh>
 
 namespace faiss {
@@ -16,7 +17,7 @@
         typename T,
         int Dim,
         bool InnerContig = false,
-        typename IndexT = int,
+        typename IndexT = idx_t,
         template <typename U> class PtrTraits = traits::DefaultPtrTraits>
 class HostTensor : public Tensor<T, Dim, InnerContig, IndexT, PtrTraits> {
    public:
diff -Nru faiss-1.7.3/faiss/gpu/utils/Limits.cuh faiss-1.7.4/faiss/gpu/utils/Limits.cuh
--- faiss-1.7.3/faiss/gpu/utils/Limits.cuh	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/utils/Limits.cuh	2023-04-19 13:18:30.000000000 +0000
@@ -67,6 +67,19 @@
     }
 };
 
+constexpr idx_t kIdxTMax = std::numeric_limits<idx_t>::max();
+constexpr idx_t kIdxTMin = std::numeric_limits<idx_t>::lowest();
+
+template <>
+struct Limits<idx_t> {
+    static __device__ __host__ inline idx_t getMin() {
+        return kIdxTMin;
+    }
+    static __device__ __host__ inline idx_t getMax() {
+        return kIdxTMax;
+    }
+};
+
 template <typename K, typename V>
 struct Limits<Pair<K, V>> {
     static __device__ __host__ inline Pair<K, V> getMin() {
diff -Nru faiss-1.7.3/faiss/gpu/utils/MatrixMult-inl.cuh faiss-1.7.4/faiss/gpu/utils/MatrixMult-inl.cuh
--- faiss-1.7.3/faiss/gpu/utils/MatrixMult-inl.cuh	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/utils/MatrixMult-inl.cuh	2023-04-19 13:18:30.000000000 +0000
@@ -12,6 +12,7 @@
 #include <faiss/gpu/utils/Float16.cuh>
 #include <faiss/gpu/utils/HostTensor.cuh>
 #include <faiss/gpu/utils/Tensor.cuh>
+#include <limits>
 
 namespace faiss {
 namespace gpu {
@@ -163,6 +164,16 @@
         float beta,
         cublasHandle_t handle,
         cudaStream_t stream) {
+    // All sizes must be within int bounds
+    FAISS_ASSERT(c.getSize(0) <= std::numeric_limits<int>::max());
+    FAISS_ASSERT(c.getSize(1) <= std::numeric_limits<int>::max());
+
+    FAISS_ASSERT(a.getSize(0) <= std::numeric_limits<int>::max());
+    FAISS_ASSERT(a.getSize(1) <= std::numeric_limits<int>::max());
+
+    FAISS_ASSERT(a.getSize(0) <= std::numeric_limits<int>::max());
+    FAISS_ASSERT(a.getSize(1) <= std::numeric_limits<int>::max());
+
     cublasSetStream(handle, stream);
 
     // Check that we have (m x k) * (k x n) = (m x n)
@@ -243,7 +254,7 @@
     FAISS_ASSERT_FMT(
             err == CUBLAS_STATUS_SUCCESS,
             "cublas failed (%d): "
-            "(%d, %d)%s x (%d, %d)%s = (%d, %d)%s "
+            "(%ld, %ld)%s x (%ld, %ld)%s = (%ld, %ld)%s "
             "gemm params m %d n %d k %d trA %s trB %s lda %d ldb %d ldc %d",
             (int)err,
             a.getSize(0),
@@ -278,6 +289,19 @@
         float beta,
         cublasHandle_t handle,
         cudaStream_t stream) {
+    // All sizes must be within int bounds
+    FAISS_ASSERT(c.getSize(0) <= std::numeric_limits<int>::max());
+    FAISS_ASSERT(c.getSize(1) <= std::numeric_limits<int>::max());
+    FAISS_ASSERT(c.getSize(2) <= std::numeric_limits<int>::max());
+
+    FAISS_ASSERT(a.getSize(0) <= std::numeric_limits<int>::max());
+    FAISS_ASSERT(a.getSize(1) <= std::numeric_limits<int>::max());
+    FAISS_ASSERT(a.getSize(2) <= std::numeric_limits<int>::max());
+
+    FAISS_ASSERT(a.getSize(0) <= std::numeric_limits<int>::max());
+    FAISS_ASSERT(a.getSize(1) <= std::numeric_limits<int>::max());
+    FAISS_ASSERT(a.getSize(2) <= std::numeric_limits<int>::max());
+
     FAISS_ASSERT(c.getSize(0) == a.getSize(0));
     FAISS_ASSERT(a.getSize(0) == b.getSize(0));
 
diff -Nru faiss-1.7.3/faiss/gpu/utils/NoTypeTensor.cuh faiss-1.7.4/faiss/gpu/utils/NoTypeTensor.cuh
--- faiss-1.7.3/faiss/gpu/utils/NoTypeTensor.cuh	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/utils/NoTypeTensor.cuh	2023-04-19 13:18:30.000000000 +0000
@@ -7,6 +7,7 @@
 
 #pragma once
 
+#include <faiss/Index.h>
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/gpu/utils/Tensor.cuh>
 #include <initializer_list>
@@ -14,7 +15,7 @@
 namespace faiss {
 namespace gpu {
 
-template <int Dim, bool InnerContig = false, typename IndexT = int>
+template <int Dim, bool InnerContig = false, typename IndexT = idx_t>
 class NoTypeTensor {
    public:
     NoTypeTensor() : mem_(nullptr), typeSize_(0) {}
@@ -41,7 +42,7 @@
         }
     }
 
-    NoTypeTensor(void* mem, int typeSize, int sizes[Dim])
+    NoTypeTensor(void* mem, int typeSize, IndexT sizes[Dim])
             : mem_(mem), typeSize_(typeSize) {
         for (int i = 0; i < Dim; ++i) {
             size_[i] = sizes[i];
diff -Nru faiss-1.7.3/faiss/gpu/utils/Tensor.cuh faiss-1.7.4/faiss/gpu/utils/Tensor.cuh
--- faiss-1.7.3/faiss/gpu/utils/Tensor.cuh	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/utils/Tensor.cuh	2023-04-19 13:18:30.000000000 +0000
@@ -10,6 +10,8 @@
 #include <assert.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <faiss/Index.h> // idx_t
+#include <stdint.h>
 #include <initializer_list>
 #include <vector>
 
@@ -76,7 +78,7 @@
         typename T,
         int Dim,
         bool InnerContig = false,
-        typename IndexT = int,
+        typename IndexT = idx_t,
         template <typename U> class PtrTraits = traits::DefaultPtrTraits>
 class Tensor {
    public:
@@ -252,7 +254,7 @@
 
     /// Returns the total number of elements contained within our data
     /// (product of `getSize(i)`)
-    __host__ __device__ size_t numElements() const;
+    __host__ __device__ IndexT numElements() const;
 
     /// If we are contiguous, returns the total size in bytes of our
     /// data
diff -Nru faiss-1.7.3/faiss/gpu/utils/Tensor-inl.cuh faiss-1.7.4/faiss/gpu/utils/Tensor-inl.cuh
--- faiss-1.7.3/faiss/gpu/utils/Tensor-inl.cuh	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/utils/Tensor-inl.cuh	2023-04-19 13:18:30.000000000 +0000
@@ -573,12 +573,12 @@
         typename IndexT,
         template <typename U>
         class PtrTraits>
-__host__ __device__ size_t
+__host__ __device__ IndexT
 Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::numElements() const {
-    size_t size = (size_t)getSize(0);
+    auto size = getSize(0);
 
     for (int i = 1; i < Dim; ++i) {
-        size *= (size_t)getSize(i);
+        size *= getSize(i);
     }
 
     return size;
@@ -593,10 +593,10 @@
         class PtrTraits>
 __host__ __device__ bool Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::
         isContiguous() const {
-    long prevSize = 1;
+    IndexT prevSize = 1;
 
     for (int i = Dim - 1; i >= 0; --i) {
-        if (getSize(i) != (IndexT)1) {
+        if (getSize(i) != 1) {
             if (getStride(i) == prevSize) {
                 prevSize *= getSize(i);
             } else {
diff -Nru faiss-1.7.3/faiss/gpu/utils/Transpose.cuh faiss-1.7.4/faiss/gpu/utils/Transpose.cuh
--- faiss-1.7.3/faiss/gpu/utils/Transpose.cuh	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/utils/Transpose.cuh	2023-04-19 13:18:30.000000000 +0000
@@ -17,27 +17,27 @@
 namespace faiss {
 namespace gpu {
 
-template <typename T, typename IndexT>
+template <typename T>
 struct TensorInfo {
     static constexpr int kMaxDims = 8;
 
     T* data;
-    IndexT sizes[kMaxDims];
-    IndexT strides[kMaxDims];
+    idx_t sizes[kMaxDims];
+    idx_t strides[kMaxDims];
     int dims;
 };
 
-template <typename T, typename IndexT, int Dim>
+template <typename T, int Dim>
 struct TensorInfoOffset {
-    __device__ inline static unsigned int get(
-            const TensorInfo<T, IndexT>& info,
-            IndexT linearId) {
-        IndexT offset = 0;
+    __device__ inline static idx_t get(
+            const TensorInfo<T>& info,
+            idx_t linearId) {
+        idx_t offset = 0;
 
 #pragma unroll
         for (int i = Dim - 1; i >= 0; --i) {
-            IndexT curDimIndex = linearId % info.sizes[i];
-            IndexT curDimOffset = curDimIndex * info.strides[i];
+            auto curDimIndex = linearId % info.sizes[i];
+            auto curDimOffset = curDimIndex * info.strides[i];
 
             offset += curDimOffset;
 
@@ -50,22 +50,22 @@
     }
 };
 
-template <typename T, typename IndexT>
-struct TensorInfoOffset<T, IndexT, -1> {
-    __device__ inline static unsigned int get(
-            const TensorInfo<T, IndexT>& info,
-            IndexT linearId) {
+template <typename T>
+struct TensorInfoOffset<T, -1> {
+    __device__ inline static idx_t get(
+            const TensorInfo<T>& info,
+            idx_t linearId) {
         return linearId;
     }
 };
 
-template <typename T, typename IndexT, int Dim>
-TensorInfo<T, IndexT> getTensorInfo(const Tensor<T, Dim, true>& t) {
-    TensorInfo<T, IndexT> info;
+template <typename T, int Dim>
+TensorInfo<T> getTensorInfo(const Tensor<T, Dim, true>& t) {
+    TensorInfo<T> info;
 
     for (int i = 0; i < Dim; ++i) {
-        info.sizes[i] = (IndexT)t.getSize(i);
-        info.strides[i] = (IndexT)t.getStride(i);
+        info.sizes[i] = t.getSize(i);
+        info.strides[i] = t.getStride(i);
     }
 
     info.data = t.data();
@@ -74,16 +74,15 @@
     return info;
 }
 
-template <typename T, typename IndexT, int DimInput, int DimOutput>
+template <typename T, int DimInput, int DimOutput>
 __global__ void transposeAny(
-        TensorInfo<T, IndexT> input,
-        TensorInfo<T, IndexT> output,
-        IndexT totalSize) {
-    for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < totalSize;
+        TensorInfo<T> input,
+        TensorInfo<T> output,
+        idx_t totalSize) {
+    for (idx_t i = idx_t(blockIdx.x) * blockDim.x + threadIdx.x; i < totalSize;
          i += gridDim.x * blockDim.x) {
-        auto inputOffset = TensorInfoOffset<T, IndexT, DimInput>::get(input, i);
-        auto outputOffset =
-                TensorInfoOffset<T, IndexT, DimOutput>::get(output, i);
+        auto inputOffset = TensorInfoOffset<T, DimInput>::get(input, i);
+        auto outputOffset = TensorInfoOffset<T, DimOutput>::get(output, i);
 
 #if __CUDA_ARCH__ >= 350
         output.data[outputOffset] = __ldg(&input.data[inputOffset]);
@@ -94,19 +93,19 @@
 }
 
 // Transpose contiguous t1 t2 i1 -> t2 t1 i1
-template <typename T, typename IndexT>
+template <typename T>
 __global__ void transposeOuter(
         const T* in,
         T* out,
-        IndexT t1,
-        IndexT t2,
-        IndexT i1) {
-    IndexT gt2 = blockIdx.x;
-    for (IndexT gt1 = blockIdx.y; gt1 < t1; gt1 += gridDim.y) {
+        idx_t t1,
+        idx_t t2,
+        idx_t i1) {
+    idx_t gt2 = blockIdx.x;
+    for (idx_t gt1 = blockIdx.y; gt1 < t1; gt1 += gridDim.y) {
         auto curIn = in + i1 * (gt1 * t2 + gt2);
         auto curOut = out + i1 * (gt2 * t1 + gt1);
 
-        for (IndexT i = threadIdx.x; i < i1; i += blockDim.x) {
+        for (idx_t i = threadIdx.x; i < i1; i += blockDim.x) {
             curOut[i] = curIn[i];
         }
     }
@@ -128,9 +127,7 @@
         int dim2,
         Tensor<T, Dim, true>& out,
         cudaStream_t stream) {
-    static_assert(
-            Dim <= TensorInfo<T, unsigned int>::kMaxDims,
-            "too many dimensions");
+    static_assert(Dim <= TensorInfo<T>::kMaxDims, "too many dimensions");
 
     FAISS_ASSERT(dim1 != dim2);
     FAISS_ASSERT(dim1 < Dim && dim2 < Dim);
@@ -141,7 +138,7 @@
         std::swap(dim1, dim2);
     }
 
-    int outSize[Dim];
+    idx_t outSize[Dim];
 
     for (int i = 0; i < Dim; ++i) {
         outSize[i] = in.getSize(i);
@@ -153,7 +150,7 @@
         FAISS_ASSERT(out.getSize(i) == outSize[i]);
     }
 
-    auto maxThreads = getMaxThreadsCurrentDevice();
+    idx_t maxThreads = getMaxThreadsCurrentDevice();
     auto totalSize = in.numElements();
 
     // Is this a transposition of the two outer dimensions?
@@ -167,55 +164,26 @@
         }
 
         // The grid y dimension is more limited; we do a grid loop if necessary
-        int maxGridY = getCurrentDeviceProperties().maxGridSize[1];
+        idx_t maxGridY = getCurrentDeviceProperties().maxGridSize[1];
         auto grid = dim3(in.getSize(1), std::min(in.getSize(0), maxGridY));
 
         int block = (innerSize < maxThreads) ? innerSize : maxThreads;
 
-        if (totalSize <= (size_t)std::numeric_limits<int>::max()) {
-            transposeOuter<T, int32_t><<<grid, block, 0, stream>>>(
-                    in.data(),
-                    out.data(),
-                    in.getSize(0),
-                    in.getSize(1),
-                    innerSize);
-        } else {
-            transposeOuter<T, int64_t><<<grid, block, 0, stream>>>(
-                    in.data(),
-                    out.data(),
-                    in.getSize(0),
-                    in.getSize(1),
-                    innerSize);
-        }
+        transposeOuter<T><<<grid, block, 0, stream>>>(
+                in.data(), out.data(), in.getSize(0), in.getSize(1), innerSize);
     } else {
-        int block = (totalSize < maxThreads) ? totalSize : maxThreads;
-
-        // Non-outer transposition
-        if (totalSize <= (size_t)std::numeric_limits<int>::max()) {
-            // General transposition
-            // div/mod seems faster with unsigned types
-            auto inInfo = getTensorInfo<T, uint32_t, Dim>(in);
-            auto outInfo = getTensorInfo<T, uint32_t, Dim>(out);
+        idx_t block = (totalSize < maxThreads) ? totalSize : maxThreads;
 
-            std::swap(inInfo.sizes[dim1], inInfo.sizes[dim2]);
-            std::swap(inInfo.strides[dim1], inInfo.strides[dim2]);
+        auto inInfo = getTensorInfo<T, Dim>(in);
+        auto outInfo = getTensorInfo<T, Dim>(out);
 
-            auto grid = std::min(utils::divUp(totalSize, block), (size_t)4096);
+        std::swap(inInfo.sizes[dim1], inInfo.sizes[dim2]);
+        std::swap(inInfo.strides[dim1], inInfo.strides[dim2]);
 
-            transposeAny<T, uint32_t, Dim, -1>
-                    <<<grid, block, 0, stream>>>(inInfo, outInfo, totalSize);
-        } else {
-            auto inInfo = getTensorInfo<T, uint64_t, Dim>(in);
-            auto outInfo = getTensorInfo<T, uint64_t, Dim>(out);
+        auto grid = std::min(utils::divUp(totalSize, block), (idx_t)4096);
 
-            std::swap(inInfo.sizes[dim1], inInfo.sizes[dim2]);
-            std::swap(inInfo.strides[dim1], inInfo.strides[dim2]);
-
-            auto grid = std::min(utils::divUp(totalSize, block), (size_t)4096);
-
-            transposeAny<T, uint64_t, Dim, -1>
-                    <<<grid, block, 0, stream>>>(inInfo, outInfo, totalSize);
-        }
+        transposeAny<T, Dim, -1>
+                <<<grid, block, 0, stream>>>(inInfo, outInfo, totalSize);
     }
 
     CUDA_TEST_ERROR();
diff -Nru faiss-1.7.3/faiss/gpu/utils/warpselect/WarpSelectImpl.cuh faiss-1.7.4/faiss/gpu/utils/warpselect/WarpSelectImpl.cuh
--- faiss-1.7.3/faiss/gpu/utils/warpselect/WarpSelectImpl.cuh	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/utils/warpselect/WarpSelectImpl.cuh	2023-04-19 13:18:30.000000000 +0000
@@ -12,7 +12,7 @@
     extern void runWarpSelect_##TYPE##_##DIR##_##WARP_Q##_( \
             Tensor<TYPE, 2, true>& in,                      \
             Tensor<TYPE, 2, true>& outK,                    \
-            Tensor<int, 2, true>& outV,                     \
+            Tensor<idx_t, 2, true>& outV,                   \
             bool dir,                                       \
             int k,                                          \
             cudaStream_t stream)
@@ -21,7 +21,7 @@
     void runWarpSelect_##TYPE##_##DIR##_##WARP_Q##_(                           \
             Tensor<TYPE, 2, true>& in,                                         \
             Tensor<TYPE, 2, true>& outK,                                       \
-            Tensor<int, 2, true>& outV,                                        \
+            Tensor<idx_t, 2, true>& outV,                                      \
             bool dir,                                                          \
             int k,                                                             \
             cudaStream_t stream) {                                             \
@@ -36,7 +36,7 @@
         auto kInit = dir ? Limits<TYPE>::getMin() : Limits<TYPE>::getMax();    \
         auto vInit = -1;                                                       \
                                                                                \
-        warpSelect<TYPE, int, DIR, WARP_Q, THREAD_Q, kWarpSelectNumThreads>    \
+        warpSelect<TYPE, idx_t, DIR, WARP_Q, THREAD_Q, kWarpSelectNumThreads>  \
                 <<<grid, block, 0, stream>>>(in, outK, outV, kInit, vInit, k); \
         CUDA_TEST_ERROR();                                                     \
     }
diff -Nru faiss-1.7.3/faiss/gpu/utils/WarpSelectFloat.cu faiss-1.7.4/faiss/gpu/utils/WarpSelectFloat.cu
--- faiss-1.7.3/faiss/gpu/utils/WarpSelectFloat.cu	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/utils/WarpSelectFloat.cu	2023-04-19 13:18:30.000000000 +0000
@@ -46,7 +46,7 @@
 void runWarpSelect(
         Tensor<float, 2, true>& in,
         Tensor<float, 2, true>& outK,
-        Tensor<int, 2, true>& outV,
+        Tensor<idx_t, 2, true>& outV,
         bool dir,
         int k,
         cudaStream_t stream) {
diff -Nru faiss-1.7.3/faiss/gpu/utils/WarpSelectKernel.cuh faiss-1.7.4/faiss/gpu/utils/WarpSelectKernel.cuh
--- faiss-1.7.3/faiss/gpu/utils/WarpSelectKernel.cuh	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/gpu/utils/WarpSelectKernel.cuh	2023-04-19 13:18:30.000000000 +0000
@@ -39,17 +39,17 @@
             heap(initK, initV, k);
 
     int warpId = threadIdx.x / kWarpSize;
-    int row = blockIdx.x * kNumWarps + warpId;
+    idx_t row = idx_t(blockIdx.x) * kNumWarps + warpId;
 
     if (row >= in.getSize(0)) {
         return;
     }
 
-    int i = getLaneId();
+    idx_t i = getLaneId();
     K* inStart = in[row][i].data();
 
     // Whole warps must participate in the selection
-    int limit = utils::roundDown(in.getSize(1), kWarpSize);
+    idx_t limit = utils::roundDown(in.getSize(1), kWarpSize);
 
     for (; i < limit; i += kWarpSize) {
         heap.add(*inStart, (IndexType)i);
@@ -68,7 +68,7 @@
 void runWarpSelect(
         Tensor<float, 2, true>& in,
         Tensor<float, 2, true>& outKeys,
-        Tensor<int, 2, true>& outIndices,
+        Tensor<idx_t, 2, true>& outIndices,
         bool dir,
         int k,
         cudaStream_t stream);
@@ -76,7 +76,7 @@
 void runWarpSelect(
         Tensor<half, 2, true>& in,
         Tensor<half, 2, true>& outKeys,
-        Tensor<int, 2, true>& outIndices,
+        Tensor<idx_t, 2, true>& outIndices,
         bool dir,
         int k,
         cudaStream_t stream);
diff -Nru faiss-1.7.3/faiss/impl/AdditiveQuantizer.cpp faiss-1.7.4/faiss/impl/AdditiveQuantizer.cpp
--- faiss-1.7.3/faiss/impl/AdditiveQuantizer.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/impl/AdditiveQuantizer.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -54,14 +54,7 @@
         : Quantizer(d),
           M(nbits.size()),
           nbits(nbits),
-          verbose(false),
-          is_trained(false),
-          max_mem_distances(5 * (size_t(1) << 30)), // 5 GiB
           search_type(search_type) {
-    norm_max = norm_min = NAN;
-    tot_bits = 0;
-    total_codebook_size = 0;
-    only_8bit = false;
     set_derived_values();
 }
 
diff -Nru faiss-1.7.3/faiss/impl/AdditiveQuantizer.h faiss-1.7.4/faiss/impl/AdditiveQuantizer.h
--- faiss-1.7.3/faiss/impl/AdditiveQuantizer.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/impl/AdditiveQuantizer.h	2023-04-19 13:18:30.000000000 +0000
@@ -7,6 +7,7 @@
 
 #pragma once
 
+#include <cmath>
 #include <cstdint>
 #include <vector>
 
@@ -29,13 +30,13 @@
 
     // derived values
     std::vector<uint64_t> codebook_offsets;
-    size_t tot_bits;            ///< total number of bits (indexes + norms)
-    size_t norm_bits;           ///< bits allocated for the norms
-    size_t total_codebook_size; ///< size of the codebook in vectors
-    bool only_8bit;             ///< are all nbits = 8 (use faster decoder)
+    size_t tot_bits = 0;            ///< total number of bits (indexes + norms)
+    size_t norm_bits = 0;           ///< bits allocated for the norms
+    size_t total_codebook_size = 0; ///< size of the codebook in vectors
+    bool only_8bit = false;         ///< are all nbits = 8 (use faster decoder)
 
-    bool verbose;    ///< verbose during training?
-    bool is_trained; ///< is trained or not
+    bool verbose = false;    ///< verbose during training?
+    bool is_trained = false; ///< is trained or not
 
     IndexFlat1D qnorm;            ///< store and search norms
     std::vector<float> norm_tabs; ///< store norms of codebook entries for 4-bit
@@ -43,7 +44,7 @@
 
     /// norms and distance matrixes with beam search can get large, so use this
     /// to control for the amount of memory that can be allocated
-    size_t max_mem_distances;
+    size_t max_mem_distances = 5 * (size_t(1) << 30);
 
     /// encode a norm into norm_bits bits
     uint64_t encode_norm(float norm) const;
@@ -145,7 +146,7 @@
     Search_type_t search_type;
 
     /// min/max for quantization of norms
-    float norm_min, norm_max;
+    float norm_min = NAN, norm_max = NAN;
 
     template <bool is_IP, Search_type_t effective_search_type>
     float compute_1_distance_LUT(const uint8_t* codes, const float* LUT) const;
@@ -157,7 +158,6 @@
      * Support for exhaustive distance computations with all the centroids.
      * Hence, the number of these centroids should not be too large.
      ****************************************************************************/
-    using idx_t = Index::idx_t;
 
     /// decoding function for a code in a 64-bit word
     void decode_64bit(idx_t n, float* x) const;
diff -Nru faiss-1.7.3/faiss/impl/AuxIndexStructures.cpp faiss-1.7.4/faiss/impl/AuxIndexStructures.cpp
--- faiss-1.7.3/faiss/impl/AuxIndexStructures.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/impl/AuxIndexStructures.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -20,7 +20,7 @@
  * RangeSearchResult
  ***********************************************************************/
 
-RangeSearchResult::RangeSearchResult(idx_t nq, bool alloc_lims) : nq(nq) {
+RangeSearchResult::RangeSearchResult(size_t nq, bool alloc_lims) : nq(nq) {
     if (alloc_lims) {
         lims = new size_t[nq + 1];
         memset(lims, 0, sizeof(*lims) * (nq + 1));
diff -Nru faiss-1.7.3/faiss/impl/AuxIndexStructures.h faiss-1.7.4/faiss/impl/AuxIndexStructures.h
--- faiss-1.7.3/faiss/impl/AuxIndexStructures.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/impl/AuxIndexStructures.h	2023-04-19 13:18:30.000000000 +0000
@@ -18,7 +18,7 @@
 #include <mutex>
 #include <vector>
 
-#include <faiss/Index.h>
+#include <faiss/MetricType.h>
 #include <faiss/impl/platform_macros.h>
 
 namespace faiss {
@@ -31,15 +31,13 @@
     size_t nq;    ///< nb of queries
     size_t* lims; ///< size (nq + 1)
 
-    typedef Index::idx_t idx_t;
-
     idx_t* labels;    ///< result for query i is labels[lims[i]:lims[i+1]]
     float* distances; ///< corresponding distances (not sorted)
 
     size_t buffer_size; ///< size of the result buffers used
 
     /// lims must be allocated on input to range_search.
-    explicit RangeSearchResult(idx_t nq, bool alloc_lims = true);
+    explicit RangeSearchResult(size_t nq, bool alloc_lims = true);
 
     /// called when lims contains the nb of elements result entries
     /// for each query
@@ -62,8 +60,6 @@
 /** List of temporary buffers used to store results before they are
  *  copied to the RangeSearchResult object. */
 struct BufferList {
-    typedef Index::idx_t idx_t;
-
     // buffer sizes in # entries
     size_t buffer_size;
 
@@ -94,7 +90,6 @@
 
 /// result structure for a single query
 struct RangeQueryResult {
-    using idx_t = Index::idx_t;
     idx_t qno;   //< id of the query
     size_t nres; //< nb of results for this query
     RangeSearchPartialResult* pres;
diff -Nru faiss-1.7.3/faiss/impl/code_distance/code_distance-avx2.h faiss-1.7.4/faiss/impl/code_distance/code_distance-avx2.h
--- faiss-1.7.3/faiss/impl/code_distance/code_distance-avx2.h	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/faiss/impl/code_distance/code_distance-avx2.h	2023-04-19 13:18:30.000000000 +0000
@@ -0,0 +1,291 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#ifdef __AVX2__
+
+#include <immintrin.h>
+
+#include <type_traits>
+
+#include <faiss/impl/code_distance/code_distance-generic.h>
+
+namespace {
+
+// Computes a horizontal sum over an __m256 register
+inline float horizontal_sum(const __m256 reg) {
+    const __m256 h0 = _mm256_hadd_ps(reg, reg);
+    const __m256 h1 = _mm256_hadd_ps(h0, h0);
+
+    // extract high and low __m128 regs from __m256
+    const __m128 h2 = _mm256_extractf128_ps(h1, 1);
+    const __m128 h3 = _mm256_castps256_ps128(h1);
+
+    // get a final hsum into all 4 regs
+    const __m128 h4 = _mm_add_ss(h2, h3);
+
+    // extract f[0] from __m128
+    const float hsum = _mm_cvtss_f32(h4);
+    return hsum;
+}
+
+} // namespace
+
+namespace faiss {
+
+template <typename PQDecoderT>
+typename std::enable_if<!std::is_same<PQDecoderT, PQDecoder8>::value, float>::
+        type inline distance_single_code_avx2(
+                // the product quantizer
+                const ProductQuantizer& pq,
+                // precomputed distances, layout (M, ksub)
+                const float* sim_table,
+                const uint8_t* code) {
+    // default implementation
+    return distance_single_code_generic<PQDecoderT>(pq, sim_table, code);
+}
+
+template <typename PQDecoderT>
+typename std::enable_if<std::is_same<PQDecoderT, PQDecoder8>::value, float>::
+        type inline distance_single_code_avx2(
+                // the product quantizer
+                const ProductQuantizer& pq,
+                // precomputed distances, layout (M, ksub)
+                const float* sim_table,
+                const uint8_t* code) {
+    float result = 0;
+
+    size_t m = 0;
+    const size_t pqM16 = pq.M / 16;
+
+    const float* tab = sim_table;
+
+    if (pqM16 > 0) {
+        // process 16 values per loop
+
+        const __m256i ksub = _mm256_set1_epi32(pq.ksub);
+        __m256i offsets_0 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        offsets_0 = _mm256_mullo_epi32(offsets_0, ksub);
+
+        // accumulators of partial sums
+        __m256 partialSum = _mm256_setzero_ps();
+
+        // loop
+        for (m = 0; m < pqM16 * 16; m += 16) {
+            // load 16 uint8 values
+            const __m128i mm1 = _mm_loadu_si128((const __m128i_u*)(code + m));
+            {
+                // convert uint8 values (low part of __m128i) to int32
+                // values
+                const __m256i idx1 = _mm256_cvtepu8_epi32(mm1);
+
+                // add offsets
+                const __m256i indices_to_read_from =
+                        _mm256_add_epi32(idx1, offsets_0);
+
+                // gather 8 values, similar to 8 operations of tab[idx]
+                __m256 collected = _mm256_i32gather_ps(
+                        tab, indices_to_read_from, sizeof(float));
+                tab += pq.ksub * 8;
+
+                // collect partial sums
+                partialSum = _mm256_add_ps(partialSum, collected);
+            }
+
+            // move high 8 uint8 to low ones
+            const __m128i mm2 = _mm_unpackhi_epi64(mm1, _mm_setzero_si128());
+            {
+                // convert uint8 values (low part of __m128i) to int32
+                // values
+                const __m256i idx1 = _mm256_cvtepu8_epi32(mm2);
+
+                // add offsets
+                const __m256i indices_to_read_from =
+                        _mm256_add_epi32(idx1, offsets_0);
+
+                // gather 8 values, similar to 8 operations of tab[idx]
+                __m256 collected = _mm256_i32gather_ps(
+                        tab, indices_to_read_from, sizeof(float));
+                tab += pq.ksub * 8;
+
+                // collect partial sums
+                partialSum = _mm256_add_ps(partialSum, collected);
+            }
+        }
+
+        // horizontal sum for partialSum
+        result += horizontal_sum(partialSum);
+    }
+
+    //
+    if (m < pq.M) {
+        // process leftovers
+        PQDecoder8 decoder(code + m, pq.nbits);
+
+        for (; m < pq.M; m++) {
+            result += tab[decoder.decode()];
+            tab += pq.ksub;
+        }
+    }
+
+    return result;
+}
+
+template <typename PQDecoderT>
+typename std::enable_if<!std::is_same<PQDecoderT, PQDecoder8>::value, void>::
+        type
+        distance_four_codes_avx2(
+                // the product quantizer
+                const ProductQuantizer& pq,
+                // precomputed distances, layout (M, ksub)
+                const float* sim_table,
+                // codes
+                const uint8_t* __restrict code0,
+                const uint8_t* __restrict code1,
+                const uint8_t* __restrict code2,
+                const uint8_t* __restrict code3,
+                // computed distances
+                float& result0,
+                float& result1,
+                float& result2,
+                float& result3) {
+    distance_four_codes_generic<PQDecoderT>(
+            pq,
+            sim_table,
+            code0,
+            code1,
+            code2,
+            code3,
+            result0,
+            result1,
+            result2,
+            result3);
+}
+
+// Combines 4 operations of distance_single_code()
+template <typename PQDecoderT>
+typename std::enable_if<std::is_same<PQDecoderT, PQDecoder8>::value, void>::type
+distance_four_codes_avx2(
+        // the product quantizer
+        const ProductQuantizer& pq,
+        // precomputed distances, layout (M, ksub)
+        const float* sim_table,
+        // codes
+        const uint8_t* __restrict code0,
+        const uint8_t* __restrict code1,
+        const uint8_t* __restrict code2,
+        const uint8_t* __restrict code3,
+        // computed distances
+        float& result0,
+        float& result1,
+        float& result2,
+        float& result3) {
+    result0 = 0;
+    result1 = 0;
+    result2 = 0;
+    result3 = 0;
+
+    size_t m = 0;
+    const size_t pqM16 = pq.M / 16;
+
+    constexpr intptr_t N = 4;
+
+    const float* tab = sim_table;
+
+    if (pqM16 > 0) {
+        // process 16 values per loop
+        const __m256i ksub = _mm256_set1_epi32(pq.ksub);
+        __m256i offsets_0 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        offsets_0 = _mm256_mullo_epi32(offsets_0, ksub);
+
+        // accumulators of partial sums
+        __m256 partialSums[N];
+        for (intptr_t j = 0; j < N; j++) {
+            partialSums[j] = _mm256_setzero_ps();
+        }
+
+        // loop
+        for (m = 0; m < pqM16 * 16; m += 16) {
+            // load 16 uint8 values
+            __m128i mm1[N];
+            mm1[0] = _mm_loadu_si128((const __m128i_u*)(code0 + m));
+            mm1[1] = _mm_loadu_si128((const __m128i_u*)(code1 + m));
+            mm1[2] = _mm_loadu_si128((const __m128i_u*)(code2 + m));
+            mm1[3] = _mm_loadu_si128((const __m128i_u*)(code3 + m));
+
+            // process first 8 codes
+            for (intptr_t j = 0; j < N; j++) {
+                // convert uint8 values (low part of __m128i) to int32
+                // values
+                const __m256i idx1 = _mm256_cvtepu8_epi32(mm1[j]);
+
+                // add offsets
+                const __m256i indices_to_read_from =
+                        _mm256_add_epi32(idx1, offsets_0);
+
+                // gather 8 values, similar to 8 operations of tab[idx]
+                __m256 collected = _mm256_i32gather_ps(
+                        tab, indices_to_read_from, sizeof(float));
+
+                // collect partial sums
+                partialSums[j] = _mm256_add_ps(partialSums[j], collected);
+            }
+            tab += pq.ksub * 8;
+
+            // process next 8 codes
+            for (intptr_t j = 0; j < N; j++) {
+                // move high 8 uint8 to low ones
+                const __m128i mm2 =
+                        _mm_unpackhi_epi64(mm1[j], _mm_setzero_si128());
+
+                // convert uint8 values (low part of __m128i) to int32
+                // values
+                const __m256i idx1 = _mm256_cvtepu8_epi32(mm2);
+
+                // add offsets
+                const __m256i indices_to_read_from =
+                        _mm256_add_epi32(idx1, offsets_0);
+
+                // gather 8 values, similar to 8 operations of tab[idx]
+                __m256 collected = _mm256_i32gather_ps(
+                        tab, indices_to_read_from, sizeof(float));
+
+                // collect partial sums
+                partialSums[j] = _mm256_add_ps(partialSums[j], collected);
+            }
+
+            tab += pq.ksub * 8;
+        }
+
+        // horizontal sum for partialSum
+        result0 += horizontal_sum(partialSums[0]);
+        result1 += horizontal_sum(partialSums[1]);
+        result2 += horizontal_sum(partialSums[2]);
+        result3 += horizontal_sum(partialSums[3]);
+    }
+
+    //
+    if (m < pq.M) {
+        // process leftovers
+        PQDecoder8 decoder0(code0 + m, pq.nbits);
+        PQDecoder8 decoder1(code1 + m, pq.nbits);
+        PQDecoder8 decoder2(code2 + m, pq.nbits);
+        PQDecoder8 decoder3(code3 + m, pq.nbits);
+        for (; m < pq.M; m++) {
+            result0 += tab[decoder0.decode()];
+            result1 += tab[decoder1.decode()];
+            result2 += tab[decoder2.decode()];
+            result3 += tab[decoder3.decode()];
+            tab += pq.ksub;
+        }
+    }
+}
+
+} // namespace faiss
+
+#endif
diff -Nru faiss-1.7.3/faiss/impl/code_distance/code_distance_avx512.h faiss-1.7.4/faiss/impl/code_distance/code_distance_avx512.h
--- faiss-1.7.3/faiss/impl/code_distance/code_distance_avx512.h	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/faiss/impl/code_distance/code_distance_avx512.h	2023-04-19 13:18:30.000000000 +0000
@@ -0,0 +1,102 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// // // AVX-512 version. It is not used, but let it be for the future
+// // // needs.
+// // template <class SearchResultType, typename T = PQDecoder>
+// // typename std::enable_if<(std::is_same<T, PQDecoder8>::value), void>::
+// //         type distance_four_codes(
+// //     const uint8_t* __restrict code0,
+// //     const uint8_t* __restrict code1,
+// //     const uint8_t* __restrict code2,
+// //     const uint8_t* __restrict code3,
+// //     float& result0,
+// //     float& result1,
+// //     float& result2,
+// //     float& result3
+// // ) const {
+// //     result0 = 0;
+// //     result1 = 0;
+// //     result2 = 0;
+// //     result3 = 0;
+
+// //     size_t m = 0;
+// //     const size_t pqM16 = pq.M / 16;
+
+// //     constexpr intptr_t N = 4;
+
+// //     const float* tab = sim_table;
+
+// //     if (pqM16 > 0) {
+// //         // process 16 values per loop
+// //         const __m512i ksub = _mm512_set1_epi32(pq.ksub);
+// //         __m512i offsets_0 = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7,
+// //              8, 9, 10, 11, 12, 13, 14, 15);
+// //         offsets_0 = _mm512_mullo_epi32(offsets_0, ksub);
+
+// //         // accumulators of partial sums
+// //         __m512 partialSums[N];
+// //         for (intptr_t j = 0; j < N; j++) {
+// //             partialSums[j] = _mm512_setzero_ps();
+// //         }
+
+// //         // loop
+// //         for (m = 0; m < pqM16 * 16; m += 16) {
+// //             // load 16 uint8 values
+// //             __m128i mm1[N];
+// //             mm1[0] = _mm_loadu_si128((const __m128i_u*)(code0 + m));
+// //             mm1[1] = _mm_loadu_si128((const __m128i_u*)(code1 + m));
+// //             mm1[2] = _mm_loadu_si128((const __m128i_u*)(code2 + m));
+// //             mm1[3] = _mm_loadu_si128((const __m128i_u*)(code3 + m));
+
+// //             // process first 8 codes
+// //             for (intptr_t j = 0; j < N; j++) {
+// //                 // convert uint8 values (low part of __m128i) to int32
+// //                 // values
+// //                 const __m512i idx1 = _mm512_cvtepu8_epi32(mm1[j]);
+
+// //                 // add offsets
+// //                 const __m512i indices_to_read_from =
+// //                     _mm512_add_epi32(idx1, offsets_0);
+
+// //                 // gather 8 values, similar to 8 operations of
+// // //                    tab[idx]
+// //                 __m512 collected =
+// //                        _mm512_i32gather_ps(
+// //                             indices_to_read_from, tab, sizeof(float));
+
+// //                 // collect partial sums
+// //                 partialSums[j] = _mm512_add_ps(partialSums[j],
+// //                    collected);
+// //             }
+// //             tab += pq.ksub * 16;
+
+// //         }
+
+// //         // horizontal sum for partialSum
+// //         result0 += _mm512_reduce_add_ps(partialSums[0]);
+// //         result1 += _mm512_reduce_add_ps(partialSums[1]);
+// //         result2 += _mm512_reduce_add_ps(partialSums[2]);
+// //         result3 += _mm512_reduce_add_ps(partialSums[3]);
+// //     }
+
+// //     //
+// //     if (m < pq.M) {
+// //         // process leftovers
+// //         PQDecoder decoder0(code0 + m, pq.nbits);
+// //         PQDecoder decoder1(code1 + m, pq.nbits);
+// //         PQDecoder decoder2(code2 + m, pq.nbits);
+// //         PQDecoder decoder3(code3 + m, pq.nbits);
+// //         for (; m < pq.M; m++) {
+// //             result0 += tab[decoder0.decode()];
+// //             result1 += tab[decoder1.decode()];
+// //             result2 += tab[decoder2.decode()];
+// //             result3 += tab[decoder3.decode()];
+// //             tab += pq.ksub;
+// //         }
+// //     }
+// // }
diff -Nru faiss-1.7.3/faiss/impl/code_distance/code_distance-generic.h faiss-1.7.4/faiss/impl/code_distance/code_distance-generic.h
--- faiss-1.7.3/faiss/impl/code_distance/code_distance-generic.h	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/faiss/impl/code_distance/code_distance-generic.h	2023-04-19 13:18:30.000000000 +0000
@@ -0,0 +1,74 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/impl/ProductQuantizer.h>
+
+namespace faiss {
+
+/// Returns the distance to a single code.
+template <typename PQDecoderT>
+inline float distance_single_code_generic(
+        // the product quantizer
+        const ProductQuantizer& pq,
+        // precomputed distances, layout (M, ksub)
+        const float* sim_table,
+        // the code
+        const uint8_t* code) {
+    PQDecoderT decoder(code, pq.nbits);
+
+    const float* tab = sim_table;
+    float result = 0;
+
+    for (size_t m = 0; m < pq.M; m++) {
+        result += tab[decoder.decode()];
+        tab += pq.ksub;
+    }
+
+    return result;
+}
+
+/// Combines 4 operations of distance_single_code()
+/// General-purpose version.
+template <typename PQDecoderT>
+inline void distance_four_codes_generic(
+        // the product quantizer
+        const ProductQuantizer& pq,
+        // precomputed distances, layout (M, ksub)
+        const float* sim_table,
+        // codes
+        const uint8_t* __restrict code0,
+        const uint8_t* __restrict code1,
+        const uint8_t* __restrict code2,
+        const uint8_t* __restrict code3,
+        // computed distances
+        float& result0,
+        float& result1,
+        float& result2,
+        float& result3) {
+    PQDecoderT decoder0(code0, pq.nbits);
+    PQDecoderT decoder1(code1, pq.nbits);
+    PQDecoderT decoder2(code2, pq.nbits);
+    PQDecoderT decoder3(code3, pq.nbits);
+
+    const float* tab = sim_table;
+    result0 = 0;
+    result1 = 0;
+    result2 = 0;
+    result3 = 0;
+
+    for (size_t m = 0; m < pq.M; m++) {
+        result0 += tab[decoder0.decode()];
+        result1 += tab[decoder1.decode()];
+        result2 += tab[decoder2.decode()];
+        result3 += tab[decoder3.decode()];
+        tab += pq.ksub;
+    }
+}
+
+} // namespace faiss
diff -Nru faiss-1.7.3/faiss/impl/code_distance/code_distance.h faiss-1.7.4/faiss/impl/code_distance/code_distance.h
--- faiss-1.7.3/faiss/impl/code_distance/code_distance.h	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/faiss/impl/code_distance/code_distance.h	2023-04-19 13:18:30.000000000 +0000
@@ -0,0 +1,123 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/impl/platform_macros.h>
+
+// This directory contains functions to compute a distance
+// from a given PQ code to a query vector, given that the
+// distances to a query vector for pq.M codebooks are precomputed.
+//
+// The code was originally the part of IndexIVFPQ.cpp.
+// The baseline implementation can be found in
+//   code_distance-generic.h, distance_single_code_generic().
+
+// The reason for this somewhat unusual structure is that
+// custom implementations may need to fall off to generic
+// implementation in certain cases. So, say, avx2 header file
+// needs to reference the generic header file. This is
+// why the names of the functions for custom implementations
+// have this _generic or _avx2 suffix.
+
+#ifdef __AVX2__
+
+#include <faiss/impl/code_distance/code_distance-avx2.h>
+
+namespace faiss {
+
+template <typename PQDecoderT>
+inline float distance_single_code(
+        // the product quantizer
+        const ProductQuantizer& pq,
+        // precomputed distances, layout (M, ksub)
+        const float* sim_table,
+        // the code
+        const uint8_t* code) {
+    return distance_single_code_avx2<PQDecoderT>(pq, sim_table, code);
+}
+
+template <typename PQDecoderT>
+inline void distance_four_codes(
+        // the product quantizer
+        const ProductQuantizer& pq,
+        // precomputed distances, layout (M, ksub)
+        const float* sim_table,
+        // codes
+        const uint8_t* __restrict code0,
+        const uint8_t* __restrict code1,
+        const uint8_t* __restrict code2,
+        const uint8_t* __restrict code3,
+        // computed distances
+        float& result0,
+        float& result1,
+        float& result2,
+        float& result3) {
+    distance_four_codes_avx2<PQDecoderT>(
+            pq,
+            sim_table,
+            code0,
+            code1,
+            code2,
+            code3,
+            result0,
+            result1,
+            result2,
+            result3);
+}
+
+} // namespace faiss
+
+#else
+
+#include <faiss/impl/code_distance/code_distance-generic.h>
+
+namespace faiss {
+
+template <typename PQDecoderT>
+inline float distance_single_code(
+        // the product quantizer
+        const ProductQuantizer& pq,
+        // precomputed distances, layout (M, ksub)
+        const float* sim_table,
+        // the code
+        const uint8_t* code) {
+    return distance_single_code_generic<PQDecoderT>(pq, sim_table, code);
+}
+
+template <typename PQDecoderT>
+inline void distance_four_codes(
+        // the product quantizer
+        const ProductQuantizer& pq,
+        // precomputed distances, layout (M, ksub)
+        const float* sim_table,
+        // codes
+        const uint8_t* __restrict code0,
+        const uint8_t* __restrict code1,
+        const uint8_t* __restrict code2,
+        const uint8_t* __restrict code3,
+        // computed distances
+        float& result0,
+        float& result1,
+        float& result2,
+        float& result3) {
+    distance_four_codes_generic<PQDecoderT>(
+            pq,
+            sim_table,
+            code0,
+            code1,
+            code2,
+            code3,
+            result0,
+            result1,
+            result2,
+            result3);
+}
+
+} // namespace faiss
+
+#endif
diff -Nru faiss-1.7.3/faiss/impl/CodePacker.cpp faiss-1.7.4/faiss/impl/CodePacker.cpp
--- faiss-1.7.3/faiss/impl/CodePacker.cpp	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/faiss/impl/CodePacker.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -0,0 +1,67 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/impl/CodePacker.h>
+
+#include <cassert>
+#include <cstring>
+
+namespace faiss {
+
+/*********************************************
+ * CodePacker
+ * default of pack_all / unpack_all loops over the _1 versions
+ */
+
+void CodePacker::pack_all(const uint8_t* flat_codes, uint8_t* block) const {
+    for (size_t i = 0; i < nvec; i++) {
+        pack_1(flat_codes + code_size * i, i, block);
+    }
+}
+
+void CodePacker::unpack_all(const uint8_t* block, uint8_t* flat_codes) const {
+    for (size_t i = 0; i < nvec; i++) {
+        unpack_1(block, i, flat_codes + code_size * i);
+    }
+}
+
+/*********************************************
+ * CodePackerFlat
+ */
+
+CodePackerFlat::CodePackerFlat(size_t code_size) {
+    this->code_size = code_size;
+    nvec = 1;
+    block_size = code_size;
+}
+
+void CodePackerFlat::pack_all(const uint8_t* flat_codes, uint8_t* block) const {
+    memcpy(block, flat_codes, code_size);
+}
+
+void CodePackerFlat::unpack_all(const uint8_t* block, uint8_t* flat_codes)
+        const {
+    memcpy(flat_codes, block, code_size);
+}
+
+void CodePackerFlat::pack_1(
+        const uint8_t* flat_code,
+        size_t offset,
+        uint8_t* block) const {
+    assert(offset == 0);
+    pack_all(flat_code, block);
+}
+
+void CodePackerFlat::unpack_1(
+        const uint8_t* block,
+        size_t offset,
+        uint8_t* flat_code) const {
+    assert(offset == 0);
+    unpack_all(block, flat_code);
+}
+
+} // namespace faiss
diff -Nru faiss-1.7.3/faiss/impl/CodePacker.h faiss-1.7.4/faiss/impl/CodePacker.h
--- faiss-1.7.3/faiss/impl/CodePacker.h	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/faiss/impl/CodePacker.h	2023-04-19 13:18:30.000000000 +0000
@@ -0,0 +1,71 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/MetricType.h>
+
+namespace faiss {
+
+/**
+ * Packing consists in combining a fixed number of codes of constant size
+ * (code_size) into a block of data where they may (or may not) be interleaved
+ * for efficient consumption by distance computation kernels. This exists for
+ * the "fast_scan" indexes on CPU and for some GPU kernels.
+ */
+struct CodePacker {
+    size_t code_size;  // input code size in bytes
+    size_t nvec;       // number of vectors per block
+    size_t block_size; // size of one block in bytes (>= code_size * nvec)
+
+    // pack a single code to a block
+    virtual void pack_1(
+            const uint8_t*
+                    flat_code, // code to write to the block, size code_size
+            size_t offset,     // offset in the block (0 <= offset < nvec)
+            uint8_t* block     // block to write to (size block_size)
+    ) const = 0;
+
+    // unpack a single code from a block
+    virtual void unpack_1(
+            const uint8_t* block, // block to read from (size block_size)
+            size_t offset,        // offset in the block (0 <= offset < nvec)
+            uint8_t* flat_code    // where to write the resulting code, size
+                                  // code_size
+    ) const = 0;
+
+    // pack all code in a block
+    virtual void pack_all(
+            const uint8_t* flat_codes, // codes to write to the block, size
+                                       // (nvec * code_size)
+            uint8_t* block             // block to write to (size block_size)
+    ) const;
+
+    // unpack all code in a block
+    virtual void unpack_all(
+            const uint8_t* block, // block to read from (size block_size)
+            uint8_t* flat_codes // where to write the resulting codes size (nvec
+                                // * code_size)
+    ) const;
+
+    virtual ~CodePacker() {}
+};
+
+/** Trivial code packer where codes are stored one by one */
+struct CodePackerFlat : CodePacker {
+    explicit CodePackerFlat(size_t code_size);
+
+    void pack_1(const uint8_t* flat_code, size_t offset, uint8_t* block)
+            const final;
+    void unpack_1(const uint8_t* block, size_t offset, uint8_t* flat_code)
+            const final;
+
+    void pack_all(const uint8_t* flat_codes, uint8_t* block) const final;
+    void unpack_all(const uint8_t* block, uint8_t* flat_codes) const final;
+};
+
+} // namespace faiss
diff -Nru faiss-1.7.3/faiss/impl/DistanceComputer.h faiss-1.7.4/faiss/impl/DistanceComputer.h
--- faiss-1.7.3/faiss/impl/DistanceComputer.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/impl/DistanceComputer.h	2023-04-19 13:18:30.000000000 +0000
@@ -23,8 +23,6 @@
  * that has additional methods to handle the inverted list context.
  ***********************************************************/
 struct DistanceComputer {
-    using idx_t = Index::idx_t;
-
     /// called before computing distances. Pointer x should remain valid
     /// while operator () is called
     virtual void set_query(const float* x) = 0;
diff -Nru faiss-1.7.3/faiss/impl/HNSW.cpp faiss-1.7.4/faiss/impl/HNSW.cpp
--- faiss-1.7.3/faiss/impl/HNSW.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/impl/HNSW.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -47,11 +47,6 @@
 
 HNSW::HNSW(int M) : rng(12345) {
     set_default_probas(M, 1.0 / log(M));
-    max_level = -1;
-    entry_point = -1;
-    efSearch = 16;
-    efConstruction = 40;
-    upper_beam = 1;
     offsets.push_back(0);
 }
 
@@ -509,7 +504,6 @@
 
 namespace {
 
-using idx_t = HNSW::idx_t;
 using MinimaxHeap = HNSW::MinimaxHeap;
 using Node = HNSW::Node;
 /** Do a BFS on the candidates list */
@@ -837,8 +831,10 @@
     if (k == n) {
         if (v >= dis[0])
             return;
+        if (ids[0] != -1) {
+            --nvalid;
+        }
         faiss::heap_pop<HC>(k--, dis.data(), ids.data());
-        --nvalid;
     }
     faiss::heap_push<HC>(++k, dis.data(), ids.data(), v, i);
     ++nvalid;
diff -Nru faiss-1.7.3/faiss/impl/HNSW.h faiss-1.7.4/faiss/impl/HNSW.h
--- faiss-1.7.3/faiss/impl/HNSW.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/impl/HNSW.h	2023-04-19 13:18:30.000000000 +0000
@@ -52,10 +52,7 @@
 
 struct HNSW {
     /// internal storage of vectors (32 bits: this is expensive)
-    typedef int storage_idx_t;
-
-    /// Faiss results are 64-bit
-    typedef Index::idx_t idx_t;
+    using storage_idx_t = int32_t;
 
     typedef std::pair<float, storage_idx_t> Node;
 
@@ -124,25 +121,25 @@
 
     /// entry point in the search structure (one of the points with maximum
     /// level
-    storage_idx_t entry_point;
+    storage_idx_t entry_point = -1;
 
     faiss::RandomGenerator rng;
 
     /// maximum level
-    int max_level;
+    int max_level = -1;
 
     /// expansion factor at construction time
-    int efConstruction;
+    int efConstruction = 40;
 
     /// expansion factor at search time
-    int efSearch;
+    int efSearch = 16;
 
     /// during search: do we check whether the next best distance is good
     /// enough?
     bool check_relative_distance = true;
 
     /// number of entry points in levels > 0.
-    int upper_beam;
+    int upper_beam = 1;
 
     /// use bounded queue during exploration
     bool search_bounded_queue = true;
diff -Nru faiss-1.7.3/faiss/impl/IDSelector.cpp faiss-1.7.4/faiss/impl/IDSelector.cpp
--- faiss-1.7.3/faiss/impl/IDSelector.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/impl/IDSelector.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -92,7 +92,7 @@
     mask = ((idx_t)1 << nbits) - 1;
     bloom.resize((idx_t)1 << (nbits - 3), 0);
     for (idx_t i = 0; i < n; i++) {
-        Index::idx_t id = indices[i];
+        idx_t id = indices[i];
         set.insert(id);
         id &= mask;
         bloom[id >> 3] |= 1 << (id & 7);
diff -Nru faiss-1.7.3/faiss/impl/IDSelector.h faiss-1.7.4/faiss/impl/IDSelector.h
--- faiss-1.7.3/faiss/impl/IDSelector.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/impl/IDSelector.h	2023-04-19 13:18:30.000000000 +0000
@@ -19,7 +19,6 @@
 
 /** Encapsulates a set of ids to handle. */
 struct IDSelector {
-    using idx_t = Index::idx_t;
     virtual bool is_member(idx_t id) const = 0;
     virtual ~IDSelector() {}
 };
@@ -132,4 +131,43 @@
     virtual ~IDSelectorAll() {}
 };
 
+/// does an AND operation on the the two given IDSelector's is_membership
+/// results.
+struct IDSelectorAnd : IDSelector {
+    const IDSelector* lhs;
+    const IDSelector* rhs;
+    IDSelectorAnd(const IDSelector* lhs, const IDSelector* rhs)
+            : lhs(lhs), rhs(rhs) {}
+    bool is_member(idx_t id) const final {
+        return lhs->is_member(id) && rhs->is_member(id);
+    };
+    virtual ~IDSelectorAnd() {}
+};
+
+/// does an OR operation on the the two given IDSelector's is_membership
+/// results.
+struct IDSelectorOr : IDSelector {
+    const IDSelector* lhs;
+    const IDSelector* rhs;
+    IDSelectorOr(const IDSelector* lhs, const IDSelector* rhs)
+            : lhs(lhs), rhs(rhs) {}
+    bool is_member(idx_t id) const final {
+        return lhs->is_member(id) || rhs->is_member(id);
+    };
+    virtual ~IDSelectorOr() {}
+};
+
+/// does an XOR operation on the the two given IDSelector's is_membership
+/// results.
+struct IDSelectorXOr : IDSelector {
+    const IDSelector* lhs;
+    const IDSelector* rhs;
+    IDSelectorXOr(const IDSelector* lhs, const IDSelector* rhs)
+            : lhs(lhs), rhs(rhs) {}
+    bool is_member(idx_t id) const final {
+        return lhs->is_member(id) ^ rhs->is_member(id);
+    };
+    virtual ~IDSelectorXOr() {}
+};
+
 } // namespace faiss
diff -Nru faiss-1.7.3/faiss/impl/index_read.cpp faiss-1.7.4/faiss/impl/index_read.cpp
--- faiss-1.7.3/faiss/impl/index_read.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/impl/index_read.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -65,7 +65,7 @@
 static void read_index_header(Index* idx, IOReader* f) {
     READ1(idx->d);
     READ1(idx->ntotal);
-    Index::idx_t dummy;
+    idx_t dummy;
     READ1(dummy);
     READ1(dummy);
     READ1(idx->is_trained);
@@ -279,6 +279,8 @@
         aq->search_type == AdditiveQuantizer::ST_norm_lsq2x4 ||
         aq->search_type == AdditiveQuantizer::ST_norm_rq2x4) {
         READXBVECTOR(aq->qnorm.codes);
+        aq->qnorm.ntotal = aq->qnorm.codes.size() / 4;
+        aq->qnorm.update_permutation();
     }
 
     if (aq->search_type == AdditiveQuantizer::ST_norm_lsq2x4 ||
@@ -439,7 +441,6 @@
     dm->type = (DirectMap::Type)maintain_direct_map;
     READVECTOR(dm->array);
     if (dm->type == DirectMap::Hashtable) {
-        using idx_t = Index::idx_t;
         std::vector<std::pair<idx_t, idx_t>> v;
         READVECTOR(v);
         std::unordered_map<idx_t, idx_t>& map = dm->hashtable;
@@ -453,7 +454,7 @@
 static void read_ivf_header(
         IndexIVF* ivf,
         IOReader* f,
-        std::vector<std::vector<Index::idx_t>>* ids = nullptr) {
+        std::vector<std::vector<idx_t>>* ids = nullptr) {
     read_index_header(ivf, f);
     READ1(ivf->nlist);
     READ1(ivf->nprobe);
@@ -470,7 +471,7 @@
 // used for legacy formats
 static ArrayInvertedLists* set_array_invlist(
         IndexIVF* ivf,
-        std::vector<std::vector<Index::idx_t>>& ids) {
+        std::vector<std::vector<idx_t>>& ids) {
     ArrayInvertedLists* ail =
             new ArrayInvertedLists(ivf->nlist, ivf->code_size);
     std::swap(ail->ids, ids);
@@ -487,7 +488,7 @@
             : nullptr;
     IndexIVFPQ* ivpq = ivfpqr ? ivfpqr : new IndexIVFPQ();
 
-    std::vector<std::vector<Index::idx_t>> ids;
+    std::vector<std::vector<idx_t>> ids;
     read_ivf_header(ivpq, f, legacy ? &ids : nullptr);
     READ1(ivpq->by_residual);
     READ1(ivpq->code_size);
@@ -728,10 +729,11 @@
         READ1(ivaqfs->max_train_points);
 
         read_InvertedLists(ivaqfs, f, io_flags);
+        ivaqfs->init_code_packer();
         idx = ivaqfs;
     } else if (h == fourcc("IvFl") || h == fourcc("IvFL")) { // legacy
         IndexIVFFlat* ivfl = new IndexIVFFlat();
-        std::vector<std::vector<Index::idx_t>> ids;
+        std::vector<std::vector<idx_t>> ids;
         read_ivf_header(ivfl, f, &ids);
         ivfl->code_size = ivfl->d * sizeof(float);
         ArrayInvertedLists* ail = set_array_invlist(ivfl, ids);
@@ -754,10 +756,10 @@
         read_ivf_header(ivfl, f);
         ivfl->code_size = ivfl->d * sizeof(float);
         {
-            std::vector<Index::idx_t> tab;
+            std::vector<idx_t> tab;
             READVECTOR(tab);
             for (long i = 0; i < tab.size(); i += 2) {
-                std::pair<Index::idx_t, Index::idx_t> pair(tab[i], tab[i + 1]);
+                std::pair<idx_t, idx_t> pair(tab[i], tab[i + 1]);
                 ivfl->instances.insert(pair);
             }
         }
@@ -788,7 +790,7 @@
         idx = idxl;
     } else if (h == fourcc("IvSQ")) { // legacy
         IndexIVFScalarQuantizer* ivsc = new IndexIVFScalarQuantizer();
-        std::vector<std::vector<Index::idx_t>> ids;
+        std::vector<std::vector<idx_t>> ids;
         read_ivf_header(ivsc, f, &ids);
         read_ScalarQuantizer(&ivsc->sq, f);
         READ1(ivsc->code_size);
@@ -1002,6 +1004,7 @@
         ivpq->nbits = pq.nbits;
         ivpq->ksub = (1 << pq.nbits);
         ivpq->code_size = pq.code_size;
+        ivpq->init_code_packer();
 
         idx = ivpq;
     } else if (h == fourcc("IRMf")) {
@@ -1072,7 +1075,7 @@
 static void read_binary_ivf_header(
         IndexBinaryIVF* ivf,
         IOReader* f,
-        std::vector<std::vector<Index::idx_t>>* ids = nullptr) {
+        std::vector<std::vector<idx_t>>* ids = nullptr) {
     read_index_binary_header(ivf, f);
     READ1(ivf->nlist);
     READ1(ivf->nprobe);
diff -Nru faiss-1.7.3/faiss/impl/index_write.cpp faiss-1.7.4/faiss/impl/index_write.cpp
--- faiss-1.7.3/faiss/impl/index_write.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/impl/index_write.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -84,7 +84,7 @@
 static void write_index_header(const Index* idx, IOWriter* f) {
     WRITE1(idx->d);
     WRITE1(idx->ntotal);
-    Index::idx_t dummy = 1 << 20;
+    idx_t dummy = 1 << 20;
     WRITE1(dummy);
     WRITE1(dummy);
     WRITE1(idx->is_trained);
@@ -373,7 +373,6 @@
     WRITE1(maintain_direct_map);
     WRITEVECTOR(dm->array);
     if (dm->type == DirectMap::Hashtable) {
-        using idx_t = Index::idx_t;
         std::vector<std::pair<idx_t, idx_t>> v;
         const std::unordered_map<idx_t, idx_t>& map = dm->hashtable;
         v.resize(map.size());
@@ -615,7 +614,7 @@
         WRITE1(h);
         write_ivf_header(ivfl, f);
         {
-            std::vector<Index::idx_t> tab(2 * ivfl->instances.size());
+            std::vector<idx_t> tab(2 * ivfl->instances.size());
             long i = 0;
             for (auto it = ivfl->instances.begin(); it != ivfl->instances.end();
                  ++it) {
@@ -900,7 +899,7 @@
         size_t ntotal,
         IOWriter* f) {
     int id_bits = 0;
-    while ((ntotal > ((Index::idx_t)1 << id_bits))) {
+    while ((ntotal > ((idx_t)1 << id_bits))) {
         id_bits++;
     }
     WRITE1(id_bits);
diff -Nru faiss-1.7.3/faiss/impl/kmeans1d.cpp faiss-1.7.4/faiss/impl/kmeans1d.cpp
--- faiss-1.7.3/faiss/impl/kmeans1d.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/impl/kmeans1d.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -20,7 +20,6 @@
 
 namespace faiss {
 
-using idx_t = Index::idx_t;
 using LookUpFunc = std::function<float(idx_t, idx_t)>;
 
 void reduce(
diff -Nru faiss-1.7.3/faiss/impl/kmeans1d.h faiss-1.7.4/faiss/impl/kmeans1d.h
--- faiss-1.7.3/faiss/impl/kmeans1d.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/impl/kmeans1d.h	2023-04-19 13:18:30.000000000 +0000
@@ -22,10 +22,10 @@
  * @param argmins  argmin of each row
  */
 void smawk(
-        const Index::idx_t nrows,
-        const Index::idx_t ncols,
+        const idx_t nrows,
+        const idx_t ncols,
         const float* x,
-        Index::idx_t* argmins);
+        idx_t* argmins);
 
 /** Exact 1D K-Means by dynamic programming
  *
diff -Nru faiss-1.7.3/faiss/impl/lattice_Zn.cpp faiss-1.7.4/faiss/impl/lattice_Zn.cpp
--- faiss-1.7.3/faiss/impl/lattice_Zn.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/impl/lattice_Zn.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -636,7 +636,7 @@
     }
 }
 
-// if not use_rec, instanciate an arbitrary harmless znc_rec
+// if not use_rec, instantiate an arbitrary harmless znc_rec
 ZnSphereCodecAlt::ZnSphereCodecAlt(int dim, int r2)
         : ZnSphereCodec(dim, r2),
           use_rec((dim & (dim - 1)) == 0),
diff -Nru faiss-1.7.3/faiss/impl/LocalSearchQuantizer.cpp faiss-1.7.4/faiss/impl/LocalSearchQuantizer.cpp
--- faiss-1.7.3/faiss/impl/LocalSearchQuantizer.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/impl/LocalSearchQuantizer.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -21,6 +21,15 @@
 #include <faiss/utils/hamming.h> // BitstringWriter
 #include <faiss/utils/utils.h>
 
+#include <faiss/utils/approx_topk/approx_topk.h>
+
+// this is needed for prefetching
+#include <faiss/impl/platform_macros.h>
+
+#ifdef __AVX2__
+#include <xmmintrin.h>
+#endif
+
 extern "C" {
 // LU decomoposition of a general matrix
 void sgetrf_(
@@ -151,23 +160,7 @@
         Search_type_t search_type)
         : AdditiveQuantizer(d, std::vector<size_t>(M, nbits), search_type) {
     K = (1 << nbits);
-
-    train_iters = 25;
-    train_ils_iters = 8;
-    icm_iters = 4;
-
-    encode_ils_iters = 16;
-
-    p = 0.5f;
-    lambd = 1e-2f;
-
-    chunk_size = 10000;
-    nperts = 4;
-
-    random_seed = 0x12345;
     std::srand(random_seed);
-
-    icm_encoder_factory = nullptr;
 }
 
 LocalSearchQuantizer::~LocalSearchQuantizer() {
@@ -192,7 +185,7 @@
     // allocate memory for codebooks, size [M, K, d]
     codebooks.resize(M * K * d);
 
-    // randomly intialize codes
+    // randomly initialize codes
     std::mt19937 gen(random_seed);
     std::vector<int32_t> codes(n * M); // [n, M]
     random_int32(codes, 0, K - 1, gen);
@@ -604,54 +597,72 @@
     FAISS_THROW_IF_NOT(M != 0 && K != 0);
     FAISS_THROW_IF_NOT(binaries != nullptr);
 
-    for (size_t iter = 0; iter < n_iters; iter++) {
-        // condition on the m-th subcode
-        for (size_t m = 0; m < M; m++) {
-            std::vector<float> objs(n * K);
-#pragma omp parallel for
-            for (int64_t i = 0; i < n; i++) {
-                auto u = unaries + m * n * K + i * K;
-                memcpy(objs.data() + i * K, u, sizeof(float) * K);
-            }
+#pragma omp parallel for schedule(dynamic)
+    for (int64_t i = 0; i < n; i++) {
+        std::vector<float> objs(K);
 
-            // compute objective function by adding unary
-            // and binary terms together
-            for (size_t other_m = 0; other_m < M; other_m++) {
-                if (other_m == m) {
-                    continue;
+        for (size_t iter = 0; iter < n_iters; iter++) {
+            // condition on the m-th subcode
+            for (size_t m = 0; m < M; m++) {
+                // copy
+                auto u = unaries + m * n * K + i * K;
+                for (size_t code = 0; code < K; code++) {
+                    objs[code] = u[code];
                 }
 
-#pragma omp parallel for
-                for (int64_t i = 0; i < n; i++) {
+                // compute objective function by adding unary
+                // and binary terms together
+                for (size_t other_m = 0; other_m < M; other_m++) {
+                    if (other_m == m) {
+                        continue;
+                    }
+
+#ifdef __AVX2__
+                    // TODO: add platform-independent compiler-independent
+                    // prefetch utilities.
+                    if (other_m + 1 < M) {
+                        // do a single prefetch
+                        int32_t code2 = codes[i * M + other_m + 1];
+                        // for (int32_t code = 0; code < K; code += 64) {
+                        int32_t code = 0;
+                        {
+                            size_t binary_idx = (other_m + 1) * M * K * K +
+                                    m * K * K + code2 * K + code;
+                            _mm_prefetch(binaries + binary_idx, _MM_HINT_T0);
+                        }
+                    }
+#endif
+
                     for (int32_t code = 0; code < K; code++) {
                         int32_t code2 = codes[i * M + other_m];
-                        size_t binary_idx = m * M * K * K + other_m * K * K +
-                                code * K + code2;
-                        // binaries[m, other_m, code, code2]
-                        objs[i * K + code] += binaries[binary_idx];
+                        size_t binary_idx = other_m * M * K * K + m * K * K +
+                                code2 * K + code;
+                        // binaries[m, other_m, code, code2].
+                        // It is symmetric over (m <-> other_m)
+                        //   and (code <-> code2).
+                        // So, replace the op with
+                        //   binaries[other_m, m, code2, code].
+                        objs[code] += binaries[binary_idx];
                     }
                 }
-            }
 
-            // find the optimal value of the m-th subcode
-#pragma omp parallel for
-            for (int64_t i = 0; i < n; i++) {
+                // find the optimal value of the m-th subcode
                 float best_obj = HUGE_VALF;
                 int32_t best_code = 0;
-                for (size_t code = 0; code < K; code++) {
-                    float obj = objs[i * K + code];
-                    if (obj < best_obj) {
-                        best_obj = obj;
-                        best_code = code;
-                    }
-                }
+
+                // find one using SIMD. The following operation is similar
+                // to the search of the smallest element in objs
+                using C = CMax<float, int>;
+                HeapWithBuckets<C, 16, 1>::addn(
+                        K, objs.data(), 1, &best_obj, &best_code);
+
+                // done
                 codes[i * M + m] = best_code;
-            }
 
-        } // loop M
+            } // loop M
+        }
     }
 }
-
 void LocalSearchQuantizer::perturb_codes(
         int32_t* codes,
         size_t n,
diff -Nru faiss-1.7.3/faiss/impl/LocalSearchQuantizer.h faiss-1.7.4/faiss/impl/LocalSearchQuantizer.h
--- faiss-1.7.3/faiss/impl/LocalSearchQuantizer.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/impl/LocalSearchQuantizer.h	2023-04-19 13:18:30.000000000 +0000
@@ -45,22 +45,21 @@
 struct LocalSearchQuantizer : AdditiveQuantizer {
     size_t K; ///< number of codes per codebook
 
-    size_t train_iters; ///< number of iterations in training
+    size_t train_iters = 25;      ///< number of iterations in training
+    size_t encode_ils_iters = 16; ///< iterations of local search in encoding
+    size_t train_ils_iters = 8;   ///< iterations of local search in training
+    size_t icm_iters = 4;         ///< number of iterations in icm
 
-    size_t encode_ils_iters; ///< iterations of local search in encoding
-    size_t train_ils_iters;  ///< iterations of local search in training
-    size_t icm_iters;        ///< number of iterations in icm
+    float p = 0.5f;      ///< temperature factor
+    float lambd = 1e-2f; ///< regularization factor
 
-    float p;     ///< temperature factor
-    float lambd; ///< regularization factor
+    size_t chunk_size = 10000; ///< nb of vectors to encode at a time
 
-    size_t chunk_size; ///< nb of vectors to encode at a time
+    int random_seed = 0x12345; ///< seed for random generator
+    size_t nperts = 4;         ///< number of perturbation in each code
 
-    int random_seed; ///< seed for random generator
-    size_t nperts;   ///< number of perturbation in each code
-
-    ///< if non-NULL, use this encoder to encode
-    lsq::IcmEncoderFactory* icm_encoder_factory;
+    ///< if non-NULL, use this encoder to encode (owned by the object)
+    lsq::IcmEncoderFactory* icm_encoder_factory = nullptr;
 
     bool update_codebooks_with_double = true;
 
diff -Nru faiss-1.7.3/faiss/impl/NNDescent.cpp faiss-1.7.4/faiss/impl/NNDescent.cpp
--- faiss-1.7.3/faiss/impl/NNDescent.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/impl/NNDescent.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -147,14 +147,8 @@
 
 constexpr int NUM_EVAL_POINTS = 100;
 
-NNDescent::NNDescent(const int d, const int K) : K(K), random_seed(2021), d(d) {
-    ntotal = 0;
-    has_built = false;
-    S = 10;
-    R = 100;
+NNDescent::NNDescent(const int d, const int K) : K(K), d(d) {
     L = K + 50;
-    iter = 10;
-    search_L = 0;
 }
 
 NNDescent::~NNDescent() {}
@@ -311,7 +305,7 @@
     for (int i = 0; i < c.size(); i++) {
         std::vector<Neighbor> tmp;
         for (int j = 0; j < N; j++) {
-            if (i == j)
+            if (c[i] == j)
                 continue; // skip itself
             float dist = qdis.symmetric_dis(c[i], j);
             tmp.push_back(Neighbor(j, dist, true));
@@ -425,7 +419,7 @@
     // candidate pool, the K best items is the result.
     std::vector<Neighbor> retset(L + 1);
 
-    // Randomly choose L points to intialize the candidate pool
+    // Randomly choose L points to initialize the candidate pool
     std::vector<int> init_ids(L);
     std::mt19937 rng(random_seed);
 
diff -Nru faiss-1.7.3/faiss/impl/NNDescent.h faiss-1.7.4/faiss/impl/NNDescent.h
--- faiss-1.7.3/faiss/impl/NNDescent.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/impl/NNDescent.h	2023-04-19 13:18:30.000000000 +0000
@@ -90,7 +90,6 @@
 
 struct NNDescent {
     using storage_idx_t = int;
-    using idx_t = Index::idx_t;
 
     using KNNGraph = std::vector<nndescent::Nhood>;
 
@@ -133,19 +132,20 @@
             std::vector<int>& ctrl_points,
             std::vector<std::vector<int>>& acc_eval_set);
 
-    bool has_built;
+    bool has_built = false;
 
-    int K; // K in KNN graph
-    int S; // number of sample neighbors to be updated for each node
-    int R; // size of reverse links, 0 means the reverse links will not be used
-    int L; // size of the candidate pool in building
-    int iter;        // number of iterations to iterate over
-    int search_L;    // size of candidate pool in searching
-    int random_seed; // random seed for generators
+    int S = 10;  // number of sample neighbors to be updated for each node
+    int R = 100; // size of reverse links, 0 means the reverse links will not be
+                 // used
+    int iter = 10;          // number of iterations to iterate over
+    int search_L = 0;       // size of candidate pool in searching
+    int random_seed = 2021; // random seed for generators
 
+    int K; // K in KNN graph
     int d; // dimensions
+    int L; // size of the candidate pool in building
 
-    int ntotal;
+    int ntotal = 0;
 
     KNNGraph graph;
     std::vector<int> final_graph;
diff -Nru faiss-1.7.3/faiss/impl/NSG.cpp faiss-1.7.4/faiss/impl/NSG.cpp
--- faiss-1.7.3/faiss/impl/NSG.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/impl/NSG.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -29,8 +29,6 @@
    distances. This makes supporting INNER_PRODUCE search easier */
 
 struct NegativeDistanceComputer : DistanceComputer {
-    using idx_t = Index::idx_t;
-
     /// owned by this
     DistanceComputer* basedis;
 
@@ -59,7 +57,7 @@
 } // namespace
 
 DistanceComputer* storage_distance_computer(const Index* storage) {
-    if (storage->metric_type == METRIC_INNER_PRODUCT) {
+    if (is_similarity_metric(storage->metric_type)) {
         return new NegativeDistanceComputer(storage->get_distance_computer());
     } else {
         return storage->get_distance_computer();
@@ -140,9 +138,6 @@
 NSG::NSG(int R) : R(R), rng(0x0903) {
     L = R + 32;
     C = R + 100;
-    search_L = 16;
-    ntotal = 0;
-    is_built = false;
     srand(0x1998);
 }
 
diff -Nru faiss-1.7.3/faiss/impl/NSG.h faiss-1.7.4/faiss/impl/NSG.h
--- faiss-1.7.3/faiss/impl/NSG.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/impl/NSG.h	2023-04-19 13:18:30.000000000 +0000
@@ -98,12 +98,9 @@
 
 struct NSG {
     /// internal storage of vectors (32 bits: this is expensive)
-    using storage_idx_t = int;
+    using storage_idx_t = int32_t;
 
-    /// Faiss results are 64-bit
-    using idx_t = Index::idx_t;
-
-    int ntotal; ///< nb of nodes
+    int ntotal = 0; ///< nb of nodes
 
     // construction-time parameters
     int R; ///< nb of neighbors per node
@@ -111,13 +108,13 @@
     int C; ///< candidate pool size at construction time
 
     // search-time parameters
-    int search_L; ///< length of the search path
+    int search_L = 16; ///< length of the search path
 
     int enterpoint; ///< enterpoint
 
     std::shared_ptr<nsg::Graph<int>> final_graph; ///< NSG graph structure
 
-    bool is_built; ///< NSG is built or not
+    bool is_built = false; ///< NSG is built or not
 
     RandomGenerator rng; ///< random generator
 
diff -Nru faiss-1.7.3/faiss/impl/platform_macros.h faiss-1.7.4/faiss/impl/platform_macros.h
--- faiss-1.7.3/faiss/impl/platform_macros.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/impl/platform_macros.h	2023-04-19 13:18:30.000000000 +0000
@@ -7,6 +7,10 @@
 
 #pragma once
 
+// basic int types and size_t
+#include <cstdint>
+#include <cstdio>
+
 #ifdef _MSC_VER
 
 /*******************************************************
@@ -19,6 +23,10 @@
 #define FAISS_API __declspec(dllimport)
 #endif // FAISS_MAIN_LIB
 
+#ifdef _MSC_VER
+#define strtok_r strtok_s
+#endif // _MSC_VER
+
 #define __PRETTY_FUNCTION__ __FUNCSIG__
 
 #define posix_memalign(p, a, s) \
@@ -87,3 +95,56 @@
 #define ALIGNED(x) __attribute__((aligned(x)))
 
 #endif // _MSC_VER
+
+#if defined(__GNUC__) || defined(__clang__)
+#define FAISS_DEPRECATED(msg) __attribute__((deprecated(msg)))
+#else
+#define FAISS_DEPRECATED(msg)
+#endif // GCC or Clang
+
+// Localized enablement of imprecise floating point operations
+// You need to use all 3 macros to cover all compilers.
+#if defined(_MSC_VER)
+#define FAISS_PRAGMA_IMPRECISE_LOOP
+#define FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN \
+    __pragma(float_control(precise, off, push))
+#define FAISS_PRAGMA_IMPRECISE_FUNCTION_END __pragma(float_control(pop))
+#elif defined(__clang__)
+#define FAISS_PRAGMA_IMPRECISE_LOOP \
+    _Pragma("clang loop vectorize(enable) interleave(enable)")
+
+// clang-format off
+
+// the following ifdef is needed, because old versions of clang (prior to 14)
+// do not generate FMAs on x86 unless this pragma is used. On the other hand,
+// ARM does not support the following pragma flag.
+// TODO: find out how to enable FMAs on clang 10 and earlier.
+#if defined(__x86_64__) && (defined(__clang_major__) && (__clang_major__ > 10))
+#define FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN \
+    _Pragma("float_control(precise, off, push)")
+#define FAISS_PRAGMA_IMPRECISE_FUNCTION_END _Pragma("float_control(pop)")
+#else
+#define FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
+#define FAISS_PRAGMA_IMPRECISE_FUNCTION_END
+#endif
+#elif defined(__GNUC__)
+// Unfortunately, GCC does not provide a pragma for detecting it.
+// So, we have to stick to GNUC, which is defined by MANY compilers.
+// This is why clang/icc needs to be checked first.
+
+// todo: add __INTEL_COMPILER check for the classic ICC
+// todo: add __INTEL_LLVM_COMPILER for ICX
+
+#define FAISS_PRAGMA_IMPRECISE_LOOP
+#define FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN \
+    _Pragma("GCC push_options") \
+    _Pragma("GCC optimize (\"unroll-loops,associative-math,no-signed-zeros\")")
+#define FAISS_PRAGMA_IMPRECISE_FUNCTION_END \
+    _Pragma("GCC pop_options")
+#else
+#define FAISS_PRAGMA_IMPRECISE_LOOP
+#define FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
+#define FAISS_PRAGMA_IMPRECISE_FUNCTION_END
+#endif
+
+// clang-format on
diff -Nru faiss-1.7.3/faiss/impl/PolysemousTraining.cpp faiss-1.7.4/faiss/impl/PolysemousTraining.cpp
--- faiss-1.7.3/faiss/impl/PolysemousTraining.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/impl/PolysemousTraining.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -8,7 +8,6 @@
 // -*- c++ -*-
 
 #include <faiss/impl/PolysemousTraining.h>
-#include "faiss/impl/FaissAssert.h"
 
 #include <omp.h>
 #include <stdint.h>
@@ -36,19 +35,6 @@
  * Optimization code
  ****************************************************/
 
-SimulatedAnnealingParameters::SimulatedAnnealingParameters() {
-    // set some reasonable defaults for the optimization
-    init_temperature = 0.7;
-    temperature_decay = pow(0.9, 1 / 500.);
-    // reduce by a factor 0.9 every 500 it
-    n_iter = 500000;
-    n_redo = 2;
-    seed = 123;
-    verbose = 0;
-    only_bit_flips = false;
-    init_random = false;
-}
-
 // what would the cost update be if iw and jw were swapped?
 // default implementation just computes both and computes the difference
 double PermutationObjective::cost_update(const int* perm, int iw, int jw)
@@ -906,7 +892,7 @@
         ScopeDeleter1<PermutationObjective> del(obj);
 
         if (verbose > 0) {
-            printf("   m=%d, nq=%zd, nb=%zd, intialize RankingScore "
+            printf("   m=%d, nq=%zd, nb=%zd, initialize RankingScore "
                    "in %.3f ms\n",
                    m,
                    nq,
diff -Nru faiss-1.7.3/faiss/impl/PolysemousTraining.h faiss-1.7.4/faiss/impl/PolysemousTraining.h
--- faiss-1.7.3/faiss/impl/PolysemousTraining.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/impl/PolysemousTraining.h	2023-04-19 13:18:30.000000000 +0000
@@ -17,18 +17,19 @@
 /// parameters used for the simulated annealing method
 struct SimulatedAnnealingParameters {
     // optimization parameters
-    double init_temperature;  // init probability of accepting a bad swap
-    double temperature_decay; // at each iteration the temp is multiplied by
-                              // this
-    int n_iter;               // nb of iterations
-    int n_redo;               // nb of runs of the simulation
-    int seed;                 // random seed
-    int verbose;
-    bool only_bit_flips; // restrict permutation changes to bit flips
-    bool init_random;    // initialize with a random permutation (not identity)
+    double init_temperature = 0.7; // init probability of accepting a bad swap
+    // at each iteration the temp is multiplied by this
+    double temperature_decay = 0.9997893011688015; // = 0.9^(1/500)
+    int n_iter = 500000;                           // nb of iterations
+    int n_redo = 2; // nb of runs of the simulation
+    int seed = 123; // random seed
+    int verbose = 0;
+    bool only_bit_flips = false; // restrict permutation changes to bit flips
+    bool init_random =
+            false; // initialize with a random permutation (not identity)
 
     // set reasonable defaults
-    SimulatedAnnealingParameters();
+    SimulatedAnnealingParameters() {}
 };
 
 /// abstract class for the loss function
diff -Nru faiss-1.7.3/faiss/impl/pq4_fast_scan.cpp faiss-1.7.4/faiss/impl/pq4_fast_scan.cpp
--- faiss-1.7.3/faiss/impl/pq4_fast_scan.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/impl/pq4_fast_scan.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -88,7 +88,7 @@
         size_t i0,
         size_t i1,
         size_t bbs,
-        size_t M2,
+        size_t nsq,
         uint8_t* blocks) {
     const uint8_t perm0[16] = {
             0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15};
@@ -98,9 +98,9 @@
     size_t block1 = ((i1 - 1) / bbs) + 1;
 
     for (size_t b = block0; b < block1; b++) {
-        uint8_t* codes2 = blocks + b * bbs * M2 / 2;
+        uint8_t* codes2 = blocks + b * bbs * nsq / 2;
         int64_t i_base = b * bbs - i0;
-        for (int sq = 0; sq < M2; sq += 2) {
+        for (int sq = 0; sq < nsq; sq += 2) {
             for (size_t i = 0; i < bbs; i += 32) {
                 std::array<uint8_t, 32> c, c0, c1;
                 get_matrix_column(
@@ -127,7 +127,7 @@
 // get the specific address of the vector inside a block
 // shift is used for determine the if the saved in bits 0..3 (false) or
 // bits 4..7 (true)
-uint8_t get_vector_specific_address(
+size_t get_vector_specific_address(
         size_t bbs,
         size_t vector_id,
         size_t sq,
@@ -189,6 +189,50 @@
     }
 }
 
+/***************************************************************
+ * CodePackerPQ4 implementation
+ ***************************************************************/
+
+CodePackerPQ4::CodePackerPQ4(size_t nsq, size_t bbs) {
+    this->nsq = nsq;
+    nvec = bbs;
+    code_size = (nsq * 4 + 7) / 8;
+    block_size = ((nsq + 1) / 2) * bbs;
+}
+
+void CodePackerPQ4::pack_1(
+        const uint8_t* flat_code,
+        size_t offset,
+        uint8_t* block) const {
+    size_t bbs = nvec;
+    if (offset >= nvec) {
+        block += (offset / nvec) * block_size;
+        offset = offset % nvec;
+    }
+    for (size_t i = 0; i < code_size; i++) {
+        uint8_t code = flat_code[i];
+        pq4_set_packed_element(block, code & 15, bbs, nsq, offset, 2 * i);
+        pq4_set_packed_element(block, code >> 4, bbs, nsq, offset, 2 * i + 1);
+    }
+}
+
+void CodePackerPQ4::unpack_1(
+        const uint8_t* block,
+        size_t offset,
+        uint8_t* flat_code) const {
+    size_t bbs = nvec;
+    if (offset >= nvec) {
+        block += (offset / nvec) * block_size;
+        offset = offset % nvec;
+    }
+    for (size_t i = 0; i < code_size; i++) {
+        uint8_t code0, code1;
+        code0 = pq4_get_packed_element(block, bbs, nsq, offset, 2 * i);
+        code1 = pq4_get_packed_element(block, bbs, nsq, offset, 2 * i + 1);
+        flat_code[i] = code0 | (code1 << 4);
+    }
+}
+
 /***************************************************************
  * Packing functions for Look-Up Tables (LUT)
  ***************************************************************/
diff -Nru faiss-1.7.3/faiss/impl/pq4_fast_scan.h faiss-1.7.4/faiss/impl/pq4_fast_scan.h
--- faiss-1.7.3/faiss/impl/pq4_fast_scan.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/impl/pq4_fast_scan.h	2023-04-19 13:18:30.000000000 +0000
@@ -10,6 +10,8 @@
 #include <cstdint>
 #include <cstdlib>
 
+#include <faiss/impl/CodePacker.h>
+
 /** PQ4 SIMD packing and accumulation functions
  *
  * The basic kernel accumulates nq query vectors with bbs = nb * 2 * 16 vectors
@@ -17,7 +19,7 @@
  * otherwise register spilling becomes too large.
  *
  * The implementation of these functions is spread over 3 cpp files to reduce
- * parallel compile times. Templates are instanciated explicitly.
+ * parallel compile times. Templates are instantiated explicitly.
  */
 
 namespace faiss {
@@ -29,7 +31,7 @@
  * @param ntotal  number of input codes
  * @param nb      output number of codes (ntotal rounded up to a multiple of
  *                bbs)
- * @param M2      number of sub-quantizers (=M rounded up to a muliple of 2)
+ * @param nsq      number of sub-quantizers (=M rounded up to a muliple of 2)
  * @param bbs     size of database blocks (multiple of 32)
  * @param blocks  output array, size nb * nsq / 2.
  */
@@ -39,7 +41,7 @@
         size_t M,
         size_t nb,
         size_t bbs,
-        size_t M2,
+        size_t nsq,
         uint8_t* blocks);
 
 /** Same as pack_codes but write in a given range of the output,
@@ -56,7 +58,7 @@
         size_t i0,
         size_t i1,
         size_t bbs,
-        size_t M2,
+        size_t nsq,
         uint8_t* blocks);
 
 /** get a single element from a packed codes table
@@ -84,6 +86,18 @@
         size_t vector_id,
         size_t sq);
 
+/** CodePacker API for the PQ4 fast-scan */
+struct CodePackerPQ4 : CodePacker {
+    size_t nsq;
+
+    CodePackerPQ4(size_t nsq, size_t bbs);
+
+    void pack_1(const uint8_t* flat_code, size_t offset, uint8_t* block)
+            const final;
+    void unpack_1(const uint8_t* block, size_t offset, uint8_t* flat_code)
+            const final;
+};
+
 /** Pack Look-up table for consumption by the kernel.
  *
  * @param nq      number of queries
diff -Nru faiss-1.7.3/faiss/impl/pq4_fast_scan_search_qbs.cpp faiss-1.7.4/faiss/impl/pq4_fast_scan_search_qbs.cpp
--- faiss-1.7.3/faiss/impl/pq4_fast_scan_search_qbs.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/impl/pq4_fast_scan_search_qbs.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -189,7 +189,7 @@
         DISPATCH(3);
         DISPATCH(4);
     }
-    FAISS_THROW_FMT("accumulate nq=%d not instanciated", nq);
+    FAISS_THROW_FMT("accumulate nq=%d not instantiated", nq);
 
 #undef DISPATCH
 }
@@ -263,7 +263,7 @@
                 DISPATCH(4);
 #undef DISPATCH
                 default:
-                    FAISS_THROW_FMT("accumulate nq=%d not instanciated", nq);
+                    FAISS_THROW_FMT("accumulate nq=%d not instantiated", nq);
             }
             i0 += nq;
             LUT += nq * nsq * 16;
diff -Nru faiss-1.7.3/faiss/impl/ProductAdditiveQuantizer.cpp faiss-1.7.4/faiss/impl/ProductAdditiveQuantizer.cpp
--- faiss-1.7.3/faiss/impl/ProductAdditiveQuantizer.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/impl/ProductAdditiveQuantizer.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -65,13 +65,6 @@
         M += q->M;
         nbits.insert(nbits.end(), q->nbits.begin(), q->nbits.end());
     }
-    verbose = false;
-    is_trained = false;
-    norm_max = norm_min = NAN;
-    code_size = 0;
-    tot_bits = 0;
-    total_codebook_size = 0;
-    only_8bit = false;
     set_derived_values();
 
     // ProductAdditiveQuantizer
diff -Nru faiss-1.7.3/faiss/impl/ProductQuantizer.cpp faiss-1.7.4/faiss/impl/ProductQuantizer.cpp
--- faiss-1.7.3/faiss/impl/ProductQuantizer.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/impl/ProductQuantizer.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -421,15 +421,28 @@
 
 void ProductQuantizer::compute_distance_table(const float* x, float* dis_table)
         const {
-    size_t m;
-
-    for (m = 0; m < M; m++) {
-        fvec_L2sqr_ny(
-                dis_table + m * ksub,
-                x + m * dsub,
-                get_centroids(m, 0),
-                dsub,
-                ksub);
+    if (transposed_centroids.empty()) {
+        // use regular version
+        for (size_t m = 0; m < M; m++) {
+            fvec_L2sqr_ny(
+                    dis_table + m * ksub,
+                    x + m * dsub,
+                    get_centroids(m, 0),
+                    dsub,
+                    ksub);
+        }
+    } else {
+        // transposed centroids are available, use'em
+        for (size_t m = 0; m < M; m++) {
+            fvec_L2sqr_ny_transposed(
+                    dis_table + m * ksub,
+                    x + m * dsub,
+                    transposed_centroids.data() + m * ksub,
+                    centroids_sq_lengths.data() + m * ksub,
+                    dsub,
+                    M * ksub,
+                    ksub);
+        }
     }
 }
 
@@ -460,7 +473,7 @@
 #endif
             if (dsub < 16) {
 
-#pragma omp parallel for
+#pragma omp parallel for if (nx > 1)
         for (int64_t i = 0; i < nx; i++) {
             compute_distance_table(x + i * d, dis_tables + i * ksub * M);
         }
@@ -494,7 +507,7 @@
 #endif
             if (dsub < 16) {
 
-#pragma omp parallel for
+#pragma omp parallel for if (nx > 1)
         for (int64_t i = 0; i < nx; i++) {
             compute_inner_prod_table(x + i * d, dis_tables + i * ksub * M);
         }
@@ -668,7 +681,7 @@
     size_t k = res->k, nx = res->nh;
     size_t ksub = pq.ksub, M = pq.M;
 
-#pragma omp parallel for
+#pragma omp parallel for if (nx > 1)
     for (int64_t i = 0; i < nx; i++) {
         /* query preparation for asymmetric search: compute look-up tables */
         const float* dis_table = dis_tables + i * ksub * M;
diff -Nru faiss-1.7.3/faiss/impl/ProductQuantizer.h faiss-1.7.4/faiss/impl/ProductQuantizer.h
--- faiss-1.7.3/faiss/impl/ProductQuantizer.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/impl/ProductQuantizer.h	2023-04-19 13:18:30.000000000 +0000
@@ -23,8 +23,6 @@
 
 /** Product Quantizer. Implemented only for METRIC_L2 */
 struct ProductQuantizer : Quantizer {
-    using idx_t = Index::idx_t;
-
     size_t M;     ///< number of subquantizers
     size_t nbits; ///< number of bits per quantization index
 
@@ -38,8 +36,8 @@
         Train_default,
         Train_hot_start,     ///< the centroids are already initialized
         Train_shared,        ///< share dictionary accross PQ segments
-        Train_hypercube,     ///< intialize centroids with nbits-D hypercube
-        Train_hypercube_pca, ///< intialize centroids with nbits-D hypercube
+        Train_hypercube,     ///< initialize centroids with nbits-D hypercube
+        Train_hypercube_pca, ///< initialize centroids with nbits-D hypercube
     };
     train_type_t train_type;
 
diff -Nru faiss-1.7.3/faiss/impl/Quantizer.h faiss-1.7.4/faiss/impl/Quantizer.h
--- faiss-1.7.3/faiss/impl/Quantizer.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/impl/Quantizer.h	2023-04-19 13:18:30.000000000 +0000
@@ -1,4 +1,9 @@
-// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 
 #pragma once
 
@@ -8,8 +13,6 @@
 
 /** Product Quantizer. Implemented only for METRIC_L2 */
 struct Quantizer {
-    using idx_t = Index::idx_t;
-
     size_t d;         ///< size of the input vectors
     size_t code_size; ///< bytes per indexed vector
 
diff -Nru faiss-1.7.3/faiss/impl/ResidualQuantizer.cpp faiss-1.7.4/faiss/impl/ResidualQuantizer.cpp
--- faiss-1.7.3/faiss/impl/ResidualQuantizer.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/impl/ResidualQuantizer.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -23,6 +23,10 @@
 #include <faiss/utils/hamming.h>
 #include <faiss/utils/utils.h>
 
+#include <faiss/utils/simdlib.h>
+
+#include <faiss/utils/approx_topk/approx_topk.h>
+
 extern "C" {
 
 // general matrix multiplication
@@ -63,12 +67,7 @@
 
 namespace faiss {
 
-ResidualQuantizer::ResidualQuantizer()
-        : train_type(Train_progressive_dim),
-          niter_codebook_refine(5),
-          max_beam_size(5),
-          use_beam_LUT(0),
-          assign_index_factory(nullptr) {
+ResidualQuantizer::ResidualQuantizer() {
     d = 0;
     M = 0;
     verbose = false;
@@ -139,12 +138,11 @@
         int32_t* new_codes,   /// size (n, new_beam_size, m + 1)
         float* new_residuals, /// size (n, new_beam_size, d)
         float* new_distances, /// size (n, new_beam_size)
-        Index* assign_index) {
+        Index* assign_index,
+        ApproxTopK_mode_t approx_topk_mode) {
     // we have to fill in the whole output matrix
     FAISS_THROW_IF_NOT(new_beam_size <= beam_size * K);
 
-    using idx_t = Index::idx_t;
-
     std::vector<float> cent_distances;
     std::vector<idx_t> cent_ids;
 
@@ -230,15 +228,36 @@
                 new_distances_i[i] = C::neutral();
             }
             std::vector<int> perm(new_beam_size, -1);
-            heap_addn<C>(
-                    new_beam_size,
-                    new_distances_i,
-                    perm.data(),
-                    cent_distances_i,
-                    nullptr,
-                    beam_size * K);
+
+#define HANDLE_APPROX(NB, BD)                                  \
+    case ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B##NB##_D##BD: \
+        HeapWithBuckets<C, NB, BD>::bs_addn(                   \
+                beam_size,                                     \
+                K,                                             \
+                cent_distances_i,                              \
+                new_beam_size,                                 \
+                new_distances_i,                               \
+                perm.data());                                  \
+        break;
+
+            switch (approx_topk_mode) {
+                HANDLE_APPROX(8, 3)
+                HANDLE_APPROX(8, 2)
+                HANDLE_APPROX(16, 2)
+                HANDLE_APPROX(32, 2)
+                default:
+                    heap_addn<C>(
+                            new_beam_size,
+                            new_distances_i,
+                            perm.data(),
+                            cent_distances_i,
+                            nullptr,
+                            beam_size * K);
+            }
             heap_reorder<C>(new_beam_size, new_distances_i, perm.data());
 
+#undef HANDLE_APPROX
+
             for (int j = 0; j < new_beam_size; j++) {
                 int js = perm[j] / K;
                 int ls = perm[j] % K;
@@ -364,7 +383,8 @@
                     new_codes.data() + i0 * new_beam_size * (m + 1),
                     new_residuals.data() + i0 * new_beam_size * d,
                     new_distances.data() + i0 * new_beam_size,
-                    assign_index.get());
+                    assign_index.get(),
+                    approx_topk_mode);
         }
         codes.swap(new_codes);
         residuals.swap(new_residuals);
@@ -544,196 +564,398 @@
     size_t mem;
     mem = beam_size * d * 2 * sizeof(float); // size for 2 beams at a time
     mem += beam_size * beam_size *
-            (sizeof(float) +
-             sizeof(Index::idx_t)); // size for 1 beam search result
+            (sizeof(float) + sizeof(idx_t)); // size for 1 beam search result
     return mem;
 }
 
-void ResidualQuantizer::compute_codes_add_centroids(
+// a namespace full of preallocated buffers
+namespace {
+
+// Preallocated memory chunk for refine_beam_mp() call
+struct RefineBeamMemoryPool {
+    std::vector<int32_t> new_codes;
+    std::vector<float> new_residuals;
+
+    std::vector<float> residuals;
+    std::vector<int32_t> codes;
+    std::vector<float> distances;
+};
+
+// Preallocated memory chunk for refine_beam_LUT_mp() call
+struct RefineBeamLUTMemoryPool {
+    std::vector<int32_t> new_codes;
+    std::vector<float> new_distances;
+
+    std::vector<int32_t> codes;
+    std::vector<float> distances;
+};
+
+// this is for use_beam_LUT == 0 in compute_codes_add_centroids_mp_lut0() call
+struct ComputeCodesAddCentroidsLUT0MemoryPool {
+    std::vector<int32_t> codes;
+    std::vector<float> norms;
+    std::vector<float> distances;
+    std::vector<float> residuals;
+    RefineBeamMemoryPool refine_beam_pool;
+};
+
+// this is for use_beam_LUT == 1 in compute_codes_add_centroids_mp_lut1() call
+struct ComputeCodesAddCentroidsLUT1MemoryPool {
+    std::vector<int32_t> codes;
+    std::vector<float> distances;
+    std::vector<float> query_norms;
+    std::vector<float> query_cp;
+    std::vector<float> residuals;
+    RefineBeamLUTMemoryPool refine_beam_lut_pool;
+};
+
+} // namespace
+
+// forward declaration
+void refine_beam_mp(
+        const ResidualQuantizer& rq,
+        size_t n,
+        size_t beam_size,
+        const float* x,
+        int out_beam_size,
+        int32_t* out_codes,
+        float* out_residuals,
+        float* out_distances,
+        RefineBeamMemoryPool& pool);
+
+// forward declaration
+void refine_beam_LUT_mp(
+        const ResidualQuantizer& rq,
+        size_t n,
+        const float* query_norms, // size n
+        const float* query_cp,    //
+        int out_beam_size,
+        int32_t* out_codes,
+        float* out_distances,
+        RefineBeamLUTMemoryPool& pool);
+
+// this is for use_beam_LUT == 0
+void compute_codes_add_centroids_mp_lut0(
+        const ResidualQuantizer& rq,
         const float* x,
         uint8_t* codes_out,
         size_t n,
-        const float* centroids) const {
-    FAISS_THROW_IF_NOT_MSG(is_trained, "RQ is not trained yet.");
+        const float* centroids,
+        ComputeCodesAddCentroidsLUT0MemoryPool& pool) {
+    pool.codes.resize(rq.max_beam_size * rq.M * n);
+    pool.distances.resize(rq.max_beam_size * n);
 
-    size_t mem = memory_per_point();
-    if (n > 1 && mem * n > max_mem_distances) {
-        // then split queries to reduce temp memory
-        size_t bs = max_mem_distances / mem;
-        if (bs == 0) {
-            bs = 1; // otherwise we can't do much
-        }
-        for (size_t i0 = 0; i0 < n; i0 += bs) {
-            size_t i1 = std::min(n, i0 + bs);
-            const float* cent = nullptr;
-            if (centroids != nullptr) {
-                cent = centroids + i0 * d;
-            }
-            compute_codes_add_centroids(
-                    x + i0 * d, codes_out + i0 * code_size, i1 - i0, cent);
+    pool.residuals.resize(rq.max_beam_size * n * rq.d);
+
+    refine_beam_mp(
+            rq,
+            n,
+            1,
+            x,
+            rq.max_beam_size,
+            pool.codes.data(),
+            pool.residuals.data(),
+            pool.distances.data(),
+            pool.refine_beam_pool);
+
+    if (rq.search_type == ResidualQuantizer::ST_norm_float ||
+        rq.search_type == ResidualQuantizer::ST_norm_qint8 ||
+        rq.search_type == ResidualQuantizer::ST_norm_qint4) {
+        pool.norms.resize(n);
+        // recover the norms of reconstruction as
+        // || original_vector - residual ||^2
+        for (size_t i = 0; i < n; i++) {
+            pool.norms[i] = fvec_L2sqr(
+                    x + i * rq.d,
+                    pool.residuals.data() + i * rq.max_beam_size * rq.d,
+                    rq.d);
         }
-        return;
     }
 
-    std::vector<int32_t> codes(max_beam_size * M * n);
-    std::vector<float> norms;
-    std::vector<float> distances(max_beam_size * n);
+    // pack only the first code of the beam
+    //   (hence the ld_codes=M * max_beam_size)
+    rq.pack_codes(
+            n,
+            pool.codes.data(),
+            codes_out,
+            rq.M * rq.max_beam_size,
+            (pool.norms.size() > 0) ? pool.norms.data() : nullptr,
+            centroids);
+}
 
-    if (use_beam_LUT == 0) {
-        std::vector<float> residuals(max_beam_size * n * d);
+// use_beam_LUT == 1
+void compute_codes_add_centroids_mp_lut1(
+        const ResidualQuantizer& rq,
+        const float* x,
+        uint8_t* codes_out,
+        size_t n,
+        const float* centroids,
+        ComputeCodesAddCentroidsLUT1MemoryPool& pool) {
+    //
+    pool.codes.resize(rq.max_beam_size * rq.M * n);
+    pool.distances.resize(rq.max_beam_size * n);
+
+    FAISS_THROW_IF_NOT_MSG(
+            rq.codebook_cross_products.size() ==
+                    rq.total_codebook_size * rq.total_codebook_size,
+            "call compute_codebook_tables first");
 
-        refine_beam(
-                n,
-                1,
-                x,
-                max_beam_size,
-                codes.data(),
-                residuals.data(),
-                distances.data());
-
-        if (search_type == ST_norm_float || search_type == ST_norm_qint8 ||
-            search_type == ST_norm_qint4) {
-            norms.resize(n);
-            // recover the norms of reconstruction as
-            // || original_vector - residual ||^2
-            for (size_t i = 0; i < n; i++) {
-                norms[i] = fvec_L2sqr(
-                        x + i * d, residuals.data() + i * max_beam_size * d, d);
-            }
-        }
-    } else if (use_beam_LUT == 1) {
-        FAISS_THROW_IF_NOT_MSG(
-                codebook_cross_products.size() ==
-                        total_codebook_size * total_codebook_size,
-                "call compute_codebook_tables first");
-
-        std::vector<float> query_norms(n);
-        fvec_norms_L2sqr(query_norms.data(), x, d, n);
-
-        std::vector<float> query_cp(n * total_codebook_size);
-        {
-            FINTEGER ti = total_codebook_size, di = d, ni = n;
-            float zero = 0, one = 1;
-            sgemm_("Transposed",
-                   "Not transposed",
-                   &ti,
-                   &ni,
-                   &di,
-                   &one,
-                   codebooks.data(),
-                   &di,
-                   x,
-                   &di,
-                   &zero,
-                   query_cp.data(),
-                   &ti);
-        }
+    pool.query_norms.resize(n);
+    fvec_norms_L2sqr(pool.query_norms.data(), x, rq.d, n);
 
-        refine_beam_LUT(
-                n,
-                query_norms.data(),
-                query_cp.data(),
-                max_beam_size,
-                codes.data(),
-                distances.data());
-    }
-    // pack only the first code of the beam (hence the ld_codes=M *
-    // max_beam_size)
-    pack_codes(
+    pool.query_cp.resize(n * rq.total_codebook_size);
+    {
+        FINTEGER ti = rq.total_codebook_size, di = rq.d, ni = n;
+        float zero = 0, one = 1;
+        sgemm_("Transposed",
+               "Not transposed",
+               &ti,
+               &ni,
+               &di,
+               &one,
+               rq.codebooks.data(),
+               &di,
+               x,
+               &di,
+               &zero,
+               pool.query_cp.data(),
+               &ti);
+    }
+
+    refine_beam_LUT_mp(
+            rq,
+            n,
+            pool.query_norms.data(),
+            pool.query_cp.data(),
+            rq.max_beam_size,
+            pool.codes.data(),
+            pool.distances.data(),
+            pool.refine_beam_lut_pool);
+
+    // pack only the first code of the beam
+    //   (hence the ld_codes=M * max_beam_size)
+    rq.pack_codes(
             n,
-            codes.data(),
+            pool.codes.data(),
             codes_out,
-            M * max_beam_size,
-            norms.size() > 0 ? norms.data() : nullptr,
+            rq.M * rq.max_beam_size,
+            nullptr,
             centroids);
 }
 
-void ResidualQuantizer::refine_beam(
+void ResidualQuantizer::compute_codes_add_centroids(
+        const float* x,
+        uint8_t* codes_out,
+        size_t n,
+        const float* centroids) const {
+    FAISS_THROW_IF_NOT_MSG(is_trained, "RQ is not trained yet.");
+
+    //
+    size_t mem = memory_per_point();
+
+    size_t bs = max_mem_distances / mem;
+    if (bs == 0) {
+        bs = 1; // otherwise we can't do much
+    }
+
+    // prepare memory pools
+    ComputeCodesAddCentroidsLUT0MemoryPool pool0;
+    ComputeCodesAddCentroidsLUT1MemoryPool pool1;
+
+    for (size_t i0 = 0; i0 < n; i0 += bs) {
+        size_t i1 = std::min(n, i0 + bs);
+        const float* cent = nullptr;
+        if (centroids != nullptr) {
+            cent = centroids + i0 * d;
+        }
+
+        // compute_codes_add_centroids(
+        //   x + i0 * d,
+        //   codes_out + i0 * code_size,
+        //   i1 - i0,
+        //   cent);
+        if (use_beam_LUT == 0) {
+            compute_codes_add_centroids_mp_lut0(
+                    *this,
+                    x + i0 * d,
+                    codes_out + i0 * code_size,
+                    i1 - i0,
+                    cent,
+                    pool0);
+        } else if (use_beam_LUT == 1) {
+            compute_codes_add_centroids_mp_lut1(
+                    *this,
+                    x + i0 * d,
+                    codes_out + i0 * code_size,
+                    i1 - i0,
+                    cent,
+                    pool1);
+        }
+    }
+}
+
+void refine_beam_mp(
+        const ResidualQuantizer& rq,
         size_t n,
         size_t beam_size,
         const float* x,
         int out_beam_size,
         int32_t* out_codes,
         float* out_residuals,
-        float* out_distances) const {
+        float* out_distances,
+        RefineBeamMemoryPool& pool) {
     int cur_beam_size = beam_size;
 
-    std::vector<float> residuals(x, x + n * d * beam_size);
-    std::vector<int32_t> codes;
-    std::vector<float> distances;
     double t0 = getmillisecs();
 
+    // find the max_beam_size
+    int max_beam_size = 0;
+    {
+        int tmp_beam_size = cur_beam_size;
+        for (int m = 0; m < rq.M; m++) {
+            int K = 1 << rq.nbits[m];
+            int new_beam_size = std::min(tmp_beam_size * K, out_beam_size);
+            tmp_beam_size = new_beam_size;
+
+            if (max_beam_size < new_beam_size) {
+                max_beam_size = new_beam_size;
+            }
+        }
+    }
+
+    // preallocate buffers
+    pool.new_codes.resize(n * max_beam_size * (rq.M + 1));
+    pool.new_residuals.resize(n * max_beam_size * rq.d);
+
+    pool.codes.resize(n * max_beam_size * (rq.M + 1));
+    pool.distances.resize(n * max_beam_size);
+    pool.residuals.resize(n * rq.d * max_beam_size);
+
+    for (size_t i = 0; i < n * rq.d * beam_size; i++) {
+        pool.residuals[i] = x[i];
+    }
+
+    // set up pointers to buffers
+    int32_t* __restrict codes_ptr = pool.codes.data();
+    float* __restrict residuals_ptr = pool.residuals.data();
+
+    int32_t* __restrict new_codes_ptr = pool.new_codes.data();
+    float* __restrict new_residuals_ptr = pool.new_residuals.data();
+
+    // index
     std::unique_ptr<Index> assign_index;
-    if (assign_index_factory) {
-        assign_index.reset((*assign_index_factory)(d));
+    if (rq.assign_index_factory) {
+        assign_index.reset((*rq.assign_index_factory)(rq.d));
     } else {
-        assign_index.reset(new IndexFlatL2(d));
+        assign_index.reset(new IndexFlatL2(rq.d));
     }
 
-    for (int m = 0; m < M; m++) {
-        int K = 1 << nbits[m];
+    // main loop
+    size_t codes_size = 0;
+    size_t distances_size = 0;
+    size_t residuals_size = 0;
 
-        const float* codebooks_m =
-                this->codebooks.data() + codebook_offsets[m] * d;
+    for (int m = 0; m < rq.M; m++) {
+        int K = 1 << rq.nbits[m];
 
-        int new_beam_size = std::min(cur_beam_size * K, out_beam_size);
+        const float* __restrict codebooks_m =
+                rq.codebooks.data() + rq.codebook_offsets[m] * rq.d;
 
-        std::vector<int32_t> new_codes(n * new_beam_size * (m + 1));
-        std::vector<float> new_residuals(n * new_beam_size * d);
-        distances.resize(n * new_beam_size);
+        const int new_beam_size = std::min(cur_beam_size * K, out_beam_size);
+
+        codes_size = n * new_beam_size * (m + 1);
+        residuals_size = n * new_beam_size * rq.d;
+        distances_size = n * new_beam_size;
 
         beam_search_encode_step(
-                d,
+                rq.d,
                 K,
                 codebooks_m,
                 n,
                 cur_beam_size,
-                residuals.data(),
+                // residuals.data(),
+                residuals_ptr,
                 m,
-                codes.data(),
+                // codes.data(),
+                codes_ptr,
                 new_beam_size,
-                new_codes.data(),
-                new_residuals.data(),
-                distances.data(),
-                assign_index.get());
+                // new_codes.data(),
+                new_codes_ptr,
+                // new_residuals.data(),
+                new_residuals_ptr,
+                pool.distances.data(),
+                assign_index.get(),
+                rq.approx_topk_mode);
 
         assign_index->reset();
 
-        codes.swap(new_codes);
-        residuals.swap(new_residuals);
+        std::swap(codes_ptr, new_codes_ptr);
+        std::swap(residuals_ptr, new_residuals_ptr);
 
         cur_beam_size = new_beam_size;
 
-        if (verbose) {
+        if (rq.verbose) {
             float sum_distances = 0;
-            for (int j = 0; j < distances.size(); j++) {
-                sum_distances += distances[j];
+            // for (int j = 0; j < distances.size(); j++) {
+            //     sum_distances += distances[j];
+            // }
+            for (int j = 0; j < distances_size; j++) {
+                sum_distances += pool.distances[j];
             }
+
             printf("[%.3f s] encode stage %d, %d bits, "
                    "total error %g, beam_size %d\n",
                    (getmillisecs() - t0) / 1000,
                    m,
-                   int(nbits[m]),
+                   int(rq.nbits[m]),
                    sum_distances,
                    cur_beam_size);
         }
     }
 
     if (out_codes) {
-        memcpy(out_codes, codes.data(), codes.size() * sizeof(codes[0]));
+        // memcpy(out_codes, codes.data(), codes.size() * sizeof(codes[0]));
+        memcpy(out_codes, codes_ptr, codes_size * sizeof(*codes_ptr));
     }
     if (out_residuals) {
+        // memcpy(out_residuals,
+        //        residuals.data(),
+        //        residuals.size() * sizeof(residuals[0]));
         memcpy(out_residuals,
-               residuals.data(),
-               residuals.size() * sizeof(residuals[0]));
+               residuals_ptr,
+               residuals_size * sizeof(*residuals_ptr));
     }
     if (out_distances) {
+        // memcpy(out_distances,
+        //        distances.data(),
+        //        distances.size() * sizeof(distances[0]));
         memcpy(out_distances,
-               distances.data(),
-               distances.size() * sizeof(distances[0]));
+               pool.distances.data(),
+               distances_size * sizeof(pool.distances[0]));
     }
 }
 
+void ResidualQuantizer::refine_beam(
+        size_t n,
+        size_t beam_size,
+        const float* x,
+        int out_beam_size,
+        int32_t* out_codes,
+        float* out_residuals,
+        float* out_distances) const {
+    RefineBeamMemoryPool pool;
+    refine_beam_mp(
+            *this,
+            n,
+            beam_size,
+            x,
+            out_beam_size,
+            out_codes,
+            out_residuals,
+            out_distances,
+            pool);
+}
+
 /*******************************************************************
  * Functions using the dot products between codebook entries
  *******************************************************************/
@@ -765,6 +987,186 @@
     }
 }
 
+namespace {
+
+template <size_t M, size_t NK>
+void accum_and_store_tab(
+        const size_t m_offset,
+        const float* const __restrict codebook_cross_norms,
+        const uint64_t* const __restrict codebook_offsets,
+        const int32_t* const __restrict codes_i,
+        const size_t b,
+        const size_t ldc,
+        const size_t K,
+        float* const __restrict output) {
+    // load pointers into registers
+    const float* cbs[M];
+    for (size_t ij = 0; ij < M; ij++) {
+        const size_t code = static_cast<size_t>(codes_i[b * m_offset + ij]);
+        cbs[ij] = &codebook_cross_norms[(codebook_offsets[ij] + code) * ldc];
+    }
+
+    // do accumulation in registers using SIMD.
+    // It is possible that compiler may be smart enough so that
+    //   this manual SIMD unrolling might be unneeded.
+#if defined(__AVX2__) || defined(__aarch64__)
+    const size_t K8 = (K / (8 * NK)) * (8 * NK);
+
+    // process in chunks of size (8 * NK) floats
+    for (size_t kk = 0; kk < K8; kk += 8 * NK) {
+        simd8float32 regs[NK];
+        for (size_t ik = 0; ik < NK; ik++) {
+            regs[ik].loadu(cbs[0] + kk + ik * 8);
+        }
+
+        for (size_t ij = 1; ij < M; ij++) {
+            for (size_t ik = 0; ik < NK; ik++) {
+                regs[ik] += simd8float32(cbs[ij] + kk + ik * 8);
+            }
+        }
+
+        // write the result
+        for (size_t ik = 0; ik < NK; ik++) {
+            regs[ik].storeu(output + kk + ik * 8);
+        }
+    }
+#else
+    const size_t K8 = 0;
+#endif
+
+    // process leftovers
+    for (size_t kk = K8; kk < K; kk++) {
+        float reg = cbs[0][kk];
+        for (size_t ij = 1; ij < M; ij++) {
+            reg += cbs[ij][kk];
+        }
+        output[b * K + kk] = reg;
+    }
+}
+
+template <size_t M, size_t NK>
+void accum_and_add_tab(
+        const size_t m_offset,
+        const float* const __restrict codebook_cross_norms,
+        const uint64_t* const __restrict codebook_offsets,
+        const int32_t* const __restrict codes_i,
+        const size_t b,
+        const size_t ldc,
+        const size_t K,
+        float* const __restrict output) {
+    // load pointers into registers
+    const float* cbs[M];
+    for (size_t ij = 0; ij < M; ij++) {
+        const size_t code = static_cast<size_t>(codes_i[b * m_offset + ij]);
+        cbs[ij] = &codebook_cross_norms[(codebook_offsets[ij] + code) * ldc];
+    }
+
+    // do accumulation in registers using SIMD.
+    // It is possible that compiler may be smart enough so that
+    //   this manual SIMD unrolling might be unneeded.
+#if defined(__AVX2__) || defined(__aarch64__)
+    const size_t K8 = (K / (8 * NK)) * (8 * NK);
+
+    // process in chunks of size (8 * NK) floats
+    for (size_t kk = 0; kk < K8; kk += 8 * NK) {
+        simd8float32 regs[NK];
+        for (size_t ik = 0; ik < NK; ik++) {
+            regs[ik].loadu(cbs[0] + kk + ik * 8);
+        }
+
+        for (size_t ij = 1; ij < M; ij++) {
+            for (size_t ik = 0; ik < NK; ik++) {
+                regs[ik] += simd8float32(cbs[ij] + kk + ik * 8);
+            }
+        }
+
+        // write the result
+        for (size_t ik = 0; ik < NK; ik++) {
+            simd8float32 existing(output + kk + ik * 8);
+            existing += regs[ik];
+            existing.storeu(output + kk + ik * 8);
+        }
+    }
+#else
+    const size_t K8 = 0;
+#endif
+
+    // process leftovers
+    for (size_t kk = K8; kk < K; kk++) {
+        float reg = cbs[0][kk];
+        for (size_t ij = 1; ij < M; ij++) {
+            reg += cbs[ij][kk];
+        }
+        output[b * K + kk] += reg;
+    }
+}
+
+template <size_t M, size_t NK>
+void accum_and_finalize_tab(
+        const float* const __restrict codebook_cross_norms,
+        const uint64_t* const __restrict codebook_offsets,
+        const int32_t* const __restrict codes_i,
+        const size_t b,
+        const size_t ldc,
+        const size_t K,
+        const float* const __restrict distances_i,
+        const float* const __restrict cd_common,
+        float* const __restrict output) {
+    // load pointers into registers
+    const float* cbs[M];
+    for (size_t ij = 0; ij < M; ij++) {
+        const size_t code = static_cast<size_t>(codes_i[b * M + ij]);
+        cbs[ij] = &codebook_cross_norms[(codebook_offsets[ij] + code) * ldc];
+    }
+
+    // do accumulation in registers using SIMD.
+    // It is possible that compiler may be smart enough so that
+    //   this manual SIMD unrolling might be unneeded.
+#if defined(__AVX2__) || defined(__aarch64__)
+    const size_t K8 = (K / (8 * NK)) * (8 * NK);
+
+    // process in chunks of size (8 * NK) floats
+    for (size_t kk = 0; kk < K8; kk += 8 * NK) {
+        simd8float32 regs[NK];
+        for (size_t ik = 0; ik < NK; ik++) {
+            regs[ik].loadu(cbs[0] + kk + ik * 8);
+        }
+
+        for (size_t ij = 1; ij < M; ij++) {
+            for (size_t ik = 0; ik < NK; ik++) {
+                regs[ik] += simd8float32(cbs[ij] + kk + ik * 8);
+            }
+        }
+
+        simd8float32 two(2.0f);
+        for (size_t ik = 0; ik < NK; ik++) {
+            // cent_distances[b * K + k] = distances_i[b] + cd_common[k]
+            //     + 2 * dp[k];
+
+            simd8float32 common_v(cd_common + kk + ik * 8);
+            common_v = fmadd(two, regs[ik], common_v);
+
+            common_v += simd8float32(distances_i[b]);
+            common_v.storeu(output + b * K + kk + ik * 8);
+        }
+    }
+#else
+    const size_t K8 = 0;
+#endif
+
+    // process leftovers
+    for (size_t kk = K8; kk < K; kk++) {
+        float reg = cbs[0][kk];
+        for (size_t ij = 1; ij < M; ij++) {
+            reg += cbs[ij][kk];
+        }
+
+        output[b * K + kk] = distances_i[b] + cd_common[kk] + 2 * reg;
+    }
+}
+
+} // namespace
+
 void beam_search_encode_step_tab(
         size_t K,
         size_t n,
@@ -779,12 +1181,13 @@
         const int32_t* codes,   // n * beam_size * m
         const float* distances, // n * beam_size
         size_t new_beam_size,
-        int32_t* new_codes,   // n * new_beam_size * (m + 1)
-        float* new_distances) // n * new_beam_size
+        int32_t* new_codes,                 // n * new_beam_size * (m + 1)
+        float* new_distances,               // n * new_beam_size
+        ApproxTopK_mode_t approx_topk_mode) //
 {
     FAISS_THROW_IF_NOT(ldc >= K);
 
-#pragma omp parallel for if (n > 100)
+#pragma omp parallel for if (n > 100) schedule(dynamic)
     for (int64_t i = 0; i < n; i++) {
         std::vector<float> cent_distances(beam_size * K);
         std::vector<float> cd_common(K);
@@ -797,6 +1200,14 @@
             cd_common[k] = cent_norms_i[k] - 2 * query_cp_i[k];
         }
 
+        /*
+        // This is the baseline implementation. Its primary flaw
+        //   that it writes way too many info to the temporary buffer
+        //   called dp.
+        //
+        // This baseline code is kept intentionally because it is easy to
+        // understand what an optimized version optimizes exactly.
+        //
         for (size_t b = 0; b < beam_size; b++) {
             std::vector<float> dp(K);
 
@@ -812,6 +1223,117 @@
                         distances_i[b] + cd_common[k] + 2 * dp[k];
             }
         }
+        */
+
+        // An optimized implementation that avoids using a temporary buffer
+        // and does the accumulation in registers.
+
+        // Compute a sum of NK AQ codes.
+#define ACCUM_AND_FINALIZE_TAB(NK)               \
+    case NK:                                     \
+        for (size_t b = 0; b < beam_size; b++) { \
+            accum_and_finalize_tab<NK, 4>(       \
+                    codebook_cross_norms,        \
+                    codebook_offsets,            \
+                    codes_i,                     \
+                    b,                           \
+                    ldc,                         \
+                    K,                           \
+                    distances_i,                 \
+                    cd_common.data(),            \
+                    cent_distances.data());      \
+        }                                        \
+        break;
+
+        // this version contains many switch-case scenarios, but
+        // they won't affect branch predictor.
+        switch (m) {
+            case 0:
+                // trivial case
+                for (size_t b = 0; b < beam_size; b++) {
+                    for (size_t k = 0; k < K; k++) {
+                        cent_distances[b * K + k] =
+                                distances_i[b] + cd_common[k];
+                    }
+                }
+                break;
+
+                ACCUM_AND_FINALIZE_TAB(1)
+                ACCUM_AND_FINALIZE_TAB(2)
+                ACCUM_AND_FINALIZE_TAB(3)
+                ACCUM_AND_FINALIZE_TAB(4)
+                ACCUM_AND_FINALIZE_TAB(5)
+                ACCUM_AND_FINALIZE_TAB(6)
+                ACCUM_AND_FINALIZE_TAB(7)
+
+            default: {
+                // m >= 8 case.
+
+                // A temporary buffer has to be used due to the lack of
+                // registers. But we'll try to accumulate up to 8 AQ codes in
+                // registers and issue a single write operation to the buffer,
+                // while the baseline does no accumulation. So, the number of
+                // write operations to the temporary buffer is reduced 8x.
+
+                // allocate a temporary buffer
+                std::vector<float> dp(K);
+
+                for (size_t b = 0; b < beam_size; b++) {
+                    // Initialize it. Compute a sum of first 8 AQ codes
+                    // because m >= 8 .
+                    accum_and_store_tab<8, 4>(
+                            m,
+                            codebook_cross_norms,
+                            codebook_offsets,
+                            codes_i,
+                            b,
+                            ldc,
+                            K,
+                            dp.data());
+
+#define ACCUM_AND_ADD_TAB(NK)          \
+    case NK:                           \
+        accum_and_add_tab<NK, 4>(      \
+                m,                     \
+                codebook_cross_norms,  \
+                codebook_offsets + im, \
+                codes_i + im,          \
+                b,                     \
+                ldc,                   \
+                K,                     \
+                dp.data());            \
+        break;
+
+                    // accumulate up to 8 additional AQ codes into
+                    // a temporary buffer
+                    for (size_t im = 8; im < ((m + 7) / 8) * 8; im += 8) {
+                        size_t m_left = m - im;
+                        if (m_left > 8) {
+                            m_left = 8;
+                        }
+
+                        switch (m_left) {
+                            ACCUM_AND_ADD_TAB(1)
+                            ACCUM_AND_ADD_TAB(2)
+                            ACCUM_AND_ADD_TAB(3)
+                            ACCUM_AND_ADD_TAB(4)
+                            ACCUM_AND_ADD_TAB(5)
+                            ACCUM_AND_ADD_TAB(6)
+                            ACCUM_AND_ADD_TAB(7)
+                            ACCUM_AND_ADD_TAB(8)
+                        }
+                    }
+
+                    // done. finalize the result
+                    for (size_t k = 0; k < K; k++) {
+                        cent_distances[b * K + k] =
+                                distances_i[b] + cd_common[k] + 2 * dp[k];
+                    }
+                }
+            }
+        }
+
+        // the optimized implementation ends here
 
         using C = CMax<float, int>;
         int32_t* new_codes_i = new_codes + i * (m + 1) * new_beam_size;
@@ -824,15 +1346,38 @@
             new_distances_i[i] = C::neutral();
         }
         std::vector<int> perm(new_beam_size, -1);
-        heap_addn<C>(
-                new_beam_size,
-                new_distances_i,
-                perm.data(),
-                cent_distances_i,
-                nullptr,
-                beam_size * K);
+
+#define HANDLE_APPROX(NB, BD)                                  \
+    case ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B##NB##_D##BD: \
+        HeapWithBuckets<C, NB, BD>::bs_addn(                   \
+                beam_size,                                     \
+                K,                                             \
+                cent_distances_i,                              \
+                new_beam_size,                                 \
+                new_distances_i,                               \
+                perm.data());                                  \
+        break;
+
+        switch (approx_topk_mode) {
+            HANDLE_APPROX(8, 3)
+            HANDLE_APPROX(8, 2)
+            HANDLE_APPROX(16, 2)
+            HANDLE_APPROX(32, 2)
+            default:
+                heap_addn<C>(
+                        new_beam_size,
+                        new_distances_i,
+                        perm.data(),
+                        cent_distances_i,
+                        nullptr,
+                        beam_size * K);
+                break;
+        }
+
         heap_reorder<C>(new_beam_size, new_distances_i, perm.data());
 
+#undef HANDLE_APPROX
+
         for (int j = 0; j < new_beam_size; j++) {
             int js = perm[j] / K;
             int ls = perm[j] % K;
@@ -845,70 +1390,147 @@
     }
 }
 
-void ResidualQuantizer::refine_beam_LUT(
+//
+void refine_beam_LUT_mp(
+        const ResidualQuantizer& rq,
         size_t n,
         const float* query_norms, // size n
         const float* query_cp,    //
         int out_beam_size,
         int32_t* out_codes,
-        float* out_distances) const {
+        float* out_distances,
+        RefineBeamLUTMemoryPool& pool) {
     int beam_size = 1;
 
-    std::vector<int32_t> codes;
-    std::vector<float> distances(query_norms, query_norms + n);
     double t0 = getmillisecs();
 
-    for (int m = 0; m < M; m++) {
-        int K = 1 << nbits[m];
+    // find the max_beam_size
+    int max_beam_size = 0;
+    {
+        int tmp_beam_size = beam_size;
+        for (int m = 0; m < rq.M; m++) {
+            int K = 1 << rq.nbits[m];
+            int new_beam_size = std::min(tmp_beam_size * K, out_beam_size);
+            tmp_beam_size = new_beam_size;
+
+            if (max_beam_size < new_beam_size) {
+                max_beam_size = new_beam_size;
+            }
+        }
+    }
+
+    // preallocate buffers
+    pool.new_codes.resize(n * max_beam_size * (rq.M + 1));
+    pool.new_distances.resize(n * max_beam_size);
 
+    pool.codes.resize(n * max_beam_size * (rq.M + 1));
+    pool.distances.resize(n * max_beam_size);
+
+    for (size_t i = 0; i < n; i++) {
+        pool.distances[i] = query_norms[i];
+    }
+
+    // set up pointers to buffers
+    int32_t* __restrict new_codes_ptr = pool.new_codes.data();
+    float* __restrict new_distances_ptr = pool.new_distances.data();
+
+    int32_t* __restrict codes_ptr = pool.codes.data();
+    float* __restrict distances_ptr = pool.distances.data();
+
+    // main loop
+    size_t codes_size = 0;
+    size_t distances_size = 0;
+    for (int m = 0; m < rq.M; m++) {
+        int K = 1 << rq.nbits[m];
+
+        // it is guaranteed that (new_beam_size <= than max_beam_size) ==
+        // true
         int new_beam_size = std::min(beam_size * K, out_beam_size);
-        std::vector<int32_t> new_codes(n * new_beam_size * (m + 1));
-        std::vector<float> new_distances(n * new_beam_size);
+
+        // std::vector<int32_t> new_codes(n * new_beam_size * (m + 1));
+        // std::vector<float> new_distances(n * new_beam_size);
+
+        codes_size = n * new_beam_size * (m + 1);
+        distances_size = n * new_beam_size;
 
         beam_search_encode_step_tab(
                 K,
                 n,
                 beam_size,
-                codebook_cross_products.data() + codebook_offsets[m],
-                total_codebook_size,
-                codebook_offsets.data(),
-                query_cp + codebook_offsets[m],
-                total_codebook_size,
-                cent_norms.data() + codebook_offsets[m],
+                rq.codebook_cross_products.data() + rq.codebook_offsets[m],
+                rq.total_codebook_size,
+                rq.codebook_offsets.data(),
+                query_cp + rq.codebook_offsets[m],
+                rq.total_codebook_size,
+                rq.cent_norms.data() + rq.codebook_offsets[m],
                 m,
-                codes.data(),
-                distances.data(),
+                // codes.data(),
+                codes_ptr,
+                // distances.data(),
+                distances_ptr,
                 new_beam_size,
-                new_codes.data(),
-                new_distances.data());
+                // new_codes.data(),
+                new_codes_ptr,
+                // new_distances.data()
+                new_distances_ptr,
+                rq.approx_topk_mode);
+
+        // codes.swap(new_codes);
+        std::swap(codes_ptr, new_codes_ptr);
+        // distances.swap(new_distances);
+        std::swap(distances_ptr, new_distances_ptr);
 
-        codes.swap(new_codes);
-        distances.swap(new_distances);
         beam_size = new_beam_size;
 
-        if (verbose) {
+        if (rq.verbose) {
             float sum_distances = 0;
-            for (int j = 0; j < distances.size(); j++) {
-                sum_distances += distances[j];
+            // for (int j = 0; j < distances.size(); j++) {
+            //     sum_distances += distances[j];
+            // }
+            for (int j = 0; j < distances_size; j++) {
+                sum_distances += distances_ptr[j];
             }
             printf("[%.3f s] encode stage %d, %d bits, "
                    "total error %g, beam_size %d\n",
                    (getmillisecs() - t0) / 1000,
                    m,
-                   int(nbits[m]),
+                   int(rq.nbits[m]),
                    sum_distances,
                    beam_size);
         }
     }
 
     if (out_codes) {
-        memcpy(out_codes, codes.data(), codes.size() * sizeof(codes[0]));
+        // memcpy(out_codes, codes.data(), codes.size() * sizeof(codes[0]));
+        memcpy(out_codes, codes_ptr, codes_size * sizeof(*codes_ptr));
     }
     if (out_distances) {
+        // memcpy(out_distances,
+        //        distances.data(),
+        //        distances.size() * sizeof(distances[0]));
         memcpy(out_distances,
-               distances.data(),
-               distances.size() * sizeof(distances[0]));
+               distances_ptr,
+               distances_size * sizeof(*distances_ptr));
     }
 }
 
+void ResidualQuantizer::refine_beam_LUT(
+        size_t n,
+        const float* query_norms, // size n
+        const float* query_cp,    //
+        int out_beam_size,
+        int32_t* out_codes,
+        float* out_distances) const {
+    RefineBeamLUTMemoryPool pool;
+    refine_beam_LUT_mp(
+            *this,
+            n,
+            query_norms,
+            query_cp,
+            out_beam_size,
+            out_codes,
+            out_distances,
+            pool);
+}
+
 } // namespace faiss
diff -Nru faiss-1.7.3/faiss/impl/ResidualQuantizer.h faiss-1.7.4/faiss/impl/ResidualQuantizer.h
--- faiss-1.7.3/faiss/impl/ResidualQuantizer.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/impl/ResidualQuantizer.h	2023-04-19 13:18:30.000000000 +0000
@@ -13,6 +13,8 @@
 #include <faiss/Clustering.h>
 #include <faiss/impl/AdditiveQuantizer.h>
 
+#include <faiss/utils/approx_topk/mode.h>
+
 namespace faiss {
 
 /** Residual quantizer with variable number of bits per sub-quantizer
@@ -29,7 +31,7 @@
     using train_type_t = int;
 
     /// Binary or of the Train_* flags below
-    train_type_t train_type;
+    train_type_t train_type = Train_progressive_dim;
 
     /// regular k-means (minimal amount of computation)
     static const int Train_default = 0;
@@ -41,7 +43,7 @@
     static const int Train_refine_codebook = 2;
 
     /// number of iterations for codebook refinement.
-    int niter_codebook_refine;
+    int niter_codebook_refine = 5;
 
     /** set this bit on train_type if beam is to be trained only on the
      *  first element of the beam (faster but less accurate) */
@@ -52,16 +54,20 @@
     static const int Skip_codebook_tables = 2048;
 
     /// beam size used for training and for encoding
-    int max_beam_size;
+    int max_beam_size = 5;
 
     /// use LUT for beam search
-    int use_beam_LUT;
+    int use_beam_LUT = 0;
+
+    /// Currently used mode of approximate min-k computations.
+    /// Default value is EXACT_TOPK.
+    ApproxTopK_mode_t approx_topk_mode = ApproxTopK_mode_t::EXACT_TOPK;
 
     /// clustering parameters
     ProgressiveDimClusteringParameters cp;
 
     /// if non-NULL, use this index for assignment
-    ProgressiveDimIndexFactory* assign_index_factory;
+    ProgressiveDimIndexFactory* assign_index_factory = nullptr;
 
     ResidualQuantizer(
             size_t d,
@@ -183,7 +189,8 @@
         int32_t* new_codes,
         float* new_residuals,
         float* new_distances,
-        Index* assign_index = nullptr);
+        Index* assign_index = nullptr,
+        ApproxTopK_mode_t approx_topk = ApproxTopK_mode_t::EXACT_TOPK);
 
 /** Encode a set of vectors using their dot products with the codebooks
  *
@@ -202,7 +209,8 @@
         const int32_t* codes,   // n * beam_size * m
         const float* distances, // n * beam_size
         size_t new_beam_size,
-        int32_t* new_codes,    // n * new_beam_size * (m + 1)
-        float* new_distances); // n * new_beam_size
+        int32_t* new_codes,   // n * new_beam_size * (m + 1)
+        float* new_distances, // n * new_beam_size
+        ApproxTopK_mode_t approx_topk = ApproxTopK_mode_t::EXACT_TOPK);
 
 }; // namespace faiss
diff -Nru faiss-1.7.3/faiss/impl/ScalarQuantizer.cpp faiss-1.7.4/faiss/impl/ScalarQuantizer.cpp
--- faiss-1.7.3/faiss/impl/ScalarQuantizer.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/impl/ScalarQuantizer.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -54,7 +54,6 @@
 
 namespace {
 
-typedef Index::idx_t idx_t;
 typedef ScalarQuantizer::QuantizerType QuantizerType;
 typedef ScalarQuantizer::RangeStat RangeStat;
 using SQDistanceComputer = ScalarQuantizer::SQDistanceComputer;
@@ -1048,12 +1047,11 @@
  ********************************************************************/
 
 ScalarQuantizer::ScalarQuantizer(size_t d, QuantizerType qtype)
-        : Quantizer(d), qtype(qtype), rangestat(RS_minmax), rangestat_arg(0) {
+        : Quantizer(d), qtype(qtype) {
     set_derived_sizes();
 }
 
-ScalarQuantizer::ScalarQuantizer()
-        : qtype(QT_8bit), rangestat(RS_minmax), rangestat_arg(0), bits(0) {}
+ScalarQuantizer::ScalarQuantizer() {}
 
 void ScalarQuantizer::set_derived_sizes() {
     switch (qtype) {
@@ -1131,7 +1129,7 @@
     ScopeDeleter<float> del_x(x_in == x ? nullptr : x);
 
     if (by_residual) {
-        std::vector<Index::idx_t> idx(n);
+        std::vector<idx_t> idx(n);
         quantizer->assign(n, x, idx.data());
 
         std::vector<float> residuals(n * d);
diff -Nru faiss-1.7.3/faiss/impl/ScalarQuantizer.h faiss-1.7.4/faiss/impl/ScalarQuantizer.h
--- faiss-1.7.3/faiss/impl/ScalarQuantizer.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/impl/ScalarQuantizer.h	2023-04-19 13:18:30.000000000 +0000
@@ -34,7 +34,7 @@
         QT_6bit,        ///< 6 bits per component
     };
 
-    QuantizerType qtype;
+    QuantizerType qtype = QT_8bit;
 
     /** The uniform encoder can estimate the range of representable
      * values of the unform encoder using different statistics. Here
@@ -48,11 +48,11 @@
         RS_optim,     ///< alternate optimization of reconstruction error
     };
 
-    RangeStat rangestat;
-    float rangestat_arg;
+    RangeStat rangestat = RS_minmax;
+    float rangestat_arg = 0;
 
     /// bits per scalar code
-    size_t bits;
+    size_t bits = 0;
 
     /// trained values (including the range)
     std::vector<float> trained;
diff -Nru faiss-1.7.3/faiss/impl/ThreadedIndex.h faiss-1.7.4/faiss/impl/ThreadedIndex.h
--- faiss-1.7.3/faiss/impl/ThreadedIndex.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/impl/ThreadedIndex.h	2023-04-19 13:18:30.000000000 +0000
@@ -29,7 +29,7 @@
     /// WARNING: once an index is added, it becomes unsafe to touch it from any
     /// other thread than that on which is managing it, until we are shut
     /// down. Use runOnIndex to perform work on it instead.
-    void addIndex(IndexT* index);
+    virtual void addIndex(IndexT* index);
 
     /// Remove an index that is managed by ourselves.
     /// This will flush all pending work on that index, and then shut
@@ -52,17 +52,17 @@
     }
 
     /// Returns the i-th sub-index
-    IndexT* at(int i) {
+    IndexT* at(size_t i) {
         return indices_[i].first;
     }
 
     /// Returns the i-th sub-index (const version)
-    const IndexT* at(int i) const {
+    const IndexT* at(size_t i) const {
         return indices_[i].first;
     }
 
     /// Whether or not we are responsible for deleting our contained indices
-    bool own_fields;
+    bool own_indices = false;
 
    protected:
     /// Called just after an index is added
diff -Nru faiss-1.7.3/faiss/impl/ThreadedIndex-inl.h faiss-1.7.4/faiss/impl/ThreadedIndex-inl.h
--- faiss-1.7.3/faiss/impl/ThreadedIndex-inl.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/impl/ThreadedIndex-inl.h	2023-04-19 13:18:30.000000000 +0000
@@ -18,7 +18,7 @@
 
 template <typename IndexT>
 ThreadedIndex<IndexT>::ThreadedIndex(int d, bool threaded)
-        : IndexT(d), own_fields(false), isThreaded_(threaded) {}
+        : IndexT(d), isThreaded_(threaded) {}
 
 template <typename IndexT>
 ThreadedIndex<IndexT>::~ThreadedIndex() {
@@ -35,7 +35,7 @@
             FAISS_ASSERT(!(bool)p.second);
         }
 
-        if (own_fields) {
+        if (own_indices) {
             delete p.first;
         }
     }
@@ -102,7 +102,7 @@
             indices_.erase(it);
             onAfterRemoveIndex(index);
 
-            if (own_fields) {
+            if (own_indices) {
                 delete index;
             }
 
diff -Nru faiss-1.7.3/faiss/IndexAdditiveQuantizer.cpp faiss-1.7.4/faiss/IndexAdditiveQuantizer.cpp
--- faiss-1.7.3/faiss/IndexAdditiveQuantizer.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/IndexAdditiveQuantizer.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -484,7 +484,7 @@
         int d,        ///< dimensionality of the input vectors
         const std::vector<size_t>& nbits,
         MetricType metric)
-        : AdditiveCoarseQuantizer(d, &rq, metric), rq(d, nbits), beam_factor(4.0) {
+        : AdditiveCoarseQuantizer(d, &rq, metric), rq(d, nbits) {
     FAISS_THROW_IF_NOT(rq.tot_bits <= 63);
     is_trained = false;
 }
@@ -520,10 +520,15 @@
         idx_t k,
         float* distances,
         idx_t* labels,
-        const SearchParameters * params
+        const SearchParameters * params_in
         ) const {
 
-    FAISS_THROW_IF_NOT_MSG(!params, "search params not supported for this index");
+    float beam_factor = this->beam_factor;
+    if (params_in) {
+        auto params = dynamic_cast<const SearchParametersResidualCoarseQuantizer*>(params_in);
+        FAISS_THROW_IF_NOT_MSG(params, "need SearchParametersResidualCoarseQuantizer parameters");
+        beam_factor = params->beam_factor;
+    }
 
     if (beam_factor < 0) {
         AdditiveCoarseQuantizer::search(n, x, k, distances, labels);
@@ -585,6 +590,15 @@
     }
 }
 
+void ResidualCoarseQuantizer::initialize_from(const ResidualCoarseQuantizer &other) {
+    FAISS_THROW_IF_NOT(rq.M <= other.rq.M);
+    rq.initialize_from(other.rq);
+    set_beam_factor(other.beam_factor);
+    is_trained = other.is_trained;
+    ntotal = (idx_t)1 << aq->tot_bits;
+}
+
+
 /**************************************************************************************
  * LocalSearchCoarseQuantizer
  **************************************************************************************/
diff -Nru faiss-1.7.3/faiss/IndexAdditiveQuantizer.h faiss-1.7.4/faiss/IndexAdditiveQuantizer.h
--- faiss-1.7.3/faiss/IndexAdditiveQuantizer.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/IndexAdditiveQuantizer.h	2023-04-19 13:18:30.000000000 +0000
@@ -187,6 +187,11 @@
     void reset() override;
 };
 
+struct SearchParametersResidualCoarseQuantizer : SearchParameters {
+    float beam_factor = 4.0f;
+    ~SearchParametersResidualCoarseQuantizer() {}
+};
+
 /** The ResidualCoarseQuantizer is a bit specialized compared to the
  * default AdditiveCoarseQuantizer because it can use a beam search
  * at search time (slow but may be useful for very large vocabularies) */
@@ -196,7 +201,7 @@
 
     /// factor between the beam size and the search k
     /// if negative, use exact search-to-centroid
-    float beam_factor;
+    float beam_factor = 4.0f;
 
     /// computes centroid norms if required
     void set_beam_factor(float new_beam_factor);
@@ -226,6 +231,10 @@
             idx_t* labels,
             const SearchParameters* params = nullptr) const override;
 
+    /** Copy the M first codebook levels from other. Useful to crop a
+     * ResidualQuantizer to its first M quantizers. */
+    void initialize_from(const ResidualCoarseQuantizer& other);
+
     ResidualCoarseQuantizer();
 };
 
diff -Nru faiss-1.7.3/faiss/IndexBinaryFlat.cpp faiss-1.7.4/faiss/IndexBinaryFlat.cpp
--- faiss-1.7.3/faiss/IndexBinaryFlat.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/IndexBinaryFlat.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -60,7 +60,8 @@
                     xb.data(),
                     ntotal,
                     code_size,
-                    /* ordered = */ true);
+                    /* ordered = */ true,
+                    approx_topk_mode);
         } else {
             hammings_knn_mc(
                     x + s * code_size,
diff -Nru faiss-1.7.3/faiss/IndexBinaryFlat.h faiss-1.7.4/faiss/IndexBinaryFlat.h
--- faiss-1.7.3/faiss/IndexBinaryFlat.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/IndexBinaryFlat.h	2023-04-19 13:18:30.000000000 +0000
@@ -14,6 +14,8 @@
 
 #include <faiss/IndexBinary.h>
 
+#include <faiss/utils/approx_topk/mode.h>
+
 namespace faiss {
 
 /** Index that stores the full vectors and performs exhaustive search. */
@@ -28,6 +30,8 @@
 
     size_t query_batch_size = 32;
 
+    ApproxTopK_mode_t approx_topk_mode = ApproxTopK_mode_t::EXACT_TOPK;
+
     explicit IndexBinaryFlat(idx_t d);
 
     void add(idx_t n, const uint8_t* x) override;
diff -Nru faiss-1.7.3/faiss/IndexBinary.h faiss-1.7.4/faiss/IndexBinary.h
--- faiss-1.7.3/faiss/IndexBinary.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/IndexBinary.h	2023-04-19 13:18:30.000000000 +0000
@@ -32,7 +32,6 @@
  * vectors.
  */
 struct IndexBinary {
-    using idx_t = Index::idx_t; ///< all indices are this type
     using component_t = uint8_t;
     using distance_t = int32_t;
 
diff -Nru faiss-1.7.3/faiss/IndexBinaryHash.cpp faiss-1.7.4/faiss/IndexBinaryHash.cpp
--- faiss-1.7.3/faiss/IndexBinaryHash.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/IndexBinaryHash.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -108,8 +108,6 @@
     }
 };
 
-using idx_t = Index::idx_t;
-
 struct RangeSearchResults {
     int radius;
     RangeQueryResult& qres;
@@ -353,7 +351,7 @@
 static void verify_shortlist(
         const IndexBinaryFlat& index,
         const uint8_t* q,
-        const std::unordered_set<Index::idx_t>& shortlist,
+        const std::unordered_set<idx_t>& shortlist,
         SearchResults& res) {
     size_t code_size = index.code_size;
     size_t nlist = 0, ndis = 0, n0 = 0;
diff -Nru faiss-1.7.3/faiss/IndexBinaryIVF.cpp faiss-1.7.4/faiss/IndexBinaryIVF.cpp
--- faiss-1.7.3/faiss/IndexBinaryIVF.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/IndexBinaryIVF.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -21,6 +21,7 @@
 #include <faiss/impl/AuxIndexStructures.h>
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/utils/hamming.h>
+#include <faiss/utils/sorting.h>
 #include <faiss/utils/utils.h>
 
 namespace faiss {
@@ -28,28 +29,14 @@
 IndexBinaryIVF::IndexBinaryIVF(IndexBinary* quantizer, size_t d, size_t nlist)
         : IndexBinary(d),
           invlists(new ArrayInvertedLists(nlist, code_size)),
-          own_invlists(true),
-          nprobe(1),
-          max_codes(0),
           quantizer(quantizer),
-          nlist(nlist),
-          own_fields(false),
-          clustering_index(nullptr) {
+          nlist(nlist) {
     FAISS_THROW_IF_NOT(d == quantizer->d);
     is_trained = quantizer->is_trained && (quantizer->ntotal == nlist);
-
     cp.niter = 10;
 }
 
-IndexBinaryIVF::IndexBinaryIVF()
-        : invlists(nullptr),
-          own_invlists(false),
-          nprobe(1),
-          max_codes(0),
-          quantizer(nullptr),
-          nlist(0),
-          own_fields(false),
-          clustering_index(nullptr) {}
+IndexBinaryIVF::IndexBinaryIVF() {}
 
 void IndexBinaryIVF::add(idx_t n, const uint8_t* x) {
     add_with_ids(n, x, nullptr);
@@ -158,7 +145,7 @@
 
     for (idx_t list_no = 0; list_no < nlist; list_no++) {
         size_t list_size = invlists->list_size(list_no);
-        const Index::idx_t* idlist = invlists->get_ids(list_no);
+        const idx_t* idlist = invlists->get_ids(list_no);
 
         for (idx_t offset = 0; offset < list_size; offset++) {
             idx_t id = idlist[offset];
@@ -174,11 +161,11 @@
 
 void IndexBinaryIVF::search_and_reconstruct(
         idx_t n,
-        const uint8_t* x,
+        const uint8_t* __restrict x,
         idx_t k,
-        int32_t* distances,
-        idx_t* labels,
-        uint8_t* recons,
+        int32_t* __restrict distances,
+        idx_t* __restrict labels,
+        uint8_t* __restrict recons,
         const SearchParameters* params) const {
     FAISS_THROW_IF_NOT_MSG(
             !params, "search params not supported for this index");
@@ -320,8 +307,6 @@
 
 namespace {
 
-using idx_t = Index::idx_t;
-
 template <class HammingComputer>
 struct IVFBinaryScannerL2 : BinaryInvertedListScanner {
     HammingComputer hc;
@@ -346,10 +331,10 @@
 
     size_t scan_codes(
             size_t n,
-            const uint8_t* codes,
-            const idx_t* ids,
-            int32_t* simi,
-            idx_t* idxi,
+            const uint8_t* __restrict codes,
+            const idx_t* __restrict ids,
+            int32_t* __restrict simi,
+            idx_t* __restrict idxi,
             size_t k) const override {
         using C = CMax<int32_t, idx_t>;
 
@@ -368,8 +353,8 @@
 
     void scan_codes_range(
             size_t n,
-            const uint8_t* codes,
-            const idx_t* ids,
+            const uint8_t* __restrict codes,
+            const idx_t* __restrict ids,
             int radius,
             RangeQueryResult& result) const override {
         size_t nup = 0;
@@ -387,12 +372,12 @@
 void search_knn_hamming_heap(
         const IndexBinaryIVF& ivf,
         size_t n,
-        const uint8_t* x,
+        const uint8_t* __restrict x,
         idx_t k,
-        const idx_t* keys,
-        const int32_t* coarse_dis,
-        int32_t* distances,
-        idx_t* labels,
+        const idx_t* __restrict keys,
+        const int32_t* __restrict coarse_dis,
+        int32_t* __restrict distances,
+        idx_t* __restrict labels,
         bool store_pairs,
         const IVFSearchParameters* params) {
     idx_t nprobe = params ? params->nprobe : ivf.nprobe;
@@ -448,7 +433,7 @@
                 size_t list_size = ivf.invlists->list_size(key);
                 InvertedLists::ScopedCodes scodes(ivf.invlists, key);
                 std::unique_ptr<InvertedLists::ScopedIds> sids;
-                const Index::idx_t* ids = nullptr;
+                const idx_t* ids = nullptr;
 
                 if (!store_pairs) {
                     sids.reset(new InvertedLists::ScopedIds(ivf.invlists, key));
@@ -483,11 +468,11 @@
 void search_knn_hamming_count(
         const IndexBinaryIVF& ivf,
         size_t nx,
-        const uint8_t* x,
-        const idx_t* keys,
+        const uint8_t* __restrict x,
+        const idx_t* __restrict keys,
         int k,
-        int32_t* distances,
-        idx_t* labels,
+        int32_t* __restrict distances,
+        idx_t* __restrict labels,
         const IVFSearchParameters* params) {
     const int nBuckets = ivf.d + 1;
     std::vector<int> all_counters(nx * nBuckets, 0);
@@ -533,7 +518,7 @@
             size_t list_size = ivf.invlists->list_size(key);
             InvertedLists::ScopedCodes scodes(ivf.invlists, key);
             const uint8_t* list_vecs = scodes.get();
-            const Index::idx_t* ids =
+            const idx_t* ids =
                     store_pairs ? nullptr : ivf.invlists->get_ids(key);
 
             for (size_t j = 0; j < list_size; j++) {
@@ -571,6 +556,185 @@
     indexIVF_stats.ndis += ndis;
 }
 
+/* Manages NQ queries at a time, stores results */
+template <class HammingComputer, int NQ, int K>
+struct BlockSearch {
+    HammingComputer hcs[NQ];
+    // heaps to update for each query
+    int32_t* distances[NQ];
+    idx_t* labels[NQ];
+    // curent top of heap
+    int32_t heap_tops[NQ];
+
+    BlockSearch(
+            size_t code_size,
+            const uint8_t* __restrict x,
+            const int32_t* __restrict keys,
+            int32_t* __restrict all_distances,
+            idx_t* __restrict all_labels) {
+        for (idx_t q = 0; q < NQ; q++) {
+            idx_t qno = keys[q];
+            hcs[q] = HammingComputer(x + qno * code_size, code_size);
+            distances[q] = all_distances + qno * K;
+            labels[q] = all_labels + qno * K;
+            heap_tops[q] = distances[q][0];
+        }
+    }
+
+    void add_bcode(const uint8_t* bcode, idx_t id) {
+        using C = CMax<int32_t, idx_t>;
+        for (int q = 0; q < NQ; q++) {
+            int dis = hcs[q].hamming(bcode);
+            if (dis < heap_tops[q]) {
+                heap_replace_top<C>(K, distances[q], labels[q], dis, id);
+                heap_tops[q] = distances[q][0];
+            }
+        }
+    }
+};
+
+template <class HammingComputer, int NQ>
+struct BlockSearchVariableK {
+    int k;
+    HammingComputer hcs[NQ];
+    // heaps to update for each query
+    int32_t* distances[NQ];
+    idx_t* labels[NQ];
+    // curent top of heap
+    int32_t heap_tops[NQ];
+
+    BlockSearchVariableK(
+            size_t code_size,
+            int k,
+            const uint8_t* __restrict x,
+            const int32_t* __restrict keys,
+            int32_t* __restrict all_distances,
+            idx_t* __restrict all_labels)
+            : k(k) {
+        for (idx_t q = 0; q < NQ; q++) {
+            idx_t qno = keys[q];
+            hcs[q] = HammingComputer(x + qno * code_size, code_size);
+            distances[q] = all_distances + qno * k;
+            labels[q] = all_labels + qno * k;
+            heap_tops[q] = distances[q][0];
+        }
+    }
+
+    void add_bcode(const uint8_t* bcode, idx_t id) {
+        using C = CMax<int32_t, idx_t>;
+        for (int q = 0; q < NQ; q++) {
+            int dis = hcs[q].hamming(bcode);
+            if (dis < heap_tops[q]) {
+                heap_replace_top<C>(k, distances[q], labels[q], dis, id);
+                heap_tops[q] = distances[q][0];
+            }
+        }
+    }
+};
+
+template <class HammingComputer>
+void search_knn_hamming_per_invlist(
+        const IndexBinaryIVF& ivf,
+        size_t n,
+        const uint8_t* __restrict x,
+        idx_t k,
+        const idx_t* __restrict keys_in,
+        const int32_t* __restrict coarse_dis,
+        int32_t* __restrict distances,
+        idx_t* __restrict labels,
+        bool store_pairs,
+        const IVFSearchParameters* params) {
+    idx_t nprobe = params ? params->nprobe : ivf.nprobe;
+    nprobe = std::min((idx_t)ivf.nlist, nprobe);
+    idx_t max_codes = params ? params->max_codes : ivf.max_codes;
+    FAISS_THROW_IF_NOT(max_codes == 0);
+    FAISS_THROW_IF_NOT(!store_pairs);
+    MetricType metric_type = ivf.metric_type;
+
+    // reorder buckets
+    std::vector<int64_t> lims(n + 1);
+    int32_t* keys = new int32_t[n * nprobe];
+    std::unique_ptr<int32_t[]> delete_keys(keys);
+    for (idx_t i = 0; i < n * nprobe; i++) {
+        keys[i] = keys_in[i];
+    }
+    matrix_bucket_sort_inplace(n, nprobe, keys, ivf.nlist, lims.data(), 0);
+
+    using C = CMax<int32_t, idx_t>;
+    heap_heapify<C>(n * k, distances, labels);
+    const size_t code_size = ivf.code_size;
+
+    for (idx_t l = 0; l < ivf.nlist; l++) {
+        idx_t l0 = lims[l], nq = lims[l + 1] - l0;
+
+        InvertedLists::ScopedCodes scodes(ivf.invlists, l);
+        InvertedLists::ScopedIds sidx(ivf.invlists, l);
+        idx_t nb = ivf.invlists->list_size(l);
+        const uint8_t* bcodes = scodes.get();
+        const idx_t* ids = sidx.get();
+
+        idx_t i = 0;
+
+        // process as much as possible by blocks
+        constexpr int BS = 4;
+
+        if (k == 1) {
+            for (; i + BS <= nq; i += BS) {
+                BlockSearch<HammingComputer, BS, 1> bc(
+                        code_size, x, keys + l0 + i, distances, labels);
+                for (idx_t j = 0; j < nb; j++) {
+                    bc.add_bcode(bcodes + j * code_size, ids[j]);
+                }
+            }
+        } else if (k == 2) {
+            for (; i + BS <= nq; i += BS) {
+                BlockSearch<HammingComputer, BS, 2> bc(
+                        code_size, x, keys + l0 + i, distances, labels);
+                for (idx_t j = 0; j < nb; j++) {
+                    bc.add_bcode(bcodes + j * code_size, ids[j]);
+                }
+            }
+        } else if (k == 4) {
+            for (; i + BS <= nq; i += BS) {
+                BlockSearch<HammingComputer, BS, 4> bc(
+                        code_size, x, keys + l0 + i, distances, labels);
+                for (idx_t j = 0; j < nb; j++) {
+                    bc.add_bcode(bcodes + j * code_size, ids[j]);
+                }
+            }
+        } else {
+            for (; i + BS <= nq; i += BS) {
+                BlockSearchVariableK<HammingComputer, BS> bc(
+                        code_size, k, x, keys + l0 + i, distances, labels);
+                for (idx_t j = 0; j < nb; j++) {
+                    bc.add_bcode(bcodes + j * code_size, ids[j]);
+                }
+            }
+        }
+
+        // leftovers
+        for (; i < nq; i++) {
+            idx_t qno = keys[l0 + i];
+            HammingComputer hc(x + qno * code_size, code_size);
+            idx_t* __restrict idxi = labels + qno * k;
+            int32_t* __restrict simi = distances + qno * k;
+            int32_t simi0 = simi[0];
+            for (idx_t j = 0; j < nb; j++) {
+                int dis = hc.hamming(bcodes + j * code_size);
+
+                if (dis < simi0) {
+                    idx_t id = store_pairs ? lo_build(l, j) : ids[j];
+                    heap_replace_top<C>(k, simi, idxi, dis, id);
+                    simi0 = simi[0];
+                }
+            }
+        }
+    }
+    for (idx_t i = 0; i < n; i++) {
+        heap_reorder<C>(k, distances + i * k, labels + i * k);
+    }
+}
+
 template <bool store_pairs>
 void search_knn_hamming_count_1(
         const IndexBinaryIVF& ivf,
@@ -601,7 +765,56 @@
     }
 }
 
-} // namespace
+void search_knn_hamming_per_invlist_1(
+        const IndexBinaryIVF& ivf,
+        size_t n,
+        const uint8_t* x,
+        idx_t k,
+        const idx_t* keys,
+        const int32_t* coarse_dis,
+        int32_t* distances,
+        idx_t* labels,
+        bool store_pairs,
+        const IVFSearchParameters* params) {
+    switch (ivf.code_size) {
+#define HANDLE_CS(cs)                                        \
+    case cs:                                                 \
+        search_knn_hamming_per_invlist<HammingComputer##cs>( \
+                ivf,                                         \
+                n,                                           \
+                x,                                           \
+                k,                                           \
+                keys,                                        \
+                coarse_dis,                                  \
+                distances,                                   \
+                labels,                                      \
+                store_pairs,                                 \
+                params);                                     \
+        break;
+        HANDLE_CS(4);
+        HANDLE_CS(8);
+        HANDLE_CS(16);
+        HANDLE_CS(20);
+        HANDLE_CS(32);
+        HANDLE_CS(64);
+#undef HANDLE_CS
+        default:
+            search_knn_hamming_per_invlist<HammingComputerDefault>(
+                    ivf,
+                    n,
+                    x,
+                    k,
+                    keys,
+                    coarse_dis,
+                    distances,
+                    labels,
+                    store_pairs,
+                    params);
+            break;
+    }
+}
+
+} // anonymous namespace
 
 BinaryInvertedListScanner* IndexBinaryIVF::get_InvertedListScanner(
         bool store_pairs) const {
@@ -635,7 +848,19 @@
         idx_t* labels,
         bool store_pairs,
         const IVFSearchParameters* params) const {
-    if (use_heap) {
+    if (per_invlist_search) {
+        search_knn_hamming_per_invlist_1(
+                *this,
+                n,
+                x,
+                k,
+                idx,
+                coarse_dis,
+                distances,
+                labels,
+                store_pairs,
+                params);
+    } else if (use_heap) {
         search_knn_hamming_heap(
                 *this,
                 n,
@@ -660,9 +885,9 @@
 
 void IndexBinaryIVF::range_search(
         idx_t n,
-        const uint8_t* x,
+        const uint8_t* __restrict x,
         int radius,
-        RangeSearchResult* res,
+        RangeSearchResult* __restrict res,
         const SearchParameters* params) const {
     FAISS_THROW_IF_NOT_MSG(
             !params, "search params not supported for this index");
@@ -684,11 +909,11 @@
 
 void IndexBinaryIVF::range_search_preassigned(
         idx_t n,
-        const uint8_t* x,
+        const uint8_t* __restrict x,
         int radius,
-        const idx_t* assign,
-        const int32_t* centroid_dis,
-        RangeSearchResult* res) const {
+        const idx_t* __restrict assign,
+        const int32_t* __restrict centroid_dis,
+        RangeSearchResult* __restrict res) const {
     const size_t nprobe = std::min(nlist, this->nprobe);
     bool store_pairs = false;
     size_t nlistv = 0, ndis = 0;
diff -Nru faiss-1.7.3/faiss/IndexBinaryIVF.h faiss-1.7.4/faiss/IndexBinaryIVF.h
--- faiss-1.7.3/faiss/IndexBinaryIVF.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/IndexBinaryIVF.h	2023-04-19 13:18:30.000000000 +0000
@@ -32,27 +32,36 @@
  */
 struct IndexBinaryIVF : IndexBinary {
     /// Access to the actual data
-    InvertedLists* invlists;
-    bool own_invlists;
+    InvertedLists* invlists = nullptr;
+    bool own_invlists = true;
 
-    size_t nprobe;    ///< number of probes at query time
-    size_t max_codes; ///< max nb of codes to visit to do a query
+    size_t nprobe = 1;    ///< number of probes at query time
+    size_t max_codes = 0; ///< max nb of codes to visit to do a query
 
     /** Select between using a heap or counting to select the k smallest values
      * when scanning inverted lists.
      */
     bool use_heap = true;
 
+    /** collect computations per batch */
+    bool per_invlist_search = false;
+
     /// map for direct access to the elements. Enables reconstruct().
     DirectMap direct_map;
 
-    IndexBinary* quantizer; ///< quantizer that maps vectors to inverted lists
-    size_t nlist;           ///< number of possible key values
+    /// quantizer that maps vectors to inverted lists
+    IndexBinary* quantizer = nullptr;
+
+    /// number of possible key values
+    size_t nlist = 0;
 
-    bool own_fields; ///< whether object owns the quantizer
+    /// whether object owns the quantizer
+    bool own_fields = false;
 
     ClusteringParameters cp; ///< to override default clustering params
-    Index* clustering_index; ///< to override index used during clustering
+
+    /// to override index used during clustering
+    Index* clustering_index = nullptr;
 
     /** The Inverted file takes a quantizer (an IndexBinary) on input,
      * which implements the function mapping a vector to a list
@@ -196,7 +205,7 @@
         return invlists->list_size(list_no);
     }
 
-    /** intialize a direct map
+    /** initialize a direct map
      *
      * @param new_maintain_direct_map    if true, create a direct map,
      *                                   else clear it
@@ -209,8 +218,6 @@
 };
 
 struct BinaryInvertedListScanner {
-    using idx_t = Index::idx_t;
-
     /// from now on we handle this query.
     virtual void set_query(const uint8_t* query_vector) = 0;
 
diff -Nru faiss-1.7.3/faiss/index_factory.cpp faiss-1.7.4/faiss/index_factory.cpp
--- faiss-1.7.3/faiss/index_factory.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/index_factory.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -10,8 +10,6 @@
  */
 
 #include <faiss/index_factory.h>
-#include "faiss/MetricType.h"
-#include "faiss/impl/FaissAssert.h"
 
 #include <cinttypes>
 #include <cmath>
@@ -665,19 +663,19 @@
         re_match(description, "(.+),Refine\\((.+)\\)", sm)) {
         std::unique_ptr<Index> filter_index =
                 index_factory_sub(d, sm[1].str(), metric);
-        std::unique_ptr<Index> refine_index;
 
+        IndexRefine* index_rf = nullptr;
         if (sm.size() == 3) { // Refine
-            refine_index = index_factory_sub(d, sm[2].str(), metric);
+            std::unique_ptr<Index> refine_index =
+                    index_factory_sub(d, sm[2].str(), metric);
+            index_rf = new IndexRefine(
+                    filter_index.release(), refine_index.release());
+            index_rf->own_refine_index = true;
         } else { // RFlat
-            refine_index.reset(new IndexFlat(d, metric));
+            index_rf = new IndexRefineFlat(filter_index.release(), nullptr);
         }
-        IndexRefine* index_rf =
-                new IndexRefine(filter_index.get(), refine_index.get());
+        FAISS_ASSERT(index_rf != nullptr);
         index_rf->own_fields = true;
-        filter_index.release();
-        refine_index.release();
-        index_rf->own_refine_index = true;
         return std::unique_ptr<Index>(index_rf);
     }
 
diff -Nru faiss-1.7.3/faiss/IndexFastScan.cpp faiss-1.7.4/faiss/IndexFastScan.cpp
--- faiss-1.7.3/faiss/IndexFastScan.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/IndexFastScan.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -98,18 +98,21 @@
     ntotal += n;
 }
 
+CodePacker* IndexFastScan::get_CodePacker() const {
+    return new CodePackerPQ4(M, bbs);
+}
+
 size_t IndexFastScan::remove_ids(const IDSelector& sel) {
     idx_t j = 0;
+    std::vector<uint8_t> buffer(code_size);
+    CodePackerPQ4 packer(M, bbs);
     for (idx_t i = 0; i < ntotal; i++) {
         if (sel.is_member(i)) {
             // should be removed
         } else {
             if (i > j) {
-                for (int sq = 0; sq < M; sq++) {
-                    uint8_t code =
-                            pq4_get_packed_element(codes.data(), bbs, M, i, sq);
-                    pq4_set_packed_element(codes.data(), code, bbs, M, j, sq);
-                }
+                packer.unpack_1(codes.data(), i, buffer.data());
+                packer.pack_1(buffer.data(), j, codes.data());
             }
             j++;
         }
@@ -142,12 +145,12 @@
     IndexFastScan* other = static_cast<IndexFastScan*>(&otherIndex);
     ntotal2 = roundup(ntotal + other->ntotal, bbs);
     codes.resize(ntotal2 * M2 / 2);
+    std::vector<uint8_t> buffer(code_size);
+    CodePackerPQ4 packer(M, bbs);
+
     for (int i = 0; i < other->ntotal; i++) {
-        for (int sq = 0; sq < M; sq++) {
-            uint8_t code =
-                    pq4_get_packed_element(other->codes.data(), bbs, M, i, sq);
-            pq4_set_packed_element(codes.data(), code, bbs, M, ntotal + i, sq);
-        }
+        packer.unpack_1(other->codes.data(), i, buffer.data());
+        packer.pack_1(buffer.data(), ntotal + i, codes.data());
     }
     ntotal += other->ntotal;
     other->reset();
diff -Nru faiss-1.7.3/faiss/IndexFastScan.h faiss-1.7.4/faiss/IndexFastScan.h
--- faiss-1.7.3/faiss/IndexFastScan.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/IndexFastScan.h	2023-04-19 13:18:30.000000000 +0000
@@ -12,6 +12,8 @@
 
 namespace faiss {
 
+struct CodePacker;
+
 /** Fast scan version of IndexPQ and IndexAQ. Works for 4-bit PQ and AQ for now.
  *
  * The codes are not stored sequentially but grouped in blocks of size bbs.
@@ -25,7 +27,6 @@
  * 14: no qbs with heap accumulator
  * 15: no qbs with reservoir accumulator
  */
-
 struct IndexFastScan : Index {
     // implementation to select
     int implem = 0;
@@ -126,6 +127,9 @@
 
     void reconstruct(idx_t key, float* recons) const override;
     size_t remove_ids(const IDSelector& sel) override;
+
+    CodePacker* get_CodePacker() const;
+
     void merge_from(Index& otherIndex, idx_t add_id = 0) override;
     void check_compatible_for_merge(const Index& otherIndex) const override;
 };
diff -Nru faiss-1.7.3/faiss/IndexFlatCodes.cpp faiss-1.7.4/faiss/IndexFlatCodes.cpp
--- faiss-1.7.3/faiss/IndexFlatCodes.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/IndexFlatCodes.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -8,6 +8,7 @@
 #include <faiss/IndexFlatCodes.h>
 
 #include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/CodePacker.h>
 #include <faiss/impl/DistanceComputer.h>
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/impl/IDSelector.h>
@@ -98,4 +99,8 @@
     other->reset();
 }
 
+CodePacker* IndexFlatCodes::get_CodePacker() const {
+    return new CodePackerFlat(code_size);
+}
+
 } // namespace faiss
diff -Nru faiss-1.7.3/faiss/IndexFlatCodes.h faiss-1.7.4/faiss/IndexFlatCodes.h
--- faiss-1.7.3/faiss/IndexFlatCodes.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/IndexFlatCodes.h	2023-04-19 13:18:30.000000000 +0000
@@ -15,6 +15,8 @@
 
 namespace faiss {
 
+struct CodePacker;
+
 /** Index that encodes all vectors as fixed-size codes (size code_size). Storage
  * is in the codes vector */
 struct IndexFlatCodes : Index {
@@ -39,8 +41,8 @@
 
     size_t sa_code_size() const override;
 
-    /** remove some ids. NB that Because of the structure of the
-     * indexing structure, the semantics of this operation are
+    /** remove some ids. NB that because of the structure of the
+     * index, the semantics of this operation are
      * different from the usual ones: the new ids are shifted */
     size_t remove_ids(const IDSelector& sel) override;
 
@@ -51,6 +53,9 @@
         return get_FlatCodesDistanceComputer();
     }
 
+    // returns a new instance of a CodePacker
+    CodePacker* get_CodePacker() const;
+
     void check_compatible_for_merge(const Index& otherIndex) const override;
 
     virtual void merge_from(Index& otherIndex, idx_t add_id = 0) override;
diff -Nru faiss-1.7.3/faiss/IndexFlat.cpp faiss-1.7.4/faiss/IndexFlat.cpp
--- faiss-1.7.3/faiss/IndexFlat.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/IndexFlat.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -14,6 +14,7 @@
 #include <faiss/utils/Heap.h>
 #include <faiss/utils/distances.h>
 #include <faiss/utils/extra_distances.h>
+#include <faiss/utils/sorting.h>
 #include <faiss/utils/utils.h>
 #include <cstring>
 
@@ -39,6 +40,10 @@
     } else if (metric_type == METRIC_L2) {
         float_maxheap_array_t res = {size_t(n), size_t(k), labels, distances};
         knn_L2sqr(x, get_xb(), d, n, ntotal, &res, nullptr, sel);
+    } else if (is_similarity_metric(metric_type)) {
+        float_minheap_array_t res = {size_t(n), size_t(k), labels, distances};
+        knn_extra_metrics(
+                x, get_xb(), d, n, ntotal, metric_type, metric_arg, &res);
     } else {
         FAISS_THROW_IF_NOT(!sel);
         float_maxheap_array_t res = {size_t(n), size_t(k), labels, distances};
@@ -90,7 +95,7 @@
 
 struct FlatL2Dis : FlatCodesDistanceComputer {
     size_t d;
-    Index::idx_t nb;
+    idx_t nb;
     const float* q;
     const float* b;
     size_t ndis;
@@ -121,7 +126,7 @@
 
 struct FlatIPDis : FlatCodesDistanceComputer {
     size_t d;
-    Index::idx_t nb;
+    idx_t nb;
     const float* q;
     const float* b;
     size_t ndis;
@@ -222,7 +227,7 @@
             perm.size() == ntotal, "Call update_permutation before search");
     const float* xb = get_xb();
 
-#pragma omp parallel for
+#pragma omp parallel for if (n > 10000)
     for (idx_t i = 0; i < n; i++) {
         float q = x[i]; // query
         float* D = distances + i * k;
@@ -232,6 +237,14 @@
         idx_t i0 = 0, i1 = ntotal;
         idx_t wp = 0;
 
+        if (ntotal == 0) {
+            for (idx_t j = 0; j < k; j++) {
+                I[j] = -1;
+                D[j] = HUGE_VAL;
+            }
+            goto done;
+        }
+
         if (xb[perm[i0]] > q) {
             i1 = 0;
             goto finish_right;
diff -Nru faiss-1.7.3/faiss/IndexFlat.h faiss-1.7.4/faiss/IndexFlat.h
--- faiss-1.7.3/faiss/IndexFlat.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/IndexFlat.h	2023-04-19 13:18:30.000000000 +0000
@@ -82,7 +82,7 @@
 
 /// optimized version for 1D "vectors".
 struct IndexFlat1D : IndexFlatL2 {
-    bool continuous_update; ///< is the permutation updated continuously?
+    bool continuous_update = true; ///< is the permutation updated continuously?
 
     std::vector<idx_t> perm; ///< sorted database indices
 
diff -Nru faiss-1.7.3/faiss/Index.h faiss-1.7.4/faiss/Index.h
--- faiss-1.7.3/faiss/Index.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/Index.h	2023-04-19 13:18:30.000000000 +0000
@@ -18,7 +18,7 @@
 
 #define FAISS_VERSION_MAJOR 1
 #define FAISS_VERSION_MINOR 7
-#define FAISS_VERSION_PATCH 3
+#define FAISS_VERSION_PATCH 4
 
 /**
  * @namespace faiss
@@ -62,7 +62,6 @@
  * although the internal representation may vary.
  */
 struct Index {
-    using idx_t = int64_t; ///< all indices are this type
     using component_t = float;
     using distance_t = float;
 
diff -Nru faiss-1.7.3/faiss/IndexHNSW.cpp faiss-1.7.4/faiss/IndexHNSW.cpp
--- faiss-1.7.3/faiss/IndexHNSW.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/IndexHNSW.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -24,9 +24,6 @@
 #include <sys/stat.h>
 #include <sys/types.h>
 
-#ifdef __SSE__
-#endif
-
 #include <faiss/Index2Layer.h>
 #include <faiss/IndexFlat.h>
 #include <faiss/IndexIVFPQ.h>
@@ -35,6 +32,7 @@
 #include <faiss/utils/Heap.h>
 #include <faiss/utils/distances.h>
 #include <faiss/utils/random.h>
+#include <faiss/utils/sorting.h>
 
 extern "C" {
 
@@ -58,7 +56,6 @@
 
 namespace faiss {
 
-using idx_t = Index::idx_t;
 using MinimaxHeap = HNSW::MinimaxHeap;
 using storage_idx_t = HNSW::storage_idx_t;
 using NodeDistFarther = HNSW::NodeDistFarther;
@@ -101,7 +98,7 @@
 };
 
 DistanceComputer* storage_distance_computer(const Index* storage) {
-    if (storage->metric_type == METRIC_INNER_PRODUCT) {
+    if (is_similarity_metric(storage->metric_type)) {
         return new NegativeDistanceComputer(storage->get_distance_computer());
     } else {
         return storage->get_distance_computer();
@@ -349,7 +346,7 @@
         InterruptCallback::check();
     }
 
-    if (metric_type == METRIC_INNER_PRODUCT) {
+    if (is_similarity_metric(metric_type)) {
         // we need to revert the negated distances
         for (size_t i = 0; i < k * n; i++) {
             distances[i] = -distances[i];
diff -Nru faiss-1.7.3/faiss/IndexHNSW.h faiss-1.7.4/faiss/IndexHNSW.h
--- faiss-1.7.3/faiss/IndexHNSW.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/IndexHNSW.h	2023-04-19 13:18:30.000000000 +0000
@@ -22,7 +22,6 @@
 struct IndexHNSW;
 
 struct ReconstructFromNeighbors {
-    typedef Index::idx_t idx_t;
     typedef HNSW::storage_idx_t storage_idx_t;
 
     const IndexHNSW& index;
diff -Nru faiss-1.7.3/faiss/IndexIDMap.cpp faiss-1.7.4/faiss/IndexIDMap.cpp
--- faiss-1.7.3/faiss/IndexIDMap.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/IndexIDMap.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -64,7 +64,7 @@
 void IndexIDMapTemplate<IndexT>::add_with_ids(
         idx_t n,
         const typename IndexT::component_t* x,
-        const typename IndexT::idx_t* xids) {
+        const idx_t* xids) {
     index->add(n, x);
     for (idx_t i = 0; i < n; i++)
         id_map.push_back(xids[i]);
@@ -77,7 +77,7 @@
         const typename IndexT::component_t* x,
         idx_t k,
         typename IndexT::distance_t* distances,
-        typename IndexT::idx_t* labels,
+        idx_t* labels,
         const SearchParameters* params) const {
     FAISS_THROW_IF_NOT_MSG(
             !params, "search params not supported for this index");
@@ -91,7 +91,7 @@
 
 template <typename IndexT>
 void IndexIDMapTemplate<IndexT>::range_search(
-        typename IndexT::idx_t n,
+        idx_t n,
         const typename IndexT::component_t* x,
         typename IndexT::distance_t radius,
         RangeSearchResult* result,
@@ -182,7 +182,7 @@
 void IndexIDMap2Template<IndexT>::add_with_ids(
         idx_t n,
         const typename IndexT::component_t* x,
-        const typename IndexT::idx_t* xids) {
+        const idx_t* xids) {
     size_t prev_ntotal = this->ntotal;
     IndexIDMapTemplate<IndexT>::add_with_ids(n, x, xids);
     for (size_t i = prev_ntotal; i < this->ntotal; i++) {
diff -Nru faiss-1.7.3/faiss/IndexIDMap.h faiss-1.7.4/faiss/IndexIDMap.h
--- faiss-1.7.3/faiss/IndexIDMap.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/IndexIDMap.h	2023-04-19 13:18:30.000000000 +0000
@@ -18,7 +18,6 @@
 /** Index that translates search results to ids */
 template <typename IndexT>
 struct IndexIDMapTemplate : IndexT {
-    using idx_t = typename IndexT::idx_t;
     using component_t = typename IndexT::component_t;
     using distance_t = typename IndexT::distance_t;
 
@@ -74,7 +73,6 @@
  *  implementation via a 2-way index */
 template <typename IndexT>
 struct IndexIDMap2Template : IndexIDMapTemplate<IndexT> {
-    using idx_t = typename IndexT::idx_t;
     using component_t = typename IndexT::component_t;
     using distance_t = typename IndexT::distance_t;
 
diff -Nru faiss-1.7.3/faiss/IndexIVFAdditiveQuantizer.cpp faiss-1.7.4/faiss/IndexIVFAdditiveQuantizer.cpp
--- faiss-1.7.3/faiss/IndexIVFAdditiveQuantizer.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/IndexIVFAdditiveQuantizer.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -51,7 +51,7 @@
     ScopeDeleter<float> del_x(x_in == x ? nullptr : x);
 
     if (by_residual) {
-        std::vector<Index::idx_t> idx(n);
+        std::vector<idx_t> idx(n);
         quantizer->assign(n, x, idx.data());
 
         std::vector<float> residuals(n * d);
@@ -145,7 +145,7 @@
             : ia(ia), aq(*ia.aq) {
         this->store_pairs = store_pairs;
         this->code_size = ia.code_size;
-        keep_max = ia.metric_type == METRIC_INNER_PRODUCT;
+        keep_max = is_similarity_metric(ia.metric_type);
         tmp.resize(ia.d);
     }
 
diff -Nru faiss-1.7.3/faiss/IndexIVF.cpp faiss-1.7.4/faiss/IndexIVF.cpp
--- faiss-1.7.3/faiss/IndexIVF.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/IndexIVF.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -10,11 +10,13 @@
 #include <faiss/IndexIVF.h>
 
 #include <omp.h>
+#include <cstdint>
 #include <mutex>
 
 #include <algorithm>
 #include <cinttypes>
 #include <cstdio>
+#include <limits>
 #include <memory>
 
 #include <faiss/utils/hamming.h>
@@ -22,6 +24,7 @@
 
 #include <faiss/IndexFlat.h>
 #include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/CodePacker.h>
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/impl/IDSelector.h>
 
@@ -35,27 +38,19 @@
  ******************************************/
 
 Level1Quantizer::Level1Quantizer(Index* quantizer, size_t nlist)
-        : quantizer(quantizer),
-          nlist(nlist),
-          quantizer_trains_alone(0),
-          own_fields(false),
-          clustering_index(nullptr) {
+        : quantizer(quantizer), nlist(nlist) {
     // here we set a low # iterations because this is typically used
     // for large clusterings (nb this is not used for the MultiIndex,
     // for which quantizer_trains_alone = true)
     cp.niter = 10;
 }
 
-Level1Quantizer::Level1Quantizer()
-        : quantizer(nullptr),
-          nlist(0),
-          quantizer_trains_alone(0),
-          own_fields(false),
-          clustering_index(nullptr) {}
+Level1Quantizer::Level1Quantizer() {}
 
 Level1Quantizer::~Level1Quantizer() {
-    if (own_fields)
+    if (own_fields) {
         delete quantizer;
+    }
 }
 
 void Level1Quantizer::train_q1(
@@ -131,7 +126,7 @@
     return nbyte;
 }
 
-void Level1Quantizer::encode_listno(Index::idx_t list_no, uint8_t* code) const {
+void Level1Quantizer::encode_listno(idx_t list_no, uint8_t* code) const {
     // little endian
     size_t nl = nlist - 1;
     while (nl > 0) {
@@ -141,7 +136,7 @@
     }
 }
 
-Index::idx_t Level1Quantizer::decode_listno(const uint8_t* code) const {
+idx_t Level1Quantizer::decode_listno(const uint8_t* code) const {
     size_t nl = nlist - 1;
     int64_t list_no = 0;
     int nbit = 0;
@@ -165,13 +160,10 @@
         size_t code_size,
         MetricType metric)
         : Index(d, metric),
-          Level1Quantizer(quantizer, nlist),
+          IndexIVFInterface(quantizer, nlist),
           invlists(new ArrayInvertedLists(nlist, code_size)),
           own_invlists(true),
-          code_size(code_size),
-          nprobe(1),
-          max_codes(0),
-          parallel_mode(0) {
+          code_size(code_size) {
     FAISS_THROW_IF_NOT(d == quantizer->d);
     is_trained = quantizer->is_trained && (quantizer->ntotal == nlist);
     // Spherical by default if the metric is inner_product
@@ -180,13 +172,7 @@
     }
 }
 
-IndexIVF::IndexIVF()
-        : invlists(nullptr),
-          own_invlists(false),
-          code_size(0),
-          nprobe(1),
-          max_codes(0),
-          parallel_mode(0) {}
+IndexIVF::IndexIVF() {}
 
 void IndexIVF::add(idx_t n, const float* x) {
     add_with_ids(n, x, nullptr);
@@ -412,6 +398,7 @@
     nprobe = std::min((idx_t)nlist, nprobe);
     FAISS_THROW_IF_NOT(nprobe > 0);
 
+    const idx_t unlimited_list_size = std::numeric_limits<idx_t>::max();
     idx_t max_codes = params ? params->max_codes : this->max_codes;
     IDSelector* sel = params ? params->sel : nullptr;
     const IDSelectorRange* selr = dynamic_cast<const IDSelectorRange*>(sel);
@@ -427,6 +414,10 @@
             !(sel && store_pairs),
             "selector and store_pairs cannot be combined");
 
+    FAISS_THROW_IF_NOT_MSG(
+            !invlists->use_iterator || (max_codes == 0 && store_pairs == false),
+            "iterable inverted lists don't support max_codes and store_pairs");
+
     size_t nlistv = 0, ndis = 0, nheap = 0;
 
     using HeapForIP = CMin<float, idx_t>;
@@ -439,6 +430,14 @@
     int pmode = this->parallel_mode & ~PARALLEL_MODE_NO_HEAP_INIT;
     bool do_heap_init = !(this->parallel_mode & PARALLEL_MODE_NO_HEAP_INIT);
 
+    FAISS_THROW_IF_NOT_MSG(
+            max_codes == 0 || pmode == 0 || pmode == 3,
+            "max_codes supported only for parallel_mode = 0 or 3");
+
+    if (max_codes == 0) {
+        max_codes = unlimited_list_size;
+    }
+
     bool do_parallel = omp_get_max_threads() >= 2 &&
             (pmode == 0           ? false
                      : pmode == 3 ? n > 1
@@ -457,7 +456,7 @@
          * that are in common between the two
          ******************************************************/
 
-        // intialize + reorder a result heap
+        // initialize + reorder a result heap
 
         auto init_result = [&](float* simi, idx_t* idxi) {
             if (!do_heap_init)
@@ -495,7 +494,8 @@
         auto scan_one_list = [&](idx_t key,
                                  float coarse_dis_i,
                                  float* simi,
-                                 idx_t* idxi) {
+                                 idx_t* idxi,
+                                 idx_t list_size_max) {
             if (key < 0) {
                 // not enough centroids for multiprobe
                 return (size_t)0;
@@ -506,10 +506,8 @@
                     key,
                     nlist);
 
-            size_t list_size = invlists->list_size(key);
-
             // don't waste time on empty lists
-            if (list_size == 0) {
+            if (invlists->is_empty(key)) {
                 return (size_t)0;
             }
 
@@ -518,32 +516,51 @@
             nlistv++;
 
             try {
-                InvertedLists::ScopedCodes scodes(invlists, key);
-                const uint8_t* codes = scodes.get();
+                if (invlists->use_iterator) {
+                    size_t list_size = 0;
 
-                std::unique_ptr<InvertedLists::ScopedIds> sids;
-                const Index::idx_t* ids = nullptr;
+                    std::unique_ptr<InvertedListsIterator> it(
+                            invlists->get_iterator(key));
 
-                if (!store_pairs) {
-                    sids.reset(new InvertedLists::ScopedIds(invlists, key));
-                    ids = sids->get();
-                }
+                    nheap += scanner->iterate_codes(
+                            it.get(), simi, idxi, k, list_size);
 
-                if (selr) { // IDSelectorRange
-                    // restrict search to a section of the inverted list
-                    size_t jmin, jmax;
-                    selr->find_sorted_ids_bounds(list_size, ids, &jmin, &jmax);
-                    list_size = jmax - jmin;
-                    if (list_size == 0) {
-                        return (size_t)0;
+                    return list_size;
+                } else {
+                    size_t list_size = invlists->list_size(key);
+                    if (list_size > list_size_max) {
+                        list_size = list_size_max;
+                    }
+
+                    InvertedLists::ScopedCodes scodes(invlists, key);
+                    const uint8_t* codes = scodes.get();
+
+                    std::unique_ptr<InvertedLists::ScopedIds> sids;
+                    const idx_t* ids = nullptr;
+
+                    if (!store_pairs) {
+                        sids.reset(new InvertedLists::ScopedIds(invlists, key));
+                        ids = sids->get();
+                    }
+
+                    if (selr) { // IDSelectorRange
+                        // restrict search to a section of the inverted list
+                        size_t jmin, jmax;
+                        selr->find_sorted_ids_bounds(
+                                list_size, ids, &jmin, &jmax);
+                        list_size = jmax - jmin;
+                        if (list_size == 0) {
+                            return (size_t)0;
+                        }
+                        codes += jmin * code_size;
+                        ids += jmin;
                     }
-                    codes += jmin * code_size;
-                    ids += jmin;
-                }
 
-                nheap += scanner->scan_codes(
-                        list_size, codes, ids, simi, idxi, k);
+                    nheap += scanner->scan_codes(
+                            list_size, codes, ids, simi, idxi, k);
 
+                    return list_size;
+                }
             } catch (const std::exception& e) {
                 std::lock_guard<std::mutex> lock(exception_mutex);
                 exception_string =
@@ -551,8 +568,6 @@
                 interrupt = true;
                 return size_t(0);
             }
-
-            return list_size;
         };
 
         /****************************************************
@@ -581,9 +596,9 @@
                             keys[i * nprobe + ik],
                             coarse_dis[i * nprobe + ik],
                             simi,
-                            idxi);
-
-                    if (max_codes && nscan >= max_codes) {
+                            idxi,
+                            max_codes - nscan);
+                    if (nscan >= max_codes) {
                         break;
                     }
                 }
@@ -610,7 +625,8 @@
                             keys[i * nprobe + ik],
                             coarse_dis[i * nprobe + ik],
                             local_dis.data(),
-                            local_idx.data());
+                            local_idx.data(),
+                            unlimited_list_size);
 
                     // can't do the test on max_codes
                 }
@@ -651,7 +667,8 @@
                         keys[ij],
                         coarse_dis[ij],
                         local_dis.data(),
-                        local_idx.data());
+                        local_idx.data(),
+                        unlimited_list_size);
 #pragma omp critical
                 {
                     add_local_results(
@@ -744,6 +761,10 @@
     idx_t max_codes = params ? params->max_codes : this->max_codes;
     IDSelector* sel = params ? params->sel : nullptr;
 
+    FAISS_THROW_IF_NOT_MSG(
+            !invlists->use_iterator || (max_codes == 0 && store_pairs == false),
+            "iterable inverted lists don't support max_codes and store_pairs");
+
     size_t nlistv = 0, ndis = 0;
 
     bool interrupt = false;
@@ -780,21 +801,30 @@
                     key,
                     ik,
                     nlist);
-            const size_t list_size = invlists->list_size(key);
 
-            if (list_size == 0)
+            if (invlists->is_empty(key)) {
                 return;
+            }
 
             try {
-                InvertedLists::ScopedCodes scodes(invlists, key);
-                InvertedLists::ScopedIds ids(invlists, key);
-
+                size_t list_size = 0;
                 scanner->set_list(key, coarse_dis[i * nprobe + ik]);
+                if (invlists->use_iterator) {
+                    std::unique_ptr<InvertedListsIterator> it(
+                            invlists->get_iterator(key));
+
+                    scanner->iterate_codes_range(
+                            it.get(), radius, qres, list_size);
+                } else {
+                    InvertedLists::ScopedCodes scodes(invlists, key);
+                    InvertedLists::ScopedIds ids(invlists, key);
+                    list_size = invlists->list_size(key);
+
+                    scanner->scan_codes_range(
+                            list_size, scodes.get(), ids.get(), radius, qres);
+                }
                 nlistv++;
                 ndis += list_size;
-                scanner->scan_codes_range(
-                        list_size, scodes.get(), ids.get(), radius, qres);
-
             } catch (const std::exception& e) {
                 std::lock_guard<std::mutex> lock(exception_mutex);
                 exception_string =
@@ -1086,6 +1116,10 @@
     other->ntotal = 0;
 }
 
+CodePacker* IndexIVF::get_CodePacker() const {
+    return new CodePackerFlat(code_size);
+}
+
 void IndexIVF::replace_invlists(InvertedLists* il, bool own) {
     if (own_invlists) {
         delete invlists;
@@ -1104,71 +1138,11 @@
 
 void IndexIVF::copy_subset_to(
         IndexIVF& other,
-        int subset_type,
+        InvertedLists::subset_type_t subset_type,
         idx_t a1,
         idx_t a2) const {
-    FAISS_THROW_IF_NOT(nlist == other.nlist);
-    FAISS_THROW_IF_NOT(code_size == other.code_size);
-    FAISS_THROW_IF_NOT(other.direct_map.no());
-    FAISS_THROW_IF_NOT_FMT(
-            subset_type == 0 || subset_type == 1 || subset_type == 2,
-            "subset type %d not implemented",
-            subset_type);
-
-    size_t accu_n = 0;
-    size_t accu_a1 = 0;
-    size_t accu_a2 = 0;
-
-    InvertedLists* oivf = other.invlists;
-
-    for (idx_t list_no = 0; list_no < nlist; list_no++) {
-        size_t n = invlists->list_size(list_no);
-        ScopedIds ids_in(invlists, list_no);
-
-        if (subset_type == 0) {
-            for (idx_t i = 0; i < n; i++) {
-                idx_t id = ids_in[i];
-                if (a1 <= id && id < a2) {
-                    oivf->add_entry(
-                            list_no,
-                            invlists->get_single_id(list_no, i),
-                            ScopedCodes(invlists, list_no, i).get());
-                    other.ntotal++;
-                }
-            }
-        } else if (subset_type == 1) {
-            for (idx_t i = 0; i < n; i++) {
-                idx_t id = ids_in[i];
-                if (id % a1 == a2) {
-                    oivf->add_entry(
-                            list_no,
-                            invlists->get_single_id(list_no, i),
-                            ScopedCodes(invlists, list_no, i).get());
-                    other.ntotal++;
-                }
-            }
-        } else if (subset_type == 2) {
-            // see what is allocated to a1 and to a2
-            size_t next_accu_n = accu_n + n;
-            size_t next_accu_a1 = next_accu_n * a1 / ntotal;
-            size_t i1 = next_accu_a1 - accu_a1;
-            size_t next_accu_a2 = next_accu_n * a2 / ntotal;
-            size_t i2 = next_accu_a2 - accu_a2;
-
-            for (idx_t i = i1; i < i2; i++) {
-                oivf->add_entry(
-                        list_no,
-                        invlists->get_single_id(list_no, i),
-                        ScopedCodes(invlists, list_no, i).get());
-            }
-
-            other.ntotal += i2 - i1;
-            accu_a1 = next_accu_a1;
-            accu_a2 = next_accu_a2;
-        }
-        accu_n += n;
-    }
-    FAISS_ASSERT(accu_n == ntotal);
+    other.ntotal +=
+            invlists->copy_subset_to(*other.invlists, subset_type, a1, a2);
 }
 
 IndexIVF::~IndexIVF() {
@@ -1233,6 +1207,39 @@
     return nup;
 }
 
+size_t InvertedListScanner::iterate_codes(
+        InvertedListsIterator* it,
+        float* simi,
+        idx_t* idxi,
+        size_t k,
+        size_t& list_size) const {
+    size_t nup = 0;
+    list_size = 0;
+
+    if (!keep_max) {
+        for (; it->is_available(); it->next()) {
+            auto id_and_codes = it->get_id_and_codes();
+            float dis = distance_to_code(id_and_codes.second);
+            if (dis < simi[0]) {
+                maxheap_replace_top(k, simi, idxi, dis, id_and_codes.first);
+                nup++;
+            }
+            list_size++;
+        }
+    } else {
+        for (; it->is_available(); it->next()) {
+            auto id_and_codes = it->get_id_and_codes();
+            float dis = distance_to_code(id_and_codes.second);
+            if (dis > simi[0]) {
+                minheap_replace_top(k, simi, idxi, dis, id_and_codes.first);
+                nup++;
+            }
+            list_size++;
+        }
+    }
+    return nup;
+}
+
 void InvertedListScanner::scan_codes_range(
         size_t list_size,
         const uint8_t* codes,
@@ -1252,4 +1259,23 @@
     }
 }
 
+void InvertedListScanner::iterate_codes_range(
+        InvertedListsIterator* it,
+        float radius,
+        RangeQueryResult& res,
+        size_t& list_size) const {
+    list_size = 0;
+    for (; it->is_available(); it->next()) {
+        auto id_and_codes = it->get_id_and_codes();
+        float dis = distance_to_code(id_and_codes.second);
+        bool keep = !keep_max
+                ? dis < radius
+                : dis > radius; // TODO templatize to remove this test
+        if (keep) {
+            res.add(dis, id_and_codes.first);
+        }
+        list_size++;
+    }
+}
+
 } // namespace faiss
diff -Nru faiss-1.7.3/faiss/IndexIVFFastScan.cpp faiss-1.7.4/faiss/IndexIVFFastScan.cpp
--- faiss-1.7.3/faiss/IndexIVFFastScan.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/IndexIVFFastScan.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -69,7 +69,14 @@
     code_size = M2 / 2;
 
     is_trained = false;
-    replace_invlists(new BlockInvertedLists(nlist, bbs, bbs * M2 / 2), true);
+    replace_invlists(new BlockInvertedLists(nlist, get_CodePacker()), true);
+}
+
+void IndexIVFFastScan::init_code_packer() {
+    auto bil = dynamic_cast<BlockInvertedLists*>(invlists);
+    FAISS_THROW_IF_NOT(bil);
+    delete bil->packer; // in case there was one before
+    bil->packer = get_CodePacker();
 }
 
 IndexIVFFastScan::~IndexIVFFastScan() {}
@@ -112,17 +119,9 @@
     }
     InterruptCallback::check();
 
-    AlignedTable<uint8_t> codes(n * code_size);
     direct_map.check_can_add(xids);
     std::unique_ptr<idx_t[]> idx(new idx_t[n]);
     quantizer->assign(n, x, idx.get());
-    size_t nadd = 0, nminus1 = 0;
-
-    for (size_t i = 0; i < n; i++) {
-        if (idx[i] < 0) {
-            nminus1++;
-        }
-    }
 
     AlignedTable<uint8_t> flat_codes(n * code_size);
     encode_vectors(n, x, idx.get(), flat_codes.get());
@@ -170,7 +169,6 @@
             memcpy(list_codes.data() + (i - i0) * code_size,
                    flat_codes.data() + order[i] * code_size,
                    code_size);
-            nadd++;
         }
         pq4_pack_codes_range(
                 list_codes.data(),
@@ -187,6 +185,10 @@
     ntotal += n;
 }
 
+CodePacker* IndexIVFFastScan::get_CodePacker() const {
+    return new CodePackerPQ4(M, bbs);
+}
+
 /*********************************************************
  * search
  *********************************************************/
@@ -229,7 +231,6 @@
     }
 }
 
-using idx_t = Index::idx_t;
 using namespace quantize_lut;
 
 } // anonymous namespace
diff -Nru faiss-1.7.3/faiss/IndexIVFFastScan.h faiss-1.7.4/faiss/IndexIVFFastScan.h
--- faiss-1.7.3/faiss/IndexIVFFastScan.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/IndexIVFFastScan.h	2023-04-19 13:18:30.000000000 +0000
@@ -67,6 +67,9 @@
             MetricType metric,
             int bbs);
 
+    // initialize the CodePacker in the InvertedLists
+    void init_code_packer();
+
     ~IndexIVFFastScan() override;
 
     /// orig's inverted lists (for debugging)
@@ -166,7 +169,7 @@
             size_t* nlist_out,
             const Scaler& scaler) const;
 
-    // implem 14 is mukltithreaded internally across nprobes and queries
+    // implem 14 is multithreaded internally across nprobes and queries
     template <class C, class Scaler>
     void search_implem_14(
             idx_t n,
@@ -181,6 +184,8 @@
     void reconstruct_from_offset(int64_t list_no, int64_t offset, float* recons)
             const override;
 
+    CodePacker* get_CodePacker() const override;
+
     // reconstruct orig invlists (for debugging)
     void reconstruct_orig_invlists();
 };
diff -Nru faiss-1.7.3/faiss/IndexIVF.h faiss-1.7.4/faiss/IndexIVF.h
--- faiss-1.7.3/faiss/IndexIVF.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/IndexIVF.h	2023-04-19 13:18:30.000000000 +0000
@@ -31,19 +31,23 @@
  * of the lists (especially training)
  */
 struct Level1Quantizer {
-    Index* quantizer; ///< quantizer that maps vectors to inverted lists
-    size_t nlist;     ///< number of possible key values
+    /// quantizer that maps vectors to inverted lists
+    Index* quantizer = nullptr;
+
+    /// number of inverted lists
+    size_t nlist = 0;
 
     /**
      * = 0: use the quantizer as index in a kmeans training
      * = 1: just pass on the training set to the train() of the quantizer
      * = 2: kmeans training on a flat index + add the centroids to the quantizer
      */
-    char quantizer_trains_alone;
-    bool own_fields; ///< whether object owns the quantizer (false by default)
+    char quantizer_trains_alone = 0;
+    bool own_fields = false; ///< whether object owns the quantizer
 
     ClusteringParameters cp; ///< to override default clustering params
-    Index* clustering_index; ///< to override index used during clustering
+    /// to override index used during clustering
+    Index* clustering_index = nullptr;
 
     /// Trains the quantizer and calls train_residual to train sub-quantizers
     void train_q1(
@@ -54,8 +58,8 @@
 
     /// compute the number of bytes required to store list ids
     size_t coarse_code_size() const;
-    void encode_listno(Index::idx_t list_no, uint8_t* code) const;
-    Index::idx_t decode_listno(const uint8_t* code) const;
+    void encode_listno(idx_t list_no, uint8_t* code) const;
+    idx_t decode_listno(const uint8_t* code) const;
 
     Level1Quantizer(Index* quantizer, size_t nlist);
 
@@ -65,11 +69,10 @@
 };
 
 struct SearchParametersIVF : SearchParameters {
-    size_t nprobe;    ///< number of probes at query time
-    size_t max_codes; ///< max nb of codes to visit to do a query
+    size_t nprobe = 1;    ///< number of probes at query time
+    size_t max_codes = 0; ///< max nb of codes to visit to do a query
     SearchParameters* quantizer_params = nullptr;
 
-    SearchParametersIVF() : nprobe(1), max_codes(0) {}
     virtual ~SearchParametersIVF() {}
 };
 
@@ -78,6 +81,75 @@
 
 struct InvertedListScanner;
 struct IndexIVFStats;
+struct CodePacker;
+
+struct IndexIVFInterface : Level1Quantizer {
+    size_t nprobe = 1;    ///< number of probes at query time
+    size_t max_codes = 0; ///< max nb of codes to visit to do a query
+
+    explicit IndexIVFInterface(Index* quantizer = nullptr, size_t nlist = 0)
+            : Level1Quantizer(quantizer, nlist) {}
+
+    /** search a set of vectors, that are pre-quantized by the IVF
+     *  quantizer. Fill in the corresponding heaps with the query
+     *  results. The default implementation uses InvertedListScanners
+     *  to do the search.
+     *
+     * @param n      nb of vectors to query
+     * @param x      query vectors, size nx * d
+     * @param assign coarse quantization indices, size nx * nprobe
+     * @param centroid_dis
+     *               distances to coarse centroids, size nx * nprobe
+     * @param distance
+     *               output distances, size n * k
+     * @param labels output labels, size n * k
+     * @param store_pairs store inv list index + inv list offset
+     *                     instead in upper/lower 32 bit of result,
+     *                     instead of ids (used for reranking).
+     * @param params used to override the object's search parameters
+     * @param stats  search stats to be updated (can be null)
+     */
+    virtual void search_preassigned(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            const idx_t* assign,
+            const float* centroid_dis,
+            float* distances,
+            idx_t* labels,
+            bool store_pairs,
+            const IVFSearchParameters* params = nullptr,
+            IndexIVFStats* stats = nullptr) const = 0;
+
+    /** Range search a set of vectors, that are pre-quantized by the IVF
+     *  quantizer. Fill in the RangeSearchResults results. The default
+     * implementation uses InvertedListScanners to do the search.
+     *
+     * @param n      nb of vectors to query
+     * @param x      query vectors, size nx * d
+     * @param assign coarse quantization indices, size nx * nprobe
+     * @param centroid_dis
+     *               distances to coarse centroids, size nx * nprobe
+     * @param result Output results
+     * @param store_pairs store inv list index + inv list offset
+     *                     instead in upper/lower 32 bit of result,
+     *                     instead of ids (used for reranking).
+     * @param params used to override the object's search parameters
+     * @param stats  search stats to be updated (can be null)
+     */
+    virtual void range_search_preassigned(
+            idx_t nx,
+            const float* x,
+            float radius,
+            const idx_t* keys,
+            const float* coarse_dis,
+            RangeSearchResult* result,
+            bool store_pairs = false,
+            const IVFSearchParameters* params = nullptr,
+            IndexIVFStats* stats = nullptr) const = 0;
+
+    virtual ~IndexIVFInterface() {}
+};
 
 /** Index based on a inverted file (IVF)
  *
@@ -99,16 +171,12 @@
  * Sub-classes implement a post-filtering of the index that refines
  * the distance estimation from the query to databse vectors.
  */
-struct IndexIVF : Index, Level1Quantizer {
+struct IndexIVF : Index, IndexIVFInterface {
     /// Access to the actual data
-    InvertedLists* invlists;
-    bool own_invlists;
-
-    size_t code_size; ///< code size per vector in bytes
-
-    size_t nprobe;    ///< number of probes at query time
-    size_t max_codes; ///< max nb of codes to visit to do a query
+    InvertedLists* invlists = nullptr;
+    bool own_invlists = false;
 
+    size_t code_size = 0; ///< code size per vector in bytes
     /** Parallel mode determines how queries are parallelized with OpenMP
      *
      * 0 (default): split over queries
@@ -119,7 +187,7 @@
      * PARALLEL_MODE_NO_HEAP_INIT: binary or with the previous to
      * prevent the heap to be initialized and finalized
      */
-    int parallel_mode;
+    int parallel_mode = 0;
     const int PARALLEL_MODE_NO_HEAP_INIT = 1024;
 
     /** optional map that maps back ids to invlist entries. This
@@ -188,26 +256,7 @@
     /// does nothing by default
     virtual void train_residual(idx_t n, const float* x);
 
-    /** search a set of vectors, that are pre-quantized by the IVF
-     *  quantizer. Fill in the corresponding heaps with the query
-     *  results. The default implementation uses InvertedListScanners
-     *  to do the search.
-     *
-     * @param n      nb of vectors to query
-     * @param x      query vectors, size nx * d
-     * @param assign coarse quantization indices, size nx * nprobe
-     * @param centroid_dis
-     *               distances to coarse centroids, size nx * nprobe
-     * @param distance
-     *               output distances, size n * k
-     * @param labels output labels, size n * k
-     * @param store_pairs store inv list index + inv list offset
-     *                     instead in upper/lower 32 bit of result,
-     *                     instead of ids (used for reranking).
-     * @param params used to override the object's search parameters
-     * @param stats  search stats to be updated (can be null)
-     */
-    virtual void search_preassigned(
+    void search_preassigned(
             idx_t n,
             const float* x,
             idx_t k,
@@ -217,7 +266,18 @@
             idx_t* labels,
             bool store_pairs,
             const IVFSearchParameters* params = nullptr,
-            IndexIVFStats* stats = nullptr) const;
+            IndexIVFStats* stats = nullptr) const override;
+
+    void range_search_preassigned(
+            idx_t nx,
+            const float* x,
+            float radius,
+            const idx_t* keys,
+            const float* coarse_dis,
+            RangeSearchResult* result,
+            bool store_pairs = false,
+            const IVFSearchParameters* params = nullptr,
+            IndexIVFStats* stats = nullptr) const override;
 
     /** assign the vectors, then call search_preassign */
     void search(
@@ -235,17 +295,6 @@
             RangeSearchResult* result,
             const SearchParameters* params = nullptr) const override;
 
-    void range_search_preassigned(
-            idx_t nx,
-            const float* x,
-            float radius,
-            const idx_t* keys,
-            const float* coarse_dis,
-            RangeSearchResult* result,
-            bool store_pairs = false,
-            const IVFSearchParameters* params = nullptr,
-            IndexIVFStats* stats = nullptr) const;
-
     /** Get a scanner for this index (store_pairs means ignore labels)
      *
      * The default search implementation uses this to compute the distances
@@ -317,16 +366,15 @@
 
     virtual void merge_from(Index& otherIndex, idx_t add_id) override;
 
+    // returns a new instance of a CodePacker
+    virtual CodePacker* get_CodePacker() const;
+
     /** copy a subset of the entries index to the other index
-     *
-     * if subset_type == 0: copies ids in [a1, a2)
-     * if subset_type == 1: copies ids if id % a1 == a2
-     * if subset_type == 2: copies inverted lists such that a1
-     *                      elements are left before and a2 elements are after
+     * see Invlists::copy_subset_to for the meaning of subset_type
      */
     virtual void copy_subset_to(
             IndexIVF& other,
-            int subset_type,
+            InvertedLists::subset_type_t subset_type,
             idx_t a1,
             idx_t a2) const;
 
@@ -339,7 +387,7 @@
     /// are the ids sorted?
     bool check_ids_sorted() const;
 
-    /** intialize a direct map
+    /** initialize a direct map
      *
      * @param new_maintain_direct_map    if true, create a direct map,
      *                                   else clear it
@@ -353,7 +401,6 @@
 
     /* The standalone codec interface (except sa_decode that is specific) */
     size_t sa_code_size() const override;
-
     void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override;
 
     IndexIVF();
@@ -366,8 +413,6 @@
  * distance_to_code and scan_codes can be called in multiple
  * threads */
 struct InvertedListScanner {
-    using idx_t = Index::idx_t;
-
     idx_t list_no = -1;    ///< remember current list
     bool keep_max = false; ///< keep maximum instead of minimum
     /// store positions in invlists rather than labels
@@ -413,6 +458,14 @@
             idx_t* labels,
             size_t k) const;
 
+    // same as scan_codes, using an iterator
+    virtual size_t iterate_codes(
+            InvertedListsIterator* iterator,
+            float* distances,
+            idx_t* labels,
+            size_t k,
+            size_t& list_size) const;
+
     /** scan a set of codes, compute distances to current query and
      * update results if distances are below radius
      *
@@ -424,6 +477,13 @@
             float radius,
             RangeQueryResult& result) const;
 
+    // same as scan_codes_range, using an iterator
+    virtual void iterate_codes_range(
+            InvertedListsIterator* iterator,
+            float radius,
+            RangeQueryResult& result,
+            size_t& list_size) const;
+
     virtual ~InvertedListScanner() {}
 };
 
diff -Nru faiss-1.7.3/faiss/IndexIVFPQ.cpp faiss-1.7.4/faiss/IndexIVFPQ.cpp
--- faiss-1.7.3/faiss/IndexIVFPQ.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/IndexIVFPQ.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -33,9 +33,7 @@
 
 #include <faiss/impl/ProductQuantizer.h>
 
-#ifdef __AVX2__
-#include <immintrin.h>
-#endif
+#include <faiss/impl/code_distance/code_distance.h>
 
 namespace faiss {
 
@@ -51,7 +49,6 @@
         size_t nbits_per_idx,
         MetricType metric)
         : IndexIVF(quantizer, d, nlist, 0, metric), pq(d, M, nbits_per_idx) {
-    FAISS_THROW_IF_NOT(nbits_per_idx <= 8);
     code_size = pq.code_size;
     invlists->code_size = code_size;
     is_trained = false;
@@ -198,9 +195,9 @@
 
 static float* compute_residuals(
         const Index* quantizer,
-        Index::idx_t n,
+        idx_t n,
         const float* x,
-        const Index::idx_t* list_nos) {
+        const idx_t* list_nos) {
     size_t d = quantizer->d;
     float* residuals = new float[n * d];
     // TODO: parallelize?
@@ -423,6 +420,7 @@
         const Index* quantizer,
         const ProductQuantizer& pq,
         AlignedTable<float>& precomputed_table,
+        bool by_residual,
         bool verbose) {
     size_t nlist = quantizer->ntotal;
     size_t d = quantizer->d;
@@ -434,10 +432,10 @@
     }
 
     if (use_precomputed_table == 0) { // then choose the type of table
-        if (quantizer->metric_type == METRIC_INNER_PRODUCT) {
+        if (!(quantizer->metric_type == METRIC_L2 && by_residual)) {
             if (verbose) {
                 printf("IndexIVFPQ::precompute_table: precomputed "
-                       "tables not needed for inner product quantizers\n");
+                       "tables needed only for L2 metric and by_residual is enabled\n");
             }
             precomputed_table.resize(0);
             return;
@@ -516,13 +514,16 @@
 
 void IndexIVFPQ::precompute_table() {
     initialize_IVFPQ_precomputed_table(
-            use_precomputed_table, quantizer, pq, precomputed_table, verbose);
+            use_precomputed_table,
+            quantizer,
+            pq,
+            precomputed_table,
+            by_residual,
+            verbose);
 }
 
 namespace {
 
-using idx_t = Index::idx_t;
-
 #define TIC t0 = get_cycles()
 #define TOC get_cycles() - t0
 
@@ -623,7 +624,7 @@
      *****************************************************/
 
     // fields specific to list
-    Index::idx_t key;
+    idx_t key;
     float coarse_dis;
     std::vector<uint8_t> q_code;
 
@@ -886,140 +887,29 @@
      * Scaning the codes: simple PQ scan.
      *****************************************************/
 
-#ifdef __AVX2__
-    /// Returns the distance to a single code.
-    /// General-purpose version.
-    template <class SearchResultType, typename T = PQDecoder>
-    typename std::enable_if<!(std::is_same<T, PQDecoder8>::value), float>::
-            type inline distance_single_code(const uint8_t* code) const {
-        PQDecoder decoder(code, pq.nbits);
-
-        const float* tab = sim_table;
-        float result = 0;
-
-        for (size_t m = 0; m < pq.M; m++) {
-            result += tab[decoder.decode()];
-            tab += pq.ksub;
-        }
-
-        return result;
-    }
-
-    /// Returns the distance to a single code.
-    /// Specialized AVX2 PQDecoder8 version.
-    template <class SearchResultType, typename T = PQDecoder>
-    typename std::enable_if<(std::is_same<T, PQDecoder8>::value), float>::
-            type inline distance_single_code(const uint8_t* code) const {
-        float result = 0;
-
-        size_t m = 0;
-        const size_t pqM16 = pq.M / 16;
-
-        const float* tab = sim_table;
-
-        if (pqM16 > 0) {
-            // process 16 values per loop
-
-            const __m256i ksub = _mm256_set1_epi32(pq.ksub);
-            __m256i offsets_0 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-            offsets_0 = _mm256_mullo_epi32(offsets_0, ksub);
-
-            // accumulators of partial sums
-            __m256 partialSum = _mm256_setzero_ps();
-
-            // loop
-            for (m = 0; m < pqM16 * 16; m += 16) {
-                // load 16 uint8 values
-                const __m128i mm1 =
-                        _mm_loadu_si128((const __m128i_u*)(code + m));
-                {
-                    // convert uint8 values (low part of __m128i) to int32
-                    // values
-                    const __m256i idx1 = _mm256_cvtepu8_epi32(mm1);
-
-                    // add offsets
-                    const __m256i indices_to_read_from =
-                            _mm256_add_epi32(idx1, offsets_0);
-
-                    // gather 8 values, similar to 8 operations of tab[idx]
-                    __m256 collected = _mm256_i32gather_ps(
-                            tab, indices_to_read_from, sizeof(float));
-                    tab += pq.ksub * 8;
-
-                    // collect partial sums
-                    partialSum = _mm256_add_ps(partialSum, collected);
-                }
-
-                // move high 8 uint8 to low ones
-                const __m128i mm2 =
-                        _mm_unpackhi_epi64(mm1, _mm_setzero_si128());
-                {
-                    // convert uint8 values (low part of __m128i) to int32
-                    // values
-                    const __m256i idx1 = _mm256_cvtepu8_epi32(mm2);
-
-                    // add offsets
-                    const __m256i indices_to_read_from =
-                            _mm256_add_epi32(idx1, offsets_0);
-
-                    // gather 8 values, similar to 8 operations of tab[idx]
-                    __m256 collected = _mm256_i32gather_ps(
-                            tab, indices_to_read_from, sizeof(float));
-                    tab += pq.ksub * 8;
-
-                    // collect partial sums
-                    partialSum = _mm256_add_ps(partialSum, collected);
-                }
-            }
-
-            // horizontal sum for partialSum
-            const __m256 h0 = _mm256_hadd_ps(partialSum, partialSum);
-            const __m256 h1 = _mm256_hadd_ps(h0, h0);
-
-            // extract high and low __m128 regs from __m256
-            const __m128 h2 = _mm256_extractf128_ps(h1, 1);
-            const __m128 h3 = _mm256_castps256_ps128(h1);
-
-            // get a final hsum into all 4 regs
-            const __m128 h4 = _mm_add_ss(h2, h3);
-
-            // extract f[0] from __m128
-            const float hsum = _mm_cvtss_f32(h4);
-            result += hsum;
-        }
-
-        //
-        if (m < pq.M) {
-            // process leftovers
-            PQDecoder decoder(code + m, pq.nbits);
-
-            for (; m < pq.M; m++) {
-                result += tab[decoder.decode()];
-                tab += pq.ksub;
-            }
-        }
-
-        return result;
-    }
-
-#else
-    /// Returns the distance to a single code.
-    /// General-purpose version.
-    template <class SearchResultType>
-    inline float distance_single_code(const uint8_t* code) const {
-        PQDecoder decoder(code, pq.nbits);
-
-        const float* tab = sim_table;
-        float result = 0;
-
-        for (size_t m = 0; m < pq.M; m++) {
-            result += tab[decoder.decode()];
-            tab += pq.ksub;
-        }
-
-        return result;
-    }
-#endif
+    // This is the baseline version of scan_list_with_tables().
+    // It demonstrates what this function actually does.
+    //
+    // /// version of the scan where we use precomputed tables.
+    // template <class SearchResultType>
+    // void scan_list_with_table(
+    //         size_t ncode,
+    //         const uint8_t* codes,
+    //         SearchResultType& res) const {
+    //
+    //     for (size_t j = 0; j < ncode; j++, codes += pq.code_size) {
+    //         if (res.skip_entry(j)) {
+    //             continue;
+    //         }
+    //         float dis = dis0 + distance_single_code<PQDecoder>(
+    //             pq, sim_table, codes);
+    //         res.add(j, dis);
+    //     }
+    // }
+
+    // This is the modified version of scan_list_with_tables().
+    // It was observed that doing manual unrolling of the loop that
+    //    utilizes distance_single_code() speeds up the computations.
 
     /// version of the scan where we use precomputed tables.
     template <class SearchResultType>
@@ -1027,12 +917,65 @@
             size_t ncode,
             const uint8_t* codes,
             SearchResultType& res) const {
-        for (size_t j = 0; j < ncode; j++, codes += pq.code_size) {
+        int counter = 0;
+
+        size_t saved_j[4] = {0, 0, 0, 0};
+        for (size_t j = 0; j < ncode; j++) {
             if (res.skip_entry(j)) {
                 continue;
             }
-            float dis = dis0 + distance_single_code<SearchResultType>(codes);
-            res.add(j, dis);
+
+            saved_j[0] = (counter == 0) ? j : saved_j[0];
+            saved_j[1] = (counter == 1) ? j : saved_j[1];
+            saved_j[2] = (counter == 2) ? j : saved_j[2];
+            saved_j[3] = (counter == 3) ? j : saved_j[3];
+
+            counter += 1;
+            if (counter == 4) {
+                float distance_0 = 0;
+                float distance_1 = 0;
+                float distance_2 = 0;
+                float distance_3 = 0;
+                distance_four_codes<PQDecoder>(
+                        pq,
+                        sim_table,
+                        codes + saved_j[0] * pq.code_size,
+                        codes + saved_j[1] * pq.code_size,
+                        codes + saved_j[2] * pq.code_size,
+                        codes + saved_j[3] * pq.code_size,
+                        distance_0,
+                        distance_1,
+                        distance_2,
+                        distance_3);
+
+                res.add(saved_j[0], dis0 + distance_0);
+                res.add(saved_j[1], dis0 + distance_1);
+                res.add(saved_j[2], dis0 + distance_2);
+                res.add(saved_j[3], dis0 + distance_3);
+                counter = 0;
+            }
+        }
+
+        if (counter >= 1) {
+            float dis =
+                    dis0 +
+                    distance_single_code<PQDecoder>(
+                            pq, sim_table, codes + saved_j[0] * pq.code_size);
+            res.add(saved_j[0], dis);
+        }
+        if (counter >= 2) {
+            float dis =
+                    dis0 +
+                    distance_single_code<PQDecoder>(
+                            pq, sim_table, codes + saved_j[1] * pq.code_size);
+            res.add(saved_j[1], dis);
+        }
+        if (counter >= 3) {
+            float dis =
+                    dis0 +
+                    distance_single_code<PQDecoder>(
+                            pq, sim_table, codes + saved_j[2] * pq.code_size);
+            res.add(saved_j[2], dis);
         }
     }
 
@@ -1101,6 +1044,46 @@
      * Scanning codes with polysemous filtering
      *****************************************************/
 
+    // This is the baseline version of scan_list_polysemous_hc().
+    // It demonstrates what this function actually does.
+
+    //     template <class HammingComputer, class SearchResultType>
+    //     void scan_list_polysemous_hc(
+    //             size_t ncode,
+    //             const uint8_t* codes,
+    //             SearchResultType& res) const {
+    //         int ht = ivfpq.polysemous_ht;
+    //         size_t n_hamming_pass = 0, nup = 0;
+    //
+    //         int code_size = pq.code_size;
+    //
+    //         HammingComputer hc(q_code.data(), code_size);
+    //
+    //         for (size_t j = 0; j < ncode; j++, codes += code_size) {
+    //             if (res.skip_entry(j)) {
+    //                 continue;
+    //             }
+    //             const uint8_t* b_code = codes;
+    //             int hd = hc.hamming(b_code);
+    //             if (hd < ht) {
+    //                 n_hamming_pass++;
+    //
+    //                 float dis =
+    //                         dis0 +
+    //                         distance_single_code<PQDecoder>(
+    //                             pq, sim_table, codes);
+    //
+    //                 res.add(j, dis);
+    //             }
+    //         }
+    // #pragma omp critical
+    //         { indexIVFPQ_stats.n_hamming_pass += n_hamming_pass; }
+    //     }
+
+    // This is the modified version of scan_list_with_tables().
+    // It was observed that doing manual unrolling of the loop that
+    //    utilizes distance_single_code() speeds up the computations.
+
     template <class HammingComputer, class SearchResultType>
     void scan_list_polysemous_hc(
             size_t ncode,
@@ -1111,23 +1094,103 @@
 
         int code_size = pq.code_size;
 
+        size_t saved_j[8];
+        int counter = 0;
+
         HammingComputer hc(q_code.data(), code_size);
 
-        for (size_t j = 0; j < ncode; j++, codes += code_size) {
+        for (size_t j = 0; j < (ncode / 4) * 4; j += 4) {
+            const uint8_t* b_code = codes + j * code_size;
+
+            // Unrolling is a key. Basically, doing multiple popcount
+            // operations one after another speeds things up.
+
+            // 9999999 is just an arbitrary large number
+            int hd0 = (res.skip_entry(j + 0))
+                    ? 99999999
+                    : hc.hamming(b_code + 0 * code_size);
+            int hd1 = (res.skip_entry(j + 1))
+                    ? 99999999
+                    : hc.hamming(b_code + 1 * code_size);
+            int hd2 = (res.skip_entry(j + 2))
+                    ? 99999999
+                    : hc.hamming(b_code + 2 * code_size);
+            int hd3 = (res.skip_entry(j + 3))
+                    ? 99999999
+                    : hc.hamming(b_code + 3 * code_size);
+
+            saved_j[counter] = j + 0;
+            counter = (hd0 < ht) ? (counter + 1) : counter;
+            saved_j[counter] = j + 1;
+            counter = (hd1 < ht) ? (counter + 1) : counter;
+            saved_j[counter] = j + 2;
+            counter = (hd2 < ht) ? (counter + 1) : counter;
+            saved_j[counter] = j + 3;
+            counter = (hd3 < ht) ? (counter + 1) : counter;
+
+            if (counter >= 4) {
+                // process four codes at the same time
+                n_hamming_pass += 4;
+
+                float distance_0 = dis0;
+                float distance_1 = dis0;
+                float distance_2 = dis0;
+                float distance_3 = dis0;
+                distance_four_codes<PQDecoder>(
+                        pq,
+                        sim_table,
+                        codes + saved_j[0] * pq.code_size,
+                        codes + saved_j[1] * pq.code_size,
+                        codes + saved_j[2] * pq.code_size,
+                        codes + saved_j[3] * pq.code_size,
+                        distance_0,
+                        distance_1,
+                        distance_2,
+                        distance_3);
+
+                res.add(saved_j[0], dis0 + distance_0);
+                res.add(saved_j[1], dis0 + distance_1);
+                res.add(saved_j[2], dis0 + distance_2);
+                res.add(saved_j[3], dis0 + distance_3);
+
+                //
+                counter -= 4;
+                saved_j[0] = saved_j[4];
+                saved_j[1] = saved_j[5];
+                saved_j[2] = saved_j[6];
+                saved_j[3] = saved_j[7];
+            }
+        }
+
+        for (size_t kk = 0; kk < counter; kk++) {
+            n_hamming_pass++;
+
+            float dis =
+                    dis0 +
+                    distance_single_code<PQDecoder>(
+                            pq, sim_table, codes + saved_j[kk] * pq.code_size);
+
+            res.add(saved_j[kk], dis);
+        }
+
+        // process leftovers
+        for (size_t j = (ncode / 4) * 4; j < ncode; j++) {
             if (res.skip_entry(j)) {
                 continue;
             }
-            const uint8_t* b_code = codes;
+            const uint8_t* b_code = codes + j * code_size;
             int hd = hc.hamming(b_code);
             if (hd < ht) {
                 n_hamming_pass++;
 
-                float dis =
-                        dis0 + distance_single_code<SearchResultType>(codes);
+                float dis = dis0 +
+                        distance_single_code<PQDecoder>(
+                                    pq, sim_table, codes + j * code_size);
 
                 res.add(j, dis);
             }
         }
+
 #pragma omp critical
         { indexIVFPQ_stats.n_hamming_pass += n_hamming_pass; }
     }
@@ -1171,7 +1234,7 @@
  * use_sel: store or ignore the IDSelector
  */
 template <MetricType METRIC_TYPE, class C, class PQDecoder, bool use_sel>
-struct IVFPQScanner : IVFPQScannerT<Index::idx_t, METRIC_TYPE, PQDecoder>,
+struct IVFPQScanner : IVFPQScannerT<idx_t, METRIC_TYPE, PQDecoder>,
                       InvertedListScanner {
     int precompute_mode;
     const IDSelector* sel;
@@ -1181,9 +1244,7 @@
             bool store_pairs,
             int precompute_mode,
             const IDSelector* sel)
-            : IVFPQScannerT<Index::idx_t, METRIC_TYPE, PQDecoder>(
-                      ivfpq,
-                      nullptr),
+            : IVFPQScannerT<idx_t, METRIC_TYPE, PQDecoder>(ivfpq, nullptr),
               precompute_mode(precompute_mode),
               sel(sel) {
         this->store_pairs = store_pairs;
@@ -1200,14 +1261,9 @@
 
     float distance_to_code(const uint8_t* code) const override {
         assert(precompute_mode == 2);
-        float dis = this->dis0;
-        const float* tab = this->sim_table;
-        PQDecoder decoder(code, this->pq.nbits);
-
-        for (size_t m = 0; m < this->pq.M; m++) {
-            dis += tab[decoder.decode()];
-            tab += this->pq.ksub;
-        }
+        float dis = this->dis0 +
+                distance_single_code<PQDecoder>(
+                            this->pq, this->sim_table, code);
         return dis;
     }
 
diff -Nru faiss-1.7.3/faiss/IndexIVFPQFastScan.cpp faiss-1.7.4/faiss/IndexIVFPQFastScan.cpp
--- faiss-1.7.3/faiss/IndexIVFPQFastScan.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/IndexIVFPQFastScan.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -156,7 +156,12 @@
 
 void IndexIVFPQFastScan::precompute_table() {
     initialize_IVFPQ_precomputed_table(
-            use_precomputed_table, quantizer, pq, precomputed_table, verbose);
+            use_precomputed_table,
+            quantizer,
+            pq,
+            precomputed_table,
+            by_residual,
+            verbose);
 }
 
 /*********************************************************
diff -Nru faiss-1.7.3/faiss/IndexIVFPQ.h faiss-1.7.4/faiss/IndexIVFPQ.h
--- faiss-1.7.3/faiss/IndexIVFPQ.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/IndexIVFPQ.h	2023-04-19 13:18:30.000000000 +0000
@@ -162,6 +162,7 @@
         const Index* quantizer,
         const ProductQuantizer& pq,
         AlignedTable<float>& precomputed_table,
+        bool by_residual,
         bool verbose);
 
 /// statistics are robust to internal threading, but not if
diff -Nru faiss-1.7.3/faiss/IndexIVFSpectralHash.cpp faiss-1.7.4/faiss/IndexIVFSpectralHash.cpp
--- faiss-1.7.3/faiss/IndexIVFSpectralHash.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/IndexIVFSpectralHash.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -213,8 +213,6 @@
     std::vector<uint8_t> qcode;
     HammingComputer hc;
 
-    using idx_t = Index::idx_t;
-
     IVFScanner(const IndexIVFSpectralHash* index, bool store_pairs)
             : index(index),
               nbit(index->nbit),
diff -Nru faiss-1.7.3/faiss/IndexNNDescent.cpp faiss-1.7.4/faiss/IndexNNDescent.cpp
--- faiss-1.7.3/faiss/IndexNNDescent.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/IndexNNDescent.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -50,7 +50,6 @@
 
 namespace faiss {
 
-using idx_t = Index::idx_t;
 using storage_idx_t = NNDescent::storage_idx_t;
 
 /**************************************************************
@@ -89,7 +88,7 @@
 };
 
 DistanceComputer* storage_distance_computer(const Index* storage) {
-    if (storage->metric_type == METRIC_INNER_PRODUCT) {
+    if (is_similarity_metric(storage->metric_type)) {
         return new NegativeDistanceComputer(storage->get_distance_computer());
     } else {
         return storage->get_distance_computer();
diff -Nru faiss-1.7.3/faiss/IndexNNDescent.h faiss-1.7.4/faiss/IndexNNDescent.h
--- faiss-1.7.3/faiss/IndexNNDescent.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/IndexNNDescent.h	2023-04-19 13:18:30.000000000 +0000
@@ -25,7 +25,6 @@
     using storage_idx_t = NNDescent::storage_idx_t;
 
     /// Faiss results are 64-bit
-    using idx_t = Index::idx_t;
 
     // the link strcuture
     NNDescent nndescent;
diff -Nru faiss-1.7.3/faiss/IndexNSG.cpp faiss-1.7.4/faiss/IndexNSG.cpp
--- faiss-1.7.3/faiss/IndexNSG.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/IndexNSG.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -23,7 +23,6 @@
 
 namespace faiss {
 
-using idx_t = Index::idx_t;
 using namespace nsg;
 
 /**************************************************************
@@ -113,7 +112,7 @@
         InterruptCallback::check();
     }
 
-    if (metric_type == METRIC_INNER_PRODUCT) {
+    if (is_similarity_metric(metric_type)) {
         // we need to revert the negated distances
         for (size_t i = 0; i < k * n; i++) {
             distances[i] = -distances[i];
diff -Nru faiss-1.7.3/faiss/IndexPQ.cpp faiss-1.7.4/faiss/IndexPQ.cpp
--- faiss-1.7.3/faiss/IndexPQ.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/IndexPQ.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -19,6 +19,8 @@
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/utils/hamming.h>
 
+#include <faiss/impl/code_distance/code_distance.h>
+
 namespace faiss {
 
 /*********************************************************
@@ -74,22 +76,18 @@
 struct PQDistanceComputer : FlatCodesDistanceComputer {
     size_t d;
     MetricType metric;
-    Index::idx_t nb;
+    idx_t nb;
     const ProductQuantizer& pq;
     const float* sdc;
     std::vector<float> precomputed_table;
     size_t ndis;
 
     float distance_to_code(const uint8_t* code) final {
-        const float* dt = precomputed_table.data();
-        PQDecoder decoder(code, pq.nbits);
-        float accu = 0;
-        for (int j = 0; j < pq.M; j++) {
-            accu += dt[decoder.decode()];
-            dt += 1 << decoder.nbits;
-        }
         ndis++;
-        return accu;
+
+        float dis = distance_single_code<PQDecoder>(
+                pq, precomputed_table.data(), code);
+        return dis;
     }
 
     float symmetric_dis(idx_t i, idx_t j) override {
diff -Nru faiss-1.7.3/faiss/IndexRefine.cpp faiss-1.7.4/faiss/IndexRefine.cpp
--- faiss-1.7.3/faiss/IndexRefine.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/IndexRefine.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -62,7 +62,7 @@
 
 namespace {
 
-typedef faiss::Index::idx_t idx_t;
+typedef faiss::idx_t idx_t;
 
 template <class C>
 static void reorder_2_heaps(
diff -Nru faiss-1.7.3/faiss/IndexReplicas.cpp faiss-1.7.4/faiss/IndexReplicas.cpp
--- faiss-1.7.3/faiss/IndexReplicas.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/IndexReplicas.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -123,14 +123,13 @@
     size_t componentsPerVec = sizeof(component_t) == 1 ? (dim + 7) / 8 : dim;
 
     // Partition the query by the number of indices we have
-    faiss::Index::idx_t queriesPerIndex =
-            (faiss::Index::idx_t)(n + this->count() - 1) /
-            (faiss::Index::idx_t)this->count();
+    faiss::idx_t queriesPerIndex =
+            (faiss::idx_t)(n + this->count() - 1) / (faiss::idx_t)this->count();
     FAISS_ASSERT(n / queriesPerIndex <= this->count());
 
     auto fn = [queriesPerIndex, componentsPerVec, n, x, k, distances, labels](
                       int i, const IndexT* index) {
-        faiss::Index::idx_t base = (faiss::Index::idx_t)i * queriesPerIndex;
+        faiss::idx_t base = (faiss::idx_t)i * queriesPerIndex;
 
         if (base < n) {
             auto numForIndex = std::min(queriesPerIndex, n - base);
diff -Nru faiss-1.7.3/faiss/IndexReplicas.h faiss-1.7.4/faiss/IndexReplicas.h
--- faiss-1.7.3/faiss/IndexReplicas.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/IndexReplicas.h	2023-04-19 13:18:30.000000000 +0000
@@ -20,7 +20,6 @@
 template <typename IndexT>
 class IndexReplicasTemplate : public ThreadedIndex<IndexT> {
    public:
-    using idx_t = typename IndexT::idx_t;
     using component_t = typename IndexT::component_t;
     using distance_t = typename IndexT::distance_t;
 
diff -Nru faiss-1.7.3/faiss/IndexRowwiseMinMax.cpp faiss-1.7.4/faiss/IndexRowwiseMinMax.cpp
--- faiss-1.7.3/faiss/IndexRowwiseMinMax.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/IndexRowwiseMinMax.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -1,3 +1,10 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
 #include <faiss/IndexRowwiseMinMax.h>
 
 #include <cstdint>
@@ -11,7 +18,7 @@
 
 namespace {
 
-using idx_t = faiss::Index::idx_t;
+using idx_t = faiss::idx_t;
 
 struct StorageMinMaxFP16 {
     uint16_t scaler;
diff -Nru faiss-1.7.3/faiss/IndexRowwiseMinMax.h faiss-1.7.4/faiss/IndexRowwiseMinMax.h
--- faiss-1.7.3/faiss/IndexRowwiseMinMax.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/IndexRowwiseMinMax.h	2023-04-19 13:18:30.000000000 +0000
@@ -1,3 +1,10 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
 #pragma once
 
 #include <cstdint>
diff -Nru faiss-1.7.3/faiss/IndexShards.cpp faiss-1.7.4/faiss/IndexShards.cpp
--- faiss-1.7.3/faiss/IndexShards.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/IndexShards.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -22,98 +22,17 @@
 // subroutines
 namespace {
 
-typedef Index::idx_t idx_t;
-
 // add translation to all valid labels
-void translate_labels(long n, idx_t* labels, long translation) {
+void translate_labels(int64_t n, idx_t* labels, int64_t translation) {
     if (translation == 0)
         return;
-    for (long i = 0; i < n; i++) {
+    for (int64_t i = 0; i < n; i++) {
         if (labels[i] < 0)
             continue;
         labels[i] += translation;
     }
 }
 
-/** merge result tables from several shards.
- * @param all_distances  size nshard * n * k
- * @param all_labels     idem
- * @param translartions  label translations to apply, size nshard
- */
-
-template <class IndexClass, class C>
-void merge_tables(
-        long n,
-        long k,
-        long nshard,
-        typename IndexClass::distance_t* distances,
-        idx_t* labels,
-        const std::vector<typename IndexClass::distance_t>& all_distances,
-        const std::vector<idx_t>& all_labels,
-        const std::vector<long>& translations) {
-    if (k == 0) {
-        return;
-    }
-    using distance_t = typename IndexClass::distance_t;
-
-    long stride = n * k;
-#pragma omp parallel
-    {
-        std::vector<int> buf(2 * nshard);
-        int* pointer = buf.data();
-        int* shard_ids = pointer + nshard;
-        std::vector<distance_t> buf2(nshard);
-        distance_t* heap_vals = buf2.data();
-#pragma omp for
-        for (long i = 0; i < n; i++) {
-            // the heap maps values to the shard where they are
-            // produced.
-            const distance_t* D_in = all_distances.data() + i * k;
-            const idx_t* I_in = all_labels.data() + i * k;
-            int heap_size = 0;
-
-            for (long s = 0; s < nshard; s++) {
-                pointer[s] = 0;
-                if (I_in[stride * s] >= 0) {
-                    heap_push<C>(
-                            ++heap_size,
-                            heap_vals,
-                            shard_ids,
-                            D_in[stride * s],
-                            s);
-                }
-            }
-
-            distance_t* D = distances + i * k;
-            idx_t* I = labels + i * k;
-
-            for (int j = 0; j < k; j++) {
-                if (heap_size == 0) {
-                    I[j] = -1;
-                    D[j] = C::neutral();
-                } else {
-                    // pop best element
-                    int s = shard_ids[0];
-                    int& p = pointer[s];
-                    D[j] = heap_vals[0];
-                    I[j] = I_in[stride * s + p] + translations[s];
-
-                    heap_pop<C>(heap_size--, heap_vals, shard_ids);
-                    p++;
-                    if (p < k && I_in[stride * s + p] >= 0) {
-                        heap_push<C>(
-                                ++heap_size,
-                                heap_vals,
-                                shard_ids,
-                                D_in[stride * s + p],
-                                s);
-                    }
-                }
-            }
-        }
-    }
-}
-
 } // anonymous namespace
 
 template <typename IndexT>
@@ -247,11 +166,9 @@
 
     if (!ids && !successive_ids) {
         aids.resize(n);
-
         for (idx_t i = 0; i < n; i++) {
             aids[i] = this->ntotal + i;
         }
-
         ids = aids.data();
     }
 
@@ -294,12 +211,23 @@
             !params, "search params not supported for this index");
     FAISS_THROW_IF_NOT(k > 0);
 
-    long nshard = this->count();
+    int64_t nshard = this->count();
 
     std::vector<distance_t> all_distances(nshard * k * n);
     std::vector<idx_t> all_labels(nshard * k * n);
+    std::vector<int64_t> translations(nshard, 0);
+
+    // Because we just called runOnIndex above, it is safe to access the
+    // sub-index ntotal here
+    if (successive_ids) {
+        translations[0] = 0;
 
-    auto fn = [n, k, x, &all_distances, &all_labels](
+        for (int s = 0; s + 1 < nshard; s++) {
+            translations[s + 1] = translations[s] + this->at(s)->ntotal;
+        }
+    }
+
+    auto fn = [n, k, x, &all_distances, &all_labels, &translations](
                       int no, const IndexT* index) {
         if (index->verbose) {
             printf("begin query shard %d on %" PRId64 " points\n", no, n);
@@ -312,6 +240,9 @@
                 all_distances.data() + no * k * n,
                 all_labels.data() + no * k * n);
 
+        translate_labels(
+                n * k, all_labels.data() + no * k * n, translations[no]);
+
         if (index->verbose) {
             printf("end query shard %d\n", no);
         }
@@ -319,38 +250,24 @@
 
     this->runOnIndex(fn);
 
-    std::vector<long> translations(nshard, 0);
-
-    // Because we just called runOnIndex above, it is safe to access the
-    // sub-index ntotal here
-    if (successive_ids) {
-        translations[0] = 0;
-
-        for (int s = 0; s + 1 < nshard; s++) {
-            translations[s + 1] = translations[s] + this->at(s)->ntotal;
-        }
-    }
-
     if (this->metric_type == METRIC_L2) {
-        merge_tables<IndexT, CMin<distance_t, int>>(
+        merge_knn_results<idx_t, CMin<distance_t, int>>(
                 n,
                 k,
                 nshard,
+                all_distances.data(),
+                all_labels.data(),
                 distances,
-                labels,
-                all_distances,
-                all_labels,
-                translations);
+                labels);
     } else {
-        merge_tables<IndexT, CMax<distance_t, int>>(
+        merge_knn_results<idx_t, CMax<distance_t, int>>(
                 n,
                 k,
                 nshard,
+                all_distances.data(),
+                all_labels.data(),
                 distances,
-                labels,
-                all_distances,
-                all_labels,
-                translations);
+                labels);
     }
 }
 
diff -Nru faiss-1.7.3/faiss/IndexShards.h faiss-1.7.4/faiss/IndexShards.h
--- faiss-1.7.3/faiss/IndexShards.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/IndexShards.h	2023-04-19 13:18:30.000000000 +0000
@@ -18,7 +18,6 @@
  */
 template <typename IndexT>
 struct IndexShardsTemplate : public ThreadedIndex<IndexT> {
-    using idx_t = typename IndexT::idx_t;
     using component_t = typename IndexT::component_t;
     using distance_t = typename IndexT::distance_t;
 
@@ -72,7 +71,7 @@
      * Cases (successive_ids, xids):
      * - true, non-NULL       ERROR: it makes no sense to pass in ids and
      *                        request them to be shifted
-     * - true, NULL           OK, but should be called only once (calls add()
+     * - true, NULL           OK: but should be called only once (calls add()
      *                        on sub-indexes).
      * - false, non-NULL      OK: will call add_with_ids with passed in xids
      *                        distributed evenly over shards
@@ -96,7 +95,7 @@
 
     /// Synchronize the top-level index (IndexShards) with data in the
     /// sub-indices
-    void syncWithSubIndexes();
+    virtual void syncWithSubIndexes();
 
    protected:
     /// Called just after an index is added
diff -Nru faiss-1.7.3/faiss/IndexShardsIVF.cpp faiss-1.7.4/faiss/IndexShardsIVF.cpp
--- faiss-1.7.3/faiss/IndexShardsIVF.cpp	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/faiss/IndexShardsIVF.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -0,0 +1,246 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/IndexShardsIVF.h>
+
+#include <cinttypes>
+#include <cstdio>
+#include <functional>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/Heap.h>
+#include <faiss/utils/WorkerThread.h>
+#include <faiss/utils/utils.h>
+
+namespace faiss {
+
+// subroutines
+namespace {
+
+// add translation to all valid labels
+void translate_labels(int64_t n, idx_t* labels, int64_t translation) {
+    if (translation == 0) {
+        return;
+    }
+    for (int64_t i = 0; i < n; i++) {
+        if (labels[i] < 0) {
+            continue;
+        }
+        labels[i] += translation;
+    }
+}
+
+} // anonymous namespace
+
+/************************************************************
+ * IndexShardsIVF
+ ************************************************************/
+
+IndexShardsIVF::IndexShardsIVF(
+        Index* quantizer,
+        size_t nlist,
+        bool threaded,
+        bool successive_ids)
+        : IndexShardsTemplate<Index>(quantizer->d, threaded, successive_ids),
+          Level1Quantizer(quantizer, nlist) {
+    is_trained = quantizer->is_trained && quantizer->ntotal == nlist;
+}
+
+void IndexShardsIVF::addIndex(Index* index) {
+    auto index_ivf = dynamic_cast<IndexIVFInterface*>(index);
+    FAISS_THROW_IF_NOT_MSG(index_ivf, "can only add IndexIVFs");
+    FAISS_THROW_IF_NOT(index_ivf->nlist == nlist);
+    IndexShardsTemplate<Index>::addIndex(index);
+}
+
+void IndexShardsIVF::train(idx_t n, const component_t* x) {
+    if (verbose) {
+        printf("Training level-1 quantizer\n");
+    }
+    train_q1(n, x, verbose, metric_type);
+
+    // set the sub-quantizer codebooks
+    std::vector<float> centroids(nlist * d);
+    quantizer->reconstruct_n(0, nlist, centroids.data());
+
+    // probably not worth running in parallel
+    for (size_t i = 0; i < indices_.size(); i++) {
+        Index* index = indices_[i].first;
+        auto index_ivf = dynamic_cast<IndexIVFInterface*>(index);
+        Index* quantizer = index_ivf->quantizer;
+        if (!quantizer->is_trained) {
+            quantizer->train(nlist, centroids.data());
+        }
+        quantizer->add(nlist, centroids.data());
+        // finish training
+        index->train(n, x);
+    }
+
+    is_trained = true;
+}
+
+void IndexShardsIVF::add_with_ids(
+        idx_t n,
+        const component_t* x,
+        const idx_t* xids) {
+    // IndexIVF exposes add_core that we can use to factorize the
+    bool all_index_ivf = true;
+    for (size_t i = 0; i < indices_.size(); i++) {
+        Index* index = indices_[i].first;
+        all_index_ivf = all_index_ivf && dynamic_cast<IndexIVF*>(index);
+    }
+    if (!all_index_ivf) {
+        IndexShardsTemplate<Index>::add_with_ids(n, x, xids);
+        return;
+    }
+    FAISS_THROW_IF_NOT_MSG(
+            !(successive_ids && xids),
+            "It makes no sense to pass in ids and "
+            "request them to be shifted");
+
+    if (successive_ids) {
+        FAISS_THROW_IF_NOT_MSG(
+                !xids,
+                "It makes no sense to pass in ids and "
+                "request them to be shifted");
+        FAISS_THROW_IF_NOT_MSG(
+                this->ntotal == 0,
+                "when adding to IndexShards with sucessive_ids, "
+                "only add() in a single pass is supported");
+    }
+
+    // perform coarse quantization
+    std::vector<idx_t> Iq(n);
+    std::vector<float> Dq(n);
+    quantizer->search(n, x, 1, Dq.data(), Iq.data());
+
+    // possibly shift ids
+    idx_t nshard = this->count();
+    const idx_t* ids = xids;
+    std::vector<idx_t> aids;
+    if (!ids && !successive_ids) {
+        aids.resize(n);
+
+        for (idx_t i = 0; i < n; i++) {
+            aids[i] = this->ntotal + i;
+        }
+        ids = aids.data();
+    }
+    idx_t d = this->d;
+
+    auto fn = [n, ids, x, nshard, d, Iq](int no, Index* index) {
+        idx_t i0 = (idx_t)no * n / nshard;
+        idx_t i1 = ((idx_t)no + 1) * n / nshard;
+        const float* x0 = x + i0 * d;
+        auto index_ivf = dynamic_cast<IndexIVF*>(index);
+
+        if (index->verbose) {
+            printf("begin add shard %d on %" PRId64 " points\n", no, n);
+        }
+
+        index_ivf->add_core(
+                i1 - i0, x + i0 * d, ids ? ids + i0 : nullptr, Iq.data() + i0);
+
+        if (index->verbose) {
+            printf("end add shard %d on %" PRId64 " points\n", no, i1 - i0);
+        }
+    };
+
+    this->runOnIndex(fn);
+    syncWithSubIndexes();
+}
+
+void IndexShardsIVF::search(
+        idx_t n,
+        const component_t* x,
+        idx_t k,
+        distance_t* distances,
+        idx_t* labels,
+        const SearchParameters* params_in) const {
+    FAISS_THROW_IF_NOT(k > 0);
+    FAISS_THROW_IF_NOT(count() > 0);
+    const IVFSearchParameters* params = nullptr;
+    if (params_in) {
+        params = dynamic_cast<const IVFSearchParameters*>(params_in);
+        FAISS_THROW_IF_NOT_MSG(params, "IndexIVF params have incorrect type");
+    }
+
+    auto index0 = dynamic_cast<const IndexIVFInterface*>(at(0));
+    idx_t nprobe = params ? params->nprobe : index0->nprobe;
+
+    // coarse quantization (TODO: support tiling with search_precomputed)
+    std::vector<distance_t> Dq(n * nprobe);
+    std::vector<idx_t> Iq(n * nprobe);
+
+    quantizer->search(n, x, nprobe, Dq.data(), Iq.data());
+
+    int64_t nshard = this->count();
+
+    std::vector<distance_t> all_distances(nshard * k * n);
+    std::vector<idx_t> all_labels(nshard * k * n);
+    std::vector<int64_t> translations(nshard, 0);
+
+    if (successive_ids) {
+        translations[0] = 0;
+        for (int s = 0; s + 1 < nshard; s++) {
+            translations[s + 1] = translations[s] + this->at(s)->ntotal;
+        }
+    }
+
+    auto fn = [&](int no, const Index* indexIn) {
+        if (indexIn->verbose) {
+            printf("begin query shard %d on %" PRId64 " points\n", no, n);
+        }
+
+        auto index = dynamic_cast<const IndexIVFInterface*>(indexIn);
+
+        FAISS_THROW_IF_NOT_MSG(index->nprobe == nprobe, "inconsistent nprobe");
+
+        index->search_preassigned(
+                n,
+                x,
+                k,
+                Iq.data(),
+                Dq.data(),
+                all_distances.data() + no * k * n,
+                all_labels.data() + no * k * n,
+                false);
+
+        translate_labels(
+                n * k, all_labels.data() + no * k * n, translations[no]);
+
+        if (indexIn->verbose) {
+            printf("end query shard %d\n", no);
+        }
+    };
+
+    this->runOnIndex(fn);
+
+    if (this->metric_type == METRIC_L2) {
+        merge_knn_results<idx_t, CMin<distance_t, int>>(
+                n,
+                k,
+                nshard,
+                all_distances.data(),
+                all_labels.data(),
+                distances,
+                labels);
+    } else {
+        merge_knn_results<idx_t, CMax<distance_t, int>>(
+                n,
+                k,
+                nshard,
+                all_distances.data(),
+                all_labels.data(),
+                distances,
+                labels);
+    }
+}
+
+} // namespace faiss
diff -Nru faiss-1.7.3/faiss/IndexShardsIVF.h faiss-1.7.4/faiss/IndexShardsIVF.h
--- faiss-1.7.3/faiss/IndexShardsIVF.h	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/faiss/IndexShardsIVF.h	2023-04-19 13:18:30.000000000 +0000
@@ -0,0 +1,42 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/IndexIVF.h>
+#include <faiss/IndexShards.h>
+
+namespace faiss {
+
+/**
+ * IndexShards with a common coarse quantizer. All the indexes added should be
+ * IndexIVFInterface indexes so that the search_precomputed can be called.
+ */
+struct IndexShardsIVF : public IndexShards, Level1Quantizer {
+    explicit IndexShardsIVF(
+            Index* quantizer,
+            size_t nlist,
+            bool threaded = false,
+            bool successive_ids = true);
+
+    void addIndex(Index* index) override;
+
+    void add_with_ids(idx_t n, const component_t* x, const idx_t* xids)
+            override;
+
+    void train(idx_t n, const component_t* x) override;
+
+    void search(
+            idx_t n,
+            const component_t* x,
+            idx_t k,
+            distance_t* distances,
+            idx_t* labels,
+            const SearchParameters* params = nullptr) const override;
+};
+
+} // namespace faiss
diff -Nru faiss-1.7.3/faiss/invlists/BlockInvertedLists.cpp faiss-1.7.4/faiss/invlists/BlockInvertedLists.cpp
--- faiss-1.7.3/faiss/invlists/BlockInvertedLists.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/invlists/BlockInvertedLists.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -7,6 +7,7 @@
 
 #include <faiss/invlists/BlockInvertedLists.h>
 
+#include <faiss/impl/CodePacker.h>
 #include <faiss/impl/FaissAssert.h>
 
 #include <faiss/impl/io.h>
@@ -25,29 +26,43 @@
     codes.resize(nlist);
 }
 
+BlockInvertedLists::BlockInvertedLists(size_t nlist, const CodePacker* packer)
+        : InvertedLists(nlist, InvertedLists::INVALID_CODE_SIZE),
+          n_per_block(packer->nvec),
+          block_size(packer->block_size),
+          packer(packer) {
+    ids.resize(nlist);
+    codes.resize(nlist);
+}
+
 BlockInvertedLists::BlockInvertedLists()
-        : InvertedLists(0, InvertedLists::INVALID_CODE_SIZE),
-          n_per_block(0),
-          block_size(0) {}
+        : InvertedLists(0, InvertedLists::INVALID_CODE_SIZE) {}
 
 size_t BlockInvertedLists::add_entries(
         size_t list_no,
         size_t n_entry,
         const idx_t* ids_in,
         const uint8_t* code) {
-    if (n_entry == 0)
+    if (n_entry == 0) {
         return 0;
+    }
     FAISS_THROW_IF_NOT(list_no < nlist);
     size_t o = ids[list_no].size();
-    FAISS_THROW_IF_NOT(
-            o == 0); // not clear how we should handle subsequent adds
     ids[list_no].resize(o + n_entry);
     memcpy(&ids[list_no][o], ids_in, sizeof(ids_in[0]) * n_entry);
-
-    // copy whole blocks
-    size_t n_block = (n_entry + n_per_block - 1) / n_per_block;
+    size_t n_block = (o + n_entry + n_per_block - 1) / n_per_block;
     codes[list_no].resize(n_block * block_size);
-    memcpy(&codes[list_no][o * code_size], code, n_block * block_size);
+    if (o % block_size == 0) {
+        // copy whole blocks
+        memcpy(&codes[list_no][o * code_size], code, n_block * block_size);
+    } else {
+        FAISS_THROW_IF_NOT_MSG(packer, "missing code packer");
+        std::vector<uint8_t> buffer(packer->code_size);
+        for (size_t i = 0; i < n_entry; i++) {
+            packer->unpack_1(code, i, buffer.data());
+            packer->pack_1(buffer.data(), i + o, codes[list_no].data());
+        }
+    }
     return o;
 }
 
@@ -61,7 +76,7 @@
     return codes[list_no].get();
 }
 
-const InvertedLists::idx_t* BlockInvertedLists::get_ids(size_t list_no) const {
+const idx_t* BlockInvertedLists::get_ids(size_t list_no) const {
     assert(list_no < nlist);
     return ids[list_no].data();
 }
@@ -95,7 +110,9 @@
     */
 }
 
-BlockInvertedLists::~BlockInvertedLists() {}
+BlockInvertedLists::~BlockInvertedLists() {
+    delete packer;
+}
 
 /**************************************************
  * IO hook implementation
diff -Nru faiss-1.7.3/faiss/invlists/BlockInvertedLists.h faiss-1.7.4/faiss/invlists/BlockInvertedLists.h
--- faiss-1.7.3/faiss/invlists/BlockInvertedLists.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/invlists/BlockInvertedLists.h	2023-04-19 13:18:30.000000000 +0000
@@ -14,6 +14,8 @@
 
 namespace faiss {
 
+struct CodePacker;
+
 /** Inverted Lists that are organized by blocks.
  *
  * Different from the regular inverted lists, the codes are organized by blocks
@@ -28,13 +30,17 @@
  * data.
  */
 struct BlockInvertedLists : InvertedLists {
-    size_t n_per_block; // nb of vectors stored per block
-    size_t block_size;  // nb bytes per block
+    size_t n_per_block = 0; // nb of vectors stored per block
+    size_t block_size = 0;  // nb bytes per block
+
+    // required to interpret the content of the blocks (owned by this)
+    const CodePacker* packer = nullptr;
 
     std::vector<AlignedTable<uint8_t>> codes;
     std::vector<std::vector<idx_t>> ids;
 
     BlockInvertedLists(size_t nlist, size_t vec_per_block, size_t block_size);
+    BlockInvertedLists(size_t nlist, const CodePacker* packer);
 
     BlockInvertedLists();
 
diff -Nru faiss-1.7.3/faiss/invlists/DirectMap.cpp faiss-1.7.4/faiss/invlists/DirectMap.cpp
--- faiss-1.7.3/faiss/invlists/DirectMap.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/invlists/DirectMap.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -68,7 +68,7 @@
     hashtable.clear();
 }
 
-DirectMap::idx_t DirectMap::get(idx_t key) const {
+idx_t DirectMap::get(idx_t key) const {
     if (type == Array) {
         FAISS_THROW_IF_NOT_MSG(key >= 0 && key < array.size(), "invalid key");
         idx_t lo = array[key];
diff -Nru faiss-1.7.3/faiss/invlists/DirectMap.h faiss-1.7.4/faiss/invlists/DirectMap.h
--- faiss-1.7.3/faiss/invlists/DirectMap.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/invlists/DirectMap.h	2023-04-19 13:18:30.000000000 +0000
@@ -15,6 +15,8 @@
 
 namespace faiss {
 
+struct IDSelector;
+
 // When offsets list id + offset are encoded in an uint64
 // we call this LO = list-offset
 
@@ -34,8 +36,6 @@
  * Direct map: a way to map back from ids to inverted lists
  */
 struct DirectMap {
-    typedef Index::idx_t idx_t;
-
     enum Type {
         NoMap = 0,    // default
         Array = 1,    // sequential ids (only for add, no add_with_ids)
@@ -91,8 +91,6 @@
 
 /// Thread-safe way of updating the direct_map
 struct DirectMapAdd {
-    typedef Index::idx_t idx_t;
-
     using Type = DirectMap::Type;
 
     DirectMap& direct_map;
diff -Nru faiss-1.7.3/faiss/invlists/InvertedLists.cpp faiss-1.7.4/faiss/invlists/InvertedLists.cpp
--- faiss-1.7.3/faiss/invlists/InvertedLists.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/invlists/InvertedLists.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -10,23 +10,32 @@
 #include <faiss/invlists/InvertedLists.h>
 
 #include <cstdio>
+#include <memory>
 
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/utils/utils.h>
 
 namespace faiss {
 
+InvertedListsIterator::~InvertedListsIterator() {}
+
 /*****************************************
  * InvertedLists implementation
  ******************************************/
 
 InvertedLists::InvertedLists(size_t nlist, size_t code_size)
-        : nlist(nlist), code_size(code_size) {}
+        : nlist(nlist), code_size(code_size), use_iterator(false) {}
 
 InvertedLists::~InvertedLists() {}
 
-InvertedLists::idx_t InvertedLists::get_single_id(size_t list_no, size_t offset)
-        const {
+bool InvertedLists::is_empty(size_t list_no) const {
+    return use_iterator
+            ? !std::unique_ptr<InvertedListsIterator>(get_iterator(list_no))
+                       ->is_available()
+            : list_size(list_no) == 0;
+}
+
+idx_t InvertedLists::get_single_id(size_t list_no, size_t offset) const {
     assert(offset < list_size(list_no));
     const idx_t* ids = get_ids(list_no);
     idx_t id = ids[offset];
@@ -67,6 +76,10 @@
     }
 }
 
+InvertedListsIterator* InvertedLists::get_iterator(size_t /*list_no*/) const {
+    FAISS_THROW_MSG("get_iterator is not supported");
+}
+
 void InvertedLists::merge_from(InvertedLists* oivf, size_t add_id) {
 #pragma omp parallel for
     for (idx_t i = 0; i < nlist; i++) {
@@ -87,6 +100,98 @@
     }
 }
 
+size_t InvertedLists::copy_subset_to(
+        InvertedLists& oivf,
+        subset_type_t subset_type,
+        idx_t a1,
+        idx_t a2) const {
+    FAISS_THROW_IF_NOT(nlist == oivf.nlist);
+    FAISS_THROW_IF_NOT(code_size == oivf.code_size);
+    FAISS_THROW_IF_NOT_FMT(
+            subset_type >= 0 && subset_type <= 4,
+            "subset type %d not implemented",
+            subset_type);
+    size_t accu_n = 0;
+    size_t accu_a1 = 0;
+    size_t accu_a2 = 0;
+    size_t n_added = 0;
+
+    size_t ntotal = 0;
+    if (subset_type == 2) {
+        ntotal = compute_ntotal();
+    }
+
+    for (idx_t list_no = 0; list_no < nlist; list_no++) {
+        size_t n = list_size(list_no);
+        ScopedIds ids_in(this, list_no);
+
+        if (subset_type == SUBSET_TYPE_ID_RANGE) {
+            for (idx_t i = 0; i < n; i++) {
+                idx_t id = ids_in[i];
+                if (a1 <= id && id < a2) {
+                    oivf.add_entry(
+                            list_no,
+                            get_single_id(list_no, i),
+                            ScopedCodes(this, list_no, i).get());
+                    n_added++;
+                }
+            }
+        } else if (subset_type == SUBSET_TYPE_ID_MOD) {
+            for (idx_t i = 0; i < n; i++) {
+                idx_t id = ids_in[i];
+                if (id % a1 == a2) {
+                    oivf.add_entry(
+                            list_no,
+                            get_single_id(list_no, i),
+                            ScopedCodes(this, list_no, i).get());
+                    n_added++;
+                }
+            }
+        } else if (subset_type == SUBSET_TYPE_ELEMENT_RANGE) {
+            // see what is allocated to a1 and to a2
+            size_t next_accu_n = accu_n + n;
+            size_t next_accu_a1 = next_accu_n * a1 / ntotal;
+            size_t i1 = next_accu_a1 - accu_a1;
+            size_t next_accu_a2 = next_accu_n * a2 / ntotal;
+            size_t i2 = next_accu_a2 - accu_a2;
+
+            for (idx_t i = i1; i < i2; i++) {
+                oivf.add_entry(
+                        list_no,
+                        get_single_id(list_no, i),
+                        ScopedCodes(this, list_no, i).get());
+            }
+
+            n_added += i2 - i1;
+            accu_a1 = next_accu_a1;
+            accu_a2 = next_accu_a2;
+        } else if (subset_type == SUBSET_TYPE_INVLIST_FRACTION) {
+            size_t i1 = n * a2 / a1;
+            size_t i2 = n * (a2 + 1) / a1;
+
+            for (idx_t i = i1; i < i2; i++) {
+                oivf.add_entry(
+                        list_no,
+                        get_single_id(list_no, i),
+                        ScopedCodes(this, list_no, i).get());
+            }
+
+            n_added += i2 - i1;
+        } else if (subset_type == SUBSET_TYPE_INVLIST) {
+            if (list_no >= a1 && list_no < a2) {
+                oivf.add_entries(
+                        list_no,
+                        n,
+                        ScopedIds(this, list_no).get(),
+                        ScopedCodes(this, list_no).get());
+                n_added += n;
+            }
+        }
+        accu_n += n;
+    }
+    return n_added;
+}
+
 double InvertedLists::imbalance_factor() const {
     std::vector<int> hist(nlist);
 
@@ -109,7 +214,9 @@
     }
     for (size_t i = 0; i < sizes.size(); i++) {
         if (sizes[i]) {
-            printf("list size in < %d: %d instances\n", 1 << i, sizes[i]);
+            printf("list size in < %zu: %d instances\n",
+                   static_cast<size_t>(1) << i,
+                   sizes[i]);
         }
     }
 }
@@ -158,7 +265,7 @@
     return codes[list_no].data();
 }
 
-const InvertedLists::idx_t* ArrayInvertedLists::get_ids(size_t list_no) const {
+const idx_t* ArrayInvertedLists::get_ids(size_t list_no) const {
     assert(list_no < nlist);
     return ids[list_no].data();
 }
@@ -267,7 +374,7 @@
     delete[] codes;
 }
 
-const Index::idx_t* HStackInvertedLists::get_ids(size_t list_no) const {
+const idx_t* HStackInvertedLists::get_ids(size_t list_no) const {
     idx_t *ids = new idx_t[list_size(list_no)], *c = ids;
 
     for (int i = 0; i < ils.size(); i++) {
@@ -281,8 +388,7 @@
     return ids;
 }
 
-Index::idx_t HStackInvertedLists::get_single_id(size_t list_no, size_t offset)
-        const {
+idx_t HStackInvertedLists::get_single_id(size_t list_no, size_t offset) const {
     for (int i = 0; i < ils.size(); i++) {
         const InvertedLists* il = ils[i];
         size_t sz = il->list_size(list_no);
@@ -312,8 +418,6 @@
 
 namespace {
 
-using idx_t = InvertedLists::idx_t;
-
 idx_t translate_list_no(const SliceInvertedLists* sil, idx_t list_no) {
     FAISS_THROW_IF_NOT(list_no >= 0 && list_no < sil->nlist);
     return list_no + sil->i0;
@@ -349,12 +453,11 @@
     return il->release_codes(translate_list_no(this, list_no), codes);
 }
 
-const Index::idx_t* SliceInvertedLists::get_ids(size_t list_no) const {
+const idx_t* SliceInvertedLists::get_ids(size_t list_no) const {
     return il->get_ids(translate_list_no(this, list_no));
 }
 
-Index::idx_t SliceInvertedLists::get_single_id(size_t list_no, size_t offset)
-        const {
+idx_t SliceInvertedLists::get_single_id(size_t list_no, size_t offset) const {
     return il->get_single_id(translate_list_no(this, list_no), offset);
 }
 
@@ -380,8 +483,6 @@
 
 namespace {
 
-using idx_t = InvertedLists::idx_t;
-
 // find the invlist this number belongs to
 int translate_list_no(const VStackInvertedLists* vil, idx_t list_no) {
     FAISS_THROW_IF_NOT(list_no >= 0 && list_no < vil->nlist);
@@ -449,14 +550,13 @@
     return ils[i]->release_codes(list_no, codes);
 }
 
-const Index::idx_t* VStackInvertedLists::get_ids(size_t list_no) const {
+const idx_t* VStackInvertedLists::get_ids(size_t list_no) const {
     int i = translate_list_no(this, list_no);
     list_no -= cumsz[i];
     return ils[i]->get_ids(list_no);
 }
 
-Index::idx_t VStackInvertedLists::get_single_id(size_t list_no, size_t offset)
-        const {
+idx_t VStackInvertedLists::get_single_id(size_t list_no, size_t offset) const {
     int i = translate_list_no(this, list_no);
     list_no -= cumsz[i];
     return ils[i]->get_single_id(list_no, offset);
diff -Nru faiss-1.7.3/faiss/invlists/InvertedLists.h faiss-1.7.4/faiss/invlists/InvertedLists.h
--- faiss-1.7.3/faiss/invlists/InvertedLists.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/invlists/InvertedLists.h	2023-04-19 13:18:30.000000000 +0000
@@ -15,11 +15,18 @@
  * the interface.
  */
 
-#include <faiss/Index.h>
+#include <faiss/MetricType.h>
 #include <vector>
 
 namespace faiss {
 
+struct InvertedListsIterator {
+    virtual ~InvertedListsIterator();
+    virtual bool is_available() const = 0;
+    virtual void next() = 0;
+    virtual std::pair<idx_t, const uint8_t*> get_id_and_codes() = 0;
+};
+
 /** Table of inverted lists
  * multithreading rules:
  * - concurrent read accesses are allowed
@@ -28,13 +35,14 @@
  *   are allowed
  */
 struct InvertedLists {
-    typedef Index::idx_t idx_t;
-
     size_t nlist;     ///< number of possible key values
     size_t code_size; ///< code size per vector in bytes
+    bool use_iterator;
 
     InvertedLists(size_t nlist, size_t code_size);
 
+    virtual ~InvertedLists();
+
     /// used for BlockInvertedLists, where the codes are packed into groups
     /// and the individual code size is meaningless
     static const size_t INVALID_CODE_SIZE = static_cast<size_t>(-1);
@@ -42,9 +50,15 @@
     /*************************
      *  Read only functions */
 
+    // check if the list is empty
+    bool is_empty(size_t list_no) const;
+
     /// get the size of a list
     virtual size_t list_size(size_t list_no) const = 0;
 
+    /// get iterable for lists that use_iterator
+    virtual InvertedListsIterator* get_iterator(size_t list_no) const;
+
     /** get the codes for an inverted list
      * must be released by release_codes
      *
@@ -105,10 +119,36 @@
 
     virtual void reset();
 
+    /*************************
+     * high level functions  */
+
     /// move all entries from oivf (empty on output)
     void merge_from(InvertedLists* oivf, size_t add_id);
 
-    virtual ~InvertedLists();
+    // how to copy a subset of elements from the inverted lists
+    // This depends on two integers, a1 and a2.
+    enum subset_type_t : int {
+        // depends on IDs
+        SUBSET_TYPE_ID_RANGE = 0, // copies ids in [a1, a2)
+        SUBSET_TYPE_ID_MOD = 1,   // copies ids if id % a1 == a2
+        // depends on order within invlists
+        SUBSET_TYPE_ELEMENT_RANGE =
+                2, // copies fractions of invlists so that a1 elements are left
+                   // before and a2 after
+        SUBSET_TYPE_INVLIST_FRACTION =
+                3, // take fraction a2 out of a1 from each invlist, 0 <= a2 < a1
+        // copy only inverted lists a1:a2
+        SUBSET_TYPE_INVLIST = 4
+    };
+
+    /** copy a subset of the entries index to the other index
+     * @return number of entries copied
+     */
+    size_t copy_subset_to(
+            InvertedLists& other,
+            subset_type_t subset_type,
+            idx_t a1,
+            idx_t a2) const;
 
     /*************************
      * statistics            */
diff -Nru faiss-1.7.3/faiss/invlists/OnDiskInvertedLists.cpp faiss-1.7.4/faiss/invlists/OnDiskInvertedLists.cpp
--- faiss-1.7.3/faiss/invlists/OnDiskInvertedLists.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/invlists/OnDiskInvertedLists.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -154,7 +154,7 @@
             const OnDiskInvertedLists* od = pf->od;
             od->locks->lock_1(list_no);
             size_t n = od->list_size(list_no);
-            const Index::idx_t* idx = od->get_ids(list_no);
+            const idx_t* idx = od->get_ids(list_no);
             const uint8_t* codes = od->get_codes(list_no);
             int cs = 0;
             for (size_t i = 0; i < n; i++) {
@@ -389,7 +389,7 @@
     return ptr + lists[list_no].offset;
 }
 
-const Index::idx_t* OnDiskInvertedLists::get_ids(size_t list_no) const {
+const idx_t* OnDiskInvertedLists::get_ids(size_t list_no) const {
     if (lists[list_no].offset == INVALID_OFFSET) {
         return nullptr;
     }
@@ -781,7 +781,7 @@
         OnDiskInvertedLists::List& l = ails->lists[i];
         l.size = l.capacity = sizes[i];
         l.offset = o;
-        o += l.size * (sizeof(OnDiskInvertedLists::idx_t) + ails->code_size);
+        o += l.size * (sizeof(idx_t) + ails->code_size);
     }
     // resume normal reading of file
     fseek(fdesc, o, SEEK_SET);
diff -Nru faiss-1.7.3/faiss/invlists/OnDiskInvertedLists.h faiss-1.7.4/faiss/invlists/OnDiskInvertedLists.h
--- faiss-1.7.3/faiss/invlists/OnDiskInvertedLists.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/invlists/OnDiskInvertedLists.h	2023-04-19 13:18:30.000000000 +0000
@@ -31,7 +31,7 @@
 
 /** On-disk storage of inverted lists.
  *
- * The data is stored in a mmapped chunk of memory (base ptointer ptr,
+ * The data is stored in a mmapped chunk of memory (base pointer ptr,
  * size totsize). Each list is a range of memory that contains (object
  * List) that contains:
  *
diff -Nru faiss-1.7.3/faiss/IVFlib.h faiss-1.7.4/faiss/IVFlib.h
--- faiss-1.7.3/faiss/IVFlib.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/IVFlib.h	2023-04-19 13:18:30.000000000 +0000
@@ -48,8 +48,6 @@
  */
 void merge_into(Index* index0, Index* index1, bool shift_ids);
 
-typedef Index::idx_t idx_t;
-
 /* Returns the cluster the embeddings belong to.
  *
  * @param index      Index, which should be an IVF index
diff -Nru faiss-1.7.3/faiss/MetaIndexes.cpp faiss-1.7.4/faiss/MetaIndexes.cpp
--- faiss-1.7.3/faiss/MetaIndexes.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/MetaIndexes.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -19,6 +19,8 @@
 #include <faiss/impl/IDSelector.h>
 #include <faiss/utils/Heap.h>
 #include <faiss/utils/WorkerThread.h>
+#include <faiss/utils/random.h>
+#include <faiss/utils/utils.h>
 
 namespace faiss {
 
@@ -154,4 +156,88 @@
     }
 }
 
+/********************************************************
+ * IndexRandom implementation
+ */
+
+IndexRandom::IndexRandom(
+        idx_t d,
+        idx_t ntotal,
+        int64_t seed,
+        MetricType metric_type)
+        : Index(d, metric_type), seed(seed) {
+    this->ntotal = ntotal;
+    is_trained = true;
+}
+
+void IndexRandom::add(idx_t n, const float*) {
+    ntotal += n;
+}
+
+void IndexRandom::search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels,
+        const SearchParameters* params) const {
+    FAISS_THROW_IF_NOT_MSG(
+            !params, "search params not supported for this index");
+    FAISS_THROW_IF_NOT(k <= ntotal);
+#pragma omp parallel for if (n > 1000)
+    for (idx_t i = 0; i < n; i++) {
+        RandomGenerator rng(
+                seed + ivec_checksum(d, (const int32_t*)(x + i * d)));
+        idx_t* I = labels + i * k;
+        float* D = distances + i * k;
+        // assumes k << ntotal
+        if (k < 100 * ntotal) {
+            std::unordered_set<idx_t> map;
+            for (int j = 0; j < k; j++) {
+                idx_t ii;
+                for (;;) {
+                    // yes I know it's not strictly uniform...
+                    ii = rng.rand_int64() % ntotal;
+                    if (map.count(ii) == 0) {
+                        break;
+                    }
+                }
+                I[j] = ii;
+                map.insert(ii);
+            }
+        } else {
+            std::vector<idx_t> perm(ntotal);
+            for (idx_t j = 0; j < ntotal; j++) {
+                perm[j] = j;
+            }
+            for (int j = 0; j < k; j++) {
+                std::swap(perm[j], perm[rng.rand_int(ntotal)]);
+                I[j] = perm[j];
+            }
+        }
+        float dprev = 0;
+        for (int j = 0; j < k; j++) {
+            float step = rng.rand_float();
+            if (is_similarity_metric(metric_type)) {
+                step = -step;
+            }
+            dprev += step;
+            D[j] = dprev;
+        }
+    }
+}
+
+void IndexRandom::reconstruct(idx_t key, float* recons) const {
+    RandomGenerator rng(seed + 123332 + key);
+    for (size_t i = 0; i < d; i++) {
+        recons[i] = rng.rand_float();
+    }
+}
+
+void IndexRandom::reset() {
+    ntotal = 0;
+}
+
+IndexRandom::~IndexRandom() {}
+
 } // namespace faiss
diff -Nru faiss-1.7.3/faiss/MetaIndexes.h faiss-1.7.4/faiss/MetaIndexes.h
--- faiss-1.7.3/faiss/MetaIndexes.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/MetaIndexes.h	2023-04-19 13:18:30.000000000 +0000
@@ -49,6 +49,35 @@
     ~IndexSplitVectors() override;
 };
 
+/** index that returns random results.
+ * used mainly for time benchmarks
+ */
+struct IndexRandom : Index {
+    int64_t seed;
+
+    explicit IndexRandom(
+            idx_t d,
+            idx_t ntotal = 0,
+            int64_t seed = 1234,
+            MetricType mt = METRIC_L2);
+
+    void add(idx_t n, const float* x) override;
+
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels,
+            const SearchParameters* params = nullptr) const override;
+
+    void reconstruct(idx_t key, float* recons) const override;
+
+    void reset() override;
+
+    ~IndexRandom() override;
+};
+
 } // namespace faiss
 
 #endif
diff -Nru faiss-1.7.3/faiss/MetricType.h faiss-1.7.4/faiss/MetricType.h
--- faiss-1.7.3/faiss/MetricType.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/MetricType.h	2023-04-19 13:18:30.000000000 +0000
@@ -10,6 +10,8 @@
 #ifndef FAISS_METRIC_TYPE_H
 #define FAISS_METRIC_TYPE_H
 
+#include <faiss/impl/platform_macros.h>
+
 namespace faiss {
 
 /// The metric space for vector comparison for Faiss indices and algorithms.
@@ -29,8 +31,20 @@
     METRIC_Canberra = 20,
     METRIC_BrayCurtis,
     METRIC_JensenShannon,
+    METRIC_Jaccard, ///< defined as: sum_i(min(a_i, b_i)) / sum_i(max(a_i, b_i))
+                    ///< where a_i, b_i > 0
 };
 
+/// all vector indices are this type
+using idx_t = int64_t;
+
+/// this function is used to distinguish between min and max indexes since
+/// we need to support similarity and dis-similarity metrics in a flexible way
+constexpr bool is_similarity_metric(MetricType metric_type) {
+    return ((metric_type == METRIC_INNER_PRODUCT) ||
+            (metric_type == METRIC_Jaccard));
+}
+
 } // namespace faiss
 
 #endif
diff -Nru faiss-1.7.3/faiss/python/array_conversions.py faiss-1.7.4/faiss/python/array_conversions.py
--- faiss-1.7.3/faiss/python/array_conversions.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/python/array_conversions.py	2023-04-19 13:18:30.000000000 +0000
@@ -10,6 +10,7 @@
 
 import numpy as np
 import array
+import warnings
 
 from faiss.loader import *
 
@@ -103,6 +104,8 @@
 def vector_to_array(v):
     """ convert a C++ vector to a numpy array """
     classname = v.__class__.__name__
+    if classname.startswith('AlignedTable'):
+        return AlignedTable_to_array(v)
     assert classname.endswith('Vector')
     dtype = np.dtype(vector_name_map[classname[:-6]])
     a = np.empty(v.size(), dtype=dtype)
diff -Nru faiss-1.7.3/faiss/python/class_wrappers.py faiss-1.7.4/faiss/python/class_wrappers.py
--- faiss-1.7.3/faiss/python/class_wrappers.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/python/class_wrappers.py	2023-04-19 13:18:30.000000000 +0000
@@ -476,16 +476,16 @@
         self.reconstruct_batch_c(n, swig_ptr(key), swig_ptr(x))
         return x
 
-    def replacement_reconstruct_n(self, n0, ni, x=None):
+    def replacement_reconstruct_n(self, n0=0, ni=-1, x=None):
         """Approximate reconstruction of vectors `n0` ... `n0 + ni - 1` from the index.
         Missing vectors trigger an exception.
 
         Parameters
         ----------
         n0 : int
-            Id of the first vector to reconstruct
+            Id of the first vector to reconstruct (default 0)
         ni : int
-            Number of vectors to reconstruct
+            Number of vectors to reconstruct (-1 = default = ntotal)
         x : array_like, optional
             pre-allocated array to store the results
 
@@ -494,6 +494,8 @@
         x : array_like
             Reconstructed vectors, size (`ni`, `self.d`), `dtype`=float32
         """
+        if ni == -1:
+            ni = self.ntotal
         if x is None:
             x = np.empty((ni, self.d), dtype=np.float32)
         else:
@@ -530,7 +532,7 @@
         Returns
         -------
         lims: array_like
-            Startring index of the results for each query vector, size n+1.
+            Starting index of the results for each query vector, size n+1.
         D : array_like
             Distances of the nearest neighbors, shape `lims[n]`. The distances for
             query i are in `D[lims[i]:lims[i+1]]`.
@@ -552,6 +554,70 @@
         I = rev_swig_ptr(res.labels, nd).copy()
         return lims, D, I
 
+    def replacement_search_preassigned(self, x, k, Iq, Dq, *, params=None, D=None, I=None):
+        """Find the k nearest neighbors of the set of vectors x in an IVF index,
+        with precalculated coarse quantization assignment.
+
+        Parameters
+        ----------
+        x : array_like
+            Query vectors, shape (n, d) where d is appropriate for the index.
+            `dtype` must be float32.
+        k : int
+            Number of nearest neighbors.
+        Dq : array_like, optional
+            Distance array to the centroids, size (n, nprobe)
+        Iq : array_like, optional
+            Nearest centroids, size (n, nprobe)
+
+        params : SearchParameters
+            Search parameters of the current search (overrides the class-level params)
+        D : array_like, optional
+            Distance array to store the result.
+        I : array_like, optional
+            Labels array to store the results.
+
+        Returns
+        -------
+        D : array_like
+            Distances of the nearest neighbors, shape (n, k). When not enough results are found
+            the label is set to +Inf or -Inf.
+        I : array_like
+            Labels of the nearest neighbors, shape (n, k).
+            When not enough results are found, the label is set to -1
+        """
+        n, d = x.shape
+        x = np.ascontiguousarray(x, dtype='float32')
+        assert d == self.d
+        assert k > 0
+
+        if D is None:
+            D = np.empty((n, k), dtype=np.float32)
+        else:
+            assert D.shape == (n, k)
+
+        if I is None:
+            I = np.empty((n, k), dtype=np.int64)
+        else:
+            assert I.shape == (n, k)
+
+        Iq = np.ascontiguousarray(Iq, dtype='int64')
+        assert params is None, "params not supported"
+        assert Iq.shape == (n, self.nprobe)
+
+        if Dq is not None:
+            Dq = np.ascontiguousarray(Dq, dtype='float32')
+            assert Dq.shape == Iq.shape
+
+        self.search_preassigned_c(
+            n, swig_ptr(x),
+            k,
+            swig_ptr(Iq), swig_ptr(Dq),
+            swig_ptr(D), swig_ptr(I),
+            False
+        )
+        return D, I
+
     def replacement_sa_encode(self, x, codes=None):
         n, d = x.shape
         assert d == self.d
@@ -603,6 +669,8 @@
                    ignore_missing=True)
     replace_method(the_class, 'search_and_reconstruct',
                    replacement_search_and_reconstruct, ignore_missing=True)
+    replace_method(the_class, 'search_preassigned',
+                   replacement_search_preassigned, ignore_missing=True)
     replace_method(the_class, 'sa_encode', replacement_sa_encode)
     replace_method(the_class, 'sa_decode', replacement_sa_decode)
     replace_method(the_class, 'add_sa_codes', replacement_add_sa_codes,
@@ -662,6 +730,31 @@
                       swig_ptr(labels))
         return distances, labels
 
+    def replacement_search_preassigned(self, x, k, Iq, Dq):
+        n, d = x.shape
+        x = _check_dtype_uint8(x)
+        assert d * 8 == self.d
+        assert k > 0
+
+        D = np.empty((n, k), dtype=np.int32)
+        I = np.empty((n, k), dtype=np.int64)
+
+        Iq = np.ascontiguousarray(Iq, dtype='int64')
+        assert Iq.shape == (n, self.nprobe)
+
+        if Dq is not None:
+            Dq = np.ascontiguousarray(Dq, dtype='int32')
+            assert Dq.shape == Iq.shape
+
+        self.search_preassigned_c(
+            n, swig_ptr(x),
+            k,
+            swig_ptr(Iq), swig_ptr(Dq),
+            swig_ptr(D), swig_ptr(I),
+            False
+        )
+        return D, I
+
     def replacement_range_search(self, x, thresh):
         n, d = x.shape
         x = _check_dtype_uint8(x)
@@ -691,6 +784,8 @@
     replace_method(the_class, 'range_search', replacement_range_search)
     replace_method(the_class, 'reconstruct', replacement_reconstruct)
     replace_method(the_class, 'remove_ids', replacement_remove_ids)
+    replace_method(the_class, 'search_preassigned',
+                   replacement_search_preassigned, ignore_missing=True)
 
 
 def handle_VectorTransform(the_class):
@@ -805,6 +900,26 @@
     replace_method(the_class, 'train_inplace', replacement_train_inplace)
 
 
+def handle_CodePacker(the_class):
+
+    def replacement_pack_1(self, x, offset, block):
+        assert x.shape == (self.code_size,)
+        nblock, block_size = block.shape
+        assert block_size == self.block_size
+        assert 0 <= offset < block_size * self.nvec
+        self.pack_1_c(swig_ptr(x), offset, faiss.swig_ptr(block))
+
+    def replacement_unpack_1(self, block, offset):
+        nblock, block_size = block.shape
+        assert block_size == self.block_size
+        assert 0 <= offset < block_size * self.nvec
+        x = np.zeros(self.code_size, dtype='uint8')
+        self.unpack_1_c(faiss.swig_ptr(block), offset, swig_ptr(x))
+        return x
+
+    replace_method(the_class, 'pack_1', replacement_pack_1)
+    replace_method(the_class, 'unpack_1', replacement_unpack_1)
+
 ######################################################
 # MapLong2Long interface
 ######################################################
@@ -825,7 +940,7 @@
 
     replace_method(the_class, 'add', replacement_map_add)
     replace_method(the_class, 'search_multiple',
-                replacement_map_search_multiple)
+                   replacement_map_search_multiple)
 
 
 ######################################################
@@ -839,6 +954,7 @@
     else:
         self.referenced_objects.append(ref)
 
+
 def handle_SearchParameters(the_class):
     """ this wrapper is to enable initializations of the form
     SearchParametersXX(a=3, b=SearchParamsYY)
diff -Nru faiss-1.7.3/faiss/python/extra_wrappers.py faiss-1.7.4/faiss/python/extra_wrappers.py
--- faiss-1.7.3/faiss/python/extra_wrappers.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/python/extra_wrappers.py	2023-04-19 13:18:30.000000000 +0000
@@ -55,7 +55,7 @@
     return D, I
 
 
-def pairwise_distances(xq, xb, mt=METRIC_L2, metric_arg=0):
+def pairwise_distances(xq, xb, metric=METRIC_L2, metric_arg=0):
     """compute the whole pairwise distance matrix between two sets of
     vectors"""
     xq = np.ascontiguousarray(xq, dtype='float32')
@@ -64,16 +64,18 @@
     nb, d2 = xb.shape
     assert d == d2
     dis = np.empty((nq, nb), dtype='float32')
-    if mt == METRIC_L2:
+    if metric == METRIC_L2:
         pairwise_L2sqr(
             d, nq, swig_ptr(xq),
             nb, swig_ptr(xb),
             swig_ptr(dis))
+    elif metric == METRIC_INNER_PRODUCT:
+        dis[:] = xq @ xb.T
     else:
         pairwise_extra_distances(
             d, nq, swig_ptr(xq),
             nb, swig_ptr(xb),
-            mt, metric_arg,
+            metric, metric_arg,
             swig_ptr(dis))
     return dis
 
@@ -102,6 +104,17 @@
     return res
 
 
+def checksum(a):
+    """ compute a checksum for quick-and-dirty comparisons of arrays """
+    a = a.view('uint8')
+    n = a.size
+    n4 = n & ~3
+    cs = ivec_checksum(int(n4 / 4), swig_ptr(a[:n4].view('int32')))
+    for i in range(n4, n):
+        cs += x[i] * 33657
+    return cs
+
+
 rand_smooth_vectors_c = rand_smooth_vectors
 
 
@@ -128,6 +141,75 @@
 def normalize_L2(x):
     fvec_renorm_L2(x.shape[1], x.shape[0], swig_ptr(x))
 
+bucket_sort_c = bucket_sort
+
+def bucket_sort(tab, nbucket=None, nt=0):
+    """Perform a bucket sort on a table of integers.
+
+    Parameters
+    ----------
+    tab : array_like
+        elements to sort, max value nbucket - 1
+    nbucket : integer
+        number of buckets, None if unknown
+    nt : integer
+        number of threads to use (0 = use unthreaded codepath)
+
+    Returns
+    -------
+    lims : array_like
+        cumulative sum of bucket sizes (size vmax + 1)
+    perm : array_like
+        perm[lims[i] : lims[i + 1]] contains the indices of bucket #i (size tab.size)
+    """
+    tab = np.ascontiguousarray(tab, dtype="int64")
+    if nbucket is None:
+        nbucket = int(tab.max() + 1)
+    lims = np.empty(nbucket + 1, dtype='int64')
+    perm = np.empty(tab.size, dtype='int64')
+    bucket_sort_c(
+        tab.size, faiss.swig_ptr(tab.view('uint64')),
+        nbucket, faiss.swig_ptr(lims), faiss.swig_ptr(perm),
+        nt
+    )
+    return lims, perm
+
+matrix_bucket_sort_inplace_c = matrix_bucket_sort_inplace
+
+def matrix_bucket_sort_inplace(tab, nbucket=None, nt=0):
+    """Perform a bucket sort on a matrix, recording the original
+    row of each element.
+
+    Parameters
+    ----------
+    tab : array_like
+        array of size (N, ncol) that contains the bucket ids, maximum
+        value nbucket - 1.
+        On output, it the elements are shuffled such that the flat array
+        tab.ravel()[lims[i] : lims[i + 1]] contains the row numbers
+        of each bucket entry.
+    nbucket : integer
+        number of buckets (the maximum value in tab should be nbucket - 1)
+    nt : integer
+        number of threads to use (0 = use unthreaded codepath)
+
+    Returns
+    -------
+    lims : array_like
+        cumulative sum of bucket sizes (size vmax + 1)
+    """
+    assert tab.dtype == 'int32' or tab.dtype == 'int64'
+    nrow, ncol = tab.shape
+    if nbucket is None:
+        nbucket = int(tab.max() + 1)
+    lims = np.empty(nbucket + 1, dtype='int64')
+    matrix_bucket_sort_inplace_c(
+        nrow, ncol, faiss.swig_ptr(tab),
+        nbucket, faiss.swig_ptr(lims),
+        nt
+    )
+    return lims
+
 
 ###########################################
 # ResultHeap
@@ -138,7 +220,11 @@
     be in self.D, self.I."""
 
     def __init__(self, nq, k, keep_max=False):
-        " nq: number of query vectors, k: number of results per query "
+        """
+        nq: number of query vectors,
+        k: number of results per query
+        keep_max: keep the top-k maximum values instead of the minima
+        """
         self.I = np.zeros((nq, k), dtype='int64')
         self.D = np.zeros((nq, k), dtype='float32')
         self.nq, self.k = nq, k
@@ -154,7 +240,11 @@
         self.heaps = heaps
 
     def add_result(self, D, I):
-        """D, I do not need to be in a particular order (heap or sorted)"""
+        """
+        Add results for all heaps
+        D, I should be of size (nh, nres)
+        D, I do not need to be in a particular order (heap or sorted)
+        """
         nq, kd = D.shape
         D = np.ascontiguousarray(D, dtype='float32')
         I = np.ascontiguousarray(I, dtype='int64')
@@ -164,10 +254,48 @@
             kd, swig_ptr(D),
             swig_ptr(I), kd)
 
+    def add_result_subset(self, subset, D, I):
+        """
+        Add results for a subset of heaps.
+        D, I should hold resutls for all the subset
+        as a special case, if I is 1D, then all ids are assumed to be the same
+        """
+        nsubset, kd = D.shape
+        assert nsubset == len(subset)
+        assert (
+            I.ndim == 2 and D.shape == I.shape or
+            I.ndim == 1 and I.shape == (kd, )
+        )
+        D = np.ascontiguousarray(D, dtype='float32')
+        I = np.ascontiguousarray(I, dtype='int64')
+        subset = np.ascontiguousarray(subset, dtype='int64')
+        id_stride = 0 if I.ndim == 1 else kd
+        self.heaps.addn_query_subset_with_ids(
+            nsubset, swig_ptr(subset),
+            kd, swig_ptr(D), swig_ptr(I), id_stride
+        )
+
     def finalize(self):
         self.heaps.reorder()
 
 
+def merge_knn_results(Dall, Iall, keep_max=False):
+    """
+    Merge a set of sorted knn-results obtained from different shards in a dataset
+    Dall and Iall are of size (nshard, nq, k) each D[i, j] should be sorted
+    returns D, I of size (nq, k) as the merged result set
+    """
+    assert Iall.shape == Dall.shape
+    nshard, n, k = Dall.shape
+    Dnew = np.empty((n, k), dtype=Dall.dtype)
+    Inew = np.empty((n, k), dtype=Iall.dtype)
+    func = merge_knn_results_CMax if keep_max else merge_knn_results_CMin
+    func(
+        n, k, nshard,
+        swig_ptr(Dall), swig_ptr(Iall),
+        swig_ptr(Dnew), swig_ptr(Inew)
+    )
+    return Dnew, Inew
 
 ######################################################
 # KNN function
diff -Nru faiss-1.7.3/faiss/python/gpu_wrappers.py faiss-1.7.4/faiss/python/gpu_wrappers.py
--- faiss-1.7.3/faiss/python/gpu_wrappers.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/python/gpu_wrappers.py	2023-04-19 13:18:30.000000000 +0000
@@ -40,7 +40,9 @@
 
 def index_cpu_to_gpus_list(index, co=None, gpus=None, ngpu=-1):
     """ Here we can pass list of GPU ids as a parameter or ngpu to
-    use first n GPU's. gpus mut be a list or None"""
+    use first n GPU's. gpus mut be a list or None.
+    co is a GpuMultipleClonerOptions
+    """
     if (gpus is None) and (ngpu == -1):  # All blank
         gpus = range(get_num_gpus())
     elif (gpus is None) and (ngpu != -1):  # Get number of GPU's only
@@ -52,7 +54,7 @@
 # allows numpy ndarray usage with bfKnn
 
 
-def knn_gpu(res, xq, xb, k, D=None, I=None, metric=METRIC_L2):
+def knn_gpu(res, xq, xb, k, D=None, I=None, metric=METRIC_L2, device=-1):
     """
     Compute the k nearest neighbors of a vector on one GPU without constructing an index
 
@@ -72,8 +74,14 @@
         Output array for distances of the nearest neighbors, shape (nq, k)
     I : array_like, optional
         Output array for the nearest neighbors, shape (nq, k)
-    distance_type : MetricType, optional
-        distance measure to use (either METRIC_L2 or METRIC_INNER_PRODUCT)
+    metric : MetricType, optional
+        Distance measure to use (either METRIC_L2 or METRIC_INNER_PRODUCT)
+    device: int, optional
+        Which CUDA device in the system to run the search on. -1 indicates that
+        the current thread-local device state (via cudaGetDevice) should be used
+        (can also be set via torch.cuda.set_device in PyTorch)
+        Otherwise, an integer 0 <= device < numDevices indicates the GPU on which
+        the computation should be run
 
     Returns
     -------
@@ -159,6 +167,7 @@
     args.outDistances = D_ptr
     args.outIndices = I_ptr
     args.outIndicesType = I_type
+    args.device = device
 
     # no stream synchronization needed, inputs and outputs are guaranteed to
     # be on the CPU (numpy arrays)
@@ -169,7 +178,7 @@
 # allows numpy ndarray usage with bfKnn for all pairwise distances
 
 
-def pairwise_distance_gpu(res, xq, xb, D=None, metric=METRIC_L2):
+def pairwise_distance_gpu(res, xq, xb, D=None, metric=METRIC_L2, device=-1):
     """
     Compute all pairwise distances between xq and xb on one GPU without constructing an index
 
@@ -185,8 +194,14 @@
         `dtype` must be float32.
     D : array_like, optional
         Output array for all pairwise distances, shape (nq, nb)
-    distance_type : MetricType, optional
-        distance measure to use (either METRIC_L2 or METRIC_INNER_PRODUCT)
+    metric : MetricType, optional
+        Distance measure to use (either METRIC_L2 or METRIC_INNER_PRODUCT)
+    device: int, optional
+        Which CUDA device in the system to run the search on. -1 indicates that
+        the current thread-local device state (via cudaGetDevice) should be used
+        (can also be set via torch.cuda.set_device in PyTorch)
+        Otherwise, an integer 0 <= device < numDevices indicates the GPU on which
+        the computation should be run
 
     Returns
     -------
@@ -255,6 +270,7 @@
     args.queryType = xq_type
     args.numQueries = nq
     args.outDistances = D_ptr
+    args.device = device
 
     # no stream synchronization needed, inputs and outputs are guaranteed to
     # be on the CPU (numpy arrays)
diff -Nru faiss-1.7.3/faiss/python/__init__.py faiss-1.7.4/faiss/python/__init__.py
--- faiss-1.7.3/faiss/python/__init__.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/python/__init__.py	2023-04-19 13:18:30.000000000 +0000
@@ -21,7 +21,8 @@
 from faiss.array_conversions import *
 from faiss.extra_wrappers import kmin, kmax, pairwise_distances, rand, randint, \
     lrand, randn, rand_smooth_vectors, eval_intersection, normalize_L2, \
-    ResultHeap, knn, Kmeans
+    ResultHeap, knn, Kmeans, checksum, matrix_bucket_sort_inplace, bucket_sort, \
+    merge_knn_results
 
 
 __version__ = "%d.%d.%d" % (FAISS_VERSION_MAJOR,
@@ -68,6 +69,9 @@
         if issubclass(the_class, SearchParameters):
             class_wrappers.handle_SearchParameters(the_class)
 
+        if issubclass(the_class, CodePacker):
+            class_wrappers.handle_CodePacker(the_class)
+
 ##############################################################################
 # For some classes (IndexIVF, IDSelector), the object holds a reference to
 # a C++ object (eg. the quantizer object of IndexIVF). We don't transfer the
@@ -185,6 +189,9 @@
 add_ref_in_constructor(BufferedIOReader, 0)
 
 add_ref_in_constructor(IDSelectorNot, 0)
+add_ref_in_constructor(IDSelectorAnd, slice(2))
+add_ref_in_constructor(IDSelectorOr, slice(2))
+add_ref_in_constructor(IDSelectorXOr, slice(2))
 
 # seems really marginal...
 # remove_ref_from_method(IndexReplicas, 'removeIndex', 0)
diff -Nru faiss-1.7.3/faiss/python/python_callbacks.cpp faiss-1.7.4/faiss/python/python_callbacks.cpp
--- faiss-1.7.3/faiss/python/python_callbacks.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/python/python_callbacks.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -118,7 +118,7 @@
     Py_INCREF(callback);
 }
 
-bool PyCallbackIDSelector::is_member(idx_t id) const {
+bool PyCallbackIDSelector::is_member(faiss::idx_t id) const {
     FAISS_THROW_IF_NOT((id >> 32) == 0);
     PyThreadLock gil;
     PyObject* result = PyObject_CallFunction(callback, "(n)", int(id));
diff -Nru faiss-1.7.3/faiss/python/python_callbacks.h faiss-1.7.4/faiss/python/python_callbacks.h
--- faiss-1.7.3/faiss/python/python_callbacks.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/python/python_callbacks.h	2023-04-19 13:18:30.000000000 +0000
@@ -54,7 +54,7 @@
 
     explicit PyCallbackIDSelector(PyObject* callback);
 
-    bool is_member(idx_t id) const override;
+    bool is_member(faiss::idx_t id) const override;
 
     ~PyCallbackIDSelector() override;
 };
diff -Nru faiss-1.7.3/faiss/python/setup.py faiss-1.7.4/faiss/python/setup.py
--- faiss-1.7.3/faiss/python/setup.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/python/setup.py	2023-04-19 13:18:30.000000000 +0000
@@ -53,7 +53,7 @@
 """
 setup(
     name='faiss',
-    version='1.7.3',
+    version='1.7.4',
     description='A library for efficient similarity search and clustering of dense vectors',
     long_description=long_description,
     url='https://github.com/facebookresearch/faiss',
diff -Nru faiss-1.7.3/faiss/python/swigfaiss.swig faiss-1.7.4/faiss/python/swigfaiss.swig
--- faiss-1.7.3/faiss/python/swigfaiss.swig	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/python/swigfaiss.swig	2023-04-19 13:18:30.000000000 +0000
@@ -90,6 +90,7 @@
 #include <faiss/IndexIVFSpectralHash.h>
 #include <faiss/impl/ThreadedIndex.h>
 #include <faiss/IndexShards.h>
+#include <faiss/IndexShardsIVF.h>
 #include <faiss/IndexReplicas.h>
 #include <faiss/impl/HNSW.h>
 #include <faiss/IndexHNSW.h>
@@ -122,6 +123,7 @@
 
 #include <faiss/IVFlib.h>
 #include <faiss/utils/utils.h>
+#include <faiss/utils/sorting.h>
 #include <faiss/utils/distances.h>
 #include <faiss/utils/extra_distances.h>
 #include <faiss/utils/random.h>
@@ -135,6 +137,7 @@
 #include <faiss/impl/ResidualQuantizer.h>
 #include <faiss/impl/LocalSearchQuantizer.h>
 #include <faiss/impl/ProductAdditiveQuantizer.h>
+#include  <faiss/impl/CodePacker.h>
 
 #include <faiss/invlists/BlockInvertedLists.h>
 
@@ -145,6 +148,7 @@
 #include <faiss/Clustering.h>
 
 #include <faiss/utils/hamming.h>
+#include <faiss/utils/hamming_distance/common.h>
 
 #include <faiss/AutoTune.h>
 #include <faiss/MatrixStats.h>
@@ -244,6 +248,7 @@
 %template(InvertedListsPtrVector) std::vector<faiss::InvertedLists*>;
 %template(RepeatVector) std::vector<faiss::Repeat>;
 %template(ClusteringIterationStatsVector) std::vector<faiss::ClusteringIterationStats>;
+%template(ParameterRangeVector) std::vector<faiss::ParameterRange>;
 
 #ifndef SWIGWIN
 %template(OnDiskOneListVector) std::vector<faiss::OnDiskOneList>;
@@ -273,6 +278,7 @@
 %include <faiss/utils/AlignedTable.h>
 %include <faiss/utils/partitioning.h>
 %include <faiss/utils/hamming.h>
+%include <faiss/utils/hamming_distance/common.h>
 
 int get_num_gpus();
 void gpu_profiler_start();
@@ -380,10 +386,13 @@
 %include  <faiss/utils/utils.h>
 %include  <faiss/utils/distances.h>
 %include  <faiss/utils/random.h>
+%include  <faiss/utils/sorting.h>
 
 %include  <faiss/MetricType.h>
 
 %newobject *::get_distance_computer() const;
+%newobject *::get_CodePacker() const;
+
 %include  <faiss/Index.h>
 
 %include <faiss/impl/DistanceComputer.h>
@@ -403,6 +412,7 @@
 %include  <faiss/impl/ResidualQuantizer.h>
 %include  <faiss/impl/LocalSearchQuantizer.h>
 %include  <faiss/impl/ProductAdditiveQuantizer.h>
+%include  <faiss/impl/CodePacker.h>
 
 %include  <faiss/VectorTransform.h>
 %include  <faiss/IndexPreTransform.h>
@@ -477,6 +487,7 @@
 %include  <faiss/IndexShards.h>
 %template(IndexShards) faiss::IndexShardsTemplate<faiss::Index>;
 %template(IndexBinaryShards) faiss::IndexShardsTemplate<faiss::IndexBinary>;
+%include  <faiss/IndexShardsIVF.h>
 
 %include  <faiss/IndexReplicas.h>
 %template(IndexReplicas) faiss::IndexReplicasTemplate<faiss::Index>;
@@ -502,7 +513,7 @@
 %include <faiss/impl/AuxIndexStructures.h>
 %include <faiss/impl/IDSelector.h>
 
-
+%include <faiss/utils/approx_topk/mode.h>
 
 #ifdef GPU_WRAPPER
 
@@ -571,6 +582,7 @@
 %typemap(out) faiss::Index * {
     DOWNCAST2 ( IndexIDMap2, IndexIDMap2TemplateT_faiss__Index_t )
     DOWNCAST2 ( IndexIDMap, IndexIDMapTemplateT_faiss__Index_t )
+    DOWNCAST ( IndexShardsIVF )
     DOWNCAST2 ( IndexShards, IndexShardsTemplateT_faiss__Index_t )
     DOWNCAST2 ( IndexReplicas, IndexReplicasTemplateT_faiss__Index_t )
     DOWNCAST ( IndexIVFPQR )
@@ -612,12 +624,14 @@
     DOWNCAST ( IndexHNSWFlat )
     DOWNCAST ( IndexHNSWPQ )
     DOWNCAST ( IndexHNSWSQ )
+    DOWNCAST ( IndexHNSW )
     DOWNCAST ( IndexHNSW2Level )
     DOWNCAST ( IndexNNDescentFlat )
     DOWNCAST ( IndexNSGFlat )
     DOWNCAST ( IndexNSGPQ )
     DOWNCAST ( IndexNSGSQ )
     DOWNCAST ( Index2Layer )
+    DOWNCAST ( IndexRandom )
     DOWNCAST ( IndexRowwiseMinMax )
     DOWNCAST ( IndexRowwiseMinMaxFP16 )
 #ifdef GPU_WRAPPER
@@ -930,7 +944,6 @@
 
 %template(float_minheap_array_t) faiss::HeapArray<faiss::CMin<float, int64_t> >;
 %template(int_minheap_array_t) faiss::HeapArray<faiss::CMin<int, int64_t> >;
-
 %template(float_maxheap_array_t) faiss::HeapArray<faiss::CMax<float, int64_t> >;
 %template(int_maxheap_array_t) faiss::HeapArray<faiss::CMax<int, int64_t> >;
 
@@ -943,46 +956,55 @@
 %template(AlignedTableUint16) faiss::AlignedTable<uint16_t>;
 %template(AlignedTableFloat32) faiss::AlignedTable<float>;
 
-%inline %{
 
-// SWIG seems to have has some trouble resolving the template type here, so
+// SWIG seems to have some trouble resolving function template types here, so
 // declare explicitly
-uint16_t CMax_uint16_partition_fuzzy(
-        uint16_t *vals, int64_t *ids, size_t n,
-        size_t q_min, size_t q_max, size_t * q_out)
-{
-    return faiss::partition_fuzzy<faiss::CMax<unsigned short, int64_t> >(
-        vals, ids, n, q_min, q_max, q_out);
-}
 
-uint16_t CMin_uint16_partition_fuzzy(
-        uint16_t *vals, int64_t *ids, size_t n,
-        size_t q_min, size_t q_max, size_t * q_out)
-{
-    return faiss::partition_fuzzy<faiss::CMin<unsigned short, int64_t> >(
-        vals, ids, n, q_min, q_max, q_out);
-}
+%define INSTANTIATE_uint16_partition_fuzzy(C, id_t)
 
-// and overload with the int32 version
+%inline %{
 
-uint16_t CMax_uint16_partition_fuzzy(
-        uint16_t *vals, int *ids, size_t n,
+uint16_t C ## _uint16_partition_fuzzy(
+        uint16_t *vals, id_t *ids, size_t n,
         size_t q_min, size_t q_max, size_t * q_out)
 {
-    return faiss::partition_fuzzy<faiss::CMax<unsigned short, int> >(
+    return faiss::partition_fuzzy<faiss::C<unsigned short, id_t> >(
         vals, ids, n, q_min, q_max, q_out);
 }
 
-uint16_t CMin_uint16_partition_fuzzy(
-        uint16_t *vals, int *ids, size_t n,
-        size_t q_min, size_t q_max, size_t * q_out)
+%}
+
+%enddef
+
+INSTANTIATE_uint16_partition_fuzzy(CMin, int64_t)
+INSTANTIATE_uint16_partition_fuzzy(CMax, int64_t)
+INSTANTIATE_uint16_partition_fuzzy(CMin, int)
+INSTANTIATE_uint16_partition_fuzzy(CMax, int)
+
+// Same for merge_knn_results
+
+// same define as explicit instanciation in Heap.cpp
+%define INSTANTIATE_merge_knn_results(C, distance_t)
+
+%inline %{
+void merge_knn_results_ ## C(
+    size_t n, size_t k, int nshard,
+    const distance_t *all_distances, const faiss::idx_t *all_labels,
+    distance_t *distances, faiss::idx_t *labels)
 {
-    return faiss::partition_fuzzy<faiss::CMin<unsigned short, int> >(
-        vals, ids, n, q_min, q_max, q_out);
+    faiss::merge_knn_results<faiss::idx_t, faiss::C<distance_t, int>>(
+        n, k, nshard, all_distances, all_labels, distances, labels);
 }
-
 %}
 
+%enddef
+
+INSTANTIATE_merge_knn_results(CMin, float);
+INSTANTIATE_merge_knn_results(CMax, float);
+INSTANTIATE_merge_knn_results(CMin, int32_t);
+INSTANTIATE_merge_knn_results(CMax, int32_t);
+
+
 /*******************************************************************
  * Expose a few basic functions
  *******************************************************************/
@@ -1006,8 +1028,8 @@
     return (float*)x;
 }
 
-faiss::Index::idx_t* cast_integer_to_idx_t_ptr (int64_t x) {
-    return (faiss::Index::idx_t*)x;
+faiss::idx_t* cast_integer_to_idx_t_ptr (int64_t x) {
+    return (faiss::idx_t*)x;
 }
 
 int * cast_integer_to_int_ptr (int64_t x) {
diff -Nru faiss-1.7.3/faiss/utils/AlignedTable.h faiss-1.7.4/faiss/utils/AlignedTable.h
--- faiss-1.7.3/faiss/utils/AlignedTable.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/utils/AlignedTable.h	2023-04-19 13:18:30.000000000 +0000
@@ -98,7 +98,9 @@
     AlignedTableTightAlloc<T, A>& operator=(
             const AlignedTableTightAlloc<T, A>& other) {
         resize(other.numel);
-        memcpy(ptr, other.ptr, sizeof(T) * numel);
+        if (numel > 0) {
+            memcpy(ptr, other.ptr, sizeof(T) * numel);
+        }
         return *this;
     }
 
diff -Nru faiss-1.7.3/faiss/utils/approx_topk/approx_topk.h faiss-1.7.4/faiss/utils/approx_topk/approx_topk.h
--- faiss-1.7.3/faiss/utils/approx_topk/approx_topk.h	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/faiss/utils/approx_topk/approx_topk.h	2023-04-19 13:18:30.000000000 +0000
@@ -0,0 +1,84 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// This file contains an implementation of approximate top-k search
+// using heap. It was initially created for a beam search.
+//
+// The core idea is the following.
+// Say we need to find beam_size indices with the minimal distance
+// values. It is done via heap (priority_queue) using the following
+// pseudocode:
+//
+//   def baseline():
+//     distances = np.empty([beam_size * n], dtype=float)
+//     indices = np.empty([beam_size * n], dtype=int)
+//
+//     heap = Heap(max_heap_size=beam_size)
+//
+//     for i in range(0, beam_size * n):
+//         heap.push(distances[i], indices[i])
+//
+// Basically, this is what heap_addn() function from utils/Heap.h does.
+//
+// The following scheme can be used for approximate beam search.
+// Say, we need to find elements with min distance.
+// Basically, we split n elements of every beam into NBUCKETS buckets
+// and track the index with the minimal distance for every bucket.
+// This can be effectively SIMD-ed and significantly lowers the number
+// of operations, but yields approximate results for beam_size >= 2.
+//
+//  def approximate_v1():
+//    distances = np.empty([beam_size * n], dtype=float)
+//    indices = np.empty([beam_size * n], dtype=int)
+//
+//    heap = Heap(max_heap_size=beam_size)
+//
+//    for beam in range(0, beam_size):
+//      # The value of 32 is just an example.
+//      # The value may be varied: the larger the value is,
+//      #  the slower and the more precise vs baseline beam search is
+//      NBUCKETS = 32
+//
+//     local_min_distances = [HUGE_VALF] * NBUCKETS
+//     local_min_indices = [0] * NBUCKETS
+//
+//      for i in range(0, n / NBUCKETS):
+//        for j in range(0, NBUCKETS):
+//          idx = beam * n + i * NBUCKETS + j
+//          if distances[idx] < local_min_distances[j]:
+//            local_min_distances[i] = distances[idx]
+//            local_min_indices[i] = indices[idx]
+//
+//    for j in range(0, NBUCKETS):
+//      heap.push(local_min_distances[j], local_min_indices[j])
+//
+// The accuracy can be improved by tracking min-2 elements for every
+// bucket. Such a min-2 implementation with NBUCKETS buckets provides
+// better accuracy than top-1 implementation with 2 * NBUCKETS buckets.
+// Min-3 is also doable. One can use min-N approach, but I'm not sure
+// whether min-4 and above are practical, because of the lack of SIMD
+// registers (unless AVX-512 version is used).
+//
+// C++ template for top-N implementation is provided. The code
+// assumes that indices[idx] == idx. One can write a code that lifts
+// such an assumption easily.
+//
+// Currently, the code that tracks elements with min distances is implemented
+//    (Max Heap). Min Heap option can be added easily.
+
+#pragma once
+
+#include <faiss/impl/platform_macros.h>
+
+// the list of available modes is in the following file
+#include <faiss/utils/approx_topk/mode.h>
+
+#ifdef __AVX2__
+#include <faiss/utils/approx_topk/avx2-inl.h>
+#else
+#include <faiss/utils/approx_topk/generic.h>
+#endif
diff -Nru faiss-1.7.3/faiss/utils/approx_topk/avx2-inl.h faiss-1.7.4/faiss/utils/approx_topk/avx2-inl.h
--- faiss-1.7.3/faiss/utils/approx_topk/avx2-inl.h	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/faiss/utils/approx_topk/avx2-inl.h	2023-04-19 13:18:30.000000000 +0000
@@ -0,0 +1,196 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <immintrin.h>
+
+#include <limits>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/Heap.h>
+
+namespace faiss {
+
+template <typename C, uint32_t NBUCKETS, uint32_t N>
+struct HeapWithBuckets {
+    // this case was not implemented yet.
+};
+
+template <uint32_t NBUCKETS, uint32_t N>
+struct HeapWithBuckets<CMax<float, int>, NBUCKETS, N> {
+    static constexpr uint32_t NBUCKETS_8 = NBUCKETS / 8;
+    static_assert(
+            (NBUCKETS) > 0 && ((NBUCKETS % 8) == 0),
+            "Number of buckets needs to be 8, 16, 24, ...");
+
+    static void addn(
+            // number of elements
+            const uint32_t n,
+            // distances. It is assumed to have n elements.
+            const float* const __restrict distances,
+            // number of best elements to keep
+            const uint32_t k,
+            // output distances
+            float* const __restrict bh_val,
+            // output indices, each being within [0, n) range
+            int32_t* const __restrict bh_ids) {
+        // forward a call to bs_addn with 1 beam
+        bs_addn(1, n, distances, k, bh_val, bh_ids);
+    }
+
+    static void bs_addn(
+            // beam_size parameter of Beam Search algorithm
+            const uint32_t beam_size,
+            // number of elements per beam
+            const uint32_t n_per_beam,
+            // distances. It is assumed to have (n_per_beam * beam_size)
+            // elements.
+            const float* const __restrict distances,
+            // number of best elements to keep
+            const uint32_t k,
+            // output distances
+            float* const __restrict bh_val,
+            // output indices, each being within [0, n_per_beam * beam_size)
+            // range
+            int32_t* const __restrict bh_ids) {
+        // // Basically, the function runs beam_size iterations.
+        // // Every iteration NBUCKETS * N elements are added to a regular heap.
+        // // So, maximum number of added elements is beam_size * NBUCKETS * N.
+        // // This number is expected to be less or equal than k.
+        // FAISS_THROW_IF_NOT_FMT(
+        //         beam_size * NBUCKETS * N >= k,
+        //         "Cannot pick %d elements, only %d. "
+        //         "Check the function and template arguments values.",
+        //         k,
+        //         beam_size * NBUCKETS * N);
+
+        using C = CMax<float, int>;
+
+        // main loop
+        for (uint32_t beam_index = 0; beam_index < beam_size; beam_index++) {
+            __m256 min_distances_i[NBUCKETS_8][N];
+            __m256i min_indices_i[NBUCKETS_8][N];
+
+            for (uint32_t j = 0; j < NBUCKETS_8; j++) {
+                for (uint32_t p = 0; p < N; p++) {
+                    min_distances_i[j][p] =
+                            _mm256_set1_ps(std::numeric_limits<float>::max());
+                    min_indices_i[j][p] =
+                            _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+                }
+            }
+
+            __m256i current_indices = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+            __m256i indices_delta = _mm256_set1_epi32(NBUCKETS);
+
+            const uint32_t nb = (n_per_beam / NBUCKETS) * NBUCKETS;
+
+            // put the data into buckets
+            for (uint32_t ip = 0; ip < nb; ip += NBUCKETS) {
+                for (uint32_t j = 0; j < NBUCKETS_8; j++) {
+                    const __m256 distances_reg = _mm256_loadu_ps(
+                            distances + j * 8 + ip + n_per_beam * beam_index);
+
+                    // loop. Compiler should get rid of unneeded ops
+                    __m256 distance_candidate = distances_reg;
+                    __m256i indices_candidate = current_indices;
+
+                    for (uint32_t p = 0; p < N; p++) {
+                        const __m256 comparison = _mm256_cmp_ps(
+                                min_distances_i[j][p],
+                                distance_candidate,
+                                _CMP_LE_OS);
+
+                        // // blend seems to be slower that min
+                        // const __m256 min_distances_new = _mm256_blendv_ps(
+                        //         distance_candidate,
+                        //         min_distances_i[j][p],
+                        //         comparison);
+                        const __m256 min_distances_new = _mm256_min_ps(
+                                distance_candidate, min_distances_i[j][p]);
+                        const __m256i min_indices_new =
+                                _mm256_castps_si256(_mm256_blendv_ps(
+                                        _mm256_castsi256_ps(indices_candidate),
+                                        _mm256_castsi256_ps(
+                                                min_indices_i[j][p]),
+                                        comparison));
+
+                        // // blend seems to be slower that min
+                        // const __m256 max_distances_new = _mm256_blendv_ps(
+                        //         min_distances_i[j][p],
+                        //         distance_candidate,
+                        //         comparison);
+                        const __m256 max_distances_new = _mm256_max_ps(
+                                min_distances_i[j][p], distances_reg);
+                        const __m256i max_indices_new =
+                                _mm256_castps_si256(_mm256_blendv_ps(
+                                        _mm256_castsi256_ps(
+                                                min_indices_i[j][p]),
+                                        _mm256_castsi256_ps(indices_candidate),
+                                        comparison));
+
+                        distance_candidate = max_distances_new;
+                        indices_candidate = max_indices_new;
+
+                        min_distances_i[j][p] = min_distances_new;
+                        min_indices_i[j][p] = min_indices_new;
+                    }
+                }
+
+                current_indices =
+                        _mm256_add_epi32(current_indices, indices_delta);
+            }
+
+            // fix the indices
+            for (uint32_t j = 0; j < NBUCKETS_8; j++) {
+                const __m256i offset =
+                        _mm256_set1_epi32(n_per_beam * beam_index + j * 8);
+                for (uint32_t p = 0; p < N; p++) {
+                    min_indices_i[j][p] =
+                            _mm256_add_epi32(min_indices_i[j][p], offset);
+                }
+            }
+
+            // merge every bucket into the regular heap
+            for (uint32_t p = 0; p < N; p++) {
+                for (uint32_t j = 0; j < NBUCKETS_8; j++) {
+                    int32_t min_indices_scalar[8];
+                    float min_distances_scalar[8];
+
+                    _mm256_storeu_si256(
+                            (__m256i*)min_indices_scalar, min_indices_i[j][p]);
+                    _mm256_storeu_ps(
+                            min_distances_scalar, min_distances_i[j][p]);
+
+                    // this exact way is needed to maintain the order as if the
+                    // input elements were pushed to the heap sequentially
+                    for (size_t j8 = 0; j8 < 8; j8++) {
+                        const auto value = min_distances_scalar[j8];
+                        const auto index = min_indices_scalar[j8];
+                        if (C::cmp2(bh_val[0], value, bh_ids[0], index)) {
+                            heap_replace_top<C>(
+                                    k, bh_val, bh_ids, value, index);
+                        }
+                    }
+                }
+            }
+
+            // process leftovers
+            for (uint32_t ip = nb; ip < n_per_beam; ip++) {
+                const int32_t index = ip + n_per_beam * beam_index;
+                const float value = distances[index];
+
+                if (C::cmp(bh_val[0], value)) {
+                    heap_replace_top<C>(k, bh_val, bh_ids, value, index);
+                }
+            }
+        }
+    }
+};
+
+} // namespace faiss
diff -Nru faiss-1.7.3/faiss/utils/approx_topk/generic.h faiss-1.7.4/faiss/utils/approx_topk/generic.h
--- faiss-1.7.3/faiss/utils/approx_topk/generic.h	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/faiss/utils/approx_topk/generic.h	2023-04-19 13:18:30.000000000 +0000
@@ -0,0 +1,138 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <limits>
+#include <utility>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/Heap.h>
+
+namespace faiss {
+
+// This is the implementation of the idea and it is very slow,
+// because a compiler is unable to vectorize it properly.
+
+template <typename C, uint32_t NBUCKETS, uint32_t N>
+struct HeapWithBuckets {
+    // this case was not implemented yet.
+};
+
+template <uint32_t NBUCKETS, uint32_t N>
+struct HeapWithBuckets<CMax<float, int>, NBUCKETS, N> {
+    static void addn(
+            // number of elements
+            const uint32_t n,
+            // distances. It is assumed to have n elements.
+            const float* const __restrict distances,
+            // number of best elements to keep
+            const uint32_t k,
+            // output distances
+            float* const __restrict bh_val,
+            // output indices, each being within [0, n) range
+            int32_t* const __restrict bh_ids) {
+        // forward a call to bs_addn with 1 beam
+        bs_addn(1, n, distances, k, bh_val, bh_ids);
+    }
+
+    static void bs_addn(
+            // beam_size parameter of Beam Search algorithm
+            const uint32_t beam_size,
+            // number of elements per beam
+            const uint32_t n_per_beam,
+            // distances. It is assumed to have (n_per_beam * beam_size)
+            // elements.
+            const float* const __restrict distances,
+            // number of best elements to keep
+            const uint32_t k,
+            // output distances
+            float* const __restrict bh_val,
+            // output indices, each being within [0, n_per_beam * beam_size)
+            // range
+            int32_t* const __restrict bh_ids) {
+        // // Basically, the function runs beam_size iterations.
+        // // Every iteration NBUCKETS * N elements are added to a regular heap.
+        // // So, maximum number of added elements is beam_size * NBUCKETS * N.
+        // // This number is expected to be less or equal than k.
+        // FAISS_THROW_IF_NOT_FMT(
+        //         beam_size * NBUCKETS * N >= k,
+        //         "Cannot pick %d elements, only %d. "
+        //         "Check the function and template arguments values.",
+        //         k,
+        //         beam_size * NBUCKETS * N);
+
+        using C = CMax<float, int>;
+
+        // main loop
+        for (uint32_t beam_index = 0; beam_index < beam_size; beam_index++) {
+            float min_distances_i[N][NBUCKETS];
+            int min_indices_i[N][NBUCKETS];
+
+            for (uint32_t p = 0; p < N; p++) {
+                for (uint32_t j = 0; j < NBUCKETS; j++) {
+                    min_distances_i[p][j] = std::numeric_limits<float>::max();
+                    min_indices_i[p][j] = 0;
+                }
+            }
+
+            const uint32_t nb = (n_per_beam / NBUCKETS) * NBUCKETS;
+
+            // put the data into buckets
+            for (uint32_t ip = 0; ip < nb; ip += NBUCKETS) {
+                for (uint32_t j = 0; j < NBUCKETS; j++) {
+                    const int index = j + ip + n_per_beam * beam_index;
+                    const float distance = distances[index];
+
+                    int index_candidate = index;
+                    float distance_candidate = distance;
+
+                    for (uint32_t p = 0; p < N; p++) {
+                        if (distance_candidate < min_distances_i[p][j]) {
+                            std::swap(
+                                    distance_candidate, min_distances_i[p][j]);
+                            std::swap(index_candidate, min_indices_i[p][j]);
+                        }
+                    }
+                }
+            }
+
+            // merge every bucket into the regular heap
+            for (uint32_t p = 0; p < N; p++) {
+                for (uint32_t j = 0; j < NBUCKETS; j++) {
+                    // this exact way is needed to maintain the order as if the
+                    // input elements were pushed to the heap sequentially
+
+                    if (C::cmp2(bh_val[0],
+                                min_distances_i[p][j],
+                                bh_ids[0],
+                                min_indices_i[p][j])) {
+                        heap_replace_top<C>(
+                                k,
+                                bh_val,
+                                bh_ids,
+                                min_distances_i[p][j],
+                                min_indices_i[p][j]);
+                    }
+                }
+            }
+
+            // process leftovers
+            for (uint32_t ip = nb; ip < n_per_beam; ip++) {
+                const int32_t index = ip + n_per_beam * beam_index;
+                const float value = distances[index];
+
+                if (C::cmp(bh_val[0], value)) {
+                    heap_replace_top<C>(k, bh_val, bh_ids, value, index);
+                }
+            }
+        }
+    }
+};
+
+} // namespace faiss
diff -Nru faiss-1.7.3/faiss/utils/approx_topk/mode.h faiss-1.7.4/faiss/utils/approx_topk/mode.h
--- faiss-1.7.3/faiss/utils/approx_topk/mode.h	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/faiss/utils/approx_topk/mode.h	2023-04-19 13:18:30.000000000 +0000
@@ -0,0 +1,34 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+/// Represents the mode of use of approximate top-k computations
+/// that allows to trade accuracy vs speed. So, every options
+/// besides EXACT_TOPK increases the speed.
+///
+/// B represents the number of buckets.
+/// D is the number of min-k elements to track within every bucket.
+///
+/// Default option is EXACT_TOPK.
+/// APPROX_TOPK_BUCKETS_B16_D2 is worth starting from, if you'd like
+/// to experiment a bit.
+///
+/// It seems that only the limited number of combinations are
+/// meaningful, because of the limited supply of SIMD registers.
+/// Also, certain combinations, such as B32_D1 and B16_D1, were concluded
+/// to be not very precise in benchmarks, so ones were not introduced.
+///
+/// TODO: Consider d-ary SIMD heap.
+
+enum ApproxTopK_mode_t : int {
+    EXACT_TOPK = 0,
+    APPROX_TOPK_BUCKETS_B32_D2 = 1,
+    APPROX_TOPK_BUCKETS_B8_D3 = 2,
+    APPROX_TOPK_BUCKETS_B16_D2 = 3,
+    APPROX_TOPK_BUCKETS_B8_D2 = 4,
+};
diff -Nru faiss-1.7.3/faiss/utils/approx_topk_hamming/approx_topk_hamming.h faiss-1.7.4/faiss/utils/approx_topk_hamming/approx_topk_hamming.h
--- faiss-1.7.3/faiss/utils/approx_topk_hamming/approx_topk_hamming.h	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/faiss/utils/approx_topk_hamming/approx_topk_hamming.h	2023-04-19 13:18:30.000000000 +0000
@@ -0,0 +1,367 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <limits>
+#include <utility>
+
+#include <faiss/utils/Heap.h>
+#include <faiss/utils/simdlib.h>
+
+namespace faiss {
+
+// HeapWithBucketsForHamming32 uses simd8uint32 under the hood.
+
+template <typename C, uint32_t NBUCKETS, uint32_t N, typename HammingComputerT>
+struct HeapWithBucketsForHamming32 {
+    // this case was not implemented yet.
+};
+
+template <uint32_t NBUCKETS, uint32_t N, typename HammingComputerT>
+struct HeapWithBucketsForHamming32<
+        CMax<int, int64_t>,
+        NBUCKETS,
+        N,
+        HammingComputerT> {
+    static constexpr uint32_t NBUCKETS_8 = NBUCKETS / 8;
+    static_assert(
+            (NBUCKETS) > 0 && ((NBUCKETS % 8) == 0),
+            "Number of buckets needs to be 8, 16, 24, ...");
+
+    static void addn(
+            // number of elements
+            const uint32_t n,
+            // Hamming computer
+            const HammingComputerT& hc,
+            // n elements that can be used with hc
+            const uint8_t* const __restrict binaryVectors,
+            // number of best elements to keep
+            const uint32_t k,
+            // output distances
+            int* const __restrict bh_val,
+            // output indices, each being within [0, n) range
+            int64_t* const __restrict bh_ids) {
+        // forward a call to bs_addn with 1 beam
+        bs_addn(1, n, hc, binaryVectors, k, bh_val, bh_ids);
+    }
+
+    static void bs_addn(
+            // beam_size parameter of Beam Search algorithm
+            const uint32_t beam_size,
+            // number of elements per beam
+            const uint32_t n_per_beam,
+            // Hamming computer
+            const HammingComputerT& hc,
+            // n elements that can be used against hc
+            const uint8_t* const __restrict binary_vectors,
+            // number of best elements to keep
+            const uint32_t k,
+            // output distances
+            int* const __restrict bh_val,
+            // output indices, each being within [0, n_per_beam * beam_size)
+            // range
+            int64_t* const __restrict bh_ids) {
+        //
+        using C = CMax<int, int64_t>;
+
+        // Hamming code size
+        const size_t code_size = hc.get_code_size();
+
+        // main loop
+        for (uint32_t beam_index = 0; beam_index < beam_size; beam_index++) {
+            simd8uint32 min_distances_i[NBUCKETS_8][N];
+            simd8uint32 min_indices_i[NBUCKETS_8][N];
+
+            for (uint32_t j = 0; j < NBUCKETS_8; j++) {
+                for (uint32_t p = 0; p < N; p++) {
+                    min_distances_i[j][p] =
+                            simd8uint32(std::numeric_limits<int32_t>::max());
+                    min_indices_i[j][p] = simd8uint32(0, 1, 2, 3, 4, 5, 6, 7);
+                }
+            }
+
+            simd8uint32 current_indices(0, 1, 2, 3, 4, 5, 6, 7);
+            const simd8uint32 indices_delta(NBUCKETS);
+
+            const uint32_t nb = (n_per_beam / NBUCKETS) * NBUCKETS;
+
+            // put the data into buckets
+            for (uint32_t ip = 0; ip < nb; ip += NBUCKETS) {
+                for (uint32_t j = 0; j < NBUCKETS_8; j++) {
+                    uint32_t hamming_distances[8];
+                    for (size_t j8 = 0; j8 < 8; j8++) {
+                        hamming_distances[j8] = hc.hamming(
+                                binary_vectors +
+                                (j8 + j * 8 + ip + n_per_beam * beam_index) *
+                                        code_size);
+                    }
+
+                    // loop. Compiler should get rid of unneeded ops
+                    simd8uint32 distance_candidate;
+                    distance_candidate.loadu(hamming_distances);
+                    simd8uint32 indices_candidate = current_indices;
+
+                    for (uint32_t p = 0; p < N; p++) {
+                        simd8uint32 min_distances_new;
+                        simd8uint32 min_indices_new;
+                        simd8uint32 max_distances_new;
+                        simd8uint32 max_indices_new;
+
+                        faiss::cmplt_min_max_fast(
+                                distance_candidate,
+                                indices_candidate,
+                                min_distances_i[j][p],
+                                min_indices_i[j][p],
+                                min_distances_new,
+                                min_indices_new,
+                                max_distances_new,
+                                max_indices_new);
+
+                        distance_candidate = max_distances_new;
+                        indices_candidate = max_indices_new;
+
+                        min_distances_i[j][p] = min_distances_new;
+                        min_indices_i[j][p] = min_indices_new;
+                    }
+                }
+
+                current_indices += indices_delta;
+            }
+
+            // fix the indices
+            for (uint32_t j = 0; j < NBUCKETS_8; j++) {
+                const simd8uint32 offset(n_per_beam * beam_index + j * 8);
+                for (uint32_t p = 0; p < N; p++) {
+                    min_indices_i[j][p] += offset;
+                }
+            }
+
+            // merge every bucket into the regular heap
+            for (uint32_t p = 0; p < N; p++) {
+                for (uint32_t j = 0; j < NBUCKETS_8; j++) {
+                    uint32_t min_indices_scalar[8];
+                    uint32_t min_distances_scalar[8];
+
+                    min_indices_i[j][p].storeu(min_indices_scalar);
+                    min_distances_i[j][p].storeu(min_distances_scalar);
+
+                    // this exact way is needed to maintain the order as if the
+                    // input elements were pushed to the heap sequentially
+                    for (size_t j8 = 0; j8 < 8; j8++) {
+                        const auto value = min_distances_scalar[j8];
+                        const auto index = min_indices_scalar[j8];
+
+                        if (C::cmp2(bh_val[0], value, bh_ids[0], index)) {
+                            heap_replace_top<C>(
+                                    k, bh_val, bh_ids, value, index);
+                        }
+                    }
+                }
+            }
+
+            // process leftovers
+            for (uint32_t ip = nb; ip < n_per_beam; ip++) {
+                const auto index = ip + n_per_beam * beam_index;
+                const auto value =
+                        hc.hamming(binary_vectors + (index)*code_size);
+
+                if (C::cmp(bh_val[0], value)) {
+                    heap_replace_top<C>(k, bh_val, bh_ids, value, index);
+                }
+            }
+        }
+    }
+};
+
+// HeapWithBucketsForHamming16 uses simd16uint16 under the hood.
+// Less registers needed in total, so higher values of NBUCKETS/N can be used,
+//   but somewhat slower.
+// No more than 32K elements currently, but it can be reorganized a bit
+//   to be limited to 32K elements per beam.
+
+template <typename C, uint32_t NBUCKETS, uint32_t N, typename HammingComputerT>
+struct HeapWithBucketsForHamming16 {
+    // this case was not implemented yet.
+};
+
+template <uint32_t NBUCKETS, uint32_t N, typename HammingComputerT>
+struct HeapWithBucketsForHamming16<
+        CMax<int, int64_t>,
+        NBUCKETS,
+        N,
+        HammingComputerT> {
+    static constexpr uint32_t NBUCKETS_16 = NBUCKETS / 16;
+    static_assert(
+            (NBUCKETS) > 0 && ((NBUCKETS % 16) == 0),
+            "Number of buckets needs to be 16, 32, 48...");
+
+    static void addn(
+            // number of elements
+            const uint32_t n,
+            // Hamming computer
+            const HammingComputerT& hc,
+            // n elements that can be used with hc
+            const uint8_t* const __restrict binaryVectors,
+            // number of best elements to keep
+            const uint32_t k,
+            // output distances
+            int* const __restrict bh_val,
+            // output indices, each being within [0, n) range
+            int64_t* const __restrict bh_ids) {
+        // forward a call to bs_addn with 1 beam
+        bs_addn(1, n, hc, binaryVectors, k, bh_val, bh_ids);
+    }
+
+    static void bs_addn(
+            // beam_size parameter of Beam Search algorithm
+            const uint32_t beam_size,
+            // number of elements per beam
+            const uint32_t n_per_beam,
+            // Hamming computer
+            const HammingComputerT& hc,
+            // n elements that can be used against hc
+            const uint8_t* const __restrict binary_vectors,
+            // number of best elements to keep
+            const uint32_t k,
+            // output distances
+            int* const __restrict bh_val,
+            // output indices, each being within [0, n_per_beam * beam_size)
+            // range
+            int64_t* const __restrict bh_ids) {
+        //
+        using C = CMax<int, int64_t>;
+
+        // Hamming code size
+        const size_t code_size = hc.get_code_size();
+
+        // main loop
+        for (uint32_t beam_index = 0; beam_index < beam_size; beam_index++) {
+            simd16uint16 min_distances_i[NBUCKETS_16][N];
+            simd16uint16 min_indices_i[NBUCKETS_16][N];
+
+            for (uint32_t j = 0; j < NBUCKETS_16; j++) {
+                for (uint32_t p = 0; p < N; p++) {
+                    min_distances_i[j][p] =
+                            simd16uint16(std::numeric_limits<int16_t>::max());
+                    min_indices_i[j][p] = simd16uint16(
+                            0,
+                            1,
+                            2,
+                            3,
+                            4,
+                            5,
+                            6,
+                            7,
+                            8,
+                            9,
+                            10,
+                            11,
+                            12,
+                            13,
+                            14,
+                            15);
+                }
+            }
+
+            simd16uint16 current_indices(
+                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+            const simd16uint16 indices_delta((uint16_t)NBUCKETS);
+
+            const uint32_t nb = (n_per_beam / NBUCKETS) * NBUCKETS;
+
+            // put the data into buckets
+            for (uint32_t ip = 0; ip < nb; ip += NBUCKETS) {
+                for (uint32_t j = 0; j < NBUCKETS_16; j++) {
+                    uint16_t hamming_distances[16];
+                    for (size_t j16 = 0; j16 < 16; j16++) {
+                        hamming_distances[j16] = hc.hamming(
+                                binary_vectors +
+                                (j16 + j * 16 + ip + n_per_beam * beam_index) *
+                                        code_size);
+                    }
+
+                    // loop. Compiler should get rid of unneeded ops
+                    simd16uint16 distance_candidate;
+                    distance_candidate.loadu(hamming_distances);
+                    simd16uint16 indices_candidate = current_indices;
+
+                    for (uint32_t p = 0; p < N; p++) {
+                        simd16uint16 min_distances_new;
+                        simd16uint16 min_indices_new;
+                        simd16uint16 max_distances_new;
+                        simd16uint16 max_indices_new;
+
+                        faiss::cmplt_min_max_fast(
+                                distance_candidate,
+                                indices_candidate,
+                                min_distances_i[j][p],
+                                min_indices_i[j][p],
+                                min_distances_new,
+                                min_indices_new,
+                                max_distances_new,
+                                max_indices_new);
+
+                        distance_candidate = max_distances_new;
+                        indices_candidate = max_indices_new;
+
+                        min_distances_i[j][p] = min_distances_new;
+                        min_indices_i[j][p] = min_indices_new;
+                    }
+                }
+
+                current_indices += indices_delta;
+            }
+
+            // fix the indices
+            for (uint32_t j = 0; j < NBUCKETS_16; j++) {
+                const simd16uint16 offset(
+                        (uint16_t)(n_per_beam * beam_index + j * 16));
+                for (uint32_t p = 0; p < N; p++) {
+                    min_indices_i[j][p] += offset;
+                }
+            }
+
+            // merge every bucket into the regular heap
+            for (uint32_t p = 0; p < N; p++) {
+                for (uint32_t j = 0; j < NBUCKETS_16; j++) {
+                    uint16_t min_indices_scalar[16];
+                    uint16_t min_distances_scalar[16];
+
+                    min_indices_i[j][p].storeu(min_indices_scalar);
+                    min_distances_i[j][p].storeu(min_distances_scalar);
+
+                    // this exact way is needed to maintain the order as if the
+                    // input elements were pushed to the heap sequentially
+                    for (size_t j16 = 0; j16 < 16; j16++) {
+                        const auto value = min_distances_scalar[j16];
+                        const auto index = min_indices_scalar[j16];
+
+                        if (C::cmp2(bh_val[0], value, bh_ids[0], index)) {
+                            heap_replace_top<C>(
+                                    k, bh_val, bh_ids, value, index);
+                        }
+                    }
+                }
+            }
+
+            // process leftovers
+            for (uint32_t ip = nb; ip < n_per_beam; ip++) {
+                const auto index = ip + n_per_beam * beam_index;
+                const auto value =
+                        hc.hamming(binary_vectors + (index)*code_size);
+
+                if (C::cmp(bh_val[0], value)) {
+                    heap_replace_top<C>(k, bh_val, bh_ids, value, index);
+                }
+            }
+        }
+    }
+};
+
+} // namespace faiss
diff -Nru faiss-1.7.3/faiss/utils/distances.cpp faiss-1.7.4/faiss/utils/distances.cpp
--- faiss-1.7.3/faiss/utils/distances.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/utils/distances.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -26,6 +26,8 @@
 #include <faiss/impl/IDSelector.h>
 #include <faiss/impl/ResultHandler.h>
 
+#include <faiss/utils/distances_fused/distances_fused.h>
+
 #ifndef FINTEGER
 #define FINTEGER long
 #endif
@@ -229,7 +231,7 @@
 // distance correction is an operator that can be applied to transform
 // the distances
 template <class ResultHandler>
-void exhaustive_L2sqr_blas(
+void exhaustive_L2sqr_blas_default_impl(
         const float* x,
         const float* y,
         size_t d,
@@ -311,10 +313,20 @@
     }
 }
 
+template <class ResultHandler>
+void exhaustive_L2sqr_blas(
+        const float* x,
+        const float* y,
+        size_t d,
+        size_t nx,
+        size_t ny,
+        ResultHandler& res,
+        const float* y_norms = nullptr) {
+    exhaustive_L2sqr_blas_default_impl(x, y, d, nx, ny, res);
+}
+
 #ifdef __AVX2__
-// an override for AVX2 if only a single closest point is needed.
-template <>
-void exhaustive_L2sqr_blas<SingleBestResultHandler<CMax<float, int64_t>>>(
+void exhaustive_L2sqr_blas_cmax_avx2(
         const float* x,
         const float* y,
         size_t d,
@@ -513,11 +525,53 @@
                 res.add_result(i, current_min_distance, current_min_index);
             }
         }
+        // Does nothing for SingleBestResultHandler, but
+        // keeping the call for the consistency.
+        res.end_multiple();
         InterruptCallback::check();
     }
 }
 #endif
 
+// an override if only a single closest point is needed
+template <>
+void exhaustive_L2sqr_blas<SingleBestResultHandler<CMax<float, int64_t>>>(
+        const float* x,
+        const float* y,
+        size_t d,
+        size_t nx,
+        size_t ny,
+        SingleBestResultHandler<CMax<float, int64_t>>& res,
+        const float* y_norms) {
+#if defined(__AVX2__)
+    // use a faster fused kernel if available
+    if (exhaustive_L2sqr_fused_cmax(x, y, d, nx, ny, res, y_norms)) {
+        // the kernel is available and it is complete, we're done.
+        return;
+    }
+
+    // run the specialized AVX2 implementation
+    exhaustive_L2sqr_blas_cmax_avx2(x, y, d, nx, ny, res, y_norms);
+
+#elif defined(__aarch64__)
+    // use a faster fused kernel if available
+    if (exhaustive_L2sqr_fused_cmax(x, y, d, nx, ny, res, y_norms)) {
+        // the kernel is available and it is complete, we're done.
+        return;
+    }
+
+    // run the default implementation
+    exhaustive_L2sqr_blas_default_impl<
+            SingleBestResultHandler<CMax<float, int64_t>>>(
+            x, y, d, nx, ny, res, y_norms);
+#else
+    // run the default implementation
+    exhaustive_L2sqr_blas_default_impl<
+            SingleBestResultHandler<CMax<float, int64_t>>>(
+            x, y, d, nx, ny, res, y_norms);
+#endif
+}
+
 template <class ResultHandler>
 void knn_L2sqr_select(
         const float* x,
@@ -770,7 +824,7 @@
         const float* y,
         const int64_t* iy,
         float* dis) {
-#pragma omp parallel for
+#pragma omp parallel for if (n > 1)
     for (int64_t j = 0; j < n; j++) {
         if (ix[j] >= 0 && iy[j] >= 0) {
             dis[j] = fvec_L2sqr(x + d * ix[j], y + d * iy[j], d);
@@ -786,7 +840,7 @@
         const float* y,
         const int64_t* iy,
         float* dis) {
-#pragma omp parallel for
+#pragma omp parallel for if (n > 1)
     for (int64_t j = 0; j < n; j++) {
         if (ix[j] >= 0 && iy[j] >= 0) {
             dis[j] = fvec_inner_product(x + d * ix[j], y + d * iy[j], d);
@@ -887,7 +941,7 @@
     // store in beginning of distance matrix to avoid malloc
     float* b_norms = dis;
 
-#pragma omp parallel for
+#pragma omp parallel for if (nb > 1)
     for (int64_t i = 0; i < nb; i++)
         b_norms[i] = fvec_norm_L2sqr(xb + i * ldb, d);
 
diff -Nru faiss-1.7.3/faiss/utils/distances_fused/avx512.cpp faiss-1.7.4/faiss/utils/distances_fused/avx512.cpp
--- faiss-1.7.3/faiss/utils/distances_fused/avx512.cpp	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/faiss/utils/distances_fused/avx512.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -0,0 +1,346 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/utils/distances_fused/avx512.h>
+
+#ifdef __AVX512__
+
+#include <immintrin.h>
+
+namespace faiss {
+
+namespace {
+
+// It makes sense to like to overload certain cases because the further
+// kernels are in need of AVX512 registers. So, let's tell compiler
+// not to waste registers on a bit faster code, if needed.
+template <size_t DIM>
+float l2_sqr(const float* const x) {
+    // compiler should be smart enough to handle that
+    float output = x[0] * x[0];
+    for (size_t i = 1; i < DIM; i++) {
+        output += x[i] * x[i];
+    }
+
+    return output;
+}
+
+template <>
+float l2_sqr<4>(const float* const x) {
+    __m128 v = _mm_loadu_ps(x);
+    __m128 v2 = _mm_mul_ps(v, v);
+    v2 = _mm_hadd_ps(v2, v2);
+    v2 = _mm_hadd_ps(v2, v2);
+
+    return _mm_cvtss_f32(v2);
+}
+
+template <size_t DIM>
+float dot_product(
+        const float* const __restrict x,
+        const float* const __restrict y) {
+    // compiler should be smart enough to handle that
+    float output = x[0] * y[0];
+    for (size_t i = 1; i < DIM; i++) {
+        output += x[i] * y[i];
+    }
+
+    return output;
+}
+
+// The kernel for low dimensionality vectors.
+// Finds the closest one from y for every given NX_POINTS_PER_LOOP points from x
+//
+// DIM is the dimensionality of the data
+// NX_POINTS_PER_LOOP is the number of x points that get processed
+//   simultaneously.
+// NY_POINTS_PER_LOOP is the number of y points that get processed
+//   simultaneously.
+template <size_t DIM, size_t NX_POINTS_PER_LOOP, size_t NY_POINTS_PER_LOOP>
+void kernel(
+        const float* const __restrict x,
+        const float* const __restrict y,
+        const float* const __restrict y_transposed,
+        size_t ny,
+        SingleBestResultHandler<CMax<float, int64_t>>& res,
+        const float* __restrict y_norms,
+        size_t i) {
+    const size_t ny_p =
+            (ny / (16 * NY_POINTS_PER_LOOP)) * (16 * NY_POINTS_PER_LOOP);
+
+    // compute
+    const float* const __restrict xd_0 = x + i * DIM;
+
+    // prefetch the next point
+    _mm_prefetch(xd_0 + DIM * sizeof(float), _MM_HINT_NTA);
+
+    // load a single point from x
+    // load -2 * value
+    __m512 x_i[NX_POINTS_PER_LOOP][DIM];
+    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
+        for (size_t dd = 0; dd < DIM; dd++) {
+            x_i[nx_k][dd] = _mm512_set1_ps(-2 * *(xd_0 + nx_k * DIM + dd));
+        }
+    }
+
+    // compute x_norm
+    float x_norm_i[NX_POINTS_PER_LOOP];
+    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
+        x_norm_i[nx_k] = l2_sqr<DIM>(xd_0 + nx_k * DIM);
+    }
+
+    // distances and indices
+    __m512 min_distances_i[NX_POINTS_PER_LOOP];
+    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
+        min_distances_i[nx_k] =
+                _mm512_set1_ps(res.dis_tab[i + nx_k] - x_norm_i[nx_k]);
+    }
+
+    __m512i min_indices_i[NX_POINTS_PER_LOOP];
+    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
+        min_indices_i[nx_k] = _mm512_set1_epi32(0);
+    }
+
+    //
+    __m512i current_indices = _mm512_setr_epi32(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+    const __m512i indices_delta = _mm512_set1_epi32(16);
+
+    // main loop
+    size_t j = 0;
+    for (; j < ny_p; j += NY_POINTS_PER_LOOP * 16) {
+        // compute dot products for NX_POINTS from x and NY_POINTS from y
+        // technically, we're multiplying -2x and y
+        __m512 dp_i[NX_POINTS_PER_LOOP][NY_POINTS_PER_LOOP];
+
+        // DIM 0 that uses MUL
+        for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
+            __m512 y_i = _mm512_loadu_ps(y_transposed + j + ny_k * 16 + ny * 0);
+            for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
+                dp_i[nx_k][ny_k] = _mm512_mul_ps(x_i[nx_k][0], y_i);
+            }
+        }
+
+        // other DIMs that use FMA
+        for (size_t dd = 1; dd < DIM; dd++) {
+            for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
+                __m512 y_i =
+                        _mm512_loadu_ps(y_transposed + j + ny_k * 16 + ny * dd);
+
+                for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
+                    dp_i[nx_k][ny_k] = _mm512_fmadd_ps(
+                            x_i[nx_k][dd], y_i, dp_i[nx_k][ny_k]);
+                }
+            }
+        }
+
+        // compute y^2 - 2 * (x,y)
+        for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
+            __m512 y_l2_sqr = _mm512_loadu_ps(y_norms + j + ny_k * 16);
+
+            for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
+                dp_i[nx_k][ny_k] = _mm512_add_ps(dp_i[nx_k][ny_k], y_l2_sqr);
+            }
+        }
+
+        // do the comparisons and alter the min indices
+        for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
+            for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
+                const __mmask16 comparison = _mm512_cmp_ps_mask(
+                        dp_i[nx_k][ny_k], min_distances_i[nx_k], _CMP_LT_OS);
+                min_distances_i[nx_k] = _mm512_mask_blend_ps(
+                        comparison, min_distances_i[nx_k], dp_i[nx_k][ny_k]);
+                min_indices_i[nx_k] = _mm512_castps_si512(_mm512_mask_blend_ps(
+                        comparison,
+                        _mm512_castsi512_ps(min_indices_i[nx_k]),
+                        _mm512_castsi512_ps(current_indices)));
+            }
+
+            current_indices = _mm512_add_epi32(current_indices, indices_delta);
+        }
+    }
+
+    // dump values and find the minimum distance / minimum index
+    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
+        float min_distances_scalar[16];
+        uint32_t min_indices_scalar[16];
+        _mm512_storeu_ps(min_distances_scalar, min_distances_i[nx_k]);
+        _mm512_storeu_si512(
+                (__m512i*)(min_indices_scalar), min_indices_i[nx_k]);
+
+        float current_min_distance = res.dis_tab[i + nx_k];
+        uint32_t current_min_index = res.ids_tab[i + nx_k];
+
+        // This unusual comparison is needed to maintain the behavior
+        // of the original implementation: if two indices are
+        // represented with equal distance values, then
+        // the index with the min value is returned.
+        for (size_t jv = 0; jv < 16; jv++) {
+            // add missing x_norms[i]
+            float distance_candidate =
+                    min_distances_scalar[jv] + x_norm_i[nx_k];
+
+            // negative values can occur for identical vectors
+            //    due to roundoff errors.
+            if (distance_candidate < 0)
+                distance_candidate = 0;
+
+            const int64_t index_candidate = min_indices_scalar[jv];
+
+            if (current_min_distance > distance_candidate) {
+                current_min_distance = distance_candidate;
+                current_min_index = index_candidate;
+            } else if (
+                    current_min_distance == distance_candidate &&
+                    current_min_index > index_candidate) {
+                current_min_index = index_candidate;
+            }
+        }
+
+        // process leftovers
+        for (size_t j0 = j; j0 < ny; j0++) {
+            const float dp =
+                    dot_product<DIM>(x + (i + nx_k) * DIM, y + j0 * DIM);
+            float dis = x_norm_i[nx_k] + y_norms[j0] - 2 * dp;
+            // negative values can occur for identical vectors
+            //    due to roundoff errors.
+            if (dis < 0) {
+                dis = 0;
+            }
+
+            if (current_min_distance > dis) {
+                current_min_distance = dis;
+                current_min_index = j0;
+            }
+        }
+
+        // done
+        res.add_result(i + nx_k, current_min_distance, current_min_index);
+    }
+}
+
+template <size_t DIM, size_t NX_POINTS_PER_LOOP, size_t NY_POINTS_PER_LOOP>
+void exhaustive_L2sqr_fused_cmax(
+        const float* const __restrict x,
+        const float* const __restrict y,
+        size_t nx,
+        size_t ny,
+        SingleBestResultHandler<CMax<float, int64_t>>& res,
+        const float* __restrict y_norms) {
+    // BLAS does not like empty matrices
+    if (nx == 0 || ny == 0) {
+        return;
+    }
+
+    // compute norms for y
+    std::unique_ptr<float[]> del2;
+    if (!y_norms) {
+        float* y_norms2 = new float[ny];
+        del2.reset(y_norms2);
+
+        for (size_t i = 0; i < ny; i++) {
+            y_norms2[i] = l2_sqr<DIM>(y + i * DIM);
+        }
+
+        y_norms = y_norms2;
+    }
+
+    // initialize res
+    res.begin_multiple(0, nx);
+
+    // transpose y
+    std::vector<float> y_transposed(DIM * ny);
+    for (size_t j = 0; j < DIM; j++) {
+        for (size_t i = 0; i < ny; i++) {
+            y_transposed[j * ny + i] = y[j + i * DIM];
+        }
+    }
+
+    const size_t nx_p = (nx / NX_POINTS_PER_LOOP) * NX_POINTS_PER_LOOP;
+    // the main loop.
+#pragma omp parallel for schedule(dynamic)
+    for (size_t i = 0; i < nx_p; i += NX_POINTS_PER_LOOP) {
+        kernel<DIM, NX_POINTS_PER_LOOP, NY_POINTS_PER_LOOP>(
+                x, y, y_transposed.data(), ny, res, y_norms, i);
+    }
+
+    for (size_t i = nx_p; i < nx; i++) {
+        kernel<DIM, 1, NY_POINTS_PER_LOOP>(
+                x, y, y_transposed.data(), ny, res, y_norms, i);
+    }
+
+    // Does nothing for SingleBestResultHandler, but
+    // keeping the call for the consistency.
+    res.end_multiple();
+    InterruptCallback::check();
+}
+
+} // namespace
+
+bool exhaustive_L2sqr_fused_cmax_AVX512(
+        const float* x,
+        const float* y,
+        size_t d,
+        size_t nx,
+        size_t ny,
+        SingleBestResultHandler<CMax<float, int64_t>>& res,
+        const float* y_norms) {
+    // process only cases with certain dimensionalities
+
+#define DISPATCH(DIM, NX_POINTS_PER_LOOP, NY_POINTS_PER_LOOP)    \
+    case DIM: {                                                  \
+        exhaustive_L2sqr_fused_cmax<                             \
+                DIM,                                             \
+                NX_POINTS_PER_LOOP,                              \
+                NY_POINTS_PER_LOOP>(x, y, nx, ny, res, y_norms); \
+        return true;                                             \
+    }
+
+    switch (d) {
+        DISPATCH(1, 8, 1)
+        DISPATCH(2, 8, 1)
+        DISPATCH(3, 8, 1)
+        DISPATCH(4, 8, 1)
+        DISPATCH(5, 8, 1)
+        DISPATCH(6, 8, 1)
+        DISPATCH(7, 8, 1)
+        DISPATCH(8, 8, 1)
+        DISPATCH(9, 8, 1)
+        DISPATCH(10, 8, 1)
+        DISPATCH(11, 8, 1)
+        DISPATCH(12, 8, 1)
+        DISPATCH(13, 8, 1)
+        DISPATCH(14, 8, 1)
+        DISPATCH(15, 8, 1)
+        DISPATCH(16, 8, 1)
+        DISPATCH(17, 8, 1)
+        DISPATCH(18, 8, 1)
+        DISPATCH(19, 8, 1)
+        DISPATCH(20, 8, 1)
+        DISPATCH(21, 8, 1)
+        DISPATCH(22, 8, 1)
+        DISPATCH(23, 8, 1)
+        DISPATCH(24, 8, 1)
+        DISPATCH(25, 8, 1)
+        DISPATCH(26, 8, 1)
+        DISPATCH(27, 8, 1)
+        DISPATCH(28, 8, 1)
+        DISPATCH(29, 8, 1)
+        DISPATCH(30, 8, 1)
+        DISPATCH(31, 8, 1)
+        DISPATCH(32, 8, 1)
+    }
+
+    return false;
+#undef DISPATCH
+}
+
+} // namespace faiss
+
+#endif
diff -Nru faiss-1.7.3/faiss/utils/distances_fused/avx512.h faiss-1.7.4/faiss/utils/distances_fused/avx512.h
--- faiss-1.7.3/faiss/utils/distances_fused/avx512.h	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/faiss/utils/distances_fused/avx512.h	2023-04-19 13:18:30.000000000 +0000
@@ -0,0 +1,36 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// AVX512 might be not used, but this version provides ~2x speedup
+// over AVX2 kernel, say, for training PQx10 or PQx12, and speeds up
+// additional cases with larger dimensionalities.
+
+#pragma once
+
+#include <faiss/impl/ResultHandler.h>
+#include <faiss/impl/platform_macros.h>
+
+#include <faiss/utils/Heap.h>
+
+#ifdef __AVX512__
+
+namespace faiss {
+
+// Returns true if the fused kernel is available and the data was processed.
+// Returns false if the fused kernel is not available.
+bool exhaustive_L2sqr_fused_cmax_AVX512(
+        const float* x,
+        const float* y,
+        size_t d,
+        size_t nx,
+        size_t ny,
+        SingleBestResultHandler<CMax<float, int64_t>>& res,
+        const float* y_norms);
+
+} // namespace faiss
+
+#endif
diff -Nru faiss-1.7.3/faiss/utils/distances_fused/distances_fused.cpp faiss-1.7.4/faiss/utils/distances_fused/distances_fused.cpp
--- faiss-1.7.3/faiss/utils/distances_fused/distances_fused.cpp	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/faiss/utils/distances_fused/distances_fused.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -0,0 +1,42 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/utils/distances_fused/distances_fused.h>
+
+#include <faiss/impl/platform_macros.h>
+
+#include <faiss/utils/distances_fused/avx512.h>
+#include <faiss/utils/distances_fused/simdlib_based.h>
+
+namespace faiss {
+
+bool exhaustive_L2sqr_fused_cmax(
+        const float* x,
+        const float* y,
+        size_t d,
+        size_t nx,
+        size_t ny,
+        SingleBestResultHandler<CMax<float, int64_t>>& res,
+        const float* y_norms) {
+    if (nx == 0 || ny == 0) {
+        // nothing to do
+        return true;
+    }
+
+#ifdef __AVX512__
+    // avx512 kernel
+    return exhaustive_L2sqr_fused_cmax_AVX512(x, y, d, nx, ny, res, y_norms);
+#elif defined(__AVX2__) || defined(__aarch64__)
+    // avx2 or arm neon kernel
+    return exhaustive_L2sqr_fused_cmax_simdlib(x, y, d, nx, ny, res, y_norms);
+#else
+    // not supported, please use a general-purpose kernel
+    return false;
+#endif
+}
+
+} // namespace faiss
diff -Nru faiss-1.7.3/faiss/utils/distances_fused/distances_fused.h faiss-1.7.4/faiss/utils/distances_fused/distances_fused.h
--- faiss-1.7.3/faiss/utils/distances_fused/distances_fused.h	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/faiss/utils/distances_fused/distances_fused.h	2023-04-19 13:18:30.000000000 +0000
@@ -0,0 +1,40 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// This file contains a fused kernel that combines distance computation
+// and the search for the CLOSEST point. Currently, this is done for small
+// dimensionality vectors when it is beneficial to avoid storing temporary
+// dot product information in RAM. This is particularly effective
+// when training PQx10 or PQx12 with the default parameters.
+//
+// InterruptCallback::check() is not used, because it is assumed that the
+// kernel takes a little time because of a tiny dimensionality.
+//
+// Later on, similar optimization can be implemented for large size vectors,
+// but a different kernel is needed.
+//
+
+#pragma once
+
+#include <faiss/impl/ResultHandler.h>
+
+#include <faiss/utils/Heap.h>
+
+namespace faiss {
+
+// Returns true if the fused kernel is available and the data was processed.
+// Returns false if the fused kernel is not available.
+bool exhaustive_L2sqr_fused_cmax(
+        const float* x,
+        const float* y,
+        size_t d,
+        size_t nx,
+        size_t ny,
+        SingleBestResultHandler<CMax<float, int64_t>>& res,
+        const float* y_norms);
+
+} // namespace faiss
diff -Nru faiss-1.7.3/faiss/utils/distances_fused/simdlib_based.cpp faiss-1.7.4/faiss/utils/distances_fused/simdlib_based.cpp
--- faiss-1.7.3/faiss/utils/distances_fused/simdlib_based.cpp	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/faiss/utils/distances_fused/simdlib_based.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -0,0 +1,352 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/utils/distances_fused/simdlib_based.h>
+
+#if defined(__AVX2__) || defined(__aarch64__)
+
+#include <faiss/utils/simdlib.h>
+
+#if defined(__AVX2__)
+#include <immintrin.h>
+#endif
+
+namespace faiss {
+
+namespace {
+
+// It makes sense to like to overload certain cases because the further
+// kernels are in need of registers. So, let's tell compiler
+// not to waste registers on a bit faster code, if needed.
+template <size_t DIM>
+float l2_sqr(const float* const x) {
+    // compiler should be smart enough to handle that
+    float output = x[0] * x[0];
+    for (size_t i = 1; i < DIM; i++) {
+        output += x[i] * x[i];
+    }
+
+    return output;
+}
+
+template <size_t DIM>
+float dot_product(
+        const float* const __restrict x,
+        const float* const __restrict y) {
+    // compiler should be smart enough to handle that
+    float output = x[0] * y[0];
+    for (size_t i = 1; i < DIM; i++) {
+        output += x[i] * y[i];
+    }
+
+    return output;
+}
+
+// The kernel for low dimensionality vectors.
+// Finds the closest one from y for every given NX_POINTS_PER_LOOP points from x
+//
+// DIM is the dimensionality of the data
+// NX_POINTS_PER_LOOP is the number of x points that get processed
+//   simultaneously.
+// NY_POINTS_PER_LOOP is the number of y points that get processed
+//   simultaneously.
+template <size_t DIM, size_t NX_POINTS_PER_LOOP, size_t NY_POINTS_PER_LOOP>
+void kernel(
+        const float* const __restrict x,
+        const float* const __restrict y,
+        const float* const __restrict y_transposed,
+        const size_t ny,
+        SingleBestResultHandler<CMax<float, int64_t>>& res,
+        const float* __restrict y_norms,
+        const size_t i) {
+    const size_t ny_p =
+            (ny / (8 * NY_POINTS_PER_LOOP)) * (8 * NY_POINTS_PER_LOOP);
+
+    // compute
+    const float* const __restrict xd_0 = x + i * DIM;
+
+    // prefetch the next point
+#if defined(__AVX2__)
+    _mm_prefetch(xd_0 + DIM * sizeof(float), _MM_HINT_NTA);
+#endif
+
+    // load a single point from x
+    // load -2 * value
+    simd8float32 x_i[NX_POINTS_PER_LOOP][DIM];
+    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
+        for (size_t dd = 0; dd < DIM; dd++) {
+            x_i[nx_k][dd] = simd8float32(-2 * *(xd_0 + nx_k * DIM + dd));
+        }
+    }
+
+    // compute x_norm
+    float x_norm_i[NX_POINTS_PER_LOOP];
+    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
+        x_norm_i[nx_k] = l2_sqr<DIM>(xd_0 + nx_k * DIM);
+    }
+
+    // distances and indices
+    simd8float32 min_distances_i[NX_POINTS_PER_LOOP];
+    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
+        min_distances_i[nx_k] =
+                simd8float32(res.dis_tab[i + nx_k] - x_norm_i[nx_k]);
+    }
+
+    simd8uint32 min_indices_i[NX_POINTS_PER_LOOP];
+    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
+        min_indices_i[nx_k] = simd8uint32((uint32_t)0);
+    }
+
+    //
+    simd8uint32 current_indices = simd8uint32(0, 1, 2, 3, 4, 5, 6, 7);
+    const simd8uint32 indices_delta = simd8uint32(8);
+
+    // main loop
+    size_t j = 0;
+    for (; j < ny_p; j += NY_POINTS_PER_LOOP * 8) {
+        // compute dot products for NX_POINTS from x and NY_POINTS from y
+        // technically, we're multiplying -2x and y
+        simd8float32 dp_i[NX_POINTS_PER_LOOP][NY_POINTS_PER_LOOP];
+
+        // DIM 0 that uses MUL
+        for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
+            simd8float32 y_i =
+                    simd8float32(y_transposed + j + ny_k * 8 + ny * 0);
+            for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
+                dp_i[nx_k][ny_k] = x_i[nx_k][0] * y_i;
+            }
+        }
+
+        // other DIMs that use FMA
+        for (size_t dd = 1; dd < DIM; dd++) {
+            for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
+                simd8float32 y_i =
+                        simd8float32(y_transposed + j + ny_k * 8 + ny * dd);
+
+                for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
+                    dp_i[nx_k][ny_k] =
+                            fmadd(x_i[nx_k][dd], y_i, dp_i[nx_k][ny_k]);
+                }
+            }
+        }
+
+        // compute y^2 + (-2x,y)
+        for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
+            simd8float32 y_l2_sqr = simd8float32(y_norms + j + ny_k * 8);
+
+            for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
+                dp_i[nx_k][ny_k] = dp_i[nx_k][ny_k] + y_l2_sqr;
+            }
+        }
+
+        // do the comparisons and alter the min indices
+        for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
+            for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
+                // cmpps
+                cmplt_and_blend_inplace(
+                        dp_i[nx_k][ny_k],
+                        current_indices,
+                        min_distances_i[nx_k],
+                        min_indices_i[nx_k]);
+            }
+
+            current_indices = current_indices + indices_delta;
+        }
+    }
+
+    // dump values and find the minimum distance / minimum index
+    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
+        float min_distances_scalar[8];
+        uint32_t min_indices_scalar[8];
+
+        min_distances_i[nx_k].storeu(min_distances_scalar);
+        min_indices_i[nx_k].storeu(min_indices_scalar);
+
+        float current_min_distance = res.dis_tab[i + nx_k];
+        uint32_t current_min_index = res.ids_tab[i + nx_k];
+
+        // This unusual comparison is needed to maintain the behavior
+        // of the original implementation: if two indices are
+        // represented with equal distance values, then
+        // the index with the min value is returned.
+        for (size_t jv = 0; jv < 8; jv++) {
+            // add missing x_norms[i]
+            float distance_candidate =
+                    min_distances_scalar[jv] + x_norm_i[nx_k];
+
+            // negative values can occur for identical vectors
+            //    due to roundoff errors.
+            if (distance_candidate < 0) {
+                distance_candidate = 0;
+            }
+
+            const int64_t index_candidate = min_indices_scalar[jv];
+
+            if (current_min_distance > distance_candidate) {
+                current_min_distance = distance_candidate;
+                current_min_index = index_candidate;
+            } else if (
+                    current_min_distance == distance_candidate &&
+                    current_min_index > index_candidate) {
+                current_min_index = index_candidate;
+            }
+        }
+
+        // process leftovers
+        for (size_t j0 = j; j0 < ny; j0++) {
+            const float dp =
+                    dot_product<DIM>(x + (i + nx_k) * DIM, y + j0 * DIM);
+            float dis = x_norm_i[nx_k] + y_norms[j0] - 2 * dp;
+            // negative values can occur for identical vectors
+            //    due to roundoff errors.
+            if (dis < 0) {
+                dis = 0;
+            }
+
+            if (current_min_distance > dis) {
+                current_min_distance = dis;
+                current_min_index = j0;
+            }
+        }
+
+        // done
+        res.add_result(i + nx_k, current_min_distance, current_min_index);
+    }
+}
+
+template <size_t DIM, size_t NX_POINTS_PER_LOOP, size_t NY_POINTS_PER_LOOP>
+void exhaustive_L2sqr_fused_cmax(
+        const float* const __restrict x,
+        const float* const __restrict y,
+        size_t nx,
+        size_t ny,
+        SingleBestResultHandler<CMax<float, int64_t>>& res,
+        const float* __restrict y_norms) {
+    // BLAS does not like empty matrices
+    if (nx == 0 || ny == 0) {
+        return;
+    }
+
+    // compute norms for y
+    std::unique_ptr<float[]> del2;
+    if (!y_norms) {
+        float* y_norms2 = new float[ny];
+        del2.reset(y_norms2);
+
+        for (size_t i = 0; i < ny; i++) {
+            y_norms2[i] = l2_sqr<DIM>(y + i * DIM);
+        }
+
+        y_norms = y_norms2;
+    }
+
+    // initialize res
+    res.begin_multiple(0, nx);
+
+    // transpose y
+    std::vector<float> y_transposed(DIM * ny);
+    for (size_t j = 0; j < DIM; j++) {
+        for (size_t i = 0; i < ny; i++) {
+            y_transposed[j * ny + i] = y[j + i * DIM];
+        }
+    }
+
+    const size_t nx_p = (nx / NX_POINTS_PER_LOOP) * NX_POINTS_PER_LOOP;
+    // the main loop.
+#pragma omp parallel for schedule(dynamic)
+    for (size_t i = 0; i < nx_p; i += NX_POINTS_PER_LOOP) {
+        kernel<DIM, NX_POINTS_PER_LOOP, NY_POINTS_PER_LOOP>(
+                x, y, y_transposed.data(), ny, res, y_norms, i);
+    }
+
+    for (size_t i = nx_p; i < nx; i++) {
+        kernel<DIM, 1, NY_POINTS_PER_LOOP>(
+                x, y, y_transposed.data(), ny, res, y_norms, i);
+    }
+
+    // Does nothing for SingleBestResultHandler, but
+    // keeping the call for the consistency.
+    res.end_multiple();
+    InterruptCallback::check();
+}
+
+} // namespace
+
+bool exhaustive_L2sqr_fused_cmax_simdlib(
+        const float* x,
+        const float* y,
+        size_t d,
+        size_t nx,
+        size_t ny,
+        SingleBestResultHandler<CMax<float, int64_t>>& res,
+        const float* y_norms) {
+    // Process only cases with certain dimensionalities.
+    // An acceptable dimensionality value is limited by the number of
+    // available registers.
+
+#define DISPATCH(DIM, NX_POINTS_PER_LOOP, NY_POINTS_PER_LOOP)    \
+    case DIM: {                                                  \
+        exhaustive_L2sqr_fused_cmax<                             \
+                DIM,                                             \
+                NX_POINTS_PER_LOOP,                              \
+                NY_POINTS_PER_LOOP>(x, y, nx, ny, res, y_norms); \
+        return true;                                             \
+    }
+
+    // faiss/benchs/bench_quantizer.py was used for benchmarking
+    // and tuning 2nd and 3rd parameters values.
+    // Basically, the larger the values for 2nd and 3rd parameters are,
+    // the faster the execution is, but the more SIMD registers are needed.
+    // This can be compensated with L1 cache, this is why this
+    // code might operate with more registers than available
+    // because of concurrent ports operations for ALU and LOAD/STORE.
+
+#if defined(__AVX2__)
+    // It was possible to tweak these parameters on x64 machine.
+    switch (d) {
+        DISPATCH(1, 6, 1)
+        DISPATCH(2, 6, 1)
+        DISPATCH(3, 6, 1)
+        DISPATCH(4, 8, 1)
+        DISPATCH(5, 8, 1)
+        DISPATCH(6, 8, 1)
+        DISPATCH(7, 8, 1)
+        DISPATCH(8, 8, 1)
+        DISPATCH(9, 8, 1)
+        DISPATCH(10, 8, 1)
+        DISPATCH(11, 8, 1)
+        DISPATCH(12, 8, 1)
+        DISPATCH(13, 6, 1)
+        DISPATCH(14, 6, 1)
+        DISPATCH(15, 6, 1)
+        DISPATCH(16, 6, 1)
+    }
+#else
+    // Please feel free to alter 2nd and 3rd parameters if you have access
+    // to ARM-based machine so that you are able to benchmark this code.
+    // Or to enable other dimensions.
+    switch (d) {
+        DISPATCH(1, 4, 2)
+        DISPATCH(2, 2, 2)
+        DISPATCH(3, 2, 2)
+        DISPATCH(4, 2, 1)
+        DISPATCH(5, 1, 1)
+        DISPATCH(6, 1, 1)
+        DISPATCH(7, 1, 1)
+        DISPATCH(8, 1, 1)
+    }
+#endif
+
+    return false;
+#undef DISPATCH
+}
+
+} // namespace faiss
+
+#endif
diff -Nru faiss-1.7.3/faiss/utils/distances_fused/simdlib_based.h faiss-1.7.4/faiss/utils/distances_fused/simdlib_based.h
--- faiss-1.7.3/faiss/utils/distances_fused/simdlib_based.h	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/faiss/utils/distances_fused/simdlib_based.h	2023-04-19 13:18:30.000000000 +0000
@@ -0,0 +1,32 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/impl/ResultHandler.h>
+#include <faiss/impl/platform_macros.h>
+
+#include <faiss/utils/Heap.h>
+
+#if defined(__AVX2__) || defined(__aarch64__)
+
+namespace faiss {
+
+// Returns true if the fused kernel is available and the data was processed.
+// Returns false if the fused kernel is not available.
+bool exhaustive_L2sqr_fused_cmax_simdlib(
+        const float* x,
+        const float* y,
+        size_t d,
+        size_t nx,
+        size_t ny,
+        SingleBestResultHandler<CMax<float, int64_t>>& res,
+        const float* y_norms);
+
+} // namespace faiss
+
+#endif
diff -Nru faiss-1.7.3/faiss/utils/distances.h faiss-1.7.4/faiss/utils/distances.h
--- faiss-1.7.3/faiss/utils/distances.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/utils/distances.h	2023-04-19 13:18:30.000000000 +0000
@@ -73,6 +73,17 @@
         size_t d,
         size_t ny);
 
+/* compute ny square L2 distance between x and a set of transposed contiguous
+   y vectors. squared lengths of y should be provided as well */
+void fvec_L2sqr_ny_transposed(
+        float* dis,
+        const float* x,
+        const float* y,
+        const float* y_sqlen,
+        size_t d,
+        size_t d_offset,
+        size_t ny);
+
 /* compute ny square L2 distance between x and a set of contiguous y vectors
    and return the index of the nearest vector.
    return 0 if ny == 0. */
diff -Nru faiss-1.7.3/faiss/utils/distances_simd.cpp faiss-1.7.4/faiss/utils/distances_simd.cpp
--- faiss-1.7.3/faiss/utils/distances_simd.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/utils/distances_simd.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -23,6 +23,10 @@
 #include <immintrin.h>
 #endif
 
+#ifdef __AVX2__
+#include <faiss/utils/transpose/transpose-avx2-inl.h>
+#endif
+
 #ifdef __aarch64__
 #include <arm_neon.h>
 #endif
@@ -56,16 +60,6 @@
  * Reference implementations
  */
 
-float fvec_L2sqr_ref(const float* x, const float* y, size_t d) {
-    size_t i;
-    float res = 0;
-    for (i = 0; i < d; i++) {
-        const float tmp = x[i] - y[i];
-        res += tmp * tmp;
-    }
-    return res;
-}
-
 float fvec_L1_ref(const float* x, const float* y, size_t d) {
     size_t i;
     float res = 0;
@@ -85,22 +79,6 @@
     return res;
 }
 
-float fvec_inner_product_ref(const float* x, const float* y, size_t d) {
-    size_t i;
-    float res = 0;
-    for (i = 0; i < d; i++)
-        res += x[i] * y[i];
-    return res;
-}
-
-float fvec_norm_L2sqr_ref(const float* x, size_t d) {
-    size_t i;
-    double res = 0;
-    for (i = 0; i < d; i++)
-        res += x[i] * x[i];
-    return res;
-}
-
 void fvec_L2sqr_ny_ref(
         float* dis,
         const float* x,
@@ -204,6 +182,48 @@
 }
 
 /*********************************************************
+ * Autovectorized implementations
+ */
+
+FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
+float fvec_inner_product(const float* x, const float* y, size_t d) {
+    float res = 0.F;
+    FAISS_PRAGMA_IMPRECISE_LOOP
+    for (size_t i = 0; i != d; ++i) {
+        res += x[i] * y[i];
+    }
+    return res;
+}
+FAISS_PRAGMA_IMPRECISE_FUNCTION_END
+
+FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
+float fvec_norm_L2sqr(const float* x, size_t d) {
+    // the double in the _ref is suspected to be a typo. Some of the manual
+    // implementations this replaces used float.
+    float res = 0;
+    FAISS_PRAGMA_IMPRECISE_LOOP
+    for (size_t i = 0; i != d; ++i) {
+        res += x[i] * x[i];
+    }
+
+    return res;
+}
+FAISS_PRAGMA_IMPRECISE_FUNCTION_END
+
+FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
+float fvec_L2sqr(const float* x, const float* y, size_t d) {
+    size_t i;
+    float res = 0;
+    FAISS_PRAGMA_IMPRECISE_LOOP
+    for (i = 0; i < d; i++) {
+        const float tmp = x[i] - y[i];
+        res += tmp * tmp;
+    }
+    return res;
+}
+FAISS_PRAGMA_IMPRECISE_FUNCTION_END
+
+/*********************************************************
  * SSE and AVX implementations
  */
 
@@ -225,25 +245,6 @@
     // cannot use AVX2 _mm_mask_set1_epi32
 }
 
-float fvec_norm_L2sqr(const float* x, size_t d) {
-    __m128 mx;
-    __m128 msum1 = _mm_setzero_ps();
-
-    while (d >= 4) {
-        mx = _mm_loadu_ps(x);
-        x += 4;
-        msum1 = _mm_add_ps(msum1, _mm_mul_ps(mx, mx));
-        d -= 4;
-    }
-
-    mx = masked_read(d, x);
-    msum1 = _mm_add_ps(msum1, _mm_mul_ps(mx, mx));
-
-    msum1 = _mm_hadd_ps(msum1, msum1);
-    msum1 = _mm_hadd_ps(msum1, msum1);
-    return _mm_cvtss_f32(msum1);
-}
-
 namespace {
 
 /// Function that does a component-wise operation between x and y
@@ -354,25 +355,25 @@
         // m3 = (x[3], x[3], x[3], x[3], x[3], x[3], x[3], x[3])
         const __m256 m3 = _mm256_set1_ps(x[3]);
 
-        const __m256i indices0 =
-                _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112);
-
         for (i = 0; i < ny8 * 8; i += 8) {
-            _mm_prefetch(y + 32, _MM_HINT_NTA);
-            _mm_prefetch(y + 48, _MM_HINT_NTA);
-
-            // collect dim 0 for 8 D4-vectors.
-            // v0 = (y[(i * 8 + 0) * 4 + 0], ..., y[(i * 8 + 7) * 4 + 0])
-            const __m256 v0 = _mm256_i32gather_ps(y, indices0, 1);
-            // collect dim 1 for 8 D4-vectors.
-            // v1 = (y[(i * 8 + 0) * 4 + 1], ..., y[(i * 8 + 7) * 4 + 1])
-            const __m256 v1 = _mm256_i32gather_ps(y + 1, indices0, 1);
-            // collect dim 2 for 8 D4-vectors.
-            // v2 = (y[(i * 8 + 0) * 4 + 2], ..., y[(i * 8 + 7) * 4 + 2])
-            const __m256 v2 = _mm256_i32gather_ps(y + 2, indices0, 1);
-            // collect dim 3 for 8 D4-vectors.
-            // v3 = (y[(i * 8 + 0) * 4 + 3], ..., y[(i * 8 + 7) * 4 + 3])
-            const __m256 v3 = _mm256_i32gather_ps(y + 3, indices0, 1);
+            // load 8x4 matrix and transpose it in registers.
+            // the typical bottleneck is memory access, so
+            // let's trade instructions for the bandwidth.
+
+            __m256 v0;
+            __m256 v1;
+            __m256 v2;
+            __m256 v3;
+
+            transpose_8x4(
+                    _mm256_loadu_ps(y + 0 * 8),
+                    _mm256_loadu_ps(y + 1 * 8),
+                    _mm256_loadu_ps(y + 2 * 8),
+                    _mm256_loadu_ps(y + 3 * 8),
+                    v0,
+                    v1,
+                    v2,
+                    v3);
 
             // compute distances
             __m256 distances = _mm256_mul_ps(m0, v0);
@@ -380,15 +381,7 @@
             distances = _mm256_fmadd_ps(m2, v2, distances);
             distances = _mm256_fmadd_ps(m3, v3, distances);
 
-            //   distances[0] = (x[0] * y[(i * 8 + 0) * 4 + 0]) +
-            //                  (x[1] * y[(i * 8 + 0) * 4 + 1]) +
-            //                  (x[2] * y[(i * 8 + 0) * 4 + 2]) +
-            //                  (x[3] * y[(i * 8 + 0) * 4 + 3])
-            //   ...
-            //   distances[7] = (x[0] * y[(i * 8 + 7) * 4 + 0]) +
-            //                  (x[1] * y[(i * 8 + 7) * 4 + 1]) +
-            //                  (x[2] * y[(i * 8 + 7) * 4 + 2]) +
-            //                  (x[3] * y[(i * 8 + 7) * 4 + 3])
+            // store
             _mm256_storeu_ps(dis + i, distances);
 
             y += 32;
@@ -432,25 +425,25 @@
         // m3 = (x[3], x[3], x[3], x[3], x[3], x[3], x[3], x[3])
         const __m256 m3 = _mm256_set1_ps(x[3]);
 
-        const __m256i indices0 =
-                _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112);
-
         for (i = 0; i < ny8 * 8; i += 8) {
-            _mm_prefetch(y + 32, _MM_HINT_NTA);
-            _mm_prefetch(y + 48, _MM_HINT_NTA);
-
-            // collect dim 0 for 8 D4-vectors.
-            // v0 = (y[(i * 8 + 0) * 4 + 0], ..., y[(i * 8 + 7) * 4 + 0])
-            const __m256 v0 = _mm256_i32gather_ps(y, indices0, 1);
-            // collect dim 1 for 8 D4-vectors.
-            // v1 = (y[(i * 8 + 0) * 4 + 1], ..., y[(i * 8 + 7) * 4 + 1])
-            const __m256 v1 = _mm256_i32gather_ps(y + 1, indices0, 1);
-            // collect dim 2 for 8 D4-vectors.
-            // v2 = (y[(i * 8 + 0) * 4 + 2], ..., y[(i * 8 + 7) * 4 + 2])
-            const __m256 v2 = _mm256_i32gather_ps(y + 2, indices0, 1);
-            // collect dim 3 for 8 D4-vectors.
-            // v3 = (y[(i * 8 + 0) * 4 + 3], ..., y[(i * 8 + 7) * 4 + 3])
-            const __m256 v3 = _mm256_i32gather_ps(y + 3, indices0, 1);
+            // load 8x4 matrix and transpose it in registers.
+            // the typical bottleneck is memory access, so
+            // let's trade instructions for the bandwidth.
+
+            __m256 v0;
+            __m256 v1;
+            __m256 v2;
+            __m256 v3;
+
+            transpose_8x4(
+                    _mm256_loadu_ps(y + 0 * 8),
+                    _mm256_loadu_ps(y + 1 * 8),
+                    _mm256_loadu_ps(y + 2 * 8),
+                    _mm256_loadu_ps(y + 3 * 8),
+                    v0,
+                    v1,
+                    v2,
+                    v3);
 
             // compute differences
             const __m256 d0 = _mm256_sub_ps(m0, v0);
@@ -464,15 +457,7 @@
             distances = _mm256_fmadd_ps(d2, d2, distances);
             distances = _mm256_fmadd_ps(d3, d3, distances);
 
-            //   distances[0] = (x[0] - y[(i * 8 + 0) * 4 + 0]) ^ 2 +
-            //                  (x[1] - y[(i * 8 + 0) * 4 + 1]) ^ 2 +
-            //                  (x[2] - y[(i * 8 + 0) * 4 + 2]) ^ 2 +
-            //                  (x[3] - y[(i * 8 + 0) * 4 + 3])
-            //   ...
-            //   distances[7] = (x[0] - y[(i * 8 + 7) * 4 + 0]) ^ 2 +
-            //                  (x[1] - y[(i * 8 + 7) * 4 + 1]) ^ 2 +
-            //                  (x[2] - y[(i * 8 + 7) * 4 + 2]) ^ 2 +
-            //                  (x[3] - y[(i * 8 + 7) * 4 + 3])
+            // store
             _mm256_storeu_ps(dis + i, distances);
 
             y += 32;
@@ -583,6 +568,228 @@
 }
 
 #ifdef __AVX2__
+template <size_t DIM>
+void fvec_L2sqr_ny_y_transposed_D(
+        float* distances,
+        const float* x,
+        const float* y,
+        const float* y_sqlen,
+        const size_t d_offset,
+        size_t ny) {
+    // current index being processed
+    size_t i = 0;
+
+    // squared length of x
+    float x_sqlen = 0;
+    ;
+    for (size_t j = 0; j < DIM; j++) {
+        x_sqlen += x[j] * x[j];
+    }
+
+    // process 8 vectors per loop.
+    const size_t ny8 = ny / 8;
+
+    if (ny8 > 0) {
+        // m[i] = (2 * x[i], ... 2 * x[i])
+        __m256 m[DIM];
+        for (size_t j = 0; j < DIM; j++) {
+            m[j] = _mm256_set1_ps(x[j]);
+            m[j] = _mm256_add_ps(m[j], m[j]);
+        }
+
+        __m256 x_sqlen_ymm = _mm256_set1_ps(x_sqlen);
+
+        for (; i < ny8 * 8; i += 8) {
+            // collect dim 0 for 8 D4-vectors.
+            const __m256 v0 = _mm256_loadu_ps(y + 0 * d_offset);
+
+            // compute dot products
+            // this is x^2 - 2x[0]*y[0]
+            __m256 dp = _mm256_fnmadd_ps(m[0], v0, x_sqlen_ymm);
+
+            for (size_t j = 1; j < DIM; j++) {
+                // collect dim j for 8 D4-vectors.
+                const __m256 vj = _mm256_loadu_ps(y + j * d_offset);
+                dp = _mm256_fnmadd_ps(m[j], vj, dp);
+            }
+
+            // we've got x^2 - (2x, y) at this point
+
+            // y^2 - (2x, y) + x^2
+            __m256 distances_v = _mm256_add_ps(_mm256_loadu_ps(y_sqlen), dp);
+
+            _mm256_storeu_ps(distances + i, distances_v);
+
+            // scroll y and y_sqlen forward.
+            y += 8;
+            y_sqlen += 8;
+        }
+    }
+
+    if (i < ny) {
+        // process leftovers
+        for (; i < ny; i++) {
+            float dp = 0;
+            for (size_t j = 0; j < DIM; j++) {
+                dp += x[j] * y[j * d_offset];
+            }
+
+            // compute y^2 - 2 * (x, y), which is sufficient for looking for the
+            //   lowest distance.
+            const float distance = y_sqlen[0] - 2 * dp + x_sqlen;
+            distances[i] = distance;
+
+            y += 1;
+            y_sqlen += 1;
+        }
+    }
+}
+#endif
+
+void fvec_L2sqr_ny_transposed(
+        float* dis,
+        const float* x,
+        const float* y,
+        const float* y_sqlen,
+        size_t d,
+        size_t d_offset,
+        size_t ny) {
+    // optimized for a few special cases
+
+#ifdef __AVX2__
+#define DISPATCH(dval)                             \
+    case dval:                                     \
+        return fvec_L2sqr_ny_y_transposed_D<dval>( \
+                dis, x, y, y_sqlen, d_offset, ny);
+
+    switch (d) {
+        DISPATCH(1)
+        DISPATCH(2)
+        DISPATCH(4)
+        DISPATCH(8)
+        default:
+            return fvec_L2sqr_ny_y_transposed_ref(
+                    dis, x, y, y_sqlen, d, d_offset, ny);
+    }
+#undef DISPATCH
+#else
+    // non-AVX2 case
+    return fvec_L2sqr_ny_y_transposed_ref(dis, x, y, y_sqlen, d, d_offset, ny);
+#endif
+}
+
+#ifdef __AVX2__
+
+size_t fvec_L2sqr_ny_nearest_D2(
+        float* distances_tmp_buffer,
+        const float* x,
+        const float* y,
+        size_t ny) {
+    // this implementation does not use distances_tmp_buffer.
+
+    // current index being processed
+    size_t i = 0;
+
+    // min distance and the index of the closest vector so far
+    float current_min_distance = HUGE_VALF;
+    size_t current_min_index = 0;
+
+    // process 8 D2-vectors per loop.
+    const size_t ny8 = ny / 8;
+    if (ny8 > 0) {
+        _mm_prefetch(y, _MM_HINT_T0);
+        _mm_prefetch(y + 16, _MM_HINT_T0);
+
+        // track min distance and the closest vector independently
+        // for each of 8 AVX2 components.
+        __m256 min_distances = _mm256_set1_ps(HUGE_VALF);
+        __m256i min_indices = _mm256_set1_epi32(0);
+
+        __m256i current_indices = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        const __m256i indices_increment = _mm256_set1_epi32(8);
+
+        // 1 value per register
+        const __m256 m0 = _mm256_set1_ps(x[0]);
+        const __m256 m1 = _mm256_set1_ps(x[1]);
+
+        for (; i < ny8 * 8; i += 8) {
+            _mm_prefetch(y + 32, _MM_HINT_T0);
+
+            __m256 v0;
+            __m256 v1;
+
+            transpose_8x2(
+                    _mm256_loadu_ps(y + 0 * 8),
+                    _mm256_loadu_ps(y + 1 * 8),
+                    v0,
+                    v1);
+
+            // compute differences
+            const __m256 d0 = _mm256_sub_ps(m0, v0);
+            const __m256 d1 = _mm256_sub_ps(m1, v1);
+
+            // compute squares of differences
+            __m256 distances = _mm256_mul_ps(d0, d0);
+            distances = _mm256_fmadd_ps(d1, d1, distances);
+
+            // compare the new distances to the min distances
+            // for each of 8 AVX2 components.
+            __m256 comparison =
+                    _mm256_cmp_ps(min_distances, distances, _CMP_LT_OS);
+
+            // update min distances and indices with closest vectors if needed.
+            min_distances = _mm256_min_ps(distances, min_distances);
+            min_indices = _mm256_castps_si256(_mm256_blendv_ps(
+                    _mm256_castsi256_ps(current_indices),
+                    _mm256_castsi256_ps(min_indices),
+                    comparison));
+
+            // update current indices values. Basically, +8 to each of the
+            // 8 AVX2 components.
+            current_indices =
+                    _mm256_add_epi32(current_indices, indices_increment);
+
+            // scroll y forward (8 vectors 2 DIM each).
+            y += 16;
+        }
+
+        // dump values and find the minimum distance / minimum index
+        float min_distances_scalar[8];
+        uint32_t min_indices_scalar[8];
+        _mm256_storeu_ps(min_distances_scalar, min_distances);
+        _mm256_storeu_si256((__m256i*)(min_indices_scalar), min_indices);
+
+        for (size_t j = 0; j < 8; j++) {
+            if (current_min_distance > min_distances_scalar[j]) {
+                current_min_distance = min_distances_scalar[j];
+                current_min_index = min_indices_scalar[j];
+            }
+        }
+    }
+
+    if (i < ny) {
+        // process leftovers.
+        // the following code is not optimal, but it is rarely invoked.
+        float x0 = x[0];
+        float x1 = x[1];
+
+        for (; i < ny; i++) {
+            float sub0 = x0 - y[0];
+            float sub1 = x1 - y[1];
+            float distance = sub0 * sub0 + sub1 * sub1;
+
+            y += 2;
+
+            if (current_min_distance > distance) {
+                current_min_distance = distance;
+                current_min_index = i;
+            }
+        }
+    }
+
+    return current_min_index;
+}
+
 size_t fvec_L2sqr_ny_nearest_D4(
         float* distances_tmp_buffer,
         const float* x,
@@ -609,38 +816,27 @@
         __m256i current_indices = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
         const __m256i indices_increment = _mm256_set1_epi32(8);
 
-        //
-        _mm_prefetch(y, _MM_HINT_NTA);
-        _mm_prefetch(y + 16, _MM_HINT_NTA);
-
-        // m0 = (x[0], x[0], x[0], x[0], x[0], x[0], x[0], x[0])
+        // 1 value per register
         const __m256 m0 = _mm256_set1_ps(x[0]);
-        // m1 = (x[1], x[1], x[1], x[1], x[1], x[1], x[1], x[1])
         const __m256 m1 = _mm256_set1_ps(x[1]);
-        // m2 = (x[2], x[2], x[2], x[2], x[2], x[2], x[2], x[2])
         const __m256 m2 = _mm256_set1_ps(x[2]);
-        // m3 = (x[3], x[3], x[3], x[3], x[3], x[3], x[3], x[3])
         const __m256 m3 = _mm256_set1_ps(x[3]);
 
-        const __m256i indices0 =
-                _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112);
-
         for (; i < ny8 * 8; i += 8) {
-            _mm_prefetch(y + 32, _MM_HINT_NTA);
-            _mm_prefetch(y + 48, _MM_HINT_NTA);
-
-            // collect dim 0 for 8 D4-vectors.
-            // v0 = (y[(i * 8 + 0) * 4 + 0], ..., y[(i * 8 + 7) * 4 + 0])
-            const __m256 v0 = _mm256_i32gather_ps(y, indices0, 1);
-            // collect dim 1 for 8 D4-vectors.
-            // v1 = (y[(i * 8 + 0) * 4 + 1], ..., y[(i * 8 + 7) * 4 + 1])
-            const __m256 v1 = _mm256_i32gather_ps(y + 1, indices0, 1);
-            // collect dim 2 for 8 D4-vectors.
-            // v2 = (y[(i * 8 + 0) * 4 + 2], ..., y[(i * 8 + 7) * 4 + 2])
-            const __m256 v2 = _mm256_i32gather_ps(y + 2, indices0, 1);
-            // collect dim 3 for 8 D4-vectors.
-            // v3 = (y[(i * 8 + 0) * 4 + 3], ..., y[(i * 8 + 7) * 4 + 3])
-            const __m256 v3 = _mm256_i32gather_ps(y + 3, indices0, 1);
+            __m256 v0;
+            __m256 v1;
+            __m256 v2;
+            __m256 v3;
+
+            transpose_8x4(
+                    _mm256_loadu_ps(y + 0 * 8),
+                    _mm256_loadu_ps(y + 1 * 8),
+                    _mm256_loadu_ps(y + 2 * 8),
+                    _mm256_loadu_ps(y + 3 * 8),
+                    v0,
+                    v1,
+                    v2,
+                    v3);
 
             // compute differences
             const __m256 d0 = _mm256_sub_ps(m0, v0);
@@ -654,24 +850,13 @@
             distances = _mm256_fmadd_ps(d2, d2, distances);
             distances = _mm256_fmadd_ps(d3, d3, distances);
 
-            //   distances[0] = (x[0] - y[(i * 8 + 0) * 4 + 0]) ^ 2 +
-            //                  (x[1] - y[(i * 8 + 0) * 4 + 1]) ^ 2 +
-            //                  (x[2] - y[(i * 8 + 0) * 4 + 2]) ^ 2 +
-            //                  (x[3] - y[(i * 8 + 0) * 4 + 3])
-            //   ...
-            //   distances[7] = (x[0] - y[(i * 8 + 7) * 4 + 0]) ^ 2 +
-            //                  (x[1] - y[(i * 8 + 7) * 4 + 1]) ^ 2 +
-            //                  (x[2] - y[(i * 8 + 7) * 4 + 2]) ^ 2 +
-            //                  (x[3] - y[(i * 8 + 7) * 4 + 3])
-
             // compare the new distances to the min distances
             // for each of 8 AVX2 components.
             __m256 comparison =
                     _mm256_cmp_ps(min_distances, distances, _CMP_LT_OS);
 
             // update min distances and indices with closest vectors if needed.
-            min_distances =
-                    _mm256_blendv_ps(distances, min_distances, comparison);
+            min_distances = _mm256_min_ps(distances, min_distances);
             min_indices = _mm256_castps_si256(_mm256_blendv_ps(
                     _mm256_castsi256_ps(current_indices),
                     _mm256_castsi256_ps(min_indices),
@@ -721,7 +906,168 @@
 
     return current_min_index;
 }
+
+size_t fvec_L2sqr_ny_nearest_D8(
+        float* distances_tmp_buffer,
+        const float* x,
+        const float* y,
+        size_t ny) {
+    // this implementation does not use distances_tmp_buffer.
+
+    // current index being processed
+    size_t i = 0;
+
+    // min distance and the index of the closest vector so far
+    float current_min_distance = HUGE_VALF;
+    size_t current_min_index = 0;
+
+    // process 8 D8-vectors per loop.
+    const size_t ny8 = ny / 8;
+    if (ny8 > 0) {
+        // track min distance and the closest vector independently
+        // for each of 8 AVX2 components.
+        __m256 min_distances = _mm256_set1_ps(HUGE_VALF);
+        __m256i min_indices = _mm256_set1_epi32(0);
+
+        __m256i current_indices = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        const __m256i indices_increment = _mm256_set1_epi32(8);
+
+        // 1 value per register
+        const __m256 m0 = _mm256_set1_ps(x[0]);
+        const __m256 m1 = _mm256_set1_ps(x[1]);
+        const __m256 m2 = _mm256_set1_ps(x[2]);
+        const __m256 m3 = _mm256_set1_ps(x[3]);
+
+        const __m256 m4 = _mm256_set1_ps(x[4]);
+        const __m256 m5 = _mm256_set1_ps(x[5]);
+        const __m256 m6 = _mm256_set1_ps(x[6]);
+        const __m256 m7 = _mm256_set1_ps(x[7]);
+
+        for (; i < ny8 * 8; i += 8) {
+            __m256 v0;
+            __m256 v1;
+            __m256 v2;
+            __m256 v3;
+            __m256 v4;
+            __m256 v5;
+            __m256 v6;
+            __m256 v7;
+
+            transpose_8x8(
+                    _mm256_loadu_ps(y + 0 * 8),
+                    _mm256_loadu_ps(y + 1 * 8),
+                    _mm256_loadu_ps(y + 2 * 8),
+                    _mm256_loadu_ps(y + 3 * 8),
+                    _mm256_loadu_ps(y + 4 * 8),
+                    _mm256_loadu_ps(y + 5 * 8),
+                    _mm256_loadu_ps(y + 6 * 8),
+                    _mm256_loadu_ps(y + 7 * 8),
+                    v0,
+                    v1,
+                    v2,
+                    v3,
+                    v4,
+                    v5,
+                    v6,
+                    v7);
+
+            // compute differences
+            const __m256 d0 = _mm256_sub_ps(m0, v0);
+            const __m256 d1 = _mm256_sub_ps(m1, v1);
+            const __m256 d2 = _mm256_sub_ps(m2, v2);
+            const __m256 d3 = _mm256_sub_ps(m3, v3);
+            const __m256 d4 = _mm256_sub_ps(m4, v4);
+            const __m256 d5 = _mm256_sub_ps(m5, v5);
+            const __m256 d6 = _mm256_sub_ps(m6, v6);
+            const __m256 d7 = _mm256_sub_ps(m7, v7);
+
+            // compute squares of differences
+            __m256 distances = _mm256_mul_ps(d0, d0);
+            distances = _mm256_fmadd_ps(d1, d1, distances);
+            distances = _mm256_fmadd_ps(d2, d2, distances);
+            distances = _mm256_fmadd_ps(d3, d3, distances);
+            distances = _mm256_fmadd_ps(d4, d4, distances);
+            distances = _mm256_fmadd_ps(d5, d5, distances);
+            distances = _mm256_fmadd_ps(d6, d6, distances);
+            distances = _mm256_fmadd_ps(d7, d7, distances);
+
+            // compare the new distances to the min distances
+            // for each of 8 AVX2 components.
+            __m256 comparison =
+                    _mm256_cmp_ps(min_distances, distances, _CMP_LT_OS);
+
+            // update min distances and indices with closest vectors if needed.
+            min_distances = _mm256_min_ps(distances, min_distances);
+            min_indices = _mm256_castps_si256(_mm256_blendv_ps(
+                    _mm256_castsi256_ps(current_indices),
+                    _mm256_castsi256_ps(min_indices),
+                    comparison));
+
+            // update current indices values. Basically, +8 to each of the
+            // 8 AVX2 components.
+            current_indices =
+                    _mm256_add_epi32(current_indices, indices_increment);
+
+            // scroll y forward (8 vectors 8 DIM each).
+            y += 64;
+        }
+
+        // dump values and find the minimum distance / minimum index
+        float min_distances_scalar[8];
+        uint32_t min_indices_scalar[8];
+        _mm256_storeu_ps(min_distances_scalar, min_distances);
+        _mm256_storeu_si256((__m256i*)(min_indices_scalar), min_indices);
+
+        for (size_t j = 0; j < 8; j++) {
+            if (current_min_distance > min_distances_scalar[j]) {
+                current_min_distance = min_distances_scalar[j];
+                current_min_index = min_indices_scalar[j];
+            }
+        }
+    }
+
+    if (i < ny) {
+        // process leftovers
+        __m256 x0 = _mm256_loadu_ps(x);
+
+        for (; i < ny; i++) {
+            __m256 sub = _mm256_sub_ps(x0, _mm256_loadu_ps(y));
+            __m256 accu = _mm256_mul_ps(sub, sub);
+            y += 8;
+
+            // horitontal sum
+            const __m256 h0 = _mm256_hadd_ps(accu, accu);
+            const __m256 h1 = _mm256_hadd_ps(h0, h0);
+
+            // extract high and low __m128 regs from __m256
+            const __m128 h2 = _mm256_extractf128_ps(h1, 1);
+            const __m128 h3 = _mm256_castps256_ps128(h1);
+
+            // get a final hsum into all 4 regs
+            const __m128 h4 = _mm_add_ss(h2, h3);
+
+            // extract f[0] from __m128
+            const float distance = _mm_cvtss_f32(h4);
+
+            if (current_min_distance > distance) {
+                current_min_distance = distance;
+                current_min_index = i;
+            }
+        }
+    }
+
+    return current_min_index;
+}
+
 #else
+size_t fvec_L2sqr_ny_nearest_D2(
+        float* distances_tmp_buffer,
+        const float* x,
+        const float* y,
+        size_t ny) {
+    return fvec_L2sqr_ny_nearest_ref(distances_tmp_buffer, x, y, 2, ny);
+}
+
 size_t fvec_L2sqr_ny_nearest_D4(
         float* distances_tmp_buffer,
         const float* x,
@@ -729,6 +1075,14 @@
         size_t ny) {
     return fvec_L2sqr_ny_nearest_ref(distances_tmp_buffer, x, y, 4, ny);
 }
+
+size_t fvec_L2sqr_ny_nearest_D8(
+        float* distances_tmp_buffer,
+        const float* x,
+        const float* y,
+        size_t ny) {
+    return fvec_L2sqr_ny_nearest_ref(distances_tmp_buffer, x, y, 8, ny);
+}
 #endif
 
 size_t fvec_L2sqr_ny_nearest(
@@ -743,7 +1097,9 @@
         return fvec_L2sqr_ny_nearest_D##dval(distances_tmp_buffer, x, y, ny);
 
     switch (d) {
+        DISPATCH(2)
         DISPATCH(4)
+        DISPATCH(8)
         default:
             return fvec_L2sqr_ny_nearest_ref(distances_tmp_buffer, x, y, d, ny);
     }
@@ -919,79 +1275,6 @@
     }
 }
 
-float fvec_inner_product(const float* x, const float* y, size_t d) {
-    __m256 msum1 = _mm256_setzero_ps();
-
-    while (d >= 8) {
-        __m256 mx = _mm256_loadu_ps(x);
-        x += 8;
-        __m256 my = _mm256_loadu_ps(y);
-        y += 8;
-        msum1 = _mm256_add_ps(msum1, _mm256_mul_ps(mx, my));
-        d -= 8;
-    }
-
-    __m128 msum2 = _mm256_extractf128_ps(msum1, 1);
-    msum2 = _mm_add_ps(msum2, _mm256_extractf128_ps(msum1, 0));
-
-    if (d >= 4) {
-        __m128 mx = _mm_loadu_ps(x);
-        x += 4;
-        __m128 my = _mm_loadu_ps(y);
-        y += 4;
-        msum2 = _mm_add_ps(msum2, _mm_mul_ps(mx, my));
-        d -= 4;
-    }
-
-    if (d > 0) {
-        __m128 mx = masked_read(d, x);
-        __m128 my = masked_read(d, y);
-        msum2 = _mm_add_ps(msum2, _mm_mul_ps(mx, my));
-    }
-
-    msum2 = _mm_hadd_ps(msum2, msum2);
-    msum2 = _mm_hadd_ps(msum2, msum2);
-    return _mm_cvtss_f32(msum2);
-}
-
-float fvec_L2sqr(const float* x, const float* y, size_t d) {
-    __m256 msum1 = _mm256_setzero_ps();
-
-    while (d >= 8) {
-        __m256 mx = _mm256_loadu_ps(x);
-        x += 8;
-        __m256 my = _mm256_loadu_ps(y);
-        y += 8;
-        const __m256 a_m_b1 = _mm256_sub_ps(mx, my);
-        msum1 = _mm256_add_ps(msum1, _mm256_mul_ps(a_m_b1, a_m_b1));
-        d -= 8;
-    }
-
-    __m128 msum2 = _mm256_extractf128_ps(msum1, 1);
-    msum2 = _mm_add_ps(msum2, _mm256_extractf128_ps(msum1, 0));
-
-    if (d >= 4) {
-        __m128 mx = _mm_loadu_ps(x);
-        x += 4;
-        __m128 my = _mm_loadu_ps(y);
-        y += 4;
-        const __m128 a_m_b1 = _mm_sub_ps(mx, my);
-        msum2 = _mm_add_ps(msum2, _mm_mul_ps(a_m_b1, a_m_b1));
-        d -= 4;
-    }
-
-    if (d > 0) {
-        __m128 mx = masked_read(d, x);
-        __m128 my = masked_read(d, y);
-        __m128 a_m_b1 = _mm_sub_ps(mx, my);
-        msum2 = _mm_add_ps(msum2, _mm_mul_ps(a_m_b1, a_m_b1));
-    }
-
-    msum2 = _mm_hadd_ps(msum2, msum2);
-    msum2 = _mm_hadd_ps(msum2, msum2);
-    return _mm_cvtss_f32(msum2);
-}
-
 float fvec_L1(const float* x, const float* y, size_t d) {
     __m256 msum1 = _mm256_setzero_ps();
     __m256 signmask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffffUL));
@@ -1082,113 +1365,8 @@
     return fvec_Linf_ref(x, y, d);
 }
 
-float fvec_L2sqr(const float* x, const float* y, size_t d) {
-    __m128 msum1 = _mm_setzero_ps();
-
-    while (d >= 4) {
-        __m128 mx = _mm_loadu_ps(x);
-        x += 4;
-        __m128 my = _mm_loadu_ps(y);
-        y += 4;
-        const __m128 a_m_b1 = _mm_sub_ps(mx, my);
-        msum1 = _mm_add_ps(msum1, _mm_mul_ps(a_m_b1, a_m_b1));
-        d -= 4;
-    }
-
-    if (d > 0) {
-        // add the last 1, 2 or 3 values
-        __m128 mx = masked_read(d, x);
-        __m128 my = masked_read(d, y);
-        __m128 a_m_b1 = _mm_sub_ps(mx, my);
-        msum1 = _mm_add_ps(msum1, _mm_mul_ps(a_m_b1, a_m_b1));
-    }
-
-    msum1 = _mm_hadd_ps(msum1, msum1);
-    msum1 = _mm_hadd_ps(msum1, msum1);
-    return _mm_cvtss_f32(msum1);
-}
-
-float fvec_inner_product(const float* x, const float* y, size_t d) {
-    __m128 mx, my;
-    __m128 msum1 = _mm_setzero_ps();
-
-    while (d >= 4) {
-        mx = _mm_loadu_ps(x);
-        x += 4;
-        my = _mm_loadu_ps(y);
-        y += 4;
-        msum1 = _mm_add_ps(msum1, _mm_mul_ps(mx, my));
-        d -= 4;
-    }
-
-    // add the last 1, 2, or 3 values
-    mx = masked_read(d, x);
-    my = masked_read(d, y);
-    __m128 prod = _mm_mul_ps(mx, my);
-
-    msum1 = _mm_add_ps(msum1, prod);
-
-    msum1 = _mm_hadd_ps(msum1, msum1);
-    msum1 = _mm_hadd_ps(msum1, msum1);
-    return _mm_cvtss_f32(msum1);
-}
-
 #elif defined(__aarch64__)
 
-float fvec_L2sqr(const float* x, const float* y, size_t d) {
-    float32x4_t accux4 = vdupq_n_f32(0);
-    const size_t d_simd = d - (d & 3);
-    size_t i;
-    for (i = 0; i < d_simd; i += 4) {
-        float32x4_t xi = vld1q_f32(x + i);
-        float32x4_t yi = vld1q_f32(y + i);
-        float32x4_t sq = vsubq_f32(xi, yi);
-        accux4 = vfmaq_f32(accux4, sq, sq);
-    }
-    float32_t accux1 = vaddvq_f32(accux4);
-    for (; i < d; ++i) {
-        float32_t xi = x[i];
-        float32_t yi = y[i];
-        float32_t sq = xi - yi;
-        accux1 += sq * sq;
-    }
-    return accux1;
-}
-
-float fvec_inner_product(const float* x, const float* y, size_t d) {
-    float32x4_t accux4 = vdupq_n_f32(0);
-    const size_t d_simd = d - (d & 3);
-    size_t i;
-    for (i = 0; i < d_simd; i += 4) {
-        float32x4_t xi = vld1q_f32(x + i);
-        float32x4_t yi = vld1q_f32(y + i);
-        accux4 = vfmaq_f32(accux4, xi, yi);
-    }
-    float32_t accux1 = vaddvq_f32(accux4);
-    for (; i < d; ++i) {
-        float32_t xi = x[i];
-        float32_t yi = y[i];
-        accux1 += xi * yi;
-    }
-    return accux1;
-}
-
-float fvec_norm_L2sqr(const float* x, size_t d) {
-    float32x4_t accux4 = vdupq_n_f32(0);
-    const size_t d_simd = d - (d & 3);
-    size_t i;
-    for (i = 0; i < d_simd; i += 4) {
-        float32x4_t xi = vld1q_f32(x + i);
-        accux4 = vfmaq_f32(accux4, xi, xi);
-    }
-    float32_t accux1 = vaddvq_f32(accux4);
-    for (; i < d; ++i) {
-        float32_t xi = x[i];
-        accux1 += xi * xi;
-    }
-    return accux1;
-}
-
 // not optimized for ARM
 void fvec_L2sqr_ny(
         float* dis,
@@ -1199,6 +1377,17 @@
     fvec_L2sqr_ny_ref(dis, x, y, d, ny);
 }
 
+void fvec_L2sqr_ny_transposed(
+        float* dis,
+        const float* x,
+        const float* y,
+        const float* y_sqlen,
+        size_t d,
+        size_t d_offset,
+        size_t ny) {
+    return fvec_L2sqr_ny_y_transposed_ref(dis, x, y, y_sqlen, d, d_offset, ny);
+}
+
 size_t fvec_L2sqr_ny_nearest(
         float* distances_tmp_buffer,
         const float* x,
@@ -1240,10 +1429,6 @@
 #else
 // scalar implementation
 
-float fvec_L2sqr(const float* x, const float* y, size_t d) {
-    return fvec_L2sqr_ref(x, y, d);
-}
-
 float fvec_L1(const float* x, const float* y, size_t d) {
     return fvec_L1_ref(x, y, d);
 }
@@ -1252,14 +1437,6 @@
     return fvec_Linf_ref(x, y, d);
 }
 
-float fvec_inner_product(const float* x, const float* y, size_t d) {
-    return fvec_inner_product_ref(x, y, d);
-}
-
-float fvec_norm_L2sqr(const float* x, size_t d) {
-    return fvec_norm_L2sqr_ref(x, d);
-}
-
 void fvec_L2sqr_ny(
         float* dis,
         const float* x,
@@ -1269,6 +1446,17 @@
     fvec_L2sqr_ny_ref(dis, x, y, d, ny);
 }
 
+void fvec_L2sqr_ny_transposed(
+        float* dis,
+        const float* x,
+        const float* y,
+        const float* y_sqlen,
+        size_t d,
+        size_t d_offset,
+        size_t ny) {
+    return fvec_L2sqr_ny_y_transposed_ref(dis, x, y, y_sqlen, d, d_offset, ny);
+}
+
 size_t fvec_L2sqr_ny_nearest(
         float* distances_tmp_buffer,
         const float* x,
diff -Nru faiss-1.7.3/faiss/utils/extra_distances.cpp faiss-1.7.4/faiss/utils/extra_distances.cpp
--- faiss-1.7.3/faiss/utils/extra_distances.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/utils/extra_distances.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -50,14 +50,14 @@
     }
 }
 
-template <class VD>
+template <class VD, class C>
 void knn_extra_metrics_template(
         VD vd,
         const float* x,
         const float* y,
         size_t nx,
         size_t ny,
-        float_maxheap_array_t* res) {
+        HeapArray<C>* res) {
     size_t k = res->k;
     size_t d = vd.d;
     size_t check_period = InterruptCallback::get_period_hint(ny * d);
@@ -74,16 +74,21 @@
             float* simi = res->get_val(i);
             int64_t* idxi = res->get_ids(i);
 
-            maxheap_heapify(k, simi, idxi);
+            // maxheap_heapify(k, simi, idxi);
+            heap_heapify<C>(k, simi, idxi);
             for (j = 0; j < ny; j++) {
                 float disij = vd(x_i, y_j);
 
-                if (disij < simi[0]) {
-                    maxheap_replace_top(k, simi, idxi, disij, j);
+                // if (disij < simi[0]) {
+                if ((!vd.is_similarity && (disij < simi[0])) ||
+                    (vd.is_similarity && (disij > simi[0]))) {
+                    // maxheap_replace_top(k, simi, idxi, disij, j);
+                    heap_replace_top<C>(k, simi, idxi, disij, j);
                 }
                 y_j += d;
             }
-            maxheap_reorder(k, simi, idxi);
+            // maxheap_reorder(k, simi, idxi);
+            heap_reorder<C>(k, simi, idxi);
         }
         InterruptCallback::check();
     }
@@ -92,7 +97,7 @@
 template <class VD>
 struct ExtraDistanceComputer : FlatCodesDistanceComputer {
     VD vd;
-    Index::idx_t nb;
+    idx_t nb;
     const float* q;
     const float* b;
 
@@ -158,12 +163,14 @@
         HANDLE_VAR(BrayCurtis);
         HANDLE_VAR(JensenShannon);
         HANDLE_VAR(Lp);
+        HANDLE_VAR(Jaccard);
 #undef HANDLE_VAR
         default:
             FAISS_THROW_MSG("metric type not implemented");
     }
 }
 
+template <class C>
 void knn_extra_metrics(
         const float* x,
         const float* y,
@@ -172,7 +179,7 @@
         size_t ny,
         MetricType mt,
         float metric_arg,
-        float_maxheap_array_t* res) {
+        HeapArray<C>* res) {
     switch (mt) {
 #define HANDLE_VAR(kw)                                            \
     case METRIC_##kw: {                                           \
@@ -187,12 +194,33 @@
         HANDLE_VAR(BrayCurtis);
         HANDLE_VAR(JensenShannon);
         HANDLE_VAR(Lp);
+        HANDLE_VAR(Jaccard);
 #undef HANDLE_VAR
         default:
             FAISS_THROW_MSG("metric type not implemented");
     }
 }
 
+template void knn_extra_metrics<CMax<float, int64_t>>(
+        const float* x,
+        const float* y,
+        size_t d,
+        size_t nx,
+        size_t ny,
+        MetricType mt,
+        float metric_arg,
+        HeapArray<CMax<float, int64_t>>* res);
+
+template void knn_extra_metrics<CMin<float, int64_t>>(
+        const float* x,
+        const float* y,
+        size_t d,
+        size_t nx,
+        size_t ny,
+        MetricType mt,
+        float metric_arg,
+        HeapArray<CMin<float, int64_t>>* res);
+
 FlatCodesDistanceComputer* get_extra_distance_computer(
         size_t d,
         MetricType mt,
@@ -213,6 +241,7 @@
         HANDLE_VAR(BrayCurtis);
         HANDLE_VAR(JensenShannon);
         HANDLE_VAR(Lp);
+        HANDLE_VAR(Jaccard);
 #undef HANDLE_VAR
         default:
             FAISS_THROW_MSG("metric type not implemented");
diff -Nru faiss-1.7.3/faiss/utils/extra_distances.h faiss-1.7.4/faiss/utils/extra_distances.h
--- faiss-1.7.3/faiss/utils/extra_distances.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/utils/extra_distances.h	2023-04-19 13:18:30.000000000 +0000
@@ -33,6 +33,7 @@
         int64_t ldb = -1,
         int64_t ldd = -1);
 
+template <class C>
 void knn_extra_metrics(
         const float* x,
         const float* y,
@@ -41,7 +42,7 @@
         size_t ny,
         MetricType mt,
         float metric_arg,
-        float_maxheap_array_t* res);
+        HeapArray<C>* res);
 
 /** get a DistanceComputer that refers to this type of distance and
  *  indexes a flat array of size nb */
diff -Nru faiss-1.7.3/faiss/utils/extra_distances-inl.h faiss-1.7.4/faiss/utils/extra_distances-inl.h
--- faiss-1.7.3/faiss/utils/extra_distances-inl.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/utils/extra_distances-inl.h	2023-04-19 13:18:30.000000000 +0000
@@ -8,6 +8,7 @@
 /** In this file are the implementations of extra metrics beyond L2
  *  and inner product */
 
+#include <faiss/MetricType.h>
 #include <faiss/utils/distances.h>
 #include <type_traits>
 
@@ -17,12 +18,13 @@
 struct VectorDistance {
     size_t d;
     float metric_arg;
+    static constexpr bool is_similarity = is_similarity_metric(mt);
 
     inline float operator()(const float* x, const float* y) const;
 
     // heap template to use for this type of metric
     using C = typename std::conditional<
-            mt == METRIC_INNER_PRODUCT,
+            is_similarity_metric(mt),
             CMin<float, int64_t>,
             CMax<float, int64_t>>::type;
 };
@@ -114,4 +116,18 @@
     return 0.5 * accu;
 }
 
+template <>
+inline float VectorDistance<METRIC_Jaccard>::operator()(
+        const float* x,
+        const float* y) const {
+    // WARNING: this distance is defined only for positive input vectors.
+    // Providing vectors with negative values would lead to incorrect results.
+    float accu_num = 0, accu_den = 0;
+    for (size_t i = 0; i < d; i++) {
+        accu_num += fmin(x[i], y[i]);
+        accu_den += fmax(x[i], y[i]);
+    }
+    return accu_num / accu_den;
+}
+
 } // namespace faiss
diff -Nru faiss-1.7.3/faiss/utils/fp16-fp16c.h faiss-1.7.4/faiss/utils/fp16-fp16c.h
--- faiss-1.7.3/faiss/utils/fp16-fp16c.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/utils/fp16-fp16c.h	2023-04-19 13:18:30.000000000 +0000
@@ -1,3 +1,10 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
 #pragma once
 
 #include <immintrin.h>
diff -Nru faiss-1.7.3/faiss/utils/fp16.h faiss-1.7.4/faiss/utils/fp16.h
--- faiss-1.7.3/faiss/utils/fp16.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/utils/fp16.h	2023-04-19 13:18:30.000000000 +0000
@@ -1,3 +1,10 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
 #pragma once
 
 #include <cstdint>
diff -Nru faiss-1.7.3/faiss/utils/fp16-inl.h faiss-1.7.4/faiss/utils/fp16-inl.h
--- faiss-1.7.3/faiss/utils/fp16-inl.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/utils/fp16-inl.h	2023-04-19 13:18:30.000000000 +0000
@@ -1,3 +1,10 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
 #pragma once
 
 #include <algorithm>
diff -Nru faiss-1.7.3/faiss/utils/hamming.cpp faiss-1.7.4/faiss/utils/hamming.cpp
--- faiss-1.7.3/faiss/utils/hamming.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/utils/hamming.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -35,6 +35,7 @@
 #include <faiss/impl/AuxIndexStructures.h>
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/utils/Heap.h>
+#include <faiss/utils/approx_topk_hamming/approx_topk_hamming.h>
 #include <faiss/utils/utils.h>
 
 static const size_t BLOCKSIZE_QUERY = 8192;
@@ -43,26 +44,13 @@
 
 size_t hamming_batch_size = 65536;
 
-const uint8_t hamdis_tab_ham_bytes[256] = {
-        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4,
-        2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4,
-        2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,
-        4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5,
-        3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,
-        4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-        4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
-
 template <size_t nbits>
 void hammings(
-        const uint64_t* bs1,
-        const uint64_t* bs2,
+        const uint64_t* __restrict bs1,
+        const uint64_t* __restrict bs2,
         size_t n1,
         size_t n2,
-        hamdis_t* dis)
+        hamdis_t* __restrict dis)
 
 {
     size_t i, j;
@@ -76,8 +64,8 @@
 }
 
 void hammings(
-        const uint64_t* bs1,
-        const uint64_t* bs2,
+        const uint64_t* __restrict bs1,
+        const uint64_t* __restrict bs2,
         size_t n1,
         size_t n2,
         size_t nwords,
@@ -95,12 +83,12 @@
 /* Count number of matches given a max threshold */
 template <size_t nbits>
 void hamming_count_thres(
-        const uint64_t* bs1,
-        const uint64_t* bs2,
+        const uint64_t* __restrict bs1,
+        const uint64_t* __restrict bs2,
         size_t n1,
         size_t n2,
         hamdis_t ht,
-        size_t* nptr) {
+        size_t* __restrict nptr) {
     const size_t nwords = nbits / 64;
     size_t i, j, posm = 0;
     const uint64_t* bs2_ = bs2;
@@ -120,10 +108,10 @@
 
 template <size_t nbits>
 void crosshamming_count_thres(
-        const uint64_t* dbs,
+        const uint64_t* __restrict dbs,
         size_t n,
         int ht,
-        size_t* nptr) {
+        size_t* __restrict nptr) {
     const size_t nwords = nbits / 64;
     size_t i, j, posm = 0;
     const uint64_t* bs1 = dbs;
@@ -142,13 +130,13 @@
 
 template <size_t nbits>
 size_t match_hamming_thres(
-        const uint64_t* bs1,
-        const uint64_t* bs2,
+        const uint64_t* __restrict bs1,
+        const uint64_t* __restrict bs2,
         size_t n1,
         size_t n2,
         int ht,
-        int64_t* idx,
-        hamdis_t* hams) {
+        int64_t* __restrict idx,
+        hamdis_t* __restrict hams) {
     const size_t nwords = nbits / 64;
     size_t i, j, posm = 0;
     hamdis_t h;
@@ -181,12 +169,13 @@
 template <class HammingComputer>
 static void hammings_knn_hc(
         int bytes_per_code,
-        int_maxheap_array_t* ha,
-        const uint8_t* bs1,
-        const uint8_t* bs2,
+        int_maxheap_array_t* __restrict ha,
+        const uint8_t* __restrict bs1,
+        const uint8_t* __restrict bs2,
         size_t n2,
         bool order = true,
-        bool init_heap = true) {
+        bool init_heap = true,
+        ApproxTopK_mode_t approx_topk_mode = ApproxTopK_mode_t::EXACT_TOPK) {
     size_t k = ha->k;
     if (init_heap)
         ha->heapify();
@@ -198,17 +187,44 @@
         for (int64_t i = 0; i < ha->nh; i++) {
             HammingComputer hc(bs1 + i * bytes_per_code, bytes_per_code);
 
-            const uint8_t* bs2_ = bs2 + j0 * bytes_per_code;
+            const uint8_t* __restrict bs2_ = bs2 + j0 * bytes_per_code;
             hamdis_t dis;
             hamdis_t* __restrict bh_val_ = ha->val + i * k;
             int64_t* __restrict bh_ids_ = ha->ids + i * k;
-            size_t j;
-            for (j = j0; j < j1; j++, bs2_ += bytes_per_code) {
-                dis = hc.hamming(bs2_);
-                if (dis < bh_val_[0]) {
-                    faiss::maxheap_replace_top<hamdis_t>(
-                            k, bh_val_, bh_ids_, dis, j);
-                }
+
+            // if larger number of k is required, then ::bs_addn() needs to be
+            // used instead of ::addn()
+#define HANDLE_APPROX(NB, BD)                                                \
+    case ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B##NB##_D##BD:               \
+        FAISS_THROW_IF_NOT_FMT(                                              \
+                k <= NB * BD,                                                \
+                "The chosen mode (%d) of approximate top-k supports "        \
+                "up to %d values, but %zd is requested.",                    \
+                (int)(ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B##NB##_D##BD), \
+                NB * BD,                                                     \
+                k);                                                          \
+        HeapWithBucketsForHamming32<                                         \
+                CMax<hamdis_t, int64_t>,                                     \
+                NB,                                                          \
+                BD,                                                          \
+                HammingComputer>::                                           \
+                addn(j1 - j0, hc, bs2_, k, bh_val_, bh_ids_);                \
+        break;
+
+            switch (approx_topk_mode) {
+                HANDLE_APPROX(8, 3)
+                HANDLE_APPROX(8, 2)
+                HANDLE_APPROX(16, 2)
+                HANDLE_APPROX(32, 2)
+                default: {
+                    for (size_t j = j0; j < j1; j++, bs2_ += bytes_per_code) {
+                        dis = hc.hamming(bs2_);
+                        if (dis < bh_val_[0]) {
+                            faiss::maxheap_replace_top<hamdis_t>(
+                                    k, bh_val_, bh_ids_, dis, j);
+                        }
+                    }
+                } break;
             }
         }
     }
@@ -220,13 +236,13 @@
 template <class HammingComputer>
 static void hammings_knn_mc(
         int bytes_per_code,
-        const uint8_t* a,
-        const uint8_t* b,
+        const uint8_t* __restrict a,
+        const uint8_t* __restrict b,
         size_t na,
         size_t nb,
         size_t k,
-        int32_t* distances,
-        int64_t* labels) {
+        int32_t* __restrict distances,
+        int64_t* __restrict labels) {
     const int nBuckets = bytes_per_code * 8 + 1;
     std::vector<int> all_counters(na * nBuckets, 0);
     std::unique_ptr<int64_t[]> all_ids_per_dis(new int64_t[na * nBuckets * k]);
@@ -271,44 +287,6 @@
     }
 }
 
-// works faster than the template version
-static void hammings_knn_hc_1(
-        int_maxheap_array_t* ha,
-        const uint64_t* bs1,
-        const uint64_t* bs2,
-        size_t n2,
-        bool order = true,
-        bool init_heap = true) {
-    const size_t nwords = 1;
-    size_t k = ha->k;
-
-    if (init_heap) {
-        ha->heapify();
-    }
-
-#pragma omp parallel for
-    for (int64_t i = 0; i < ha->nh; i++) {
-        const uint64_t bs1_ = bs1[i];
-        const uint64_t* bs2_ = bs2;
-        hamdis_t dis;
-        hamdis_t* bh_val_ = ha->val + i * k;
-        hamdis_t bh_val_0 = bh_val_[0];
-        int64_t* bh_ids_ = ha->ids + i * k;
-        size_t j;
-        for (j = 0; j < n2; j++, bs2_ += nwords) {
-            dis = popcount64(bs1_ ^ *bs2_);
-            if (dis < bh_val_0) {
-                faiss::maxheap_replace_top<hamdis_t>(
-                        k, bh_val_, bh_ids_, dis, j);
-                bh_val_0 = bh_val_[0];
-            }
-        }
-    }
-    if (order) {
-        ha->reorder();
-    }
-}
-
 /* Functions to maps vectors to bits. Assume proper allocation done beforehand,
    meaning that b should be be able to receive as many bits as x may produce. */
 
@@ -316,7 +294,7 @@
  * dimension 0 corresponds to the least significant bit of b[0], or
  * equivalently to the lsb of the first byte that is stored.
  */
-void fvec2bitvec(const float* x, uint8_t* b, size_t d) {
+void fvec2bitvec(const float* __restrict x, uint8_t* __restrict b, size_t d) {
     for (int i = 0; i < d; i += 8) {
         uint8_t w = 0;
         uint8_t mask = 1;
@@ -333,14 +311,22 @@
 
 /* Same but for n vectors.
    Ensure that the ouptut b is byte-aligned (pad with 0s). */
-void fvecs2bitvecs(const float* x, uint8_t* b, size_t d, size_t n) {
+void fvecs2bitvecs(
+        const float* __restrict x,
+        uint8_t* __restrict b,
+        size_t d,
+        size_t n) {
     const int64_t ncodes = ((d + 7) / 8);
 #pragma omp parallel for if (n > 100000)
     for (int64_t i = 0; i < n; i++)
         fvec2bitvec(x + i * d, b + i * ncodes, d);
 }
 
-void bitvecs2fvecs(const uint8_t* b, float* x, size_t d, size_t n) {
+void bitvecs2fvecs(
+        const uint8_t* __restrict b,
+        float* __restrict x,
+        size_t d,
+        size_t n) {
     const int64_t ncodes = ((d + 7) / 8);
 #pragma omp parallel for if (n > 100000)
     for (int64_t i = 0; i < n; i++) {
@@ -378,9 +364,9 @@
         size_t n,
         size_t da,
         size_t db,
-        const int* order,
-        const uint8_t* a,
-        uint8_t* b) {
+        const int* __restrict order,
+        const uint8_t* __restrict a,
+        uint8_t* __restrict b) {
     for (size_t i = 0; i < db; i++) {
         FAISS_THROW_IF_NOT(order[i] >= 0 && order[i] < da);
     }
@@ -407,8 +393,8 @@
 
 /* Compute a set of Hamming distances */
 void hammings(
-        const uint8_t* a,
-        const uint8_t* b,
+        const uint8_t* __restrict a,
+        const uint8_t* __restrict b,
         size_t na,
         size_t nb,
         size_t ncodes,
@@ -434,9 +420,9 @@
 }
 
 void hammings_knn(
-        int_maxheap_array_t* ha,
-        const uint8_t* a,
-        const uint8_t* b,
+        int_maxheap_array_t* __restrict ha,
+        const uint8_t* __restrict a,
+        const uint8_t* __restrict b,
         size_t nb,
         size_t ncodes,
         int order) {
@@ -444,54 +430,52 @@
 }
 
 void hammings_knn_hc(
-        int_maxheap_array_t* ha,
-        const uint8_t* a,
-        const uint8_t* b,
+        int_maxheap_array_t* __restrict ha,
+        const uint8_t* __restrict a,
+        const uint8_t* __restrict b,
         size_t nb,
         size_t ncodes,
-        int order) {
+        int order,
+        ApproxTopK_mode_t approx_topk_mode) {
     switch (ncodes) {
         case 4:
             hammings_knn_hc<faiss::HammingComputer4>(
-                    4, ha, a, b, nb, order, true);
+                    4, ha, a, b, nb, order, true, approx_topk_mode);
             break;
         case 8:
-            hammings_knn_hc_1(ha, C64(a), C64(b), nb, order, true);
-            // hammings_knn_hc<faiss::HammingComputer8>
-            //      (8, ha, a, b, nb, order, true);
+            hammings_knn_hc<faiss::HammingComputer8>(
+                    8, ha, a, b, nb, order, true, approx_topk_mode);
             break;
         case 16:
             hammings_knn_hc<faiss::HammingComputer16>(
-                    16, ha, a, b, nb, order, true);
+                    16, ha, a, b, nb, order, true, approx_topk_mode);
             break;
         case 32:
             hammings_knn_hc<faiss::HammingComputer32>(
-                    32, ha, a, b, nb, order, true);
+                    32, ha, a, b, nb, order, true, approx_topk_mode);
             break;
         default:
             hammings_knn_hc<faiss::HammingComputerDefault>(
-                    ncodes, ha, a, b, nb, order, true);
+                    ncodes, ha, a, b, nb, order, true, approx_topk_mode);
             break;
     }
 }
 
 void hammings_knn_mc(
-        const uint8_t* a,
-        const uint8_t* b,
+        const uint8_t* __restrict a,
+        const uint8_t* __restrict b,
         size_t na,
         size_t nb,
         size_t k,
         size_t ncodes,
-        int32_t* distances,
-        int64_t* labels) {
+        int32_t* __restrict distances,
+        int64_t* __restrict labels) {
     switch (ncodes) {
         case 4:
             hammings_knn_mc<faiss::HammingComputer4>(
                     4, a, b, na, nb, k, distances, labels);
             break;
         case 8:
-            // TODO(hoss): Write analog to hammings_knn_hc_1
-            // hammings_knn_hc_1 (ha, C64(a), C64(b), nb, order, true);
             hammings_knn_mc<faiss::HammingComputer8>(
                     8, a, b, na, nb, k, distances, labels);
             break;
@@ -664,13 +648,13 @@
 
 template <class HammingComputer>
 static void hamming_dis_inner_loop(
-        const uint8_t* ca,
-        const uint8_t* cb,
+        const uint8_t* __restrict ca,
+        const uint8_t* __restrict cb,
         size_t nb,
         size_t code_size,
         int k,
-        hamdis_t* bh_val_,
-        int64_t* bh_ids_) {
+        hamdis_t* __restrict bh_val_,
+        int64_t* __restrict bh_ids_) {
     HammingComputer hc(ca, code_size);
 
     for (size_t j = 0; j < nb; j++) {
@@ -683,9 +667,9 @@
 }
 
 void generalized_hammings_knn_hc(
-        int_maxheap_array_t* ha,
-        const uint8_t* a,
-        const uint8_t* b,
+        int_maxheap_array_t* __restrict ha,
+        const uint8_t* __restrict a,
+        const uint8_t* __restrict b,
         size_t nb,
         size_t code_size,
         int ordered) {
@@ -697,11 +681,11 @@
 
 #pragma omp parallel for
     for (int i = 0; i < na; i++) {
-        const uint8_t* ca = a + i * code_size;
-        const uint8_t* cb = b;
+        const uint8_t* __restrict ca = a + i * code_size;
+        const uint8_t* __restrict cb = b;
 
-        hamdis_t* bh_val_ = ha->val + i * k;
-        int64_t* bh_ids_ = ha->ids + i * k;
+        hamdis_t* __restrict bh_val_ = ha->val + i * k;
+        int64_t* __restrict bh_ids_ = ha->ids + i * k;
 
         switch (code_size) {
             case 8:
diff -Nru faiss-1.7.3/faiss/utils/hamming_distance/avx2-inl.h faiss-1.7.4/faiss/utils/hamming_distance/avx2-inl.h
--- faiss-1.7.3/faiss/utils/hamming_distance/avx2-inl.h	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/faiss/utils/hamming_distance/avx2-inl.h	2023-04-19 13:18:30.000000000 +0000
@@ -0,0 +1,535 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef HAMMING_AVX2_INL_H
+#define HAMMING_AVX2_INL_H
+
+// AVX2 version
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include <faiss/impl/platform_macros.h>
+
+#include <immintrin.h>
+
+namespace faiss {
+
+/* Elementary Hamming distance computation: unoptimized  */
+template <size_t nbits, typename T>
+inline T hamming(const uint8_t* bs1, const uint8_t* bs2) {
+    const size_t nbytes = nbits / 8;
+    size_t i;
+    T h = 0;
+    for (i = 0; i < nbytes; i++) {
+        h += (T)hamdis_tab_ham_bytes[bs1[i] ^ bs2[i]];
+    }
+    return h;
+}
+
+/* Hamming distances for multiples of 64 bits */
+template <size_t nbits>
+inline hamdis_t hamming(const uint64_t* bs1, const uint64_t* bs2) {
+    const size_t nwords = nbits / 64;
+    size_t i;
+    hamdis_t h = 0;
+    for (i = 0; i < nwords; i++) {
+        h += popcount64(bs1[i] ^ bs2[i]);
+    }
+    return h;
+}
+
+/* specialized (optimized) functions */
+template <>
+inline hamdis_t hamming<64>(const uint64_t* pa, const uint64_t* pb) {
+    return popcount64(pa[0] ^ pb[0]);
+}
+
+template <>
+inline hamdis_t hamming<128>(const uint64_t* pa, const uint64_t* pb) {
+    return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]);
+}
+
+template <>
+inline hamdis_t hamming<256>(const uint64_t* pa, const uint64_t* pb) {
+    return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]) +
+            popcount64(pa[2] ^ pb[2]) + popcount64(pa[3] ^ pb[3]);
+}
+
+/* Hamming distances for multiple of 64 bits */
+inline hamdis_t hamming(
+        const uint64_t* bs1,
+        const uint64_t* bs2,
+        size_t nwords) {
+    hamdis_t h = 0;
+    for (size_t i = 0; i < nwords; i++) {
+        h += popcount64(bs1[i] ^ bs2[i]);
+    }
+    return h;
+}
+
+/******************************************************************
+ * The HammingComputer series of classes compares a single code of
+ * size 4 to 32 to incoming codes. They are intended for use as a
+ * template class where it would be inefficient to switch on the code
+ * size in the inner loop. Hopefully the compiler will inline the
+ * hamming() functions and put the a0, a1, ... in registers.
+ ******************************************************************/
+
+struct HammingComputer4 {
+    uint32_t a0;
+
+    HammingComputer4() {}
+
+    HammingComputer4(const uint8_t* a, int code_size) {
+        set(a, code_size);
+    }
+
+    void set(const uint8_t* a, int code_size) {
+        assert(code_size == 4);
+        a0 = *(uint32_t*)a;
+    }
+
+    inline int hamming(const uint8_t* b) const {
+        return popcount64(*(uint32_t*)b ^ a0);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 4;
+    }
+};
+
+struct HammingComputer8 {
+    uint64_t a0;
+
+    HammingComputer8() {}
+
+    HammingComputer8(const uint8_t* a, int code_size) {
+        set(a, code_size);
+    }
+
+    void set(const uint8_t* a, int code_size) {
+        assert(code_size == 8);
+        a0 = *(uint64_t*)a;
+    }
+
+    inline int hamming(const uint8_t* b) const {
+        return popcount64(*(uint64_t*)b ^ a0);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 8;
+    }
+};
+
+struct HammingComputer16 {
+    uint64_t a0, a1;
+
+    HammingComputer16() {}
+
+    HammingComputer16(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        assert(code_size == 16);
+        const uint64_t* a = (uint64_t*)a8;
+        a0 = a[0];
+        a1 = a[1];
+    }
+
+    inline int hamming(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 16;
+    }
+};
+
+// when applied to an array, 1/2 of the 64-bit accesses are unaligned.
+// This incurs a penalty of ~10% wrt. fully aligned accesses.
+struct HammingComputer20 {
+    uint64_t a0, a1;
+    uint32_t a2;
+
+    HammingComputer20() {}
+
+    HammingComputer20(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        assert(code_size == 20);
+        const uint64_t* a = (uint64_t*)a8;
+        a0 = a[0];
+        a1 = a[1];
+        a2 = a[2];
+    }
+
+    inline int hamming(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
+                popcount64(*(uint32_t*)(b + 2) ^ a2);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 20;
+    }
+};
+
+struct HammingComputer32 {
+    uint64_t a0, a1, a2, a3;
+
+    HammingComputer32() {}
+
+    HammingComputer32(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        assert(code_size == 32);
+        const uint64_t* a = (uint64_t*)a8;
+        a0 = a[0];
+        a1 = a[1];
+        a2 = a[2];
+        a3 = a[3];
+    }
+
+    inline int hamming(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
+                popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 32;
+    }
+};
+
+struct HammingComputer64 {
+    uint64_t a0, a1, a2, a3, a4, a5, a6, a7;
+
+    HammingComputer64() {}
+
+    HammingComputer64(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        assert(code_size == 64);
+        const uint64_t* a = (uint64_t*)a8;
+        a0 = a[0];
+        a1 = a[1];
+        a2 = a[2];
+        a3 = a[3];
+        a4 = a[4];
+        a5 = a[5];
+        a6 = a[6];
+        a7 = a[7];
+    }
+
+    inline int hamming(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
+                popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3) +
+                popcount64(b[4] ^ a4) + popcount64(b[5] ^ a5) +
+                popcount64(b[6] ^ a6) + popcount64(b[7] ^ a7);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 64;
+    }
+};
+
+struct HammingComputerDefault {
+    const uint8_t* a8;
+    int quotient8;
+    int remainder8;
+
+    HammingComputerDefault() {}
+
+    HammingComputerDefault(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        this->a8 = a8;
+        quotient8 = code_size / 8;
+        remainder8 = code_size % 8;
+    }
+
+    int hamming(const uint8_t* b8) const {
+        int accu = 0;
+
+        const uint64_t* a64 = reinterpret_cast<const uint64_t*>(a8);
+        const uint64_t* b64 = reinterpret_cast<const uint64_t*>(b8);
+        int i = 0, len = quotient8;
+        switch (len & 7) {
+            default:
+                while (len > 7) {
+                    len -= 8;
+                    accu += popcount64(a64[i] ^ b64[i]);
+                    i++;
+                    case 7:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                    case 6:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                    case 5:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                    case 4:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                    case 3:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                    case 2:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                    case 1:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                }
+        }
+        if (remainder8) {
+            const uint8_t* a = a8 + 8 * quotient8;
+            const uint8_t* b = b8 + 8 * quotient8;
+            switch (remainder8) {
+                case 7:
+                    accu += hamdis_tab_ham_bytes[a[6] ^ b[6]];
+                case 6:
+                    accu += hamdis_tab_ham_bytes[a[5] ^ b[5]];
+                case 5:
+                    accu += hamdis_tab_ham_bytes[a[4] ^ b[4]];
+                case 4:
+                    accu += hamdis_tab_ham_bytes[a[3] ^ b[3]];
+                case 3:
+                    accu += hamdis_tab_ham_bytes[a[2] ^ b[2]];
+                case 2:
+                    accu += hamdis_tab_ham_bytes[a[1] ^ b[1]];
+                case 1:
+                    accu += hamdis_tab_ham_bytes[a[0] ^ b[0]];
+                default:
+                    break;
+            }
+        }
+
+        return accu;
+    }
+
+    inline int get_code_size() const {
+        return quotient8 * 8 + remainder8;
+    }
+};
+
+// more inefficient than HammingComputerDefault (obsolete)
+struct HammingComputerM8 {
+    const uint64_t* a;
+    int n;
+
+    HammingComputerM8() {}
+
+    HammingComputerM8(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        assert(code_size % 8 == 0);
+        a = (uint64_t*)a8;
+        n = code_size / 8;
+    }
+
+    int hamming(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        int accu = 0;
+        for (int i = 0; i < n; i++)
+            accu += popcount64(a[i] ^ b[i]);
+        return accu;
+    }
+
+    inline int get_code_size() const {
+        return n * 8;
+    }
+};
+
+// more inefficient than HammingComputerDefault (obsolete)
+struct HammingComputerM4 {
+    const uint32_t* a;
+    int n;
+
+    HammingComputerM4() {}
+
+    HammingComputerM4(const uint8_t* a4, int code_size) {
+        set(a4, code_size);
+    }
+
+    void set(const uint8_t* a4, int code_size) {
+        assert(code_size % 4 == 0);
+        a = (uint32_t*)a4;
+        n = code_size / 4;
+    }
+
+    int hamming(const uint8_t* b8) const {
+        const uint32_t* b = (uint32_t*)b8;
+        int accu = 0;
+        for (int i = 0; i < n; i++)
+            accu += popcount64(a[i] ^ b[i]);
+        return accu;
+    }
+
+    inline int get_code_size() const {
+        return n * 4;
+    }
+};
+
+/***************************************************************************
+ * Equivalence with a template class when code size is known at compile time
+ **************************************************************************/
+
+// default template
+template <int CODE_SIZE>
+struct HammingComputer : HammingComputerDefault {
+    HammingComputer(const uint8_t* a, int code_size)
+            : HammingComputerDefault(a, code_size) {}
+};
+
+#define SPECIALIZED_HC(CODE_SIZE)                                    \
+    template <>                                                      \
+    struct HammingComputer<CODE_SIZE> : HammingComputer##CODE_SIZE { \
+        HammingComputer(const uint8_t* a)                            \
+                : HammingComputer##CODE_SIZE(a, CODE_SIZE) {}        \
+    }
+
+SPECIALIZED_HC(4);
+SPECIALIZED_HC(8);
+SPECIALIZED_HC(16);
+SPECIALIZED_HC(20);
+SPECIALIZED_HC(32);
+SPECIALIZED_HC(64);
+
+#undef SPECIALIZED_HC
+
+/***************************************************************************
+ * generalized Hamming = number of bytes that are different between
+ * two codes.
+ ***************************************************************************/
+
+inline int generalized_hamming_64(uint64_t a) {
+    a |= a >> 1;
+    a |= a >> 2;
+    a |= a >> 4;
+    a &= 0x0101010101010101UL;
+    return popcount64(a);
+}
+
+struct GenHammingComputer8 {
+    uint64_t a0;
+
+    GenHammingComputer8(const uint8_t* a, int code_size) {
+        assert(code_size == 8);
+        a0 = *(uint64_t*)a;
+    }
+
+    inline int hamming(const uint8_t* b) const {
+        return generalized_hamming_64(*(uint64_t*)b ^ a0);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 8;
+    }
+};
+
+// I'm not sure whether this version is faster of slower, tbh
+// todo: test on different CPUs
+struct GenHammingComputer16 {
+    __m128i a;
+
+    GenHammingComputer16(const uint8_t* a8, int code_size) {
+        assert(code_size == 16);
+        a = _mm_loadu_si128((const __m128i_u*)a8);
+    }
+
+    inline int hamming(const uint8_t* b8) const {
+        const __m128i b = _mm_loadu_si128((const __m128i_u*)b8);
+        const __m128i cmp = _mm_cmpeq_epi8(a, b);
+        const auto movemask = _mm_movemask_epi8(cmp);
+        return 16 - popcount32(movemask);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 16;
+    }
+};
+
+struct GenHammingComputer32 {
+    __m256i a;
+
+    GenHammingComputer32(const uint8_t* a8, int code_size) {
+        assert(code_size == 32);
+        a = _mm256_loadu_si256((const __m256i_u*)a8);
+    }
+
+    inline int hamming(const uint8_t* b8) const {
+        const __m256i b = _mm256_loadu_si256((const __m256i_u*)b8);
+        const __m256i cmp = _mm256_cmpeq_epi8(a, b);
+        const uint32_t movemask = _mm256_movemask_epi8(cmp);
+        return 32 - popcount32(movemask);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 32;
+    }
+};
+
+// A specialized version might be needed for the very long
+// GenHamming code_size. In such a case, one may accumulate
+// counts using _mm256_sub_epi8 and then compute a horizontal
+// sum (using _mm256_sad_epu8, maybe, in blocks of no larger
+// than 256 * 32 bytes).
+
+struct GenHammingComputerM8 {
+    const uint64_t* a;
+    int n;
+
+    GenHammingComputerM8(const uint8_t* a8, int code_size) {
+        assert(code_size % 8 == 0);
+        a = (uint64_t*)a8;
+        n = code_size / 8;
+    }
+
+    int hamming(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        int accu = 0;
+
+        int i = 0;
+        int n4 = (n / 4) * 4;
+        for (; i < n4; i += 4) {
+            const __m256i av = _mm256_loadu_si256((const __m256i_u*)(a + i));
+            const __m256i bv = _mm256_loadu_si256((const __m256i_u*)(b + i));
+            const __m256i cmp = _mm256_cmpeq_epi8(av, bv);
+            const uint32_t movemask = _mm256_movemask_epi8(cmp);
+            accu += 32 - popcount32(movemask);
+        }
+
+        for (; i < n; i++)
+            accu += generalized_hamming_64(a[i] ^ b[i]);
+        return accu;
+    }
+
+    inline int get_code_size() const {
+        return n * 8;
+    }
+};
+
+} // namespace faiss
+
+#endif
diff -Nru faiss-1.7.3/faiss/utils/hamming_distance/common.h faiss-1.7.4/faiss/utils/hamming_distance/common.h
--- faiss-1.7.3/faiss/utils/hamming_distance/common.h	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/faiss/utils/hamming_distance/common.h	2023-04-19 13:18:30.000000000 +0000
@@ -0,0 +1,48 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef FAISS_hamming_common_h
+#define FAISS_hamming_common_h
+
+#include <cstdint>
+
+#include <faiss/impl/platform_macros.h>
+
+/* The Hamming distance type */
+using hamdis_t = int32_t;
+
+namespace faiss {
+
+inline int popcount32(uint32_t x) {
+    return __builtin_popcount(x);
+}
+
+// popcount
+inline int popcount64(uint64_t x) {
+    return __builtin_popcountl(x);
+}
+
+// This table was moved from .cpp to .h file, because
+// otherwise it was causing compilation errors while trying to
+// compile swig modules on Windows.
+// todo for C++17: switch to 'inline constexpr'
+static constexpr uint8_t hamdis_tab_ham_bytes[256] = {
+        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4,
+        2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4,
+        2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,
+        4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5,
+        3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,
+        4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+        4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
+
+} // namespace faiss
+
+#endif
diff -Nru faiss-1.7.3/faiss/utils/hamming_distance/generic-inl.h faiss-1.7.4/faiss/utils/hamming_distance/generic-inl.h
--- faiss-1.7.3/faiss/utils/hamming_distance/generic-inl.h	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/faiss/utils/hamming_distance/generic-inl.h	2023-04-19 13:18:30.000000000 +0000
@@ -0,0 +1,519 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef HAMMING_GENERIC_INL_H
+#define HAMMING_GENERIC_INL_H
+
+// A general-purpose version of hamming distance computation.
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include <faiss/impl/platform_macros.h>
+
+namespace faiss {
+
+/* Elementary Hamming distance computation: unoptimized  */
+template <size_t nbits, typename T>
+inline T hamming(const uint8_t* bs1, const uint8_t* bs2) {
+    const size_t nbytes = nbits / 8;
+    size_t i;
+    T h = 0;
+    for (i = 0; i < nbytes; i++) {
+        h += (T)hamdis_tab_ham_bytes[bs1[i] ^ bs2[i]];
+    }
+    return h;
+}
+
+/* Hamming distances for multiples of 64 bits */
+template <size_t nbits>
+inline hamdis_t hamming(const uint64_t* bs1, const uint64_t* bs2) {
+    const size_t nwords = nbits / 64;
+    size_t i;
+    hamdis_t h = 0;
+    for (i = 0; i < nwords; i++) {
+        h += popcount64(bs1[i] ^ bs2[i]);
+    }
+    return h;
+}
+
+/* specialized (optimized) functions */
+template <>
+inline hamdis_t hamming<64>(const uint64_t* pa, const uint64_t* pb) {
+    return popcount64(pa[0] ^ pb[0]);
+}
+
+template <>
+inline hamdis_t hamming<128>(const uint64_t* pa, const uint64_t* pb) {
+    return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]);
+}
+
+template <>
+inline hamdis_t hamming<256>(const uint64_t* pa, const uint64_t* pb) {
+    return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]) +
+            popcount64(pa[2] ^ pb[2]) + popcount64(pa[3] ^ pb[3]);
+}
+
+/* Hamming distances for multiple of 64 bits */
+inline hamdis_t hamming(
+        const uint64_t* bs1,
+        const uint64_t* bs2,
+        size_t nwords) {
+    hamdis_t h = 0;
+    for (size_t i = 0; i < nwords; i++) {
+        h += popcount64(bs1[i] ^ bs2[i]);
+    }
+    return h;
+}
+
+/******************************************************************
+ * The HammingComputer series of classes compares a single code of
+ * size 4 to 32 to incoming codes. They are intended for use as a
+ * template class where it would be inefficient to switch on the code
+ * size in the inner loop. Hopefully the compiler will inline the
+ * hamming() functions and put the a0, a1, ... in registers.
+ ******************************************************************/
+
+struct HammingComputer4 {
+    uint32_t a0;
+
+    HammingComputer4() {}
+
+    HammingComputer4(const uint8_t* a, int code_size) {
+        set(a, code_size);
+    }
+
+    void set(const uint8_t* a, int code_size) {
+        assert(code_size == 4);
+        a0 = *(uint32_t*)a;
+    }
+
+    inline int hamming(const uint8_t* b) const {
+        return popcount64(*(uint32_t*)b ^ a0);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 4;
+    }
+};
+
+struct HammingComputer8 {
+    uint64_t a0;
+
+    HammingComputer8() {}
+
+    HammingComputer8(const uint8_t* a, int code_size) {
+        set(a, code_size);
+    }
+
+    void set(const uint8_t* a, int code_size) {
+        assert(code_size == 8);
+        a0 = *(uint64_t*)a;
+    }
+
+    inline int hamming(const uint8_t* b) const {
+        return popcount64(*(uint64_t*)b ^ a0);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 8;
+    }
+};
+
+struct HammingComputer16 {
+    uint64_t a0, a1;
+
+    HammingComputer16() {}
+
+    HammingComputer16(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        assert(code_size == 16);
+        const uint64_t* a = (uint64_t*)a8;
+        a0 = a[0];
+        a1 = a[1];
+    }
+
+    inline int hamming(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 16;
+    }
+};
+
+// when applied to an array, 1/2 of the 64-bit accesses are unaligned.
+// This incurs a penalty of ~10% wrt. fully aligned accesses.
+struct HammingComputer20 {
+    uint64_t a0, a1;
+    uint32_t a2;
+
+    HammingComputer20() {}
+
+    HammingComputer20(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        assert(code_size == 20);
+        const uint64_t* a = (uint64_t*)a8;
+        a0 = a[0];
+        a1 = a[1];
+        a2 = a[2];
+    }
+
+    inline int hamming(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
+                popcount64(*(uint32_t*)(b + 2) ^ a2);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 20;
+    }
+};
+
+struct HammingComputer32 {
+    uint64_t a0, a1, a2, a3;
+
+    HammingComputer32() {}
+
+    HammingComputer32(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        assert(code_size == 32);
+        const uint64_t* a = (uint64_t*)a8;
+        a0 = a[0];
+        a1 = a[1];
+        a2 = a[2];
+        a3 = a[3];
+    }
+
+    inline int hamming(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
+                popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 32;
+    }
+};
+
+struct HammingComputer64 {
+    uint64_t a0, a1, a2, a3, a4, a5, a6, a7;
+
+    HammingComputer64() {}
+
+    HammingComputer64(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        assert(code_size == 64);
+        const uint64_t* a = (uint64_t*)a8;
+        a0 = a[0];
+        a1 = a[1];
+        a2 = a[2];
+        a3 = a[3];
+        a4 = a[4];
+        a5 = a[5];
+        a6 = a[6];
+        a7 = a[7];
+    }
+
+    inline int hamming(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
+                popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3) +
+                popcount64(b[4] ^ a4) + popcount64(b[5] ^ a5) +
+                popcount64(b[6] ^ a6) + popcount64(b[7] ^ a7);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 64;
+    }
+};
+
+struct HammingComputerDefault {
+    const uint8_t* a8;
+    int quotient8;
+    int remainder8;
+
+    HammingComputerDefault() {}
+
+    HammingComputerDefault(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        this->a8 = a8;
+        quotient8 = code_size / 8;
+        remainder8 = code_size % 8;
+    }
+
+    int hamming(const uint8_t* b8) const {
+        int accu = 0;
+
+        const uint64_t* a64 = reinterpret_cast<const uint64_t*>(a8);
+        const uint64_t* b64 = reinterpret_cast<const uint64_t*>(b8);
+        int i = 0, len = quotient8;
+        switch (len & 7) {
+            default:
+                while (len > 7) {
+                    len -= 8;
+                    accu += popcount64(a64[i] ^ b64[i]);
+                    i++;
+                    case 7:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                    case 6:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                    case 5:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                    case 4:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                    case 3:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                    case 2:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                    case 1:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                }
+        }
+        if (remainder8) {
+            const uint8_t* a = a8 + 8 * quotient8;
+            const uint8_t* b = b8 + 8 * quotient8;
+            switch (remainder8) {
+                case 7:
+                    accu += hamdis_tab_ham_bytes[a[6] ^ b[6]];
+                case 6:
+                    accu += hamdis_tab_ham_bytes[a[5] ^ b[5]];
+                case 5:
+                    accu += hamdis_tab_ham_bytes[a[4] ^ b[4]];
+                case 4:
+                    accu += hamdis_tab_ham_bytes[a[3] ^ b[3]];
+                case 3:
+                    accu += hamdis_tab_ham_bytes[a[2] ^ b[2]];
+                case 2:
+                    accu += hamdis_tab_ham_bytes[a[1] ^ b[1]];
+                case 1:
+                    accu += hamdis_tab_ham_bytes[a[0] ^ b[0]];
+                default:
+                    break;
+            }
+        }
+
+        return accu;
+    }
+
+    inline int get_code_size() const {
+        return quotient8 * 8 + remainder8;
+    }
+};
+
+// more inefficient than HammingComputerDefault (obsolete)
+struct HammingComputerM8 {
+    const uint64_t* a;
+    int n;
+
+    HammingComputerM8() {}
+
+    HammingComputerM8(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        assert(code_size % 8 == 0);
+        a = (uint64_t*)a8;
+        n = code_size / 8;
+    }
+
+    int hamming(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        int accu = 0;
+        for (int i = 0; i < n; i++)
+            accu += popcount64(a[i] ^ b[i]);
+        return accu;
+    }
+
+    inline int get_code_size() const {
+        return n * 8;
+    }
+};
+
+// more inefficient than HammingComputerDefault (obsolete)
+struct HammingComputerM4 {
+    const uint32_t* a;
+    int n;
+
+    HammingComputerM4() {}
+
+    HammingComputerM4(const uint8_t* a4, int code_size) {
+        set(a4, code_size);
+    }
+
+    void set(const uint8_t* a4, int code_size) {
+        assert(code_size % 4 == 0);
+        a = (uint32_t*)a4;
+        n = code_size / 4;
+    }
+
+    int hamming(const uint8_t* b8) const {
+        const uint32_t* b = (uint32_t*)b8;
+        int accu = 0;
+        for (int i = 0; i < n; i++)
+            accu += popcount64(a[i] ^ b[i]);
+        return accu;
+    }
+
+    inline int get_code_size() const {
+        return n * 4;
+    }
+};
+
+/***************************************************************************
+ * Equivalence with a template class when code size is known at compile time
+ **************************************************************************/
+
+// default template
+template <int CODE_SIZE>
+struct HammingComputer : HammingComputerDefault {
+    HammingComputer(const uint8_t* a, int code_size)
+            : HammingComputerDefault(a, code_size) {}
+};
+
+#define SPECIALIZED_HC(CODE_SIZE)                                    \
+    template <>                                                      \
+    struct HammingComputer<CODE_SIZE> : HammingComputer##CODE_SIZE { \
+        HammingComputer(const uint8_t* a)                            \
+                : HammingComputer##CODE_SIZE(a, CODE_SIZE) {}        \
+    }
+
+SPECIALIZED_HC(4);
+SPECIALIZED_HC(8);
+SPECIALIZED_HC(16);
+SPECIALIZED_HC(20);
+SPECIALIZED_HC(32);
+SPECIALIZED_HC(64);
+
+#undef SPECIALIZED_HC
+
+/***************************************************************************
+ * generalized Hamming = number of bytes that are different between
+ * two codes.
+ ***************************************************************************/
+
+inline int generalized_hamming_64(uint64_t a) {
+    a |= a >> 1;
+    a |= a >> 2;
+    a |= a >> 4;
+    a &= 0x0101010101010101UL;
+    return popcount64(a);
+}
+
+struct GenHammingComputer8 {
+    uint64_t a0;
+
+    GenHammingComputer8(const uint8_t* a, int code_size) {
+        assert(code_size == 8);
+        a0 = *(uint64_t*)a;
+    }
+
+    inline int hamming(const uint8_t* b) const {
+        return generalized_hamming_64(*(uint64_t*)b ^ a0);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 8;
+    }
+};
+
+struct GenHammingComputer16 {
+    uint64_t a0, a1;
+    GenHammingComputer16(const uint8_t* a8, int code_size) {
+        assert(code_size == 16);
+        const uint64_t* a = (uint64_t*)a8;
+        a0 = a[0];
+        a1 = a[1];
+    }
+
+    inline int hamming(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        return generalized_hamming_64(b[0] ^ a0) +
+                generalized_hamming_64(b[1] ^ a1);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 16;
+    }
+};
+
+struct GenHammingComputer32 {
+    uint64_t a0, a1, a2, a3;
+
+    GenHammingComputer32(const uint8_t* a8, int code_size) {
+        assert(code_size == 32);
+        const uint64_t* a = (uint64_t*)a8;
+        a0 = a[0];
+        a1 = a[1];
+        a2 = a[2];
+        a3 = a[3];
+    }
+
+    inline int hamming(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        return generalized_hamming_64(b[0] ^ a0) +
+                generalized_hamming_64(b[1] ^ a1) +
+                generalized_hamming_64(b[2] ^ a2) +
+                generalized_hamming_64(b[3] ^ a3);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 32;
+    }
+};
+
+struct GenHammingComputerM8 {
+    const uint64_t* a;
+    int n;
+
+    GenHammingComputerM8(const uint8_t* a8, int code_size) {
+        assert(code_size % 8 == 0);
+        a = (uint64_t*)a8;
+        n = code_size / 8;
+    }
+
+    int hamming(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        int accu = 0;
+        for (int i = 0; i < n; i++)
+            accu += generalized_hamming_64(a[i] ^ b[i]);
+        return accu;
+    }
+
+    inline int get_code_size() const {
+        return n * 8;
+    }
+};
+
+} // namespace faiss
+
+#endif
diff -Nru faiss-1.7.3/faiss/utils/hamming_distance/hamdis-inl.h faiss-1.7.4/faiss/utils/hamming_distance/hamdis-inl.h
--- faiss-1.7.3/faiss/utils/hamming_distance/hamdis-inl.h	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/faiss/utils/hamming_distance/hamdis-inl.h	2023-04-19 13:18:30.000000000 +0000
@@ -0,0 +1,26 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// This file contains low level inline facilities for computing
+// Hamming distances, such as HammingComputerXX and GenHammingComputerXX.
+
+#ifndef FAISS_hamming_inl_h
+#define FAISS_hamming_inl_h
+
+#include <faiss/utils/hamming_distance/common.h>
+
+#ifdef __aarch64__
+// ARM compilers may produce inoptimal code for Hamming distance somewhy.
+#include <faiss/utils/hamming_distance/neon-inl.h>
+#elif __AVX2__
+// better versions for GenHammingComputer
+#include <faiss/utils/hamming_distance/avx2-inl.h>
+#else
+#include <faiss/utils/hamming_distance/generic-inl.h>
+#endif
+
+#endif
diff -Nru faiss-1.7.3/faiss/utils/hamming_distance/neon-inl.h faiss-1.7.4/faiss/utils/hamming_distance/neon-inl.h
--- faiss-1.7.3/faiss/utils/hamming_distance/neon-inl.h	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/faiss/utils/hamming_distance/neon-inl.h	2023-04-19 13:18:30.000000000 +0000
@@ -0,0 +1,614 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef HAMMING_NEON_INL_H
+#define HAMMING_NEON_INL_H
+
+// a specialized version of hamming is needed here, because both
+// gcc, clang and msvc seem to generate suboptimal code sometimes.
+
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include <faiss/impl/platform_macros.h>
+
+#include <faiss/utils/hamming_distance/common.h>
+
+namespace faiss {
+
+/* Elementary Hamming distance computation: unoptimized  */
+template <size_t nbits, typename T>
+inline T hamming(const uint8_t* bs1, const uint8_t* bs2) {
+    const size_t nbytes = nbits / 8;
+    size_t i;
+    T h = 0;
+    for (i = 0; i < nbytes; i++) {
+        h += (T)hamdis_tab_ham_bytes[bs1[i] ^ bs2[i]];
+    }
+    return h;
+}
+
+/* Hamming distances for multiples of 64 bits */
+template <size_t nbits>
+inline hamdis_t hamming(const uint64_t* pa, const uint64_t* pb) {
+    constexpr size_t nwords256 = nbits / 256;
+    constexpr size_t nwords128 = (nbits - nwords256 * 256) / 128;
+    constexpr size_t nwords64 =
+            (nbits - nwords256 * 256 - nwords128 * 128) / 64;
+
+    hamdis_t h = 0;
+    if (nwords256 > 0) {
+        for (size_t i = 0; i < nwords256; i++) {
+            h += hamming<256>(pa, pb);
+            pa += 4;
+            pb += 4;
+        }
+    }
+
+    if (nwords128 > 0) {
+        h += hamming<128>(pa, pb);
+        pa += 2;
+        pb += 2;
+    }
+
+    if (nwords64 > 0) {
+        h += hamming<64>(pa, pb);
+    }
+
+    return h;
+}
+
+/* specialized (optimized) functions */
+template <>
+inline hamdis_t hamming<64>(const uint64_t* pa, const uint64_t* pb) {
+    return popcount64(pa[0] ^ pb[0]);
+}
+
+template <>
+inline hamdis_t hamming<128>(const uint64_t* pa, const uint64_t* pb) {
+    const uint8_t* pa8 = reinterpret_cast<const uint8_t*>(pa);
+    const uint8_t* pb8 = reinterpret_cast<const uint8_t*>(pb);
+    uint8x16_t or0 = veorq_u8(vld1q_u8(pa8), vld1q_u8(pb8));
+    uint8x16_t c0 = vcntq_u8(or0);
+    auto dis = vaddvq_u8(c0);
+    return dis;
+}
+
+template <>
+inline hamdis_t hamming<256>(const uint64_t* pa, const uint64_t* pb) {
+    const uint8_t* pa8 = reinterpret_cast<const uint8_t*>(pa);
+    const uint8_t* pb8 = reinterpret_cast<const uint8_t*>(pb);
+    uint8x16_t or0 = veorq_u8(vld1q_u8(pa8), vld1q_u8(pb8));
+    uint8x16_t or1 = veorq_u8(vld1q_u8(pa8 + 16), vld1q_u8(pb8 + 16));
+    uint8x16_t c0 = vcntq_u8(or0);
+    uint8x16_t c1 = vcntq_u8(or1);
+    uint8x16_t ca = vpaddq_u8(c0, c1);
+    auto dis = vaddvq_u8(ca);
+    return dis;
+}
+
+/* Hamming distances for multiple of 64 bits */
+inline hamdis_t hamming(const uint64_t* pa, const uint64_t* pb, size_t nwords) {
+    const size_t nwords256 = nwords / 256;
+    const size_t nwords128 = (nwords - nwords256 * 256) / 128;
+    const size_t nwords64 = (nwords - nwords256 * 256 - nwords128 * 128) / 64;
+
+    hamdis_t h = 0;
+    if (nwords256 > 0) {
+        for (size_t i = 0; i < nwords256; i++) {
+            h += hamming<256>(pa, pb);
+            pa += 4;
+            pb += 4;
+        }
+    }
+
+    if (nwords128 > 0) {
+        h += hamming<128>(pa, pb);
+        pa += 2;
+        pb += 2;
+    }
+
+    if (nwords64 > 0) {
+        h += hamming<64>(pa, pb);
+    }
+
+    return h;
+}
+
+/******************************************************************
+ * The HammingComputer series of classes compares a single code of
+ * size 4 to 32 to incoming codes. They are intended for use as a
+ * template class where it would be inefficient to switch on the code
+ * size in the inner loop. Hopefully the compiler will inline the
+ * hamming() functions and put the a0, a1, ... in registers.
+ ******************************************************************/
+
+struct HammingComputer4 {
+    uint32_t a0;
+
+    HammingComputer4() {}
+
+    HammingComputer4(const uint8_t* a, int code_size) {
+        set(a, code_size);
+    }
+
+    void set(const uint8_t* a, int code_size) {
+        assert(code_size == 4);
+        a0 = *(uint32_t*)a;
+    }
+
+    inline int hamming(const uint8_t* b) const {
+        return popcount64(*(uint32_t*)b ^ a0);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 4;
+    }
+};
+
+struct HammingComputer8 {
+    uint64_t a0;
+
+    HammingComputer8() {}
+
+    HammingComputer8(const uint8_t* a, int code_size) {
+        set(a, code_size);
+    }
+
+    void set(const uint8_t* a, int code_size) {
+        assert(code_size == 8);
+        a0 = *(uint64_t*)a;
+    }
+
+    inline int hamming(const uint8_t* b) const {
+        return popcount64(*(uint64_t*)b ^ a0);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 8;
+    }
+};
+
+struct HammingComputer16 {
+    uint8x16_t a0;
+
+    HammingComputer16() {}
+
+    HammingComputer16(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        assert(code_size == 16);
+        a0 = vld1q_u8(a8);
+    }
+
+    inline int hamming(const uint8_t* b8) const {
+        uint8x16_t b0 = vld1q_u8(b8);
+
+        uint8x16_t or0 = veorq_u8(a0, b0);
+        uint8x16_t c0 = vcntq_u8(or0);
+        auto dis = vaddvq_u8(c0);
+        return dis;
+    }
+
+    inline static constexpr int get_code_size() {
+        return 16;
+    }
+};
+
+// when applied to an array, 1/2 of the 64-bit accesses are unaligned.
+// This incurs a penalty of ~10% wrt. fully aligned accesses.
+struct HammingComputer20 {
+    uint8x16_t a0;
+    uint32_t a2;
+
+    HammingComputer20() {}
+
+    HammingComputer20(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        assert(code_size == 20);
+
+        a0 = vld1q_u8(a8);
+
+        const uint32_t* a = (uint32_t*)a8;
+        a2 = a[4];
+    }
+
+    inline int hamming(const uint8_t* b8) const {
+        uint8x16_t b0 = vld1q_u8(b8);
+
+        uint8x16_t or0 = veorq_u8(a0, b0);
+        uint8x16_t c0 = vcntq_u8(or0);
+        auto dis = vaddvq_u8(c0);
+
+        const uint32_t* b = (uint32_t*)b8;
+        return dis + popcount64(b[4] ^ a2);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 20;
+    }
+};
+
+struct HammingComputer32 {
+    uint8x16_t a0;
+    uint8x16_t a1;
+
+    HammingComputer32() {}
+
+    HammingComputer32(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        assert(code_size == 32);
+        a0 = vld1q_u8(a8);
+        a1 = vld1q_u8(a8 + 16);
+    }
+
+    inline int hamming(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        uint8x16_t b0 = vld1q_u8(b8);
+        uint8x16_t b1 = vld1q_u8(b8 + 16);
+
+        uint8x16_t or0 = veorq_u8(a0, b0);
+        uint8x16_t or1 = veorq_u8(a1, b1);
+        uint8x16_t c0 = vcntq_u8(or0);
+        uint8x16_t c1 = vcntq_u8(or1);
+        uint8x16_t ca = vpaddq_u8(c0, c1);
+        auto dis = vaddvq_u8(ca);
+        return dis;
+    }
+
+    inline static constexpr int get_code_size() {
+        return 32;
+    }
+};
+
+struct HammingComputer64 {
+    HammingComputer32 hc0, hc1;
+
+    HammingComputer64() {}
+
+    HammingComputer64(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        assert(code_size == 64);
+        hc0.set(a8, 32);
+        hc1.set(a8 + 32, 32);
+    }
+
+    inline int hamming(const uint8_t* b8) const {
+        return hc0.hamming(b8) + hc1.hamming(b8 + 32);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 64;
+    }
+};
+
+struct HammingComputerDefault {
+    const uint8_t* a8;
+    int quotient8;
+    int remainder8;
+
+    HammingComputerDefault() {}
+
+    HammingComputerDefault(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        this->a8 = a8;
+        quotient8 = code_size / 8;
+        remainder8 = code_size % 8;
+    }
+
+    int hamming(const uint8_t* b8) const {
+        int accu = 0;
+
+        const uint64_t* a64 = reinterpret_cast<const uint64_t*>(a8);
+        const uint64_t* b64 = reinterpret_cast<const uint64_t*>(b8);
+        int i = 0, len = quotient8;
+
+        int len256 = (quotient8 / 4) * 4;
+        for (; i < len256; i += 4) {
+            accu += ::faiss::hamming<256>(a64 + i, b64 + i);
+            len -= 4;
+        }
+
+        switch (len & 7) {
+            default:
+                while (len > 7) {
+                    len -= 8;
+                    accu += popcount64(a64[i] ^ b64[i]);
+                    i++;
+                    case 7:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                    case 6:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                    case 5:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                    case 4:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                    case 3:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                    case 2:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                    case 1:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                }
+        }
+        if (remainder8) {
+            const uint8_t* a = a8 + 8 * quotient8;
+            const uint8_t* b = b8 + 8 * quotient8;
+            switch (remainder8) {
+                case 7:
+                    accu += hamdis_tab_ham_bytes[a[6] ^ b[6]];
+                case 6:
+                    accu += hamdis_tab_ham_bytes[a[5] ^ b[5]];
+                case 5:
+                    accu += hamdis_tab_ham_bytes[a[4] ^ b[4]];
+                case 4:
+                    accu += hamdis_tab_ham_bytes[a[3] ^ b[3]];
+                case 3:
+                    accu += hamdis_tab_ham_bytes[a[2] ^ b[2]];
+                case 2:
+                    accu += hamdis_tab_ham_bytes[a[1] ^ b[1]];
+                case 1:
+                    accu += hamdis_tab_ham_bytes[a[0] ^ b[0]];
+                default:
+                    break;
+            }
+        }
+
+        return accu;
+    }
+
+    inline int get_code_size() const {
+        return quotient8 * 8 + remainder8;
+    }
+};
+
+// more inefficient than HammingComputerDefault (obsolete)
+struct HammingComputerM8 {
+    const uint64_t* a;
+    int n;
+
+    HammingComputerM8() {}
+
+    HammingComputerM8(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        assert(code_size % 8 == 0);
+        a = (uint64_t*)a8;
+        n = code_size / 8;
+    }
+
+    int hamming(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        int n4 = (n / 4) * 4;
+        int accu = 0;
+
+        int i = 0;
+        for (; i < n4; i += 4) {
+            accu += ::faiss::hamming<256>(a + i, b + i);
+        }
+        for (; i < n; i++) {
+            accu += popcount64(a[i] ^ b[i]);
+        }
+        return accu;
+    }
+
+    inline int get_code_size() const {
+        return n * 8;
+    }
+};
+
+// more inefficient than HammingComputerDefault (obsolete)
+struct HammingComputerM4 {
+    const uint32_t* a;
+    int n;
+
+    HammingComputerM4() {}
+
+    HammingComputerM4(const uint8_t* a4, int code_size) {
+        set(a4, code_size);
+    }
+
+    void set(const uint8_t* a4, int code_size) {
+        assert(code_size % 4 == 0);
+        a = (uint32_t*)a4;
+        n = code_size / 4;
+    }
+
+    int hamming(const uint8_t* b8) const {
+        const uint32_t* b = (uint32_t*)b8;
+
+        int n8 = (n / 8) * 8;
+        int accu = 0;
+
+        int i = 0;
+        for (; i < n8; i += 8) {
+            accu += ::faiss::hamming<256>(
+                    (const uint64_t*)(a + i), (const uint64_t*)(b + i));
+        }
+        for (; i < n; i++) {
+            accu += popcount64(a[i] ^ b[i]);
+        }
+        return accu;
+    }
+
+    inline int get_code_size() const {
+        return n * 4;
+    }
+};
+
+/***************************************************************************
+ * Equivalence with a template class when code size is known at compile time
+ **************************************************************************/
+
+// default template
+template <int CODE_SIZE>
+struct HammingComputer : HammingComputerDefault {
+    HammingComputer(const uint8_t* a, int code_size)
+            : HammingComputerDefault(a, code_size) {}
+};
+
+#define SPECIALIZED_HC(CODE_SIZE)                                    \
+    template <>                                                      \
+    struct HammingComputer<CODE_SIZE> : HammingComputer##CODE_SIZE { \
+        HammingComputer(const uint8_t* a)                            \
+                : HammingComputer##CODE_SIZE(a, CODE_SIZE) {}        \
+    }
+
+SPECIALIZED_HC(4);
+SPECIALIZED_HC(8);
+SPECIALIZED_HC(16);
+SPECIALIZED_HC(20);
+SPECIALIZED_HC(32);
+SPECIALIZED_HC(64);
+
+#undef SPECIALIZED_HC
+
+/***************************************************************************
+ * generalized Hamming = number of bytes that are different between
+ * two codes.
+ ***************************************************************************/
+
+inline int generalized_hamming_64(uint64_t a) {
+    a |= a >> 1;
+    a |= a >> 2;
+    a |= a >> 4;
+    a &= 0x0101010101010101UL;
+    return popcount64(a);
+}
+
+struct GenHammingComputer8 {
+    uint8x8_t a0;
+
+    GenHammingComputer8(const uint8_t* a8, int code_size) {
+        assert(code_size == 8);
+        a0 = vld1_u8(a8);
+    }
+
+    inline int hamming(const uint8_t* b8) const {
+        uint8x8_t b0 = vld1_u8(b8);
+        uint8x8_t reg = vceq_u8(a0, b0);
+        uint8x8_t c0 = vcnt_u8(reg);
+        return 8 - vaddv_u8(c0) / 8;
+    }
+
+    inline static constexpr int get_code_size() {
+        return 8;
+    }
+};
+
+struct GenHammingComputer16 {
+    uint8x16_t a0;
+
+    GenHammingComputer16(const uint8_t* a8, int code_size) {
+        assert(code_size == 16);
+        a0 = vld1q_u8(a8);
+    }
+
+    inline int hamming(const uint8_t* b8) const {
+        uint8x16_t b0 = vld1q_u8(b8);
+        uint8x16_t reg = vceqq_u8(a0, b0);
+        uint8x16_t c0 = vcntq_u8(reg);
+        return 16 - vaddvq_u8(c0) / 8;
+    }
+
+    inline static constexpr int get_code_size() {
+        return 16;
+    }
+};
+
+struct GenHammingComputer32 {
+    GenHammingComputer16 a0, a1;
+
+    GenHammingComputer32(const uint8_t* a8, int code_size)
+            : a0(a8, 16), a1(a8 + 16, 16) {
+        assert(code_size == 32);
+    }
+
+    inline int hamming(const uint8_t* b8) const {
+        return a0.hamming(b8) + a1.hamming(b8 + 16);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 32;
+    }
+};
+
+struct GenHammingComputerM8 {
+    const uint64_t* a;
+    int n;
+
+    GenHammingComputerM8(const uint8_t* a8, int code_size) {
+        assert(code_size % 8 == 0);
+        a = (uint64_t*)a8;
+        n = code_size / 8;
+    }
+
+    int hamming(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        int accu = 0;
+
+        int n2 = (n / 2) * 2;
+        int i = 0;
+        for (; i < n2; i += 2) {
+            uint8x16_t a0 = vld1q_u8((const uint8_t*)(a + i));
+            uint8x16_t b0 = vld1q_u8((const uint8_t*)(b + i));
+            uint8x16_t reg = vceqq_u8(a0, b0);
+            uint8x16_t c0 = vcntq_u8(reg);
+            auto dis = 16 - vaddvq_u8(c0) / 8;
+            accu += dis;
+        }
+
+        for (; i < n; i++) {
+            uint8x8_t a0 = vld1_u8((const uint8_t*)(a + i));
+            uint8x8_t b0 = vld1_u8((const uint8_t*)(b + i));
+            uint8x8_t reg = vceq_u8(a0, b0);
+            uint8x8_t c0 = vcnt_u8(reg);
+            auto dis = 8 - vaddv_u8(c0) / 8;
+            accu += dis;
+        }
+
+        return accu;
+    }
+
+    inline int get_code_size() {
+        return n * 8;
+    }
+};
+
+} // namespace faiss
+
+#endif
+
+#endif
diff -Nru faiss-1.7.3/faiss/utils/hamming.h faiss-1.7.4/faiss/utils/hamming.h
--- faiss-1.7.3/faiss/utils/hamming.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/utils/hamming.h	2023-04-19 13:18:30.000000000 +0000
@@ -19,6 +19,7 @@
  * - memory usage
  * - cache-misses when dealing with large volumes of data (fewer bits is better)
  *
+ * hamdis_t is defined in utils/hamming_distance/common.h
  */
 
 #ifndef FAISS_hamming_h
@@ -29,8 +30,10 @@
 #include <faiss/impl/platform_macros.h>
 #include <faiss/utils/Heap.h>
 
-/* The Hamming distance type */
-typedef int32_t hamdis_t;
+// Low-level Hamming distance computations and hamdis_t.
+#include <faiss/utils/hamming_distance/hamdis-inl.h>
+
+#include <faiss/utils/approx_topk/mode.h>
 
 namespace faiss {
 
@@ -99,10 +102,6 @@
 
 FAISS_API extern size_t hamming_batch_size;
 
-inline int popcount64(uint64_t x) {
-    return __builtin_popcountl(x);
-}
-
 /** Compute a set of Hamming distances between na and nb binary vectors
  *
  * @param  a             size na * nbytespercode
@@ -125,14 +124,18 @@
  * @param nb      number of database vectors
  * @param ncodes  size of the binary codes (bytes)
  * @param ordered if != 0: order the results by decreasing distance
- *                (may be bottleneck for k/n > 0.01) */
+ *                (may be bottleneck for k/n > 0.01)
+ * @param approx_topk_mode allows to use approximate top-k facilities
+ *                         to speedup heap
+ */
 void hammings_knn_hc(
         int_maxheap_array_t* ha,
         const uint8_t* a,
         const uint8_t* b,
         size_t nb,
         size_t ncodes,
-        int ordered);
+        int ordered,
+        ApproxTopK_mode_t approx_topk_mode = ApproxTopK_mode_t::EXACT_TOPK);
 
 /* Legacy alias to hammings_knn_hc. */
 void hammings_knn(
@@ -209,9 +212,17 @@
 /* compute the Hamming distances between two codewords of nwords*64 bits */
 hamdis_t hamming(const uint64_t* bs1, const uint64_t* bs2, size_t nwords);
 
-} // namespace faiss
+/** generalized Hamming distances (= count number of code bytes that
+    are the same) */
+void generalized_hammings_knn_hc(
+        int_maxheap_array_t* ha,
+        const uint8_t* a,
+        const uint8_t* b,
+        size_t nb,
+        size_t code_size,
+        int ordered = true);
 
-// inlined definitions of HammingComputerXX and GenHammingComputerXX
+} // namespace faiss
 
 #include <faiss/utils/hamming-inl.h>
 
diff -Nru faiss-1.7.3/faiss/utils/hamming-inl.h faiss-1.7.4/faiss/utils/hamming-inl.h
--- faiss-1.7.3/faiss/utils/hamming-inl.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/utils/hamming-inl.h	2023-04-19 13:18:30.000000000 +0000
@@ -7,61 +7,6 @@
 
 namespace faiss {
 
-extern const uint8_t hamdis_tab_ham_bytes[256];
-
-/* Elementary Hamming distance computation: unoptimized  */
-template <size_t nbits, typename T>
-inline T hamming(const uint8_t* bs1, const uint8_t* bs2) {
-    const size_t nbytes = nbits / 8;
-    size_t i;
-    T h = 0;
-    for (i = 0; i < nbytes; i++) {
-        h += (T)hamdis_tab_ham_bytes[bs1[i] ^ bs2[i]];
-    }
-    return h;
-}
-
-/* Hamming distances for multiples of 64 bits */
-template <size_t nbits>
-inline hamdis_t hamming(const uint64_t* bs1, const uint64_t* bs2) {
-    const size_t nwords = nbits / 64;
-    size_t i;
-    hamdis_t h = 0;
-    for (i = 0; i < nwords; i++) {
-        h += popcount64(bs1[i] ^ bs2[i]);
-    }
-    return h;
-}
-
-/* specialized (optimized) functions */
-template <>
-inline hamdis_t hamming<64>(const uint64_t* pa, const uint64_t* pb) {
-    return popcount64(pa[0] ^ pb[0]);
-}
-
-template <>
-inline hamdis_t hamming<128>(const uint64_t* pa, const uint64_t* pb) {
-    return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]);
-}
-
-template <>
-inline hamdis_t hamming<256>(const uint64_t* pa, const uint64_t* pb) {
-    return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]) +
-            popcount64(pa[2] ^ pb[2]) + popcount64(pa[3] ^ pb[3]);
-}
-
-/* Hamming distances for multiple of 64 bits */
-inline hamdis_t hamming(
-        const uint64_t* bs1,
-        const uint64_t* bs2,
-        size_t nwords) {
-    hamdis_t h = 0;
-    for (size_t i = 0; i < nwords; i++) {
-        h += popcount64(bs1[i] ^ bs2[i]);
-    }
-    return h;
-}
-
 // BitstringWriter and BitstringReader functions
 inline BitstringWriter::BitstringWriter(uint8_t* code, size_t code_size)
         : code(code), code_size(code_size), i(0) {
@@ -119,407 +64,6 @@
     }
 }
 
-/******************************************************************
- * The HammingComputer series of classes compares a single code of
- * size 4 to 32 to incoming codes. They are intended for use as a
- * template class where it would be inefficient to switch on the code
- * size in the inner loop. Hopefully the compiler will inline the
- * hamming() functions and put the a0, a1, ... in registers.
- ******************************************************************/
-
-struct HammingComputer4 {
-    uint32_t a0;
-
-    HammingComputer4() {}
-
-    HammingComputer4(const uint8_t* a, int code_size) {
-        set(a, code_size);
-    }
-
-    void set(const uint8_t* a, int code_size) {
-        assert(code_size == 4);
-        a0 = *(uint32_t*)a;
-    }
-
-    inline int hamming(const uint8_t* b) const {
-        return popcount64(*(uint32_t*)b ^ a0);
-    }
-};
-
-struct HammingComputer8 {
-    uint64_t a0;
-
-    HammingComputer8() {}
-
-    HammingComputer8(const uint8_t* a, int code_size) {
-        set(a, code_size);
-    }
-
-    void set(const uint8_t* a, int code_size) {
-        assert(code_size == 8);
-        a0 = *(uint64_t*)a;
-    }
-
-    inline int hamming(const uint8_t* b) const {
-        return popcount64(*(uint64_t*)b ^ a0);
-    }
-};
-
-struct HammingComputer16 {
-    uint64_t a0, a1;
-
-    HammingComputer16() {}
-
-    HammingComputer16(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        assert(code_size == 16);
-        const uint64_t* a = (uint64_t*)a8;
-        a0 = a[0];
-        a1 = a[1];
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1);
-    }
-};
-
-// when applied to an array, 1/2 of the 64-bit accesses are unaligned.
-// This incurs a penalty of ~10% wrt. fully aligned accesses.
-struct HammingComputer20 {
-    uint64_t a0, a1;
-    uint32_t a2;
-
-    HammingComputer20() {}
-
-    HammingComputer20(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        assert(code_size == 20);
-        const uint64_t* a = (uint64_t*)a8;
-        a0 = a[0];
-        a1 = a[1];
-        a2 = a[2];
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
-                popcount64(*(uint32_t*)(b + 2) ^ a2);
-    }
-};
-
-struct HammingComputer32 {
-    uint64_t a0, a1, a2, a3;
-
-    HammingComputer32() {}
-
-    HammingComputer32(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        assert(code_size == 32);
-        const uint64_t* a = (uint64_t*)a8;
-        a0 = a[0];
-        a1 = a[1];
-        a2 = a[2];
-        a3 = a[3];
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
-                popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3);
-    }
-};
-
-struct HammingComputer64 {
-    uint64_t a0, a1, a2, a3, a4, a5, a6, a7;
-
-    HammingComputer64() {}
-
-    HammingComputer64(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        assert(code_size == 64);
-        const uint64_t* a = (uint64_t*)a8;
-        a0 = a[0];
-        a1 = a[1];
-        a2 = a[2];
-        a3 = a[3];
-        a4 = a[4];
-        a5 = a[5];
-        a6 = a[6];
-        a7 = a[7];
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
-                popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3) +
-                popcount64(b[4] ^ a4) + popcount64(b[5] ^ a5) +
-                popcount64(b[6] ^ a6) + popcount64(b[7] ^ a7);
-    }
-};
-
-struct HammingComputerDefault {
-    const uint8_t* a8;
-    int quotient8;
-    int remainder8;
-
-    HammingComputerDefault() {}
-
-    HammingComputerDefault(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        this->a8 = a8;
-        quotient8 = code_size / 8;
-        remainder8 = code_size % 8;
-    }
-
-    int hamming(const uint8_t* b8) const {
-        int accu = 0;
-
-        const uint64_t* a64 = reinterpret_cast<const uint64_t*>(a8);
-        const uint64_t* b64 = reinterpret_cast<const uint64_t*>(b8);
-        int i = 0, len = quotient8;
-        switch (len & 7) {
-            default:
-                while (len > 7) {
-                    len -= 8;
-                    accu += popcount64(a64[i] ^ b64[i]);
-                    i++;
-                    case 7:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                    case 6:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                    case 5:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                    case 4:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                    case 3:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                    case 2:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                    case 1:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                }
-        }
-        if (remainder8) {
-            const uint8_t* a = a8 + 8 * quotient8;
-            const uint8_t* b = b8 + 8 * quotient8;
-            switch (remainder8) {
-                case 7:
-                    accu += hamdis_tab_ham_bytes[a[6] ^ b[6]];
-                case 6:
-                    accu += hamdis_tab_ham_bytes[a[5] ^ b[5]];
-                case 5:
-                    accu += hamdis_tab_ham_bytes[a[4] ^ b[4]];
-                case 4:
-                    accu += hamdis_tab_ham_bytes[a[3] ^ b[3]];
-                case 3:
-                    accu += hamdis_tab_ham_bytes[a[2] ^ b[2]];
-                case 2:
-                    accu += hamdis_tab_ham_bytes[a[1] ^ b[1]];
-                case 1:
-                    accu += hamdis_tab_ham_bytes[a[0] ^ b[0]];
-                default:
-                    break;
-            }
-        }
-
-        return accu;
-    }
-};
-
-// more inefficient than HammingComputerDefault (obsolete)
-struct HammingComputerM8 {
-    const uint64_t* a;
-    int n;
-
-    HammingComputerM8() {}
-
-    HammingComputerM8(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        assert(code_size % 8 == 0);
-        a = (uint64_t*)a8;
-        n = code_size / 8;
-    }
-
-    int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        int accu = 0;
-        for (int i = 0; i < n; i++)
-            accu += popcount64(a[i] ^ b[i]);
-        return accu;
-    }
-};
-
-// more inefficient than HammingComputerDefault (obsolete)
-struct HammingComputerM4 {
-    const uint32_t* a;
-    int n;
-
-    HammingComputerM4() {}
-
-    HammingComputerM4(const uint8_t* a4, int code_size) {
-        set(a4, code_size);
-    }
-
-    void set(const uint8_t* a4, int code_size) {
-        assert(code_size % 4 == 0);
-        a = (uint32_t*)a4;
-        n = code_size / 4;
-    }
-
-    int hamming(const uint8_t* b8) const {
-        const uint32_t* b = (uint32_t*)b8;
-        int accu = 0;
-        for (int i = 0; i < n; i++)
-            accu += popcount64(a[i] ^ b[i]);
-        return accu;
-    }
-};
-
-/***************************************************************************
- * Equivalence with a template class when code size is known at compile time
- **************************************************************************/
-
-// default template
-template <int CODE_SIZE>
-struct HammingComputer : HammingComputerDefault {
-    HammingComputer(const uint8_t* a, int code_size)
-            : HammingComputerDefault(a, code_size) {}
-};
-
-#define SPECIALIZED_HC(CODE_SIZE)                                    \
-    template <>                                                      \
-    struct HammingComputer<CODE_SIZE> : HammingComputer##CODE_SIZE { \
-        HammingComputer(const uint8_t* a)                            \
-                : HammingComputer##CODE_SIZE(a, CODE_SIZE) {}        \
-    }
-
-SPECIALIZED_HC(4);
-SPECIALIZED_HC(8);
-SPECIALIZED_HC(16);
-SPECIALIZED_HC(20);
-SPECIALIZED_HC(32);
-SPECIALIZED_HC(64);
-
-#undef SPECIALIZED_HC
-
-/***************************************************************************
- * generalized Hamming = number of bytes that are different between
- * two codes.
- ***************************************************************************/
-
-inline int generalized_hamming_64(uint64_t a) {
-    a |= a >> 1;
-    a |= a >> 2;
-    a |= a >> 4;
-    a &= 0x0101010101010101UL;
-    return popcount64(a);
-}
-
-struct GenHammingComputer8 {
-    uint64_t a0;
-
-    GenHammingComputer8(const uint8_t* a, int code_size) {
-        assert(code_size == 8);
-        a0 = *(uint64_t*)a;
-    }
-
-    inline int hamming(const uint8_t* b) const {
-        return generalized_hamming_64(*(uint64_t*)b ^ a0);
-    }
-};
-
-struct GenHammingComputer16 {
-    uint64_t a0, a1;
-    GenHammingComputer16(const uint8_t* a8, int code_size) {
-        assert(code_size == 16);
-        const uint64_t* a = (uint64_t*)a8;
-        a0 = a[0];
-        a1 = a[1];
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        return generalized_hamming_64(b[0] ^ a0) +
-                generalized_hamming_64(b[1] ^ a1);
-    }
-};
-
-struct GenHammingComputer32 {
-    uint64_t a0, a1, a2, a3;
-
-    GenHammingComputer32(const uint8_t* a8, int code_size) {
-        assert(code_size == 32);
-        const uint64_t* a = (uint64_t*)a8;
-        a0 = a[0];
-        a1 = a[1];
-        a2 = a[2];
-        a3 = a[3];
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        return generalized_hamming_64(b[0] ^ a0) +
-                generalized_hamming_64(b[1] ^ a1) +
-                generalized_hamming_64(b[2] ^ a2) +
-                generalized_hamming_64(b[3] ^ a3);
-    }
-};
-
-struct GenHammingComputerM8 {
-    const uint64_t* a;
-    int n;
-
-    GenHammingComputerM8(const uint8_t* a8, int code_size) {
-        assert(code_size % 8 == 0);
-        a = (uint64_t*)a8;
-        n = code_size / 8;
-    }
-
-    int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        int accu = 0;
-        for (int i = 0; i < n; i++)
-            accu += generalized_hamming_64(a[i] ^ b[i]);
-        return accu;
-    }
-};
-
-/** generalized Hamming distances (= count number of code bytes that
-    are the same) */
-void generalized_hammings_knn_hc(
-        int_maxheap_array_t* ha,
-        const uint8_t* a,
-        const uint8_t* b,
-        size_t nb,
-        size_t code_size,
-        int ordered = true);
-
 /** This class maintains a list of best distances seen so far.
  *
  * Since the distances are in a limited range (0 to nbit), the
diff -Nru faiss-1.7.3/faiss/utils/Heap.cpp faiss-1.7.4/faiss/utils/Heap.cpp
--- faiss-1.7.3/faiss/utils/Heap.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/utils/Heap.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -9,6 +9,7 @@
 
 /* Function for soft heap */
 
+#include <faiss/impl/FaissAssert.h>
 #include <faiss/utils/Heap.h>
 
 namespace faiss {
@@ -32,7 +33,7 @@
     if (ni == -1)
         ni = nh;
     assert(i0 >= 0 && i0 + ni <= nh);
-#pragma omp parallel for
+#pragma omp parallel for if (ni * nj > 100000)
     for (int64_t i = i0; i < i0 + ni; i++) {
         T* __restrict simi = get_val(i);
         TI* __restrict idxi = get_ids(i);
@@ -62,7 +63,7 @@
     if (ni == -1)
         ni = nh;
     assert(i0 >= 0 && i0 + ni <= nh);
-#pragma omp parallel for
+#pragma omp parallel for if (ni * nj > 100000)
     for (int64_t i = i0; i < i0 + ni; i++) {
         T* __restrict simi = get_val(i);
         TI* __restrict idxi = get_ids(i);
@@ -79,8 +80,37 @@
 }
 
 template <typename C>
+void HeapArray<C>::addn_query_subset_with_ids(
+        size_t nsubset,
+        const TI* subset,
+        size_t nj,
+        const T* vin,
+        const TI* id_in,
+        int64_t id_stride) {
+    FAISS_THROW_IF_NOT_MSG(id_in, "anonymous ids not supported");
+    if (id_stride < 0) {
+        id_stride = nj;
+    }
+#pragma omp parallel for if (nsubset * nj > 100000)
+    for (int64_t si = 0; si < nsubset; si++) {
+        T i = subset[si];
+        T* __restrict simi = get_val(i);
+        TI* __restrict idxi = get_ids(i);
+        const T* ip_line = vin + si * nj;
+        const TI* id_line = id_in + si * id_stride;
+
+        for (size_t j = 0; j < nj; j++) {
+            T ip = ip_line[j];
+            if (C::cmp(simi[0], ip)) {
+                heap_replace_top<C>(k, simi, idxi, ip, id_line[j]);
+            }
+        }
+    }
+}
+
+template <typename C>
 void HeapArray<C>::per_line_extrema(T* out_val, TI* out_ids) const {
-#pragma omp parallel for
+#pragma omp parallel for if (nh * k > 100000)
     for (int64_t j = 0; j < nh; j++) {
         int64_t imin = -1;
         typename C::T xval = C::Crev::neutral();
@@ -109,4 +139,110 @@
 template struct HeapArray<CMin<int, int64_t>>;
 template struct HeapArray<CMax<int, int64_t>>;
 
+/**********************************************************
+ * merge knn search results
+ **********************************************************/
+
+/** Merge result tables from several shards. The per-shard results are assumed
+ * to be sorted. Note that the C comparator is reversed w.r.t. the usual top-k
+ * element heap because we want the best (ie. lowest for L2) result to be on
+ * top, not the worst.
+ *
+ * @param all_distances  size (nshard, n, k)
+ * @param all_labels     size (nshard, n, k)
+ * @param distances      output distances, size (n, k)
+ * @param labels         output labels, size (n, k)
+ */
+template <class idx_t, class C>
+void merge_knn_results(
+        size_t n,
+        size_t k,
+        typename C::TI nshard,
+        const typename C::T* all_distances,
+        const idx_t* all_labels,
+        typename C::T* distances,
+        idx_t* labels) {
+    using distance_t = typename C::T;
+    if (k == 0) {
+        return;
+    }
+    long stride = n * k;
+#pragma omp parallel if (n * nshard * k > 100000)
+    {
+        std::vector<int> buf(2 * nshard);
+        // index in each shard's result list
+        int* pointer = buf.data();
+        // (shard_ids, heap_vals): heap that indexes
+        // shard -> current distance for this shard
+        int* shard_ids = pointer + nshard;
+        std::vector<distance_t> buf2(nshard);
+        distance_t* heap_vals = buf2.data();
+#pragma omp for
+        for (long i = 0; i < n; i++) {
+            // the heap maps values to the shard where they are
+            // produced.
+            const distance_t* D_in = all_distances + i * k;
+            const idx_t* I_in = all_labels + i * k;
+            int heap_size = 0;
+
+            // push the first element of each shard (if not -1)
+            for (long s = 0; s < nshard; s++) {
+                pointer[s] = 0;
+                if (I_in[stride * s] >= 0) {
+                    heap_push<C>(
+                            ++heap_size,
+                            heap_vals,
+                            shard_ids,
+                            D_in[stride * s],
+                            s);
+                }
+            }
+
+            distance_t* D = distances + i * k;
+            idx_t* I = labels + i * k;
+
+            int j;
+            for (j = 0; j < k && heap_size > 0; j++) {
+                // pop element from best shard
+                int s = shard_ids[0]; // top of heap
+                int& p = pointer[s];
+                D[j] = heap_vals[0];
+                I[j] = I_in[stride * s + p];
+
+                // pop from shard, advance pointer for this shard
+                heap_pop<C>(heap_size--, heap_vals, shard_ids);
+                p++;
+                if (p < k && I_in[stride * s + p] >= 0) {
+                    heap_push<C>(
+                            ++heap_size,
+                            heap_vals,
+                            shard_ids,
+                            D_in[stride * s + p],
+                            s);
+                }
+            }
+            for (; j < k; j++) {
+                I[j] = -1;
+                D[j] = C::Crev::neutral();
+            }
+        }
+    }
+}
+
+// explicit instanciations
+#define INSTANTIATE(C, distance_t)                                \
+    template void merge_knn_results<int64_t, C<distance_t, int>>( \
+            size_t,                                               \
+            size_t,                                               \
+            int,                                                  \
+            const distance_t*,                                    \
+            const int64_t*,                                       \
+            distance_t*,                                          \
+            int64_t*);
+
+INSTANTIATE(CMin, float);
+INSTANTIATE(CMax, float);
+INSTANTIATE(CMin, int32_t);
+INSTANTIATE(CMax, int32_t);
+
 } // namespace faiss
diff -Nru faiss-1.7.3/faiss/utils/Heap.h faiss-1.7.4/faiss/utils/Heap.h
--- faiss-1.7.3/faiss/utils/Heap.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/utils/Heap.h	2023-04-19 13:18:30.000000000 +0000
@@ -413,6 +413,19 @@
             size_t i0 = 0,
             int64_t ni = -1);
 
+    /** same as addn_with_ids, but for just a subset of queries
+     *
+     * @param nsubset  number of query entries to update
+     * @param subset   indexes of queries to update, in 0..nh-1, size nsubset
+     */
+    void addn_query_subset_with_ids(
+            size_t nsubset,
+            const TI* subset,
+            size_t nj,
+            const T* vin,
+            const TI* id_in = nullptr,
+            int64_t id_stride = 0);
+
     /// reorder all the heaps
     void reorder();
 
@@ -431,7 +444,7 @@
 typedef HeapArray<CMax<float, int64_t>> float_maxheap_array_t;
 typedef HeapArray<CMax<int, int64_t>> int_maxheap_array_t;
 
-// The heap templates are instanciated explicitly in Heap.cpp
+// The heap templates are instantiated explicitly in Heap.cpp
 
 /*********************************************************************
  * Indirect heaps: instead of having
@@ -492,6 +505,27 @@
     bh_ids[i] = id;
 }
 
+/** Merge result tables from several shards. The per-shard results are assumed
+ * to be sorted. Note that the C comparator is reversed w.r.t. the usual top-k
+ * element heap because we want the best (ie. lowest for L2) result to be on
+ * top, not the worst. Also, it needs to hold an index of a shard id (ie.
+ * usually int32 is more than enough).
+ *
+ * @param all_distances  size (nshard, n, k)
+ * @param all_labels     size (nshard, n, k)
+ * @param distances      output distances, size (n, k)
+ * @param labels         output labels, size (n, k)
+ */
+template <class idx_t, class C>
+void merge_knn_results(
+        size_t n,
+        size_t k,
+        typename C::TI nshard,
+        const typename C::T* all_distances,
+        const idx_t* all_labels,
+        typename C::T* distances,
+        idx_t* labels);
+
 } // namespace faiss
 
 #endif /* FAISS_Heap_h */
diff -Nru faiss-1.7.3/faiss/utils/partitioning.cpp faiss-1.7.4/faiss/utils/partitioning.cpp
--- faiss-1.7.3/faiss/utils/partitioning.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/utils/partitioning.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -817,7 +817,7 @@
  * Histogram subroutines
  ******************************************************************/
 
-#ifdef __AVX2__
+#if defined(__AVX2__) || defined(__aarch64__)
 /// FIXME when MSB of uint16 is set
 // this code does not compile properly with GCC 7.4.0
 
@@ -833,7 +833,7 @@
     simd16uint16 a8_0 = a4 & mask4;
     simd16uint16 a8_1 = (a4 >> 4) & mask4;
 
-    return simd32uint8(_mm256_hadd_epi16(a8_0.i, a8_1.i));
+    return simd32uint8(hadd(a8_0, a8_1));
 }
 
 simd16uint16 accu8to16(simd32uint8 a8) {
@@ -842,10 +842,10 @@
     simd16uint16 a8_0 = simd16uint16(a8) & mask8;
     simd16uint16 a8_1 = (simd16uint16(a8) >> 8) & mask8;
 
-    return simd16uint16(_mm256_hadd_epi16(a8_0.i, a8_1.i));
+    return hadd(a8_0, a8_1);
 }
 
-static const simd32uint8 shifts(_mm256_setr_epi8(
+static const simd32uint8 shifts = simd32uint8::create<
         1,
         16,
         0,
@@ -877,7 +877,7 @@
         0,
         0,
         4,
-        64));
+        64>();
 
 // 2-bit accumulator: we can add only up to 3 elements
 // on output we return 2*4-bit results
@@ -937,7 +937,7 @@
     simd16uint16 a16lo = accu8to16(a8lo);
     simd16uint16 a16hi = accu8to16(a8hi);
 
-    simd16uint16 a16 = simd16uint16(_mm256_hadd_epi16(a16lo.i, a16hi.i));
+    simd16uint16 a16 = hadd(a16lo, a16hi);
 
     // the 2 lanes must still be combined
     return a16;
@@ -947,7 +947,7 @@
  * 16 bins
  ************************************************************/
 
-static const simd32uint8 shifts2(_mm256_setr_epi8(
+static const simd32uint8 shifts2 = simd32uint8::create<
         1,
         2,
         4,
@@ -955,7 +955,7 @@
         16,
         32,
         64,
-        (char)128,
+        128,
         1,
         2,
         4,
@@ -963,7 +963,7 @@
         16,
         32,
         64,
-        (char)128,
+        128,
         1,
         2,
         4,
@@ -971,7 +971,7 @@
         16,
         32,
         64,
-        (char)128,
+        128,
         1,
         2,
         4,
@@ -979,19 +979,12 @@
         16,
         32,
         64,
-        (char)128));
+        128>();
 
 simd32uint8 shiftr_16(simd32uint8 x, int n) {
     return simd32uint8(simd16uint16(x) >> n);
 }
 
-inline simd32uint8 combine_2x2(simd32uint8 a, simd32uint8 b) {
-    __m256i a1b0 = _mm256_permute2f128_si256(a.i, b.i, 0x21);
-    __m256i a0b1 = _mm256_blend_epi32(a.i, b.i, 0xF0);
-
-    return simd32uint8(a1b0) + simd32uint8(a0b1);
-}
-
 // 2-bit accumulator: we can add only up to 3 elements
 // on output we return 2*4-bit results
 template <int N, class Preproc>
@@ -1018,7 +1011,7 @@
         // contains 0s for out-of-bounds elements
 
         simd16uint16 lt8 = (v >> 3) == simd16uint16(0);
-        lt8.i = _mm256_xor_si256(lt8.i, _mm256_set1_epi16(0xff00));
+        lt8 = lt8 ^ simd16uint16(0xff00);
 
         a1 = a1 & lt8;
 
@@ -1036,11 +1029,15 @@
 simd32uint8 accu4to8_2(simd32uint8 a4_0, simd32uint8 a4_1) {
     simd32uint8 mask4(0x0f);
 
-    simd32uint8 a8_0 = combine_2x2(a4_0 & mask4, shiftr_16(a4_0, 4) & mask4);
-
-    simd32uint8 a8_1 = combine_2x2(a4_1 & mask4, shiftr_16(a4_1, 4) & mask4);
+    simd16uint16 a8_0 = combine2x2(
+            (simd16uint16)(a4_0 & mask4),
+            (simd16uint16)(shiftr_16(a4_0, 4) & mask4));
+
+    simd16uint16 a8_1 = combine2x2(
+            (simd16uint16)(a4_1 & mask4),
+            (simd16uint16)(shiftr_16(a4_1, 4) & mask4));
 
-    return simd32uint8(_mm256_hadd_epi16(a8_0.i, a8_1.i));
+    return simd32uint8(hadd(a8_0, a8_1));
 }
 
 template <class Preproc>
@@ -1079,10 +1076,9 @@
     simd16uint16 a16lo = accu8to16(a8lo);
     simd16uint16 a16hi = accu8to16(a8hi);
 
-    simd16uint16 a16 = simd16uint16(_mm256_hadd_epi16(a16lo.i, a16hi.i));
+    simd16uint16 a16 = hadd(a16lo, a16hi);
 
-    __m256i perm32 = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7);
-    a16.i = _mm256_permutevar8x32_epi32(a16.i, perm32);
+    a16 = simd16uint16{simd8uint32{a16}.unzip()};
 
     return a16;
 }
diff -Nru faiss-1.7.3/faiss/utils/simdlib_avx2.h faiss-1.7.4/faiss/utils/simdlib_avx2.h
--- faiss-1.7.3/faiss/utils/simdlib_avx2.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/utils/simdlib_avx2.h	2023-04-19 13:18:30.000000000 +0000
@@ -70,6 +70,13 @@
         bin(bits);
         return std::string(bits);
     }
+
+    // Checks whether the other holds exactly the same bytes.
+    bool is_same_as(simd256bit other) const {
+        const __m256i pcmp = _mm256_cmpeq_epi32(i, other.i);
+        unsigned bitmask = _mm256_movemask_epi8(pcmp);
+        return (bitmask == 0xffffffffU);
+    }
 };
 
 /// vector of 16 elements in uint16
@@ -86,6 +93,41 @@
 
     explicit simd16uint16(const uint16_t* x) : simd256bit((const void*)x) {}
 
+    explicit simd16uint16(
+            uint16_t u0,
+            uint16_t u1,
+            uint16_t u2,
+            uint16_t u3,
+            uint16_t u4,
+            uint16_t u5,
+            uint16_t u6,
+            uint16_t u7,
+            uint16_t u8,
+            uint16_t u9,
+            uint16_t u10,
+            uint16_t u11,
+            uint16_t u12,
+            uint16_t u13,
+            uint16_t u14,
+            uint16_t u15)
+            : simd256bit(_mm256_setr_epi16(
+                      u0,
+                      u1,
+                      u2,
+                      u3,
+                      u4,
+                      u5,
+                      u6,
+                      u7,
+                      u8,
+                      u9,
+                      u10,
+                      u11,
+                      u12,
+                      u13,
+                      u14,
+                      u15)) {}
+
     std::string elements_to_string(const char* fmt) const {
         uint16_t bytes[16];
         storeu((void*)bytes);
@@ -151,9 +193,19 @@
         return simd16uint16(_mm256_or_si256(i, other.i));
     }
 
+    simd16uint16 operator^(simd256bit other) const {
+        return simd16uint16(_mm256_xor_si256(i, other.i));
+    }
+
     // returns binary masks
-    simd16uint16 operator==(simd256bit other) const {
-        return simd16uint16(_mm256_cmpeq_epi16(i, other.i));
+    friend simd16uint16 operator==(const simd256bit lhs, const simd256bit rhs) {
+        return simd16uint16(_mm256_cmpeq_epi16(lhs.i, rhs.i));
+    }
+
+    bool is_same(simd16uint16 other) const {
+        const __m256i pcmp = _mm256_cmpeq_epi16(i, other.i);
+        unsigned bitmask = _mm256_movemask_epi8(pcmp);
+        return (bitmask == 0xffffffffU);
     }
 
     simd16uint16 operator~() const {
@@ -255,6 +307,45 @@
     return ge;
 }
 
+inline simd16uint16 hadd(const simd16uint16& a, const simd16uint16& b) {
+    return simd16uint16(_mm256_hadd_epi16(a.i, b.i));
+}
+
+// Vectorized version of the following code:
+//   for (size_t i = 0; i < n; i++) {
+//      bool flag = (candidateValues[i] < currentValues[i]);
+//      minValues[i] = flag ? candidateValues[i] : currentValues[i];
+//      minIndices[i] = flag ? candidateIndices[i] : currentIndices[i];
+//      maxValues[i] = !flag ? candidateValues[i] : currentValues[i];
+//      maxIndices[i] = !flag ? candidateIndices[i] : currentIndices[i];
+//   }
+// Max indices evaluation is inaccurate in case of equal values (the index of
+// the last equal value is saved instead of the first one), but this behavior
+// saves instructions.
+//
+// Works in i16 mode in order to save instructions. One may
+// switch from i16 to u16.
+inline void cmplt_min_max_fast(
+        const simd16uint16 candidateValues,
+        const simd16uint16 candidateIndices,
+        const simd16uint16 currentValues,
+        const simd16uint16 currentIndices,
+        simd16uint16& minValues,
+        simd16uint16& minIndices,
+        simd16uint16& maxValues,
+        simd16uint16& maxIndices) {
+    // there's no lt instruction, so we'll need to emulate one
+    __m256i comparison = _mm256_cmpgt_epi16(currentValues.i, candidateValues.i);
+    comparison = _mm256_andnot_si256(comparison, _mm256_set1_epi16(-1));
+
+    minValues.i = _mm256_min_epi16(candidateValues.i, currentValues.i);
+    minIndices.i = _mm256_blendv_epi8(
+            candidateIndices.i, currentIndices.i, comparison);
+    maxValues.i = _mm256_max_epi16(candidateValues.i, currentValues.i);
+    maxIndices.i = _mm256_blendv_epi8(
+            currentIndices.i, candidateIndices.i, comparison);
+}
+
 // vector of 32 unsigned 8-bit integers
 struct simd32uint8 : simd256bit {
     simd32uint8() {}
@@ -265,6 +356,75 @@
 
     explicit simd32uint8(uint8_t x) : simd256bit(_mm256_set1_epi8(x)) {}
 
+    template <
+            uint8_t _0,
+            uint8_t _1,
+            uint8_t _2,
+            uint8_t _3,
+            uint8_t _4,
+            uint8_t _5,
+            uint8_t _6,
+            uint8_t _7,
+            uint8_t _8,
+            uint8_t _9,
+            uint8_t _10,
+            uint8_t _11,
+            uint8_t _12,
+            uint8_t _13,
+            uint8_t _14,
+            uint8_t _15,
+            uint8_t _16,
+            uint8_t _17,
+            uint8_t _18,
+            uint8_t _19,
+            uint8_t _20,
+            uint8_t _21,
+            uint8_t _22,
+            uint8_t _23,
+            uint8_t _24,
+            uint8_t _25,
+            uint8_t _26,
+            uint8_t _27,
+            uint8_t _28,
+            uint8_t _29,
+            uint8_t _30,
+            uint8_t _31>
+    static simd32uint8 create() {
+        return simd32uint8(_mm256_setr_epi8(
+                (char)_0,
+                (char)_1,
+                (char)_2,
+                (char)_3,
+                (char)_4,
+                (char)_5,
+                (char)_6,
+                (char)_7,
+                (char)_8,
+                (char)_9,
+                (char)_10,
+                (char)_11,
+                (char)_12,
+                (char)_13,
+                (char)_14,
+                (char)_15,
+                (char)_16,
+                (char)_17,
+                (char)_18,
+                (char)_19,
+                (char)_20,
+                (char)_21,
+                (char)_22,
+                (char)_23,
+                (char)_24,
+                (char)_25,
+                (char)_26,
+                (char)_27,
+                (char)_28,
+                (char)_29,
+                (char)_30,
+                (char)_31));
+    }
+
     explicit simd32uint8(simd256bit x) : simd256bit(x) {}
 
     explicit simd32uint8(const uint8_t* x) : simd256bit((const void*)x) {}
@@ -359,6 +519,40 @@
 
     explicit simd8uint32(const uint8_t* x) : simd256bit((const void*)x) {}
 
+    explicit simd8uint32(
+            uint32_t u0,
+            uint32_t u1,
+            uint32_t u2,
+            uint32_t u3,
+            uint32_t u4,
+            uint32_t u5,
+            uint32_t u6,
+            uint32_t u7)
+            : simd256bit(_mm256_setr_epi32(u0, u1, u2, u3, u4, u5, u6, u7)) {}
+
+    simd8uint32 operator+(simd8uint32 other) const {
+        return simd8uint32(_mm256_add_epi32(i, other.i));
+    }
+
+    simd8uint32 operator-(simd8uint32 other) const {
+        return simd8uint32(_mm256_sub_epi32(i, other.i));
+    }
+
+    simd8uint32& operator+=(const simd8uint32& other) {
+        i = _mm256_add_epi32(i, other.i);
+        return *this;
+    }
+
+    bool operator==(simd8uint32 other) const {
+        const __m256i pcmp = _mm256_cmpeq_epi32(i, other.i);
+        unsigned bitmask = _mm256_movemask_epi8(pcmp);
+        return (bitmask == 0xffffffffU);
+    }
+
+    bool operator!=(simd8uint32 other) const {
+        return !(*this == other);
+    }
+
     std::string elements_to_string(const char* fmt) const {
         uint32_t bytes[8];
         storeu((void*)bytes);
@@ -383,8 +577,49 @@
     void set1(uint32_t x) {
         i = _mm256_set1_epi32((int)x);
     }
+
+    simd8uint32 unzip() const {
+        return simd8uint32(_mm256_permutevar8x32_epi32(
+                i, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)));
+    }
 };
 
+// Vectorized version of the following code:
+//   for (size_t i = 0; i < n; i++) {
+//      bool flag = (candidateValues[i] < currentValues[i]);
+//      minValues[i] = flag ? candidateValues[i] : currentValues[i];
+//      minIndices[i] = flag ? candidateIndices[i] : currentIndices[i];
+//      maxValues[i] = !flag ? candidateValues[i] : currentValues[i];
+//      maxIndices[i] = !flag ? candidateIndices[i] : currentIndices[i];
+//   }
+// Max indices evaluation is inaccurate in case of equal values (the index of
+// the last equal value is saved instead of the first one), but this behavior
+// saves instructions.
+inline void cmplt_min_max_fast(
+        const simd8uint32 candidateValues,
+        const simd8uint32 candidateIndices,
+        const simd8uint32 currentValues,
+        const simd8uint32 currentIndices,
+        simd8uint32& minValues,
+        simd8uint32& minIndices,
+        simd8uint32& maxValues,
+        simd8uint32& maxIndices) {
+    // there's no lt instruction, so we'll need to emulate one
+    __m256i comparison = _mm256_cmpgt_epi32(currentValues.i, candidateValues.i);
+    comparison = _mm256_andnot_si256(comparison, _mm256_set1_epi32(-1));
+
+    minValues.i = _mm256_min_epi32(candidateValues.i, currentValues.i);
+    minIndices.i = _mm256_castps_si256(_mm256_blendv_ps(
+            _mm256_castsi256_ps(candidateIndices.i),
+            _mm256_castsi256_ps(currentIndices.i),
+            _mm256_castsi256_ps(comparison)));
+    maxValues.i = _mm256_max_epi32(candidateValues.i, currentValues.i);
+    maxIndices.i = _mm256_castps_si256(_mm256_blendv_ps(
+            _mm256_castsi256_ps(currentIndices.i),
+            _mm256_castsi256_ps(candidateIndices.i),
+            _mm256_castsi256_ps(comparison)));
+}
+
 struct simd8float32 : simd256bit {
     simd8float32() {}
 
@@ -394,7 +629,18 @@
 
     explicit simd8float32(float x) : simd256bit(_mm256_set1_ps(x)) {}
 
-    explicit simd8float32(const float* x) : simd256bit(_mm256_load_ps(x)) {}
+    explicit simd8float32(const float* x) : simd256bit(_mm256_loadu_ps(x)) {}
+
+    explicit simd8float32(
+            float f0,
+            float f1,
+            float f2,
+            float f3,
+            float f4,
+            float f5,
+            float f6,
+            float f7)
+            : simd256bit(_mm256_setr_ps(f0, f1, f2, f3, f4, f5, f6, f7)) {}
 
     simd8float32 operator*(simd8float32 other) const {
         return simd8float32(_mm256_mul_ps(f, other.f));
@@ -408,6 +654,22 @@
         return simd8float32(_mm256_sub_ps(f, other.f));
     }
 
+    simd8float32& operator+=(const simd8float32& other) {
+        f = _mm256_add_ps(f, other.f);
+        return *this;
+    }
+
+    bool operator==(simd8float32 other) const {
+        const __m256i pcmp =
+                _mm256_castps_si256(_mm256_cmp_ps(f, other.f, _CMP_EQ_OQ));
+        unsigned bitmask = _mm256_movemask_epi8(pcmp);
+        return (bitmask == 0xffffffffU);
+    }
+
+    bool operator!=(simd8float32 other) const {
+        return !(*this == other);
+    }
+
     std::string tostring() const {
         float tab[8];
         storeu((void*)tab);
@@ -439,6 +701,85 @@
     return simd8float32(_mm256_fmadd_ps(a.f, b.f, c.f));
 }
 
+// The following primitive is a vectorized version of the following code
+// snippet:
+//   float lowestValue = HUGE_VAL;
+//   uint lowestIndex = 0;
+//   for (size_t i = 0; i < n; i++) {
+//     if (values[i] < lowestValue) {
+//       lowestValue = values[i];
+//       lowestIndex = i;
+//     }
+//   }
+// Vectorized version can be implemented via two operations: cmp and blend
+// with something like this:
+//   lowestValues = [HUGE_VAL; 8];
+//   lowestIndices = {0, 1, 2, 3, 4, 5, 6, 7};
+//   for (size_t i = 0; i < n; i += 8) {
+//     auto comparison = cmp(values + i, lowestValues);
+//     lowestValues = blend(
+//         comparison,
+//         values + i,
+//         lowestValues);
+//     lowestIndices = blend(
+//         comparison,
+//         i + {0, 1, 2, 3, 4, 5, 6, 7},
+//         lowestIndices);
+//     lowestIndices += {8, 8, 8, 8, 8, 8, 8, 8};
+//   }
+// The problem is that blend primitive needs very different instruction
+// order for AVX and ARM.
+// So, let's introduce a combination of these two in order to avoid
+// confusion for ppl who write in low-level SIMD instructions. Additionally,
+// these two ops (cmp and blend) are very often used together.
+inline void cmplt_and_blend_inplace(
+        const simd8float32 candidateValues,
+        const simd8uint32 candidateIndices,
+        simd8float32& lowestValues,
+        simd8uint32& lowestIndices) {
+    const __m256 comparison =
+            _mm256_cmp_ps(lowestValues.f, candidateValues.f, _CMP_LE_OS);
+    lowestValues.f = _mm256_min_ps(candidateValues.f, lowestValues.f);
+    lowestIndices.i = _mm256_castps_si256(_mm256_blendv_ps(
+            _mm256_castsi256_ps(candidateIndices.i),
+            _mm256_castsi256_ps(lowestIndices.i),
+            comparison));
+}
+
+// Vectorized version of the following code:
+//   for (size_t i = 0; i < n; i++) {
+//      bool flag = (candidateValues[i] < currentValues[i]);
+//      minValues[i] = flag ? candidateValues[i] : currentValues[i];
+//      minIndices[i] = flag ? candidateIndices[i] : currentIndices[i];
+//      maxValues[i] = !flag ? candidateValues[i] : currentValues[i];
+//      maxIndices[i] = !flag ? candidateIndices[i] : currentIndices[i];
+//   }
+// Max indices evaluation is inaccurate in case of equal values (the index of
+// the last equal value is saved instead of the first one), but this behavior
+// saves instructions.
+inline void cmplt_min_max_fast(
+        const simd8float32 candidateValues,
+        const simd8uint32 candidateIndices,
+        const simd8float32 currentValues,
+        const simd8uint32 currentIndices,
+        simd8float32& minValues,
+        simd8uint32& minIndices,
+        simd8float32& maxValues,
+        simd8uint32& maxIndices) {
+    const __m256 comparison =
+            _mm256_cmp_ps(currentValues.f, candidateValues.f, _CMP_LE_OS);
+    minValues.f = _mm256_min_ps(candidateValues.f, currentValues.f);
+    minIndices.i = _mm256_castps_si256(_mm256_blendv_ps(
+            _mm256_castsi256_ps(candidateIndices.i),
+            _mm256_castsi256_ps(currentIndices.i),
+            comparison));
+    maxValues.f = _mm256_max_ps(candidateValues.f, currentValues.f);
+    maxIndices.i = _mm256_castps_si256(_mm256_blendv_ps(
+            _mm256_castsi256_ps(currentIndices.i),
+            _mm256_castsi256_ps(candidateIndices.i),
+            comparison));
+}
+
 namespace {
 
 // get even float32's of a and b, interleaved
diff -Nru faiss-1.7.3/faiss/utils/simdlib_emulated.h faiss-1.7.4/faiss/utils/simdlib_emulated.h
--- faiss-1.7.3/faiss/utils/simdlib_emulated.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/utils/simdlib_emulated.h	2023-04-19 13:18:30.000000000 +0000
@@ -57,6 +57,17 @@
         bin(bits);
         return std::string(bits);
     }
+
+    // Checks whether the other holds exactly the same bytes.
+    bool is_same_as(simd256bit other) const {
+        for (size_t i = 0; i < 8; i++) {
+            if (u32[i] != other.u32[i]) {
+                return false;
+            }
+        }
+
+        return true;
+    }
 };
 
 /// vector of 16 elements in uint16
@@ -75,6 +86,41 @@
 
     explicit simd16uint16(const uint16_t* x) : simd256bit((const void*)x) {}
 
+    explicit simd16uint16(
+            uint16_t u0,
+            uint16_t u1,
+            uint16_t u2,
+            uint16_t u3,
+            uint16_t u4,
+            uint16_t u5,
+            uint16_t u6,
+            uint16_t u7,
+            uint16_t u8,
+            uint16_t u9,
+            uint16_t u10,
+            uint16_t u11,
+            uint16_t u12,
+            uint16_t u13,
+            uint16_t u14,
+            uint16_t u15) {
+        this->u16[0] = u0;
+        this->u16[1] = u1;
+        this->u16[2] = u2;
+        this->u16[3] = u3;
+        this->u16[4] = u4;
+        this->u16[5] = u5;
+        this->u16[6] = u6;
+        this->u16[7] = u7;
+        this->u16[8] = u8;
+        this->u16[9] = u9;
+        this->u16[10] = u10;
+        this->u16[11] = u11;
+        this->u16[12] = u12;
+        this->u16[13] = u13;
+        this->u16[14] = u14;
+        this->u16[15] = u15;
+    }
+
     std::string elements_to_string(const char* fmt) const {
         char res[1000], *ptr = res;
         for (int i = 0; i < 16; i++) {
@@ -169,6 +215,13 @@
                 });
     }
 
+    simd16uint16 operator^(const simd256bit& other) const {
+        return binary_func(
+                *this, simd16uint16(other), [](uint16_t a, uint16_t b) {
+                    return a ^ b;
+                });
+    }
+
     // returns binary masks
     simd16uint16 operator==(const simd16uint16& other) const {
         return binary_func(*this, other, [](uint16_t a, uint16_t b) {
@@ -288,6 +341,62 @@
     return gem;
 }
 
+// hadd does not cross lanes
+inline simd16uint16 hadd(const simd16uint16& a, const simd16uint16& b) {
+    simd16uint16 c;
+    c.u16[0] = a.u16[0] + a.u16[1];
+    c.u16[1] = a.u16[2] + a.u16[3];
+    c.u16[2] = a.u16[4] + a.u16[5];
+    c.u16[3] = a.u16[6] + a.u16[7];
+    c.u16[4] = b.u16[0] + b.u16[1];
+    c.u16[5] = b.u16[2] + b.u16[3];
+    c.u16[6] = b.u16[4] + b.u16[5];
+    c.u16[7] = b.u16[6] + b.u16[7];
+
+    c.u16[8] = a.u16[8] + a.u16[9];
+    c.u16[9] = a.u16[10] + a.u16[11];
+    c.u16[10] = a.u16[12] + a.u16[13];
+    c.u16[11] = a.u16[14] + a.u16[15];
+    c.u16[12] = b.u16[8] + b.u16[9];
+    c.u16[13] = b.u16[10] + b.u16[11];
+    c.u16[14] = b.u16[12] + b.u16[13];
+    c.u16[15] = b.u16[14] + b.u16[15];
+
+    return c;
+}
+
+// Vectorized version of the following code:
+//   for (size_t i = 0; i < n; i++) {
+//      bool flag = (candidateValues[i] < currentValues[i]);
+//      minValues[i] = flag ? candidateValues[i] : currentValues[i];
+//      minIndices[i] = flag ? candidateIndices[i] : currentIndices[i];
+//      maxValues[i] = !flag ? candidateValues[i] : currentValues[i];
+//      maxIndices[i] = !flag ? candidateIndices[i] : currentIndices[i];
+//   }
+// Max indices evaluation is inaccurate in case of equal values (the index of
+// the last equal value is saved instead of the first one), but this behavior
+// saves instructions.
+inline void cmplt_min_max_fast(
+        const simd16uint16 candidateValues,
+        const simd16uint16 candidateIndices,
+        const simd16uint16 currentValues,
+        const simd16uint16 currentIndices,
+        simd16uint16& minValues,
+        simd16uint16& minIndices,
+        simd16uint16& maxValues,
+        simd16uint16& maxIndices) {
+    for (size_t i = 0; i < 16; i++) {
+        bool flag = (candidateValues.u16[i] < currentValues.u16[i]);
+        minValues.u16[i] = flag ? candidateValues.u16[i] : currentValues.u16[i];
+        minIndices.u16[i] =
+                flag ? candidateIndices.u16[i] : currentIndices.u16[i];
+        maxValues.u16[i] =
+                !flag ? candidateValues.u16[i] : currentValues.u16[i];
+        maxIndices.u16[i] =
+                !flag ? candidateIndices.u16[i] : currentIndices.u16[i];
+    }
+}
+
 // vector of 32 unsigned 8-bit integers
 struct simd32uint8 : simd256bit {
     simd32uint8() {}
@@ -299,6 +408,75 @@
     explicit simd32uint8(uint8_t x) {
         set1(x);
     }
+    template <
+            uint8_t _0,
+            uint8_t _1,
+            uint8_t _2,
+            uint8_t _3,
+            uint8_t _4,
+            uint8_t _5,
+            uint8_t _6,
+            uint8_t _7,
+            uint8_t _8,
+            uint8_t _9,
+            uint8_t _10,
+            uint8_t _11,
+            uint8_t _12,
+            uint8_t _13,
+            uint8_t _14,
+            uint8_t _15,
+            uint8_t _16,
+            uint8_t _17,
+            uint8_t _18,
+            uint8_t _19,
+            uint8_t _20,
+            uint8_t _21,
+            uint8_t _22,
+            uint8_t _23,
+            uint8_t _24,
+            uint8_t _25,
+            uint8_t _26,
+            uint8_t _27,
+            uint8_t _28,
+            uint8_t _29,
+            uint8_t _30,
+            uint8_t _31>
+    static simd32uint8 create() {
+        simd32uint8 ret;
+        ret.u8[0] = _0;
+        ret.u8[1] = _1;
+        ret.u8[2] = _2;
+        ret.u8[3] = _3;
+        ret.u8[4] = _4;
+        ret.u8[5] = _5;
+        ret.u8[6] = _6;
+        ret.u8[7] = _7;
+        ret.u8[8] = _8;
+        ret.u8[9] = _9;
+        ret.u8[10] = _10;
+        ret.u8[11] = _11;
+        ret.u8[12] = _12;
+        ret.u8[13] = _13;
+        ret.u8[14] = _14;
+        ret.u8[15] = _15;
+        ret.u8[16] = _16;
+        ret.u8[17] = _17;
+        ret.u8[18] = _18;
+        ret.u8[19] = _19;
+        ret.u8[20] = _20;
+        ret.u8[21] = _21;
+        ret.u8[22] = _22;
+        ret.u8[23] = _23;
+        ret.u8[24] = _24;
+        ret.u8[25] = _25;
+        ret.u8[26] = _26;
+        ret.u8[27] = _27;
+        ret.u8[28] = _28;
+        ret.u8[29] = _29;
+        ret.u8[30] = _30;
+        ret.u8[31] = _31;
+        return ret;
+    }
 
     explicit simd32uint8(const simd256bit& x) : simd256bit(x) {}
 
@@ -440,6 +618,62 @@
 
     explicit simd8uint32(const uint32_t* x) : simd256bit((const void*)x) {}
 
+    explicit simd8uint32(
+            uint32_t u0,
+            uint32_t u1,
+            uint32_t u2,
+            uint32_t u3,
+            uint32_t u4,
+            uint32_t u5,
+            uint32_t u6,
+            uint32_t u7) {
+        u32[0] = u0;
+        u32[1] = u1;
+        u32[2] = u2;
+        u32[3] = u3;
+        u32[4] = u4;
+        u32[5] = u5;
+        u32[6] = u6;
+        u32[7] = u7;
+    }
+
+    simd8uint32 operator+(simd8uint32 other) const {
+        simd8uint32 result;
+        for (int i = 0; i < 8; i++) {
+            result.u32[i] = u32[i] + other.u32[i];
+        }
+        return result;
+    }
+
+    simd8uint32 operator-(simd8uint32 other) const {
+        simd8uint32 result;
+        for (int i = 0; i < 8; i++) {
+            result.u32[i] = u32[i] - other.u32[i];
+        }
+        return result;
+    }
+
+    simd8uint32& operator+=(const simd8uint32& other) {
+        for (int i = 0; i < 8; i++) {
+            u32[i] += other.u32[i];
+        }
+        return *this;
+    }
+
+    bool operator==(simd8uint32 other) const {
+        for (size_t i = 0; i < 8; i++) {
+            if (u32[i] != other.u32[i]) {
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    bool operator!=(simd8uint32 other) const {
+        return !(*this == other);
+    }
+
     std::string elements_to_string(const char* fmt) const {
         char res[1000], *ptr = res;
         for (int i = 0; i < 8; i++) {
@@ -463,8 +697,46 @@
             u32[i] = x;
         }
     }
+
+    simd8uint32 unzip() const {
+        const uint32_t ret[] = {
+                u32[0], u32[2], u32[4], u32[6], u32[1], u32[3], u32[5], u32[7]};
+        return simd8uint32{ret};
+    }
 };
 
+// Vectorized version of the following code:
+//   for (size_t i = 0; i < n; i++) {
+//      bool flag = (candidateValues[i] < currentValues[i]);
+//      minValues[i] = flag ? candidateValues[i] : currentValues[i];
+//      minIndices[i] = flag ? candidateIndices[i] : currentIndices[i];
+//      maxValues[i] = !flag ? candidateValues[i] : currentValues[i];
+//      maxIndices[i] = !flag ? candidateIndices[i] : currentIndices[i];
+//   }
+// Max indices evaluation is inaccurate in case of equal values (the index of
+// the last equal value is saved instead of the first one), but this behavior
+// saves instructions.
+inline void cmplt_min_max_fast(
+        const simd8uint32 candidateValues,
+        const simd8uint32 candidateIndices,
+        const simd8uint32 currentValues,
+        const simd8uint32 currentIndices,
+        simd8uint32& minValues,
+        simd8uint32& minIndices,
+        simd8uint32& maxValues,
+        simd8uint32& maxIndices) {
+    for (size_t i = 0; i < 8; i++) {
+        bool flag = (candidateValues.u32[i] < currentValues.u32[i]);
+        minValues.u32[i] = flag ? candidateValues.u32[i] : currentValues.u32[i];
+        minIndices.u32[i] =
+                flag ? candidateIndices.u32[i] : currentIndices.u32[i];
+        maxValues.u32[i] =
+                !flag ? candidateValues.u32[i] : currentValues.u32[i];
+        maxIndices.u32[i] =
+                !flag ? candidateIndices.u32[i] : currentIndices.u32[i];
+    }
+}
+
 struct simd8float32 : simd256bit {
     simd8float32() {}
 
@@ -484,6 +756,25 @@
         }
     }
 
+    explicit simd8float32(
+            float f0,
+            float f1,
+            float f2,
+            float f3,
+            float f4,
+            float f5,
+            float f6,
+            float f7) {
+        f32[0] = f0;
+        f32[1] = f1;
+        f32[2] = f2;
+        f32[3] = f3;
+        f32[4] = f4;
+        f32[5] = f5;
+        f32[6] = f6;
+        f32[7] = f7;
+    }
+
     template <typename F>
     static simd8float32 binary_func(
             const simd8float32& a,
@@ -511,6 +802,28 @@
                 *this, other, [](float a, float b) { return a - b; });
     }
 
+    simd8float32& operator+=(const simd8float32& other) {
+        for (size_t i = 0; i < 8; i++) {
+            f32[i] += other.f32[i];
+        }
+
+        return *this;
+    }
+
+    bool operator==(simd8float32 other) const {
+        for (size_t i = 0; i < 8; i++) {
+            if (f32[i] != other.f32[i]) {
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    bool operator!=(simd8float32 other) const {
+        return !(*this == other);
+    }
+
     std::string tostring() const {
         char res[1000], *ptr = res;
         for (int i = 0; i < 8; i++) {
@@ -650,6 +963,83 @@
     return c;
 }
 
+// The following primitive is a vectorized version of the following code
+// snippet:
+//   float lowestValue = HUGE_VAL;
+//   uint lowestIndex = 0;
+//   for (size_t i = 0; i < n; i++) {
+//     if (values[i] < lowestValue) {
+//       lowestValue = values[i];
+//       lowestIndex = i;
+//     }
+//   }
+// Vectorized version can be implemented via two operations: cmp and blend
+// with something like this:
+//   lowestValues = [HUGE_VAL; 8];
+//   lowestIndices = {0, 1, 2, 3, 4, 5, 6, 7};
+//   for (size_t i = 0; i < n; i += 8) {
+//     auto comparison = cmp(values + i, lowestValues);
+//     lowestValues = blend(
+//         comparison,
+//         values + i,
+//         lowestValues);
+//     lowestIndices = blend(
+//         comparison,
+//         i + {0, 1, 2, 3, 4, 5, 6, 7},
+//         lowestIndices);
+//     lowestIndices += {8, 8, 8, 8, 8, 8, 8, 8};
+//   }
+// The problem is that blend primitive needs very different instruction
+// order for AVX and ARM.
+// So, let's introduce a combination of these two in order to avoid
+// confusion for ppl who write in low-level SIMD instructions. Additionally,
+// these two ops (cmp and blend) are very often used together.
+inline void cmplt_and_blend_inplace(
+        const simd8float32 candidateValues,
+        const simd8uint32 candidateIndices,
+        simd8float32& lowestValues,
+        simd8uint32& lowestIndices) {
+    for (size_t j = 0; j < 8; j++) {
+        bool comparison = (candidateValues.f32[j] < lowestValues.f32[j]);
+        if (comparison) {
+            lowestValues.f32[j] = candidateValues.f32[j];
+            lowestIndices.u32[j] = candidateIndices.u32[j];
+        }
+    }
+}
+
+// Vectorized version of the following code:
+//   for (size_t i = 0; i < n; i++) {
+//      bool flag = (candidateValues[i] < currentValues[i]);
+//      minValues[i] = flag ? candidateValues[i] : currentValues[i];
+//      minIndices[i] = flag ? candidateIndices[i] : currentIndices[i];
+//      maxValues[i] = !flag ? candidateValues[i] : currentValues[i];
+//      maxIndices[i] = !flag ? candidateIndices[i] : currentIndices[i];
+//   }
+// Max indices evaluation is inaccurate in case of equal values (the index of
+// the last equal value is saved instead of the first one), but this behavior
+// saves instructions.
+inline void cmplt_min_max_fast(
+        const simd8float32 candidateValues,
+        const simd8uint32 candidateIndices,
+        const simd8float32 currentValues,
+        const simd8uint32 currentIndices,
+        simd8float32& minValues,
+        simd8uint32& minIndices,
+        simd8float32& maxValues,
+        simd8uint32& maxIndices) {
+    for (size_t i = 0; i < 8; i++) {
+        bool flag = (candidateValues.f32[i] < currentValues.f32[i]);
+        minValues.f32[i] = flag ? candidateValues.f32[i] : currentValues.f32[i];
+        minIndices.u32[i] =
+                flag ? candidateIndices.u32[i] : currentIndices.u32[i];
+        maxValues.f32[i] =
+                !flag ? candidateValues.f32[i] : currentValues.f32[i];
+        maxIndices.u32[i] =
+                !flag ? candidateIndices.u32[i] : currentIndices.u32[i];
+    }
+}
+
 } // namespace
 
 } // namespace faiss
diff -Nru faiss-1.7.3/faiss/utils/simdlib_neon.h faiss-1.7.4/faiss/utils/simdlib_neon.h
--- faiss-1.7.3/faiss/utils/simdlib_neon.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/utils/simdlib_neon.h	2023-04-19 13:18:30.000000000 +0000
@@ -18,6 +18,8 @@
 
 #include <arm_neon.h>
 
+#include <faiss/impl/FaissAssert.h>
+
 namespace faiss {
 
 namespace detail {
@@ -88,6 +90,23 @@
     return v;
 }
 
+// Surprisingly, vdupq_n_u16 has the type of
+// uint16x8_t (std::uint32_t) , and vdupq_n_u8 also has
+// uint8x16_t (std::uint32_t) on **some environments**.
+// We want argument type as same as the type of element
+// of result vector type (std::uint16_t for uint16x8_t,
+// and std::uint8_t for uint8x16_t) instead of
+// std::uint32_t due to using set1 function templates,
+// so let's fix the argument type here and use these
+// overload below.
+static inline ::uint16x8_t vdupq_n_u16(std::uint16_t v) {
+    return ::vdupq_n_u16(v);
+}
+
+static inline ::uint8x16_t vdupq_n_u8(std::uint8_t v) {
+    return ::vdupq_n_u8(v);
+}
+
 template <
         typename T,
         typename U = decltype(reinterpret_u8(std::declval<T>().data))>
@@ -119,11 +138,25 @@
     return std::string(bits);
 }
 
-template <typename D, typename F, typename T>
-static inline void set1(D& d, F&& f, T t) {
-    const auto v = f(t);
-    d.val[0] = v;
-    d.val[1] = v;
+template <typename T>
+using remove_cv_ref_t =
+        typename std::remove_reference<typename std::remove_cv<T>::type>::type;
+
+template <typename D, typename T>
+struct set1_impl {
+    D& d;
+    T t;
+    template <remove_cv_ref_t<decltype(std::declval<D>().val[0])> (*F)(T)>
+    inline void call() {
+        const auto v = F(t);
+        d.val[0] = v;
+        d.val[1] = v;
+    }
+};
+
+template <typename D, typename T>
+static inline set1_impl<remove_cv_ref_t<D>, T> set1(D& d, T t) {
+    return {d, t};
 }
 
 template <typename T, size_t N, typename S>
@@ -142,20 +175,57 @@
     return std::string(res);
 }
 
-template <typename T, typename F>
-static inline T unary_func(const T& a, F&& f) {
-    T t;
-    t.val[0] = f(a.val[0]);
-    t.val[1] = f(a.val[1]);
-    return t;
+template <typename T, typename U>
+struct unary_func_impl {
+    const U& a;
+    using Telem = remove_cv_ref_t<decltype(std::declval<T>().val[0])>;
+    using Uelem = remove_cv_ref_t<decltype(std::declval<U>().val[0])>;
+    template <Telem (*F)(Uelem)>
+    inline T call() {
+        T t;
+        t.val[0] = F(a.val[0]);
+        t.val[1] = F(a.val[1]);
+        return t;
+    }
+};
+
+template <typename T>
+static inline unary_func_impl<remove_cv_ref_t<T>, remove_cv_ref_t<T>> unary_func(
+        const T& a) {
+    return {a};
+}
+
+template <typename T, typename U>
+static inline unary_func_impl<remove_cv_ref_t<T>, remove_cv_ref_t<U>> unary_func(
+        const U& a) {
+    return {a};
+}
+
+template <typename T, typename U>
+struct binary_func_impl {
+    const U& a;
+    const U& b;
+    using Telem = remove_cv_ref_t<decltype(std::declval<T>().val[0])>;
+    using Uelem = remove_cv_ref_t<decltype(std::declval<U>().val[0])>;
+    template <Telem (*F)(Uelem, Uelem)>
+    inline T call() {
+        T t;
+        t.val[0] = F(a.val[0], b.val[0]);
+        t.val[1] = F(a.val[1], b.val[1]);
+        return t;
+    }
+};
+
+template <typename T>
+static inline binary_func_impl<remove_cv_ref_t<T>, remove_cv_ref_t<T>>
+binary_func(const T& a, const T& b) {
+    return {a, b};
 }
 
-template <typename T, typename F>
-static inline T binary_func(const T& a, const T& b, F&& f) {
-    T t;
-    t.val[0] = f(a.val[0], b.val[0]);
-    t.val[1] = f(a.val[1], b.val[1]);
-    return t;
+template <typename T, typename U>
+static inline binary_func_impl<remove_cv_ref_t<T>, remove_cv_ref_t<U>>
+binary_func(const U& a, const U& b) {
+    return {a, b};
 }
 
 static inline uint16_t vmovmask_u8(const uint8x16_t& v) {
@@ -172,8 +242,8 @@
         const uint16x8x2_t& d0,
         const uint16x8x2_t& d1,
         const uint16x8x2_t& thr) {
-    const auto d0_thr = detail::simdlib::binary_func(d0, thr, F);
-    const auto d1_thr = detail::simdlib::binary_func(d1, thr, F);
+    const auto d0_thr = detail::simdlib::binary_func(d0, thr).call<F>();
+    const auto d1_thr = detail::simdlib::binary_func(d1, thr).call<F>();
     const auto d0_mask = vmovmask_u8(
             vmovn_high_u16(vmovn_u16(d0_thr.val[0]), d0_thr.val[1]));
     const auto d1_mask = vmovmask_u8(
@@ -207,6 +277,44 @@
 
     explicit simd16uint16(const uint16x8x2_t& v) : data{v} {}
 
+    explicit simd16uint16(
+            uint16_t u0,
+            uint16_t u1,
+            uint16_t u2,
+            uint16_t u3,
+            uint16_t u4,
+            uint16_t u5,
+            uint16_t u6,
+            uint16_t u7,
+            uint16_t u8,
+            uint16_t u9,
+            uint16_t u10,
+            uint16_t u11,
+            uint16_t u12,
+            uint16_t u13,
+            uint16_t u14,
+            uint16_t u15) {
+        uint16_t temp[16] = {
+                u0,
+                u1,
+                u2,
+                u3,
+                u4,
+                u5,
+                u6,
+                u7,
+                u8,
+                u9,
+                u10,
+                u11,
+                u12,
+                u13,
+                u14,
+                u15};
+        data.val[0] = vld1q_u16(temp);
+        data.val[1] = vld1q_u16(temp + 8);
+    }
+
     template <
             typename T,
             typename std::enable_if<
@@ -219,7 +327,8 @@
             : data{vld1q_u16(x), vld1q_u16(x + 8)} {}
 
     void clear() {
-        detail::simdlib::set1(data, &vdupq_n_u16, static_cast<uint16_t>(0));
+        detail::simdlib::set1(data, static_cast<uint16_t>(0))
+                .call<&detail::simdlib::vdupq_n_u16>();
     }
 
     void storeu(uint16_t* ptr) const {
@@ -257,12 +366,12 @@
     }
 
     void set1(uint16_t x) {
-        detail::simdlib::set1(data, &vdupq_n_u16, x);
+        detail::simdlib::set1(data, x).call<&detail::simdlib::vdupq_n_u16>();
     }
 
     simd16uint16 operator*(const simd16uint16& other) const {
-        return simd16uint16{
-                detail::simdlib::binary_func(data, other.data, &vmulq_u16)};
+        return simd16uint16{detail::simdlib::binary_func(data, other.data)
+                                    .call<&vmulq_u16>()};
     }
 
     // shift must be known at compile time
@@ -271,50 +380,56 @@
             case 0:
                 return *this;
             case 1:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshrq<1>)};
+                return simd16uint16{detail::simdlib::unary_func(data)
+                                            .call<detail::simdlib::vshrq<1>>()};
             case 2:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshrq<2>)};
+                return simd16uint16{detail::simdlib::unary_func(data)
+                                            .call<detail::simdlib::vshrq<2>>()};
             case 3:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshrq<3>)};
+                return simd16uint16{detail::simdlib::unary_func(data)
+                                            .call<detail::simdlib::vshrq<3>>()};
             case 4:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshrq<4>)};
+                return simd16uint16{detail::simdlib::unary_func(data)
+                                            .call<detail::simdlib::vshrq<4>>()};
             case 5:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshrq<5>)};
+                return simd16uint16{detail::simdlib::unary_func(data)
+                                            .call<detail::simdlib::vshrq<5>>()};
             case 6:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshrq<6>)};
+                return simd16uint16{detail::simdlib::unary_func(data)
+                                            .call<detail::simdlib::vshrq<6>>()};
             case 7:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshrq<7>)};
+                return simd16uint16{detail::simdlib::unary_func(data)
+                                            .call<detail::simdlib::vshrq<7>>()};
             case 8:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshrq<8>)};
+                return simd16uint16{detail::simdlib::unary_func(data)
+                                            .call<detail::simdlib::vshrq<8>>()};
             case 9:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshrq<9>)};
+                return simd16uint16{detail::simdlib::unary_func(data)
+                                            .call<detail::simdlib::vshrq<9>>()};
             case 10:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshrq<10>)};
+                return simd16uint16{
+                        detail::simdlib::unary_func(data)
+                                .call<detail::simdlib::vshrq<10>>()};
             case 11:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshrq<11>)};
+                return simd16uint16{
+                        detail::simdlib::unary_func(data)
+                                .call<detail::simdlib::vshrq<11>>()};
             case 12:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshrq<12>)};
+                return simd16uint16{
+                        detail::simdlib::unary_func(data)
+                                .call<detail::simdlib::vshrq<12>>()};
             case 13:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshrq<13>)};
+                return simd16uint16{
+                        detail::simdlib::unary_func(data)
+                                .call<detail::simdlib::vshrq<13>>()};
             case 14:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshrq<14>)};
+                return simd16uint16{
+                        detail::simdlib::unary_func(data)
+                                .call<detail::simdlib::vshrq<14>>()};
             case 15:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshrq<15>)};
+                return simd16uint16{
+                        detail::simdlib::unary_func(data)
+                                .call<detail::simdlib::vshrq<15>>()};
             default:
                 FAISS_THROW_FMT("Invalid shift %d", shift);
         }
@@ -326,50 +441,56 @@
             case 0:
                 return *this;
             case 1:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshlq<1>)};
+                return simd16uint16{detail::simdlib::unary_func(data)
+                                            .call<detail::simdlib::vshlq<1>>()};
             case 2:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshlq<2>)};
+                return simd16uint16{detail::simdlib::unary_func(data)
+                                            .call<detail::simdlib::vshlq<2>>()};
             case 3:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshlq<3>)};
+                return simd16uint16{detail::simdlib::unary_func(data)
+                                            .call<detail::simdlib::vshlq<3>>()};
             case 4:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshlq<4>)};
+                return simd16uint16{detail::simdlib::unary_func(data)
+                                            .call<detail::simdlib::vshlq<4>>()};
             case 5:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshlq<5>)};
+                return simd16uint16{detail::simdlib::unary_func(data)
+                                            .call<detail::simdlib::vshlq<5>>()};
             case 6:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshlq<6>)};
+                return simd16uint16{detail::simdlib::unary_func(data)
+                                            .call<detail::simdlib::vshlq<6>>()};
             case 7:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshlq<7>)};
+                return simd16uint16{detail::simdlib::unary_func(data)
+                                            .call<detail::simdlib::vshlq<7>>()};
             case 8:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshlq<8>)};
+                return simd16uint16{detail::simdlib::unary_func(data)
+                                            .call<detail::simdlib::vshlq<8>>()};
             case 9:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshlq<9>)};
+                return simd16uint16{detail::simdlib::unary_func(data)
+                                            .call<detail::simdlib::vshlq<9>>()};
             case 10:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshlq<10>)};
+                return simd16uint16{
+                        detail::simdlib::unary_func(data)
+                                .call<detail::simdlib::vshlq<10>>()};
             case 11:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshlq<11>)};
+                return simd16uint16{
+                        detail::simdlib::unary_func(data)
+                                .call<detail::simdlib::vshlq<11>>()};
             case 12:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshlq<12>)};
+                return simd16uint16{
+                        detail::simdlib::unary_func(data)
+                                .call<detail::simdlib::vshlq<12>>()};
             case 13:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshlq<13>)};
+                return simd16uint16{
+                        detail::simdlib::unary_func(data)
+                                .call<detail::simdlib::vshlq<13>>()};
             case 14:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshlq<14>)};
+                return simd16uint16{
+                        detail::simdlib::unary_func(data)
+                                .call<detail::simdlib::vshlq<14>>()};
             case 15:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshlq<15>)};
+                return simd16uint16{
+                        detail::simdlib::unary_func(data)
+                                .call<detail::simdlib::vshlq<15>>()};
             default:
                 FAISS_THROW_FMT("Invalid shift %d", shift);
         }
@@ -386,13 +507,13 @@
     }
 
     simd16uint16 operator+(const simd16uint16& other) const {
-        return simd16uint16{
-                detail::simdlib::binary_func(data, other.data, &vaddq_u16)};
+        return simd16uint16{detail::simdlib::binary_func(data, other.data)
+                                    .call<&vaddq_u16>()};
     }
 
     simd16uint16 operator-(const simd16uint16& other) const {
-        return simd16uint16{
-                detail::simdlib::binary_func(data, other.data, &vsubq_u16)};
+        return simd16uint16{detail::simdlib::binary_func(data, other.data)
+                                    .call<&vsubq_u16>()};
     }
 
     template <
@@ -401,10 +522,10 @@
                     detail::simdlib::is_simd256bit<T>::value,
                     std::nullptr_t>::type = nullptr>
     simd16uint16 operator&(const T& other) const {
-        return simd16uint16{detail::simdlib::binary_func(
-                data,
-                detail::simdlib::reinterpret_u16(other.data),
-                &vandq_u16)};
+        return simd16uint16{
+                detail::simdlib::binary_func(
+                        data, detail::simdlib::reinterpret_u16(other.data))
+                        .template call<&vandq_u16>()};
     }
 
     template <
@@ -413,20 +534,45 @@
                     detail::simdlib::is_simd256bit<T>::value,
                     std::nullptr_t>::type = nullptr>
     simd16uint16 operator|(const T& other) const {
-        return simd16uint16{detail::simdlib::binary_func(
-                data,
-                detail::simdlib::reinterpret_u16(other.data),
-                &vorrq_u16)};
+        return simd16uint16{
+                detail::simdlib::binary_func(
+                        data, detail::simdlib::reinterpret_u16(other.data))
+                        .template call<&vorrq_u16>()};
+    }
+
+    template <
+            typename T,
+            typename std::enable_if<
+                    detail::simdlib::is_simd256bit<T>::value,
+                    std::nullptr_t>::type = nullptr>
+    simd16uint16 operator^(const T& other) const {
+        return simd16uint16{
+                detail::simdlib::binary_func(
+                        data, detail::simdlib::reinterpret_u16(other.data))
+                        .template call<&veorq_u16>()};
     }
 
     // returns binary masks
     simd16uint16 operator==(const simd16uint16& other) const {
-        return simd16uint16{
-                detail::simdlib::binary_func(data, other.data, &vceqq_u16)};
+        return simd16uint16{detail::simdlib::binary_func(data, other.data)
+                                    .call<&vceqq_u16>()};
+    }
+
+    // Checks whether the other holds exactly the same bytes.
+    bool is_same_as(simd16uint16 other) const {
+        const bool equal0 =
+                (vminvq_u16(vceqq_u16(data.val[0], other.data.val[0])) ==
+                 0xffff);
+        const bool equal1 =
+                (vminvq_u16(vceqq_u16(data.val[1], other.data.val[1])) ==
+                 0xffff);
+
+        return equal0 && equal1;
     }
 
     simd16uint16 operator~() const {
-        return simd16uint16{detail::simdlib::unary_func(data, &vmvnq_u16)};
+        return simd16uint16{
+                detail::simdlib::unary_func(data).call<&vmvnq_u16>()};
     }
 
     // get scalar at index 0
@@ -437,8 +583,8 @@
     // mask of elements where this >= thresh
     // 2 bit per component: 16 * 2 = 32 bit
     uint32_t ge_mask(const simd16uint16& thresh) const {
-        const auto input =
-                detail::simdlib::binary_func(data, thresh.data, &vcgeq_u16);
+        const auto input = detail::simdlib::binary_func(data, thresh.data)
+                                   .call<&vcgeq_u16>();
         const auto vmovmask_u16 = [](uint16x8_t v) -> uint16_t {
             uint16_t d[8];
             const auto v2 = vreinterpretq_u32_u16(vshrq_n_u16(v, 14));
@@ -471,23 +617,25 @@
     }
 
     void accu_min(const simd16uint16& incoming) {
-        data = detail::simdlib::binary_func(incoming.data, data, &vminq_u16);
+        data = detail::simdlib::binary_func(incoming.data, data)
+                       .call<&vminq_u16>();
     }
 
     void accu_max(const simd16uint16& incoming) {
-        data = detail::simdlib::binary_func(incoming.data, data, &vmaxq_u16);
+        data = detail::simdlib::binary_func(incoming.data, data)
+                       .call<&vmaxq_u16>();
     }
 };
 
 // not really a std::min because it returns an elementwise min
 inline simd16uint16 min(const simd16uint16& av, const simd16uint16& bv) {
     return simd16uint16{
-            detail::simdlib::binary_func(av.data, bv.data, &vminq_u16)};
+            detail::simdlib::binary_func(av.data, bv.data).call<&vminq_u16>()};
 }
 
 inline simd16uint16 max(const simd16uint16& av, const simd16uint16& bv) {
     return simd16uint16{
-            detail::simdlib::binary_func(av.data, bv.data, &vmaxq_u16)};
+            detail::simdlib::binary_func(av.data, bv.data).call<&vmaxq_u16>()};
 }
 
 // decompose in 128-lanes: a = (a0, a1), b = (b0, b1)
@@ -515,6 +663,63 @@
     return detail::simdlib::cmp_xe32<&vcleq_u16>(d0.data, d1.data, thr.data);
 }
 
+// hadd does not cross lanes
+inline simd16uint16 hadd(const simd16uint16& a, const simd16uint16& b) {
+    return simd16uint16{
+            detail::simdlib::binary_func(a.data, b.data).call<&vpaddq_u16>()};
+}
+
+// Vectorized version of the following code:
+//   for (size_t i = 0; i < n; i++) {
+//      bool flag = (candidateValues[i] < currentValues[i]);
+//      minValues[i] = flag ? candidateValues[i] : currentValues[i];
+//      minIndices[i] = flag ? candidateIndices[i] : currentIndices[i];
+//      maxValues[i] = !flag ? candidateValues[i] : currentValues[i];
+//      maxIndices[i] = !flag ? candidateIndices[i] : currentIndices[i];
+//   }
+// Max indices evaluation is inaccurate in case of equal values (the index of
+// the last equal value is saved instead of the first one), but this behavior
+// saves instructions.
+inline void cmplt_min_max_fast(
+        const simd16uint16 candidateValues,
+        const simd16uint16 candidateIndices,
+        const simd16uint16 currentValues,
+        const simd16uint16 currentIndices,
+        simd16uint16& minValues,
+        simd16uint16& minIndices,
+        simd16uint16& maxValues,
+        simd16uint16& maxIndices) {
+    const uint16x8x2_t comparison = uint16x8x2_t{
+            vcltq_u16(candidateValues.data.val[0], currentValues.data.val[0]),
+            vcltq_u16(candidateValues.data.val[1], currentValues.data.val[1])};
+
+    minValues.data = uint16x8x2_t{
+            vminq_u16(candidateValues.data.val[0], currentValues.data.val[0]),
+            vminq_u16(candidateValues.data.val[1], currentValues.data.val[1])};
+    minIndices.data = uint16x8x2_t{
+            vbslq_u16(
+                    comparison.val[0],
+                    candidateIndices.data.val[0],
+                    currentIndices.data.val[0]),
+            vbslq_u16(
+                    comparison.val[1],
+                    candidateIndices.data.val[1],
+                    currentIndices.data.val[1])};
+
+    maxValues.data = uint16x8x2_t{
+            vmaxq_u16(candidateValues.data.val[0], currentValues.data.val[0]),
+            vmaxq_u16(candidateValues.data.val[1], currentValues.data.val[1])};
+    maxIndices.data = uint16x8x2_t{
+            vbslq_u16(
+                    comparison.val[0],
+                    currentIndices.data.val[0],
+                    candidateIndices.data.val[0]),
+            vbslq_u16(
+                    comparison.val[1],
+                    currentIndices.data.val[1],
+                    candidateIndices.data.val[1])};
+}
+
 // vector of 32 unsigned 8-bit integers
 struct simd32uint8 {
     uint8x16x2_t data;
@@ -528,6 +733,47 @@
     explicit simd32uint8(const uint8x16x2_t& v) : data{v} {}
 
     template <
+            uint8_t _0,
+            uint8_t _1,
+            uint8_t _2,
+            uint8_t _3,
+            uint8_t _4,
+            uint8_t _5,
+            uint8_t _6,
+            uint8_t _7,
+            uint8_t _8,
+            uint8_t _9,
+            uint8_t _10,
+            uint8_t _11,
+            uint8_t _12,
+            uint8_t _13,
+            uint8_t _14,
+            uint8_t _15,
+            uint8_t _16,
+            uint8_t _17,
+            uint8_t _18,
+            uint8_t _19,
+            uint8_t _20,
+            uint8_t _21,
+            uint8_t _22,
+            uint8_t _23,
+            uint8_t _24,
+            uint8_t _25,
+            uint8_t _26,
+            uint8_t _27,
+            uint8_t _28,
+            uint8_t _29,
+            uint8_t _30,
+            uint8_t _31>
+    static simd32uint8 create() {
+        constexpr uint8_t ds[32] = {_0,  _1,  _2,  _3,  _4,  _5,  _6,  _7,
+                                    _8,  _9,  _10, _11, _12, _13, _14, _15,
+                                    _16, _17, _18, _19, _20, _21, _22, _23,
+                                    _24, _25, _26, _27, _28, _29, _30, _31};
+        return simd32uint8{ds};
+    }
+
+    template <
             typename T,
             typename std::enable_if<
                     detail::simdlib::is_simd256bit<T>::value,
@@ -539,7 +785,8 @@
             : data{vld1q_u8(x), vld1q_u8(x + 16)} {}
 
     void clear() {
-        detail::simdlib::set1(data, &vdupq_n_u8, static_cast<uint8_t>(0));
+        detail::simdlib::set1(data, static_cast<uint8_t>(0))
+                .call<&detail::simdlib::vdupq_n_u8>();
     }
 
     void storeu(uint8_t* ptr) const {
@@ -582,7 +829,7 @@
     }
 
     void set1(uint8_t x) {
-        detail::simdlib::set1(data, &vdupq_n_u8, x);
+        detail::simdlib::set1(data, x).call<&detail::simdlib::vdupq_n_u8>();
     }
 
     template <
@@ -591,19 +838,21 @@
                     detail::simdlib::is_simd256bit<T>::value,
                     std::nullptr_t>::type = nullptr>
     simd32uint8 operator&(const T& other) const {
-        return simd32uint8{detail::simdlib::binary_func(
-                data, detail::simdlib::reinterpret_u8(other.data), &vandq_u8)};
+        return simd32uint8{
+                detail::simdlib::binary_func(
+                        data, detail::simdlib::reinterpret_u8(other.data))
+                        .template call<&vandq_u8>()};
     }
 
     simd32uint8 operator+(const simd32uint8& other) const {
-        return simd32uint8{
-                detail::simdlib::binary_func(data, other.data, &vaddq_u8)};
+        return simd32uint8{detail::simdlib::binary_func(data, other.data)
+                                   .call<&vaddq_u8>()};
     }
 
     // The very important operation that everything relies on
     simd32uint8 lookup_2_lanes(const simd32uint8& idx) const {
-        return simd32uint8{
-                detail::simdlib::binary_func(data, idx.data, &vqtbl1q_u8)};
+        return simd32uint8{detail::simdlib::binary_func(data, idx.data)
+                                   .call<&vqtbl1q_u8>()};
     }
 
     simd32uint8 operator+=(const simd32uint8& other) {
@@ -618,6 +867,16 @@
         vst1q_u8(tab, data.val[high]);
         return tab[i - high * 16];
     }
+
+    // Checks whether the other holds exactly the same bytes.
+    bool is_same_as(simd32uint8 other) const {
+        const bool equal0 =
+                (vminvq_u8(vceqq_u8(data.val[0], other.data.val[0])) == 0xff);
+        const bool equal1 =
+                (vminvq_u8(vceqq_u8(data.val[1], other.data.val[1])) == 0xff);
+
+        return equal0 && equal1;
+    }
 };
 
 // convert with saturation
@@ -671,8 +930,62 @@
 
     explicit simd8uint32(const uint8_t* x) : simd8uint32(simd32uint8(x)) {}
 
+    explicit simd8uint32(
+            uint32_t u0,
+            uint32_t u1,
+            uint32_t u2,
+            uint32_t u3,
+            uint32_t u4,
+            uint32_t u5,
+            uint32_t u6,
+            uint32_t u7) {
+        uint32_t temp[8] = {u0, u1, u2, u3, u4, u5, u6, u7};
+        data.val[0] = vld1q_u32(temp);
+        data.val[1] = vld1q_u32(temp + 4);
+    }
+
+    simd8uint32 operator+(simd8uint32 other) const {
+        return simd8uint32{detail::simdlib::binary_func(data, other.data)
+                                   .call<&vaddq_u32>()};
+    }
+
+    simd8uint32 operator-(simd8uint32 other) const {
+        return simd8uint32{detail::simdlib::binary_func(data, other.data)
+                                   .call<&vsubq_u32>()};
+    }
+
+    simd8uint32& operator+=(const simd8uint32& other) {
+        data.val[0] = vaddq_u32(data.val[0], other.data.val[0]);
+        data.val[1] = vaddq_u32(data.val[1], other.data.val[1]);
+        return *this;
+    }
+
+    bool operator==(simd8uint32 other) const {
+        const auto equals = detail::simdlib::binary_func(data, other.data)
+                                    .call<&vceqq_u32>();
+        const auto equal = vandq_u32(equals.val[0], equals.val[1]);
+        return vminvq_u32(equal) == 0xffffffff;
+    }
+
+    bool operator!=(simd8uint32 other) const {
+        return !(*this == other);
+    }
+
+    // Checks whether the other holds exactly the same bytes.
+    bool is_same_as(simd8uint32 other) const {
+        const bool equal0 =
+                (vminvq_u32(vceqq_u32(data.val[0], other.data.val[0])) ==
+                 0xffffffff);
+        const bool equal1 =
+                (vminvq_u32(vceqq_u32(data.val[1], other.data.val[1])) ==
+                 0xffffffff);
+
+        return equal0 && equal1;
+    }
+
     void clear() {
-        detail::simdlib::set1(data, &vdupq_n_u32, static_cast<uint32_t>(0));
+        detail::simdlib::set1(data, static_cast<uint32_t>(0))
+                .call<&vdupq_n_u32>();
     }
 
     void storeu(uint32_t* ptr) const {
@@ -710,10 +1023,67 @@
     }
 
     void set1(uint32_t x) {
-        detail::simdlib::set1(data, &vdupq_n_u32, x);
+        detail::simdlib::set1(data, x).call<&vdupq_n_u32>();
+    }
+
+    simd8uint32 unzip() const {
+        return simd8uint32{uint32x4x2_t{
+                vuzp1q_u32(data.val[0], data.val[1]),
+                vuzp2q_u32(data.val[0], data.val[1])}};
     }
 };
 
+// Vectorized version of the following code:
+//   for (size_t i = 0; i < n; i++) {
+//      bool flag = (candidateValues[i] < currentValues[i]);
+//      minValues[i] = flag ? candidateValues[i] : currentValues[i];
+//      minIndices[i] = flag ? candidateIndices[i] : currentIndices[i];
+//      maxValues[i] = !flag ? candidateValues[i] : currentValues[i];
+//      maxIndices[i] = !flag ? candidateIndices[i] : currentIndices[i];
+//   }
+// Max indices evaluation is inaccurate in case of equal values (the index of
+// the last equal value is saved instead of the first one), but this behavior
+// saves instructions.
+inline void cmplt_min_max_fast(
+        const simd8uint32 candidateValues,
+        const simd8uint32 candidateIndices,
+        const simd8uint32 currentValues,
+        const simd8uint32 currentIndices,
+        simd8uint32& minValues,
+        simd8uint32& minIndices,
+        simd8uint32& maxValues,
+        simd8uint32& maxIndices) {
+    const uint32x4x2_t comparison = uint32x4x2_t{
+            vcltq_u32(candidateValues.data.val[0], currentValues.data.val[0]),
+            vcltq_u32(candidateValues.data.val[1], currentValues.data.val[1])};
+
+    minValues.data = uint32x4x2_t{
+            vminq_u32(candidateValues.data.val[0], currentValues.data.val[0]),
+            vminq_u32(candidateValues.data.val[1], currentValues.data.val[1])};
+    minIndices.data = uint32x4x2_t{
+            vbslq_u32(
+                    comparison.val[0],
+                    candidateIndices.data.val[0],
+                    currentIndices.data.val[0]),
+            vbslq_u32(
+                    comparison.val[1],
+                    candidateIndices.data.val[1],
+                    currentIndices.data.val[1])};
+
+    maxValues.data = uint32x4x2_t{
+            vmaxq_u32(candidateValues.data.val[0], currentValues.data.val[0]),
+            vmaxq_u32(candidateValues.data.val[1], currentValues.data.val[1])};
+    maxIndices.data = uint32x4x2_t{
+            vbslq_u32(
+                    comparison.val[0],
+                    currentIndices.data.val[0],
+                    candidateIndices.data.val[0]),
+            vbslq_u32(
+                    comparison.val[1],
+                    currentIndices.data.val[1],
+                    candidateIndices.data.val[1])};
+}
+
 struct simd8float32 {
     float32x4x2_t data;
 
@@ -734,8 +1104,22 @@
     explicit simd8float32(const float* x)
             : data{vld1q_f32(x), vld1q_f32(x + 4)} {}
 
+    explicit simd8float32(
+            float f0,
+            float f1,
+            float f2,
+            float f3,
+            float f4,
+            float f5,
+            float f6,
+            float f7) {
+        float temp[8] = {f0, f1, f2, f3, f4, f5, f6, f7};
+        data.val[0] = vld1q_f32(temp);
+        data.val[1] = vld1q_f32(temp + 4);
+    }
+
     void clear() {
-        detail::simdlib::set1(data, &vdupq_n_f32, 0.f);
+        detail::simdlib::set1(data, 0.f).call<&vdupq_n_f32>();
     }
 
     void storeu(float* ptr) const {
@@ -761,18 +1145,50 @@
     }
 
     simd8float32 operator*(const simd8float32& other) const {
-        return simd8float32{
-                detail::simdlib::binary_func(data, other.data, &vmulq_f32)};
+        return simd8float32{detail::simdlib::binary_func(data, other.data)
+                                    .call<&vmulq_f32>()};
     }
 
     simd8float32 operator+(const simd8float32& other) const {
-        return simd8float32{
-                detail::simdlib::binary_func(data, other.data, &vaddq_f32)};
+        return simd8float32{detail::simdlib::binary_func(data, other.data)
+                                    .call<&vaddq_f32>()};
     }
 
     simd8float32 operator-(const simd8float32& other) const {
-        return simd8float32{
-                detail::simdlib::binary_func(data, other.data, &vsubq_f32)};
+        return simd8float32{detail::simdlib::binary_func(data, other.data)
+                                    .call<&vsubq_f32>()};
+    }
+
+    simd8float32& operator+=(const simd8float32& other) {
+        // In this context, it is more compiler friendly to write intrinsics
+        // directly instead of using binary_func
+        data.val[0] = vaddq_f32(data.val[0], other.data.val[0]);
+        data.val[1] = vaddq_f32(data.val[1], other.data.val[1]);
+        return *this;
+    }
+
+    bool operator==(simd8float32 other) const {
+        const auto equals =
+                detail::simdlib::binary_func<::uint32x4x2_t>(data, other.data)
+                        .call<&vceqq_f32>();
+        const auto equal = vandq_u32(equals.val[0], equals.val[1]);
+        return vminvq_u32(equal) == 0xffffffff;
+    }
+
+    bool operator!=(simd8float32 other) const {
+        return !(*this == other);
+    }
+
+    // Checks whether the other holds exactly the same bytes.
+    bool is_same_as(simd8float32 other) const {
+        const bool equal0 =
+                (vminvq_u32(vceqq_f32(data.val[0], other.data.val[0])) ==
+                 0xffffffff);
+        const bool equal1 =
+                (vminvq_u32(vceqq_f32(data.val[1], other.data.val[1])) ==
+                 0xffffffff);
+
+        return equal0 && equal1;
     }
 
     std::string tostring() const {
@@ -783,17 +1199,17 @@
 // hadd does not cross lanes
 inline simd8float32 hadd(const simd8float32& a, const simd8float32& b) {
     return simd8float32{
-            detail::simdlib::binary_func(a.data, b.data, &vpaddq_f32)};
+            detail::simdlib::binary_func(a.data, b.data).call<&vpaddq_f32>()};
 }
 
 inline simd8float32 unpacklo(const simd8float32& a, const simd8float32& b) {
     return simd8float32{
-            detail::simdlib::binary_func(a.data, b.data, &vzip1q_f32)};
+            detail::simdlib::binary_func(a.data, b.data).call<&vzip1q_f32>()};
 }
 
 inline simd8float32 unpackhi(const simd8float32& a, const simd8float32& b) {
     return simd8float32{
-            detail::simdlib::binary_func(a.data, b.data, &vzip2q_f32)};
+            detail::simdlib::binary_func(a.data, b.data).call<&vzip2q_f32>()};
 }
 
 // compute a * b + c
@@ -806,20 +1222,129 @@
             vfmaq_f32(c.data.val[1], a.data.val[1], b.data.val[1])}};
 }
 
+// The following primitive is a vectorized version of the following code
+// snippet:
+//   float lowestValue = HUGE_VAL;
+//   uint lowestIndex = 0;
+//   for (size_t i = 0; i < n; i++) {
+//     if (values[i] < lowestValue) {
+//       lowestValue = values[i];
+//       lowestIndex = i;
+//     }
+//   }
+// Vectorized version can be implemented via two operations: cmp and blend
+// with something like this:
+//   lowestValues = [HUGE_VAL; 8];
+//   lowestIndices = {0, 1, 2, 3, 4, 5, 6, 7};
+//   for (size_t i = 0; i < n; i += 8) {
+//     auto comparison = cmp(values + i, lowestValues);
+//     lowestValues = blend(
+//         comparison,
+//         values + i,
+//         lowestValues);
+//     lowestIndices = blend(
+//         comparison,
+//         i + {0, 1, 2, 3, 4, 5, 6, 7},
+//         lowestIndices);
+//     lowestIndices += {8, 8, 8, 8, 8, 8, 8, 8};
+//   }
+// The problem is that blend primitive needs very different instruction
+// order for AVX and ARM.
+// So, let's introduce a combination of these two in order to avoid
+// confusion for ppl who write in low-level SIMD instructions. Additionally,
+// these two ops (cmp and blend) are very often used together.
+inline void cmplt_and_blend_inplace(
+        const simd8float32 candidateValues,
+        const simd8uint32 candidateIndices,
+        simd8float32& lowestValues,
+        simd8uint32& lowestIndices) {
+    const auto comparison = detail::simdlib::binary_func<::uint32x4x2_t>(
+                                    candidateValues.data, lowestValues.data)
+                                    .call<&vcltq_f32>();
+
+    lowestValues.data = float32x4x2_t{
+            vbslq_f32(
+                    comparison.val[0],
+                    candidateValues.data.val[0],
+                    lowestValues.data.val[0]),
+            vbslq_f32(
+                    comparison.val[1],
+                    candidateValues.data.val[1],
+                    lowestValues.data.val[1])};
+    lowestIndices.data = uint32x4x2_t{
+            vbslq_u32(
+                    comparison.val[0],
+                    candidateIndices.data.val[0],
+                    lowestIndices.data.val[0]),
+            vbslq_u32(
+                    comparison.val[1],
+                    candidateIndices.data.val[1],
+                    lowestIndices.data.val[1])};
+}
+
+// Vectorized version of the following code:
+//   for (size_t i = 0; i < n; i++) {
+//      bool flag = (candidateValues[i] < currentValues[i]);
+//      minValues[i] = flag ? candidateValues[i] : currentValues[i];
+//      minIndices[i] = flag ? candidateIndices[i] : currentIndices[i];
+//      maxValues[i] = !flag ? candidateValues[i] : currentValues[i];
+//      maxIndices[i] = !flag ? candidateIndices[i] : currentIndices[i];
+//   }
+// Max indices evaluation is inaccurate in case of equal values (the index of
+// the last equal value is saved instead of the first one), but this behavior
+// saves instructions.
+inline void cmplt_min_max_fast(
+        const simd8float32 candidateValues,
+        const simd8uint32 candidateIndices,
+        const simd8float32 currentValues,
+        const simd8uint32 currentIndices,
+        simd8float32& minValues,
+        simd8uint32& minIndices,
+        simd8float32& maxValues,
+        simd8uint32& maxIndices) {
+    const uint32x4x2_t comparison = uint32x4x2_t{
+            vcltq_f32(candidateValues.data.val[0], currentValues.data.val[0]),
+            vcltq_f32(candidateValues.data.val[1], currentValues.data.val[1])};
+
+    minValues.data = float32x4x2_t{
+            vminq_f32(candidateValues.data.val[0], currentValues.data.val[0]),
+            vminq_f32(candidateValues.data.val[1], currentValues.data.val[1])};
+    minIndices.data = uint32x4x2_t{
+            vbslq_u32(
+                    comparison.val[0],
+                    candidateIndices.data.val[0],
+                    currentIndices.data.val[0]),
+            vbslq_u32(
+                    comparison.val[1],
+                    candidateIndices.data.val[1],
+                    currentIndices.data.val[1])};
+
+    maxValues.data = float32x4x2_t{
+            vmaxq_f32(candidateValues.data.val[0], currentValues.data.val[0]),
+            vmaxq_f32(candidateValues.data.val[1], currentValues.data.val[1])};
+    maxIndices.data = uint32x4x2_t{
+            vbslq_u32(
+                    comparison.val[0],
+                    currentIndices.data.val[0],
+                    candidateIndices.data.val[0]),
+            vbslq_u32(
+                    comparison.val[1],
+                    currentIndices.data.val[1],
+                    candidateIndices.data.val[1])};
+}
+
 namespace {
 
 // get even float32's of a and b, interleaved
 simd8float32 geteven(const simd8float32& a, const simd8float32& b) {
-    return simd8float32{float32x4x2_t{
-            vuzp1q_f32(a.data.val[0], b.data.val[0]),
-            vuzp1q_f32(a.data.val[1], b.data.val[1])}};
+    return simd8float32{
+            detail::simdlib::binary_func(a.data, b.data).call<&vuzp1q_f32>()};
 }
 
 // get odd float32's of a and b, interleaved
 simd8float32 getodd(const simd8float32& a, const simd8float32& b) {
-    return simd8float32{float32x4x2_t{
-            vuzp2q_f32(a.data.val[0], b.data.val[0]),
-            vuzp2q_f32(a.data.val[1], b.data.val[1])}};
+    return simd8float32{
+            detail::simdlib::binary_func(a.data, b.data).call<&vuzp2q_f32>()};
 }
 
 // 3 cycles
diff -Nru faiss-1.7.3/faiss/utils/sorting.cpp faiss-1.7.4/faiss/utils/sorting.cpp
--- faiss-1.7.3/faiss/utils/sorting.cpp	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/faiss/utils/sorting.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -0,0 +1,692 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/utils/sorting.h>
+
+#include <omp.h>
+#include <algorithm>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/utils.h>
+
+namespace faiss {
+
+/*****************************************************************************
+ * Argsort
+ ****************************************************************************/
+
+namespace {
+struct ArgsortComparator {
+    const float* vals;
+    bool operator()(const size_t a, const size_t b) const {
+        return vals[a] < vals[b];
+    }
+};
+
+struct SegmentS {
+    size_t i0; // begin pointer in the permutation array
+    size_t i1; // end
+    size_t len() const {
+        return i1 - i0;
+    }
+};
+
+// see https://en.wikipedia.org/wiki/Merge_algorithm#Parallel_merge
+// extended to > 1 merge thread
+
+// merges 2 ranges that should be consecutive on the source into
+// the union of the two on the destination
+template <typename T>
+void parallel_merge(
+        const T* src,
+        T* dst,
+        SegmentS& s1,
+        SegmentS& s2,
+        int nt,
+        const ArgsortComparator& comp) {
+    if (s2.len() > s1.len()) { // make sure that s1 larger than s2
+        std::swap(s1, s2);
+    }
+
+    // compute sub-ranges for each thread
+    std::vector<SegmentS> s1s(nt), s2s(nt), sws(nt);
+    s2s[0].i0 = s2.i0;
+    s2s[nt - 1].i1 = s2.i1;
+
+    // not sure parallel actually helps here
+#pragma omp parallel for num_threads(nt)
+    for (int t = 0; t < nt; t++) {
+        s1s[t].i0 = s1.i0 + s1.len() * t / nt;
+        s1s[t].i1 = s1.i0 + s1.len() * (t + 1) / nt;
+
+        if (t + 1 < nt) {
+            T pivot = src[s1s[t].i1];
+            size_t i0 = s2.i0, i1 = s2.i1;
+            while (i0 + 1 < i1) {
+                size_t imed = (i1 + i0) / 2;
+                if (comp(pivot, src[imed])) {
+                    i1 = imed;
+                } else {
+                    i0 = imed;
+                }
+            }
+            s2s[t].i1 = s2s[t + 1].i0 = i1;
+        }
+    }
+    s1.i0 = std::min(s1.i0, s2.i0);
+    s1.i1 = std::max(s1.i1, s2.i1);
+    s2 = s1;
+    sws[0].i0 = s1.i0;
+    for (int t = 0; t < nt; t++) {
+        sws[t].i1 = sws[t].i0 + s1s[t].len() + s2s[t].len();
+        if (t + 1 < nt) {
+            sws[t + 1].i0 = sws[t].i1;
+        }
+    }
+    assert(sws[nt - 1].i1 == s1.i1);
+
+    // do the actual merging
+#pragma omp parallel for num_threads(nt)
+    for (int t = 0; t < nt; t++) {
+        SegmentS sw = sws[t];
+        SegmentS s1t = s1s[t];
+        SegmentS s2t = s2s[t];
+        if (s1t.i0 < s1t.i1 && s2t.i0 < s2t.i1) {
+            for (;;) {
+                // assert (sw.len() == s1t.len() + s2t.len());
+                if (comp(src[s1t.i0], src[s2t.i0])) {
+                    dst[sw.i0++] = src[s1t.i0++];
+                    if (s1t.i0 == s1t.i1) {
+                        break;
+                    }
+                } else {
+                    dst[sw.i0++] = src[s2t.i0++];
+                    if (s2t.i0 == s2t.i1) {
+                        break;
+                    }
+                }
+            }
+        }
+        if (s1t.len() > 0) {
+            assert(s1t.len() == sw.len());
+            memcpy(dst + sw.i0, src + s1t.i0, s1t.len() * sizeof(dst[0]));
+        } else if (s2t.len() > 0) {
+            assert(s2t.len() == sw.len());
+            memcpy(dst + sw.i0, src + s2t.i0, s2t.len() * sizeof(dst[0]));
+        }
+    }
+}
+
+}; // namespace
+
+void fvec_argsort(size_t n, const float* vals, size_t* perm) {
+    for (size_t i = 0; i < n; i++) {
+        perm[i] = i;
+    }
+    ArgsortComparator comp = {vals};
+    std::sort(perm, perm + n, comp);
+}
+
+void fvec_argsort_parallel(size_t n, const float* vals, size_t* perm) {
+    size_t* perm2 = new size_t[n];
+    // 2 result tables, during merging, flip between them
+    size_t *permB = perm2, *permA = perm;
+
+    int nt = omp_get_max_threads();
+    { // prepare correct permutation so that the result ends in perm
+      // at final iteration
+        int nseg = nt;
+        while (nseg > 1) {
+            nseg = (nseg + 1) / 2;
+            std::swap(permA, permB);
+        }
+    }
+
+#pragma omp parallel
+    for (size_t i = 0; i < n; i++) {
+        permA[i] = i;
+    }
+
+    ArgsortComparator comp = {vals};
+
+    std::vector<SegmentS> segs(nt);
+
+    // independent sorts
+#pragma omp parallel for
+    for (int t = 0; t < nt; t++) {
+        size_t i0 = t * n / nt;
+        size_t i1 = (t + 1) * n / nt;
+        SegmentS seg = {i0, i1};
+        std::sort(permA + seg.i0, permA + seg.i1, comp);
+        segs[t] = seg;
+    }
+    int prev_nested = omp_get_nested();
+    omp_set_nested(1);
+
+    int nseg = nt;
+    while (nseg > 1) {
+        int nseg1 = (nseg + 1) / 2;
+        int sub_nt = nseg % 2 == 0 ? nt : nt - 1;
+        int sub_nseg1 = nseg / 2;
+
+#pragma omp parallel for num_threads(nseg1)
+        for (int s = 0; s < nseg; s += 2) {
+            if (s + 1 == nseg) { // otherwise isolated segment
+                memcpy(permB + segs[s].i0,
+                       permA + segs[s].i0,
+                       segs[s].len() * sizeof(size_t));
+            } else {
+                int t0 = s * sub_nt / sub_nseg1;
+                int t1 = (s + 1) * sub_nt / sub_nseg1;
+                printf("merge %d %d, %d threads\n", s, s + 1, t1 - t0);
+                parallel_merge(
+                        permA, permB, segs[s], segs[s + 1], t1 - t0, comp);
+            }
+        }
+        for (int s = 0; s < nseg; s += 2) {
+            segs[s / 2] = segs[s];
+        }
+        nseg = nseg1;
+        std::swap(permA, permB);
+    }
+    assert(permA == perm);
+    omp_set_nested(prev_nested);
+    delete[] perm2;
+}
+
+/*****************************************************************************
+ * Bucket sort
+ ****************************************************************************/
+
+// extern symbol in the .h
+int bucket_sort_verbose = 0;
+
+namespace {
+
+void bucket_sort_ref(
+        size_t nval,
+        const uint64_t* vals,
+        uint64_t vmax,
+        int64_t* lims,
+        int64_t* perm) {
+    double t0 = getmillisecs();
+    memset(lims, 0, sizeof(*lims) * (vmax + 1));
+    for (size_t i = 0; i < nval; i++) {
+        FAISS_THROW_IF_NOT(vals[i] < vmax);
+        lims[vals[i] + 1]++;
+    }
+    double t1 = getmillisecs();
+    // compute cumulative sum
+    for (size_t i = 0; i < vmax; i++) {
+        lims[i + 1] += lims[i];
+    }
+    FAISS_THROW_IF_NOT(lims[vmax] == nval);
+    double t2 = getmillisecs();
+    // populate buckets
+    for (size_t i = 0; i < nval; i++) {
+        perm[lims[vals[i]]++] = i;
+    }
+    double t3 = getmillisecs();
+    // reset pointers
+    for (size_t i = vmax; i > 0; i--) {
+        lims[i] = lims[i - 1];
+    }
+    lims[0] = 0;
+    double t4 = getmillisecs();
+    if (bucket_sort_verbose) {
+        printf("times %.3f %.3f %.3f %.3f\n",
+               t1 - t0,
+               t2 - t1,
+               t3 - t2,
+               t4 - t3);
+    }
+}
+
+void bucket_sort_parallel(
+        size_t nval,
+        const uint64_t* vals,
+        uint64_t vmax,
+        int64_t* lims,
+        int64_t* perm,
+        int nt_in) {
+    memset(lims, 0, sizeof(*lims) * (vmax + 1));
+#pragma omp parallel num_threads(nt_in)
+    {
+        int nt = omp_get_num_threads(); // might be different from nt_in
+        int rank = omp_get_thread_num();
+        std::vector<int64_t> local_lims(vmax + 1);
+
+        // range of indices handled by this thread
+        size_t i0 = nval * rank / nt;
+        size_t i1 = nval * (rank + 1) / nt;
+
+        // build histogram in local lims
+        double t0 = getmillisecs();
+        for (size_t i = i0; i < i1; i++) {
+            local_lims[vals[i]]++;
+        }
+#pragma omp critical
+        { // accumulate histograms (not shifted indices to prepare cumsum)
+            for (size_t i = 0; i < vmax; i++) {
+                lims[i + 1] += local_lims[i];
+            }
+        }
+#pragma omp barrier
+
+        double t1 = getmillisecs();
+#pragma omp master
+        {
+            // compute cumulative sum
+            for (size_t i = 0; i < vmax; i++) {
+                lims[i + 1] += lims[i];
+            }
+            FAISS_THROW_IF_NOT(lims[vmax] == nval);
+        }
+#pragma omp barrier
+
+#pragma omp critical
+        { // current thread grabs a slot in the buckets
+            for (size_t i = 0; i < vmax; i++) {
+                size_t nv = local_lims[i];
+                local_lims[i] = lims[i]; // where we should start writing
+                lims[i] += nv;
+            }
+        }
+
+        double t2 = getmillisecs();
+#pragma omp barrier
+        { // populate buckets, this is the slowest operation
+            for (size_t i = i0; i < i1; i++) {
+                perm[local_lims[vals[i]]++] = i;
+            }
+        }
+#pragma omp barrier
+        double t3 = getmillisecs();
+
+#pragma omp master
+        { // shift back lims
+            for (size_t i = vmax; i > 0; i--) {
+                lims[i] = lims[i - 1];
+            }
+            lims[0] = 0;
+            double t4 = getmillisecs();
+            if (bucket_sort_verbose) {
+                printf("times %.3f %.3f %.3f %.3f\n",
+                       t1 - t0,
+                       t2 - t1,
+                       t3 - t2,
+                       t4 - t3);
+            }
+        }
+    }
+}
+
+/***********************************************
+ * in-place bucket sort
+ */
+
+template <class TI>
+void bucket_sort_inplace_ref(
+        size_t nrow,
+        size_t ncol,
+        TI* vals,
+        TI nbucket,
+        int64_t* lims) {
+    double t0 = getmillisecs();
+    size_t nval = nrow * ncol;
+    FAISS_THROW_IF_NOT(
+            nbucket < nval); // unclear what would happen in this case...
+
+    memset(lims, 0, sizeof(*lims) * (nbucket + 1));
+    for (size_t i = 0; i < nval; i++) {
+        FAISS_THROW_IF_NOT(vals[i] < nbucket);
+        lims[vals[i] + 1]++;
+    }
+    double t1 = getmillisecs();
+    // compute cumulative sum
+    for (size_t i = 0; i < nbucket; i++) {
+        lims[i + 1] += lims[i];
+    }
+    FAISS_THROW_IF_NOT(lims[nbucket] == nval);
+    double t2 = getmillisecs();
+
+    std::vector<size_t> ptrs(nbucket);
+    for (size_t i = 0; i < nbucket; i++) {
+        ptrs[i] = lims[i];
+    }
+
+    // find loops in the permutation and follow them
+    TI row = -1;
+    TI init_bucket_no = 0, bucket_no = 0;
+    for (;;) {
+        size_t idx = ptrs[bucket_no];
+        if (row >= 0) {
+            ptrs[bucket_no] += 1;
+        }
+        assert(idx < lims[bucket_no + 1]);
+        TI next_bucket_no = vals[idx];
+        vals[idx] = row;
+        if (next_bucket_no != -1) {
+            row = idx / ncol;
+            bucket_no = next_bucket_no;
+        } else {
+            // start new loop
+            for (; init_bucket_no < nbucket; init_bucket_no++) {
+                if (ptrs[init_bucket_no] < lims[init_bucket_no + 1]) {
+                    break;
+                }
+            }
+            if (init_bucket_no == nbucket) { // we're done
+                break;
+            }
+            bucket_no = init_bucket_no;
+            row = -1;
+        }
+    }
+
+    for (size_t i = 0; i < nbucket; i++) {
+        assert(ptrs[i] == lims[i + 1]);
+    }
+    double t3 = getmillisecs();
+    if (bucket_sort_verbose) {
+        printf("times %.3f %.3f %.3f\n", t1 - t0, t2 - t1, t3 - t2);
+    }
+}
+
+// collects row numbers to write into buckets
+template <class TI>
+struct ToWrite {
+    TI nbucket;
+    std::vector<TI> buckets;
+    std::vector<TI> rows;
+    std::vector<size_t> lims;
+
+    explicit ToWrite(TI nbucket) : nbucket(nbucket) {
+        lims.resize(nbucket + 1);
+    }
+
+    /// add one element (row) to write in bucket b
+    void add(TI row, TI b) {
+        assert(b >= 0 && b < nbucket);
+        rows.push_back(row);
+        buckets.push_back(b);
+    }
+
+    void bucket_sort() {
+        FAISS_THROW_IF_NOT(buckets.size() == rows.size());
+        lims.resize(nbucket + 1);
+        memset(lims.data(), 0, sizeof(lims[0]) * (nbucket + 1));
+
+        for (size_t i = 0; i < buckets.size(); i++) {
+            assert(buckets[i] >= 0 && buckets[i] < nbucket);
+            lims[buckets[i] + 1]++;
+        }
+        // compute cumulative sum
+        for (size_t i = 0; i < nbucket; i++) {
+            lims[i + 1] += lims[i];
+        }
+        FAISS_THROW_IF_NOT(lims[nbucket] == buckets.size());
+
+        // could also do a circular perm...
+        std::vector<TI> new_rows(rows.size());
+        std::vector<size_t> ptrs = lims;
+        for (size_t i = 0; i < buckets.size(); i++) {
+            TI b = buckets[i];
+            assert(ptrs[b] < lims[b + 1]);
+            new_rows[ptrs[b]++] = rows[i];
+        }
+        buckets.resize(0);
+        std::swap(rows, new_rows);
+    }
+
+    void swap(ToWrite& other) {
+        assert(nbucket == other.nbucket);
+        buckets.swap(other.buckets);
+        rows.swap(other.rows);
+        lims.swap(other.lims);
+    }
+};
+
+template <class TI>
+void bucket_sort_inplace_parallel(
+        size_t nrow,
+        size_t ncol,
+        TI* vals,
+        TI nbucket,
+        int64_t* lims,
+        int nt_in) {
+    int verbose = bucket_sort_verbose;
+    memset(lims, 0, sizeof(*lims) * (nbucket + 1));
+    std::vector<ToWrite<TI>> all_to_write;
+    size_t nval = nrow * ncol;
+    FAISS_THROW_IF_NOT(
+            nbucket < nval); // unclear what would happen in this case...
+
+    // try to keep size of all_to_write < 5GiB
+    // but we need at least one element per bucket
+    size_t init_to_write = std::max(
+            size_t(nbucket),
+            std::min(nval / 10, ((size_t)5 << 30) / (sizeof(TI) * 3 * nt_in)));
+    if (verbose > 0) {
+        printf("init_to_write=%zd\n", init_to_write);
+    }
+
+    std::vector<size_t> ptrs(nbucket); // ptrs is shared across all threads
+    std::vector<char> did_wrap(
+            nbucket); // DON'T use std::vector<bool> that cannot be accessed
+                      // safely from multiple threads!!!
+
+#pragma omp parallel num_threads(nt_in)
+    {
+        int nt = omp_get_num_threads(); // might be different from nt_in (?)
+        int rank = omp_get_thread_num();
+        std::vector<int64_t> local_lims(nbucket + 1);
+
+        // range of indices handled by this thread
+        size_t i0 = nval * rank / nt;
+        size_t i1 = nval * (rank + 1) / nt;
+
+        // build histogram in local lims
+        for (size_t i = i0; i < i1; i++) {
+            local_lims[vals[i]]++;
+        }
+#pragma omp critical
+        { // accumulate histograms (not shifted indices to prepare cumsum)
+            for (size_t i = 0; i < nbucket; i++) {
+                lims[i + 1] += local_lims[i];
+            }
+            all_to_write.push_back(ToWrite<TI>(nbucket));
+        }
+
+#pragma omp barrier
+        // this thread's things to write
+        ToWrite<TI>& to_write = all_to_write[rank];
+
+#pragma omp master
+        {
+            // compute cumulative sum
+            for (size_t i = 0; i < nbucket; i++) {
+                lims[i + 1] += lims[i];
+            }
+            FAISS_THROW_IF_NOT(lims[nbucket] == nval);
+            // at this point lims is final (read only!)
+
+            memcpy(ptrs.data(), lims, sizeof(lims[0]) * nbucket);
+
+            // initial values to write (we write -1s to get the process running)
+            // make sure at least one element per bucket
+            size_t written = 0;
+            for (TI b = 0; b < nbucket; b++) {
+                size_t l0 = lims[b], l1 = lims[b + 1];
+                size_t target_to_write = l1 * init_to_write / nval;
+                do {
+                    if (l0 == l1) {
+                        break;
+                    }
+                    to_write.add(-1, b);
+                    l0++;
+                    written++;
+                } while (written < target_to_write);
+            }
+
+            to_write.bucket_sort();
+        }
+
+        // this thread writes only buckets b0:b1
+        size_t b0 = (rank * nbucket + nt - 1) / nt;
+        size_t b1 = ((rank + 1) * nbucket + nt - 1) / nt;
+
+        // in this loop, we write elements collected in the previous round
+        // and collect the elements that are overwritten for the next round
+        size_t tot_written = 0;
+        int round = 0;
+        for (;;) {
+#pragma omp barrier
+
+            size_t n_to_write = 0;
+            for (const ToWrite<TI>& to_write_2 : all_to_write) {
+                n_to_write += to_write_2.lims.back();
+            }
+
+            tot_written += n_to_write;
+            // assert(tot_written <= nval);
+
+#pragma omp master
+            {
+                if (verbose >= 1) {
+                    printf("ROUND %d n_to_write=%zd\n", round, n_to_write);
+                }
+                if (verbose > 2) {
+                    for (size_t b = 0; b < nbucket; b++) {
+                        printf("   b=%zd [", b);
+                        for (size_t i = lims[b]; i < lims[b + 1]; i++) {
+                            printf(" %s%d",
+                                   ptrs[b] == i ? ">" : "",
+                                   int(vals[i]));
+                        }
+                        printf(" %s] %s\n",
+                               ptrs[b] == lims[b + 1] ? ">" : "",
+                               did_wrap[b] ? "w" : "");
+                    }
+                    printf("To write\n");
+                    for (size_t b = 0; b < nbucket; b++) {
+                        printf("   b=%zd ", b);
+                        const char* sep = "[";
+                        for (const ToWrite<TI>& to_write_2 : all_to_write) {
+                            printf("%s", sep);
+                            sep = " |";
+                            size_t l0 = to_write_2.lims[b];
+                            size_t l1 = to_write_2.lims[b + 1];
+                            for (size_t i = l0; i < l1; i++) {
+                                printf(" %d", int(to_write_2.rows[i]));
+                            }
+                        }
+                        printf(" ]\n");
+                    }
+                }
+            }
+            if (n_to_write == 0) {
+                break;
+            }
+            round++;
+
+#pragma omp barrier
+
+            ToWrite<TI> next_to_write(nbucket);
+
+            for (size_t b = b0; b < b1; b++) {
+                for (const ToWrite<TI>& to_write_2 : all_to_write) {
+                    size_t l0 = to_write_2.lims[b];
+                    size_t l1 = to_write_2.lims[b + 1];
+                    for (size_t i = l0; i < l1; i++) {
+                        TI row = to_write_2.rows[i];
+                        size_t idx = ptrs[b];
+                        if (verbose > 2) {
+                            printf("    bucket %d (rank %d) idx %zd\n",
+                                   int(row),
+                                   rank,
+                                   idx);
+                        }
+                        if (idx < lims[b + 1]) {
+                            ptrs[b]++;
+                        } else {
+                            // wrapping around
+                            assert(!did_wrap[b]);
+                            did_wrap[b] = true;
+                            idx = lims[b];
+                            ptrs[b] = idx + 1;
+                        }
+
+                        // check if we need to remember the overwritten number
+                        if (vals[idx] >= 0) {
+                            TI new_row = idx / ncol;
+                            next_to_write.add(new_row, vals[idx]);
+                            if (verbose > 2) {
+                                printf("       new_row=%d\n", int(new_row));
+                            }
+                        } else {
+                            assert(did_wrap[b]);
+                        }
+
+                        vals[idx] = row;
+                    }
+                }
+            }
+            next_to_write.bucket_sort();
+#pragma omp barrier
+            all_to_write[rank].swap(next_to_write);
+        }
+    }
+}
+
+} // anonymous namespace
+
+void bucket_sort(
+        size_t nval,
+        const uint64_t* vals,
+        uint64_t vmax,
+        int64_t* lims,
+        int64_t* perm,
+        int nt) {
+    if (nt == 0) {
+        bucket_sort_ref(nval, vals, vmax, lims, perm);
+    } else {
+        bucket_sort_parallel(nval, vals, vmax, lims, perm, nt);
+    }
+}
+
+void matrix_bucket_sort_inplace(
+        size_t nrow,
+        size_t ncol,
+        int32_t* vals,
+        int32_t vmax,
+        int64_t* lims,
+        int nt) {
+    if (nt == 0) {
+        bucket_sort_inplace_ref(nrow, ncol, vals, vmax, lims);
+    } else {
+        bucket_sort_inplace_parallel(nrow, ncol, vals, vmax, lims, nt);
+    }
+}
+
+void matrix_bucket_sort_inplace(
+        size_t nrow,
+        size_t ncol,
+        int64_t* vals,
+        int64_t vmax,
+        int64_t* lims,
+        int nt) {
+    if (nt == 0) {
+        bucket_sort_inplace_ref(nrow, ncol, vals, vmax, lims);
+    } else {
+        bucket_sort_inplace_parallel(nrow, ncol, vals, vmax, lims, nt);
+    }
+}
+
+} // namespace faiss
diff -Nru faiss-1.7.3/faiss/utils/sorting.h faiss-1.7.4/faiss/utils/sorting.h
--- faiss-1.7.3/faiss/utils/sorting.h	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/faiss/utils/sorting.h	2023-04-19 13:18:30.000000000 +0000
@@ -0,0 +1,71 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/impl/platform_macros.h>
+
+namespace faiss {
+
+/** Indirect sort of a floating-point array
+ *
+ * @param n     size of the array
+ * @param vals  array to sort, size n
+ * @param perm  output: permutation of [0..n-1], st.
+ *              vals[perm[i + 1]] >= vals[perm[i]]
+ */
+void fvec_argsort(size_t n, const float* vals, size_t* perm);
+
+/** Same as fvec_argsort, parallelized */
+void fvec_argsort_parallel(size_t n, const float* vals, size_t* perm);
+
+/// increase verbosity of the bucket_sort functions
+FAISS_API extern int bucket_sort_verbose;
+
+/** Bucket sort of a list of values
+ *
+ * @param vals     values to sort, size nval, max value nbucket - 1
+ * @param lims     output limits of buckets, size nbucket + 1
+ * @param perm     output buckets, the elements of bucket
+ *                 i are in perm[lims[i]:lims[i + 1]]
+ * @param nt       number of threads (0 = pure sequential code)
+ */
+void bucket_sort(
+        size_t nval,
+        const uint64_t* vals,
+        uint64_t nbucket,
+        int64_t* lims,
+        int64_t* perm,
+        int nt = 0);
+
+/** in-place bucket sort (with attention to memory=>int32)
+ * on input the values are in a nrow * col matrix
+ * we want to store the row numbers in the output.
+ *
+ * @param vals     positive values to sort, size nrow * ncol,
+ *                 max value nbucket - 1
+ * @param lims     output limits of buckets, size nbucket + 1
+ * @param nt       number of threads (0 = pure sequential code)
+ */
+void matrix_bucket_sort_inplace(
+        size_t nrow,
+        size_t ncol,
+        int32_t* vals,
+        int32_t nbucket,
+        int64_t* lims,
+        int nt = 0);
+
+/// same with int64 elements
+void matrix_bucket_sort_inplace(
+        size_t nrow,
+        size_t ncol,
+        int64_t* vals,
+        int64_t nbucket,
+        int64_t* lims,
+        int nt = 0);
+
+} // namespace faiss
diff -Nru faiss-1.7.3/faiss/utils/transpose/transpose-avx2-inl.h faiss-1.7.4/faiss/utils/transpose/transpose-avx2-inl.h
--- faiss-1.7.3/faiss/utils/transpose/transpose-avx2-inl.h	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/faiss/utils/transpose/transpose-avx2-inl.h	2023-04-19 13:18:30.000000000 +0000
@@ -0,0 +1,165 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+// This file contains transposing kernels for AVX2 for
+// tiny float/int32 matrices, such as 8x2.
+
+#ifdef __AVX2__
+
+#include <immintrin.h>
+
+namespace faiss {
+
+// 8x2 -> 2x8
+inline void transpose_8x2(
+        const __m256 i0,
+        const __m256 i1,
+        __m256& o0,
+        __m256& o1) {
+    // say, we have the following as in input:
+    // i0:  00 01 10 11 20 21 30 31
+    // i1:  40 41 50 51 60 61 70 71
+
+    // 00 01 10 11 40 41 50 51
+    const __m256 r0 = _mm256_permute2f128_ps(i0, i1, _MM_SHUFFLE(0, 2, 0, 0));
+    // 20 21 30 31 60 61 70 71
+    const __m256 r1 = _mm256_permute2f128_ps(i0, i1, _MM_SHUFFLE(0, 3, 0, 1));
+
+    // 00 10 20 30 40 50 60 70
+    o0 = _mm256_shuffle_ps(r0, r1, _MM_SHUFFLE(2, 0, 2, 0));
+    // 01 11 21 31 41 51 61 71
+    o1 = _mm256_shuffle_ps(r0, r1, _MM_SHUFFLE(3, 1, 3, 1));
+}
+
+// 8x4 -> 4x8
+inline void transpose_8x4(
+        const __m256 i0,
+        const __m256 i1,
+        const __m256 i2,
+        const __m256 i3,
+        __m256& o0,
+        __m256& o1,
+        __m256& o2,
+        __m256& o3) {
+    // say, we have the following as an input:
+    // i0:  00 01 02 03 10 11 12 13
+    // i1:  20 21 22 23 30 31 32 33
+    // i2:  40 41 42 43 50 51 52 53
+    // i3:  60 61 62 63 70 71 72 73
+
+    // 00 01 02 03 40 41 42 43
+    const __m256 r0 = _mm256_permute2f128_ps(i0, i2, _MM_SHUFFLE(0, 2, 0, 0));
+    // 20 21 22 23 60 61 62 63
+    const __m256 r1 = _mm256_permute2f128_ps(i1, i3, _MM_SHUFFLE(0, 2, 0, 0));
+    // 10 11 12 13 50 51 52 53
+    const __m256 r2 = _mm256_permute2f128_ps(i0, i2, _MM_SHUFFLE(0, 3, 0, 1));
+    // 30 31 32 33 70 71 72 73
+    const __m256 r3 = _mm256_permute2f128_ps(i1, i3, _MM_SHUFFLE(0, 3, 0, 1));
+
+    // 00 02 10 12 40 42 50 52
+    const __m256 t0 = _mm256_shuffle_ps(r0, r2, _MM_SHUFFLE(2, 0, 2, 0));
+    // 01 03 11 13 41 43 51 53
+    const __m256 t1 = _mm256_shuffle_ps(r0, r2, _MM_SHUFFLE(3, 1, 3, 1));
+    // 20 22 30 32 60 62 70 72
+    const __m256 t2 = _mm256_shuffle_ps(r1, r3, _MM_SHUFFLE(2, 0, 2, 0));
+    // 21 23 31 33 61 63 71 73
+    const __m256 t3 = _mm256_shuffle_ps(r1, r3, _MM_SHUFFLE(3, 1, 3, 1));
+
+    // 00 10 20 30 40 50 60 70
+    o0 = _mm256_shuffle_ps(t0, t2, _MM_SHUFFLE(2, 0, 2, 0));
+    // 01 11 21 31 41 51 61 71
+    o1 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(2, 0, 2, 0));
+    // 02 12 22 32 42 52 62 72
+    o2 = _mm256_shuffle_ps(t0, t2, _MM_SHUFFLE(3, 1, 3, 1));
+    // 03 13 23 33 43 53 63 73
+    o3 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(3, 1, 3, 1));
+}
+
+inline void transpose_8x8(
+        const __m256 i0,
+        const __m256 i1,
+        const __m256 i2,
+        const __m256 i3,
+        const __m256 i4,
+        const __m256 i5,
+        const __m256 i6,
+        const __m256 i7,
+        __m256& o0,
+        __m256& o1,
+        __m256& o2,
+        __m256& o3,
+        __m256& o4,
+        __m256& o5,
+        __m256& o6,
+        __m256& o7) {
+    // say, we have the following as an input:
+    // i0:  00 01 02 03 04 05 06 07
+    // i1:  10 11 12 13 14 15 16 17
+    // i2:  20 21 22 23 24 25 26 27
+    // i3:  30 31 32 33 34 35 36 37
+    // i4:  40 41 42 43 44 45 46 47
+    // i5:  50 51 52 53 54 55 56 57
+    // i6:  60 61 62 63 64 65 66 67
+    // i7:  70 71 72 73 74 75 76 77
+
+    // 00 10 01 11 04 14 05 15
+    const __m256 r0 = _mm256_unpacklo_ps(i0, i1);
+    // 02 12 03 13 06 16 07 17
+    const __m256 r1 = _mm256_unpackhi_ps(i0, i1);
+    // 20 30 21 31 24 34 25 35
+    const __m256 r2 = _mm256_unpacklo_ps(i2, i3);
+    // 22 32 23 33 26 36 27 37
+    const __m256 r3 = _mm256_unpackhi_ps(i2, i3);
+    // 40 50 41 51 44 54 45 55
+    const __m256 r4 = _mm256_unpacklo_ps(i4, i5);
+    // 42 52 43 53 46 56 47 57
+    const __m256 r5 = _mm256_unpackhi_ps(i4, i5);
+    // 60 70 61 71 64 74 65 75
+    const __m256 r6 = _mm256_unpacklo_ps(i6, i7);
+    // 62 72 63 73 66 76 67 77
+    const __m256 r7 = _mm256_unpackhi_ps(i6, i7);
+
+    // 00 10 20 30 04 14 24 34
+    const __m256 rr0 = _mm256_shuffle_ps(r0, r2, _MM_SHUFFLE(1, 0, 1, 0));
+    // 01 11 21 31 05 15 25 35
+    const __m256 rr1 = _mm256_shuffle_ps(r0, r2, _MM_SHUFFLE(3, 2, 3, 2));
+    // 02 12 22 32 06 16 26 36
+    const __m256 rr2 = _mm256_shuffle_ps(r1, r3, _MM_SHUFFLE(1, 0, 1, 0));
+    // 03 13 23 33 07 17 27 37
+    const __m256 rr3 = _mm256_shuffle_ps(r1, r3, _MM_SHUFFLE(3, 2, 3, 2));
+    // 40 50 60 70 44 54 64 74
+    const __m256 rr4 = _mm256_shuffle_ps(r4, r6, _MM_SHUFFLE(1, 0, 1, 0));
+    // 41 51 61 71 45 55 65 75
+    const __m256 rr5 = _mm256_shuffle_ps(r4, r6, _MM_SHUFFLE(3, 2, 3, 2));
+    // 42 52 62 72 46 56 66 76
+    const __m256 rr6 = _mm256_shuffle_ps(r5, r7, _MM_SHUFFLE(1, 0, 1, 0));
+    // 43 53 63 73 47 57 67 77
+    const __m256 rr7 = _mm256_shuffle_ps(r5, r7, _MM_SHUFFLE(3, 2, 3, 2));
+
+    // 00 10 20 30 40 50 60 70
+    o0 = _mm256_permute2f128_ps(rr0, rr4, 0x20);
+    // 01 11 21 31 41 51 61 71
+    o1 = _mm256_permute2f128_ps(rr1, rr5, 0x20);
+    // 02 12 22 32 42 52 62 72
+    o2 = _mm256_permute2f128_ps(rr2, rr6, 0x20);
+    // 03 13 23 33 43 53 63 73
+    o3 = _mm256_permute2f128_ps(rr3, rr7, 0x20);
+    // 04 14 24 34 44 54 64 74
+    o4 = _mm256_permute2f128_ps(rr0, rr4, 0x31);
+    // 05 15 25 35 45 55 65 75
+    o5 = _mm256_permute2f128_ps(rr1, rr5, 0x31);
+    // 06 16 26 36 46 56 66 76
+    o6 = _mm256_permute2f128_ps(rr2, rr6, 0x31);
+    // 07 17 27 37 47 57 67 77
+    o7 = _mm256_permute2f128_ps(rr3, rr7, 0x31);
+}
+
+} // namespace faiss
+
+#endif
diff -Nru faiss-1.7.3/faiss/utils/utils.cpp faiss-1.7.4/faiss/utils/utils.cpp
--- faiss-1.7.3/faiss/utils/utils.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/utils/utils.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -423,185 +423,13 @@
     }
 }
 
-size_t ivec_checksum(size_t n, const int* a) {
+size_t ivec_checksum(size_t n, const int32_t* asigned) {
+    const uint32_t* a = reinterpret_cast<const uint32_t*>(asigned);
     size_t cs = 112909;
-    while (n--)
+    while (n--) {
         cs = cs * 65713 + a[n] * 1686049;
-    return cs;
-}
-
-namespace {
-struct ArgsortComparator {
-    const float* vals;
-    bool operator()(const size_t a, const size_t b) const {
-        return vals[a] < vals[b];
-    }
-};
-
-struct SegmentS {
-    size_t i0; // begin pointer in the permutation array
-    size_t i1; // end
-    size_t len() const {
-        return i1 - i0;
-    }
-};
-
-// see https://en.wikipedia.org/wiki/Merge_algorithm#Parallel_merge
-// extended to > 1 merge thread
-
-// merges 2 ranges that should be consecutive on the source into
-// the union of the two on the destination
-template <typename T>
-void parallel_merge(
-        const T* src,
-        T* dst,
-        SegmentS& s1,
-        SegmentS& s2,
-        int nt,
-        const ArgsortComparator& comp) {
-    if (s2.len() > s1.len()) { // make sure that s1 larger than s2
-        std::swap(s1, s2);
     }
-
-    // compute sub-ranges for each thread
-    std::vector<SegmentS> s1s(nt), s2s(nt), sws(nt);
-    s2s[0].i0 = s2.i0;
-    s2s[nt - 1].i1 = s2.i1;
-
-    // not sure parallel actually helps here
-#pragma omp parallel for num_threads(nt)
-    for (int t = 0; t < nt; t++) {
-        s1s[t].i0 = s1.i0 + s1.len() * t / nt;
-        s1s[t].i1 = s1.i0 + s1.len() * (t + 1) / nt;
-
-        if (t + 1 < nt) {
-            T pivot = src[s1s[t].i1];
-            size_t i0 = s2.i0, i1 = s2.i1;
-            while (i0 + 1 < i1) {
-                size_t imed = (i1 + i0) / 2;
-                if (comp(pivot, src[imed])) {
-                    i1 = imed;
-                } else {
-                    i0 = imed;
-                }
-            }
-            s2s[t].i1 = s2s[t + 1].i0 = i1;
-        }
-    }
-    s1.i0 = std::min(s1.i0, s2.i0);
-    s1.i1 = std::max(s1.i1, s2.i1);
-    s2 = s1;
-    sws[0].i0 = s1.i0;
-    for (int t = 0; t < nt; t++) {
-        sws[t].i1 = sws[t].i0 + s1s[t].len() + s2s[t].len();
-        if (t + 1 < nt) {
-            sws[t + 1].i0 = sws[t].i1;
-        }
-    }
-    assert(sws[nt - 1].i1 == s1.i1);
-
-    // do the actual merging
-#pragma omp parallel for num_threads(nt)
-    for (int t = 0; t < nt; t++) {
-        SegmentS sw = sws[t];
-        SegmentS s1t = s1s[t];
-        SegmentS s2t = s2s[t];
-        if (s1t.i0 < s1t.i1 && s2t.i0 < s2t.i1) {
-            for (;;) {
-                // assert (sw.len() == s1t.len() + s2t.len());
-                if (comp(src[s1t.i0], src[s2t.i0])) {
-                    dst[sw.i0++] = src[s1t.i0++];
-                    if (s1t.i0 == s1t.i1)
-                        break;
-                } else {
-                    dst[sw.i0++] = src[s2t.i0++];
-                    if (s2t.i0 == s2t.i1)
-                        break;
-                }
-            }
-        }
-        if (s1t.len() > 0) {
-            assert(s1t.len() == sw.len());
-            memcpy(dst + sw.i0, src + s1t.i0, s1t.len() * sizeof(dst[0]));
-        } else if (s2t.len() > 0) {
-            assert(s2t.len() == sw.len());
-            memcpy(dst + sw.i0, src + s2t.i0, s2t.len() * sizeof(dst[0]));
-        }
-    }
-}
-
-}; // namespace
-
-void fvec_argsort(size_t n, const float* vals, size_t* perm) {
-    for (size_t i = 0; i < n; i++)
-        perm[i] = i;
-    ArgsortComparator comp = {vals};
-    std::sort(perm, perm + n, comp);
-}
-
-void fvec_argsort_parallel(size_t n, const float* vals, size_t* perm) {
-    size_t* perm2 = new size_t[n];
-    // 2 result tables, during merging, flip between them
-    size_t *permB = perm2, *permA = perm;
-
-    int nt = omp_get_max_threads();
-    { // prepare correct permutation so that the result ends in perm
-      // at final iteration
-        int nseg = nt;
-        while (nseg > 1) {
-            nseg = (nseg + 1) / 2;
-            std::swap(permA, permB);
-        }
-    }
-
-#pragma omp parallel
-    for (size_t i = 0; i < n; i++)
-        permA[i] = i;
-
-    ArgsortComparator comp = {vals};
-
-    std::vector<SegmentS> segs(nt);
-
-    // independent sorts
-#pragma omp parallel for
-    for (int t = 0; t < nt; t++) {
-        size_t i0 = t * n / nt;
-        size_t i1 = (t + 1) * n / nt;
-        SegmentS seg = {i0, i1};
-        std::sort(permA + seg.i0, permA + seg.i1, comp);
-        segs[t] = seg;
-    }
-    int prev_nested = omp_get_nested();
-    omp_set_nested(1);
-
-    int nseg = nt;
-    while (nseg > 1) {
-        int nseg1 = (nseg + 1) / 2;
-        int sub_nt = nseg % 2 == 0 ? nt : nt - 1;
-        int sub_nseg1 = nseg / 2;
-
-#pragma omp parallel for num_threads(nseg1)
-        for (int s = 0; s < nseg; s += 2) {
-            if (s + 1 == nseg) { // otherwise isolated segment
-                memcpy(permB + segs[s].i0,
-                       permA + segs[s].i0,
-                       segs[s].len() * sizeof(size_t));
-            } else {
-                int t0 = s * sub_nt / sub_nseg1;
-                int t1 = (s + 1) * sub_nt / sub_nseg1;
-                printf("merge %d %d, %d threads\n", s, s + 1, t1 - t0);
-                parallel_merge(
-                        permA, permB, segs[s], segs[s + 1], t1 - t0, comp);
-            }
-        }
-        for (int s = 0; s < nseg; s += 2)
-            segs[s / 2] = segs[s];
-        nseg = nseg1;
-        std::swap(permA, permB);
-    }
-    assert(permA == perm);
-    omp_set_nested(prev_nested);
-    delete[] perm2;
+    return cs;
 }
 
 const float* fvecs_maybe_subsample(
diff -Nru faiss-1.7.3/faiss/utils/utils.h faiss-1.7.4/faiss/utils/utils.h
--- faiss-1.7.3/faiss/utils/utils.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/utils/utils.h	2023-04-19 13:18:30.000000000 +0000
@@ -19,10 +19,7 @@
 #include <stdint.h>
 #include <string>
 
-#ifdef _MSC_VER
-#define strtok_r strtok_s
-#endif // _MSC_VER
-
+#include <faiss/impl/platform_macros.h>
 #include <faiss/utils/Heap.h>
 
 namespace faiss {
@@ -113,10 +110,6 @@
 /// same, takes a histogram as input
 double imbalance_factor(int k, const int* hist);
 
-void fvec_argsort(size_t n, const float* vals, size_t* perm);
-
-void fvec_argsort_parallel(size_t n, const float* vals, size_t* perm);
-
 /// compute histogram on v
 int ivec_hist(size_t n, const int* v, int vmax, int* hist);
 
@@ -128,7 +121,7 @@
 void bincode_hist(size_t n, size_t nbits, const uint8_t* codes, int* hist);
 
 /// compute a checksum on a table.
-size_t ivec_checksum(size_t n, const int* a);
+size_t ivec_checksum(size_t n, const int32_t* a);
 
 /** random subsamples a set of vectors if there are too many of them
  *
diff -Nru faiss-1.7.3/faiss/VectorTransform.cpp faiss-1.7.4/faiss/VectorTransform.cpp
--- faiss-1.7.3/faiss/VectorTransform.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/VectorTransform.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -135,7 +135,7 @@
  * VectorTransform
  *********************************************/
 
-float* VectorTransform::apply(Index::idx_t n, const float* x) const {
+float* VectorTransform::apply(idx_t n, const float* x) const {
     float* xt = new float[n * d_out];
     apply_noalloc(n, x, xt);
     return xt;
@@ -166,8 +166,7 @@
     is_trained = false; // will be trained when A and b are initialized
 }
 
-void LinearTransform::apply_noalloc(Index::idx_t n, const float* x, float* xt)
-        const {
+void LinearTransform::apply_noalloc(idx_t n, const float* x, float* xt) const {
     FAISS_THROW_IF_NOT_MSG(is_trained, "Transformation not trained yet");
 
     float c_factor;
@@ -348,7 +347,7 @@
     is_trained = true;
 }
 
-void RandomRotationMatrix::train(Index::idx_t /*n*/, const float* /*x*/) {
+void RandomRotationMatrix::train(idx_t /*n*/, const float* /*x*/) {
     // initialize with some arbitrary seed
     init(12345);
 }
@@ -442,7 +441,7 @@
 
 } // namespace
 
-void PCAMatrix::train(Index::idx_t n, const float* x) {
+void PCAMatrix::train(idx_t n, const float* x) {
     const float* x_in = x;
 
     x = fvecs_maybe_subsample(
@@ -733,7 +732,7 @@
         : LinearTransform(d, d, false), max_iter(50), seed(123) {}
 
 /** translated from fbcode/deeplearning/catalyzer/catalyzer/quantizers.py */
-void ITQMatrix::train(Index::idx_t n, const float* xf) {
+void ITQMatrix::train(idx_t n, const float* xf) {
     size_t d = d_in;
     std::vector<double> rotation(d * d);
 
@@ -957,8 +956,7 @@
     is_trained = true;
 }
 
-void ITQTransform::apply_noalloc(Index::idx_t n, const float* x, float* xt)
-        const {
+void ITQTransform::apply_noalloc(idx_t n, const float* x, float* xt) const {
     FAISS_THROW_IF_NOT_MSG(is_trained, "Transformation not trained yet");
 
     std::unique_ptr<float[]> x_norm(new float[n * d_in]);
@@ -1003,7 +1001,7 @@
     pq = nullptr;
 }
 
-void OPQMatrix::train(Index::idx_t n, const float* x) {
+void OPQMatrix::train(idx_t n, const float* x) {
     const float* x_in = x;
 
     x = fvecs_maybe_subsample(d_in, (size_t*)&n, max_train_points, x, verbose);
@@ -1261,7 +1259,7 @@
     is_trained = false;
 }
 
-void CenteringTransform::train(Index::idx_t n, const float* x) {
+void CenteringTransform::train(idx_t n, const float* x) {
     FAISS_THROW_IF_NOT_MSG(n > 0, "need at least one training vector");
     mean.resize(d_in, 0);
     for (idx_t i = 0; i < n; i++) {
diff -Nru faiss-1.7.3/faiss/VectorTransform.h faiss-1.7.4/faiss/VectorTransform.h
--- faiss-1.7.3/faiss/VectorTransform.h	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/faiss/VectorTransform.h	2023-04-19 13:18:30.000000000 +0000
@@ -23,8 +23,6 @@
 
 /** Any transformation applied on a set of vectors */
 struct VectorTransform {
-    typedef Index::idx_t idx_t;
-
     int d_in;  ///! input dimension
     int d_out; ///! output dimension
 
@@ -122,7 +120,7 @@
     /// must be called before the transform is used
     void init(int seed);
 
-    // intializes with an arbitrary seed
+    // initializes with an arbitrary seed
     void train(idx_t n, const float* x) override;
 
     RandomRotationMatrix() {}
diff -Nru faiss-1.7.3/INSTALL.md faiss-1.7.4/INSTALL.md
--- faiss-1.7.3/INSTALL.md	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/INSTALL.md	2023-04-19 13:18:30.000000000 +0000
@@ -31,6 +31,17 @@
 $ conda install -c pytorch/label/nightly faiss-gpu
 ```
 
+A combination of versions that works with Pytorch (as of 2022-11-23):
+```
+conda create -n faiss_1.7.3 python=3.8
+conda activate faiss_1.7.3
+conda install pytorch==1.11.0 cudatoolkit=11.3 -c pytorch
+conda install numpy
+conda install -c pytorch faiss-gpu=1.7.3 cudatoolkit=11.3
+conda install -c conda-forge notebook
+conda install -y matplotlib
+```
+
 ## Installing from conda-forge
 
 Faiss is also being packaged by [conda-forge](https://conda-forge.org/), the
@@ -97,6 +108,8 @@
   - `-DBUILD_TESTING=OFF` in order to disable building C++ tests,
   - `-DBUILD_SHARED_LIBS=ON` in order to build a shared library (possible values
   are `ON` and `OFF`),
+  - `-DFAISS_ENABLE_C_API=ON` in order to enable building [C API](c_api/INSTALL.md) (possible values
+    are `ON` and `OFF`), 
 - optimization-related options:
   - `-DCMAKE_BUILD_TYPE=Release` in order to enable generic compiler
   optimization options (enables `-O3` on gcc for instance),
diff -Nru faiss-1.7.3/README.md faiss-1.7.4/README.md
--- faiss-1.7.3/README.md	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/README.md	2023-04-19 13:18:30.000000000 +0000
@@ -1,6 +1,6 @@
 # Faiss
 
-Faiss is a library for efficient similarity search and clustering of dense vectors. It contains algorithms that search in sets of vectors of any size, up to ones that possibly do not fit in RAM. It also contains supporting code for evaluation and parameter tuning. Faiss is written in C++ with complete wrappers for Python/numpy. Some of the most useful algorithms are implemented on the GPU. It is developed primarily at [Facebook AI Research](https://ai.facebook.com/).
+Faiss is a library for efficient similarity search and clustering of dense vectors. It contains algorithms that search in sets of vectors of any size, up to ones that possibly do not fit in RAM. It also contains supporting code for evaluation and parameter tuning. Faiss is written in C++ with complete wrappers for Python/numpy. Some of the most useful algorithms are implemented on the GPU. It is developed primarily at Meta's [Fundamental AI Research](https://ai.facebook.com/) group.
 
 ## News
 
@@ -48,6 +48,7 @@
 - [Jeff Johnson](https://github.com/wickedfoo) implemented all of the GPU Faiss
 - [Lucas Hosseini](https://github.com/beauby) implemented the binary indexes and the build system
 - [Chengqi Deng](https://github.com/KinglittleQ) implemented NSG, NNdescent and much of the additive quantization code.
+- [Alexandr Guzhva](https://github.com/alexanderguzhva) many optimizations: SIMD, memory allocation and layout, fast decoding kernels for vector codecs, etc.
 
 ## Reference
 
@@ -73,6 +74,8 @@
 We monitor the [issues page](http://github.com/facebookresearch/faiss/issues) of the repository.
 You can report bugs, ask questions, etc.
 
-## License
+## Legal
 
-Faiss is MIT-licensed.
+Faiss is MIT-licensed, refer to the [LICENSE file](https://github.com/facebookresearch/faiss/blob/main/LICENSE) in the top level directory.
+
+Copyright © Meta Platforms, Inc. See the [Terms of Use](https://opensource.fb.com/legal/terms/) and [Privacy Policy](https://opensource.fb.com/legal/privacy/) for this project.
diff -Nru faiss-1.7.3/tests/CMakeLists.txt faiss-1.7.4/tests/CMakeLists.txt
--- faiss-1.7.3/tests/CMakeLists.txt	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/tests/CMakeLists.txt	2023-04-19 13:18:30.000000000 +0000
@@ -22,6 +22,9 @@
   test_mem_leak.cpp
   test_cppcontrib_sa_decode.cpp
   test_cppcontrib_uintreader.cpp
+  test_simdlib.cpp
+  test_approx_topk.cpp
+  test_RCQ_cropping.cpp
 )
 
 add_executable(faiss_test ${FAISS_TEST_SRC})
diff -Nru faiss-1.7.3/tests/test_approx_topk.cpp faiss-1.7.4/tests/test_approx_topk.cpp
--- faiss-1.7.3/tests/test_approx_topk.cpp	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/tests/test_approx_topk.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -0,0 +1,225 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gtest/gtest.h>
+
+#include <chrono>
+#include <cstdint>
+#include <random>
+#include <sstream>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include <faiss/utils/approx_topk/approx_topk.h>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/FaissException.h>
+#include <faiss/utils/Heap.h>
+
+//
+using namespace faiss;
+
+//
+template <uint32_t NBUCKETS, uint32_t N>
+void test_approx_topk(
+        const uint32_t beamSize,
+        const uint32_t nPerBeam,
+        const uint32_t k,
+        const uint32_t nDatasetsToTest,
+        const bool verbose) {
+    if (verbose) {
+        printf("-----------\n");
+    }
+
+    // generate random data
+    std::default_random_engine rng(123);
+    std::uniform_real_distribution<float> u(0, 1);
+
+    // matches
+    size_t nMatches = 0;
+    // the element was completely missed in approx version.
+    size_t nMissed = 0;
+    // the element is available
+    size_t nAvailable = 0;
+    // the distance is the same, but the index is different.
+    size_t nSoftMismatches = 0;
+    // the distances are different
+    size_t nHardMismatches = 0;
+    // error of distances
+    double sqrError = 0.0;
+
+    //
+    double timeBaseline = 0.0;
+    double timeApprox = 0.0;
+
+    for (size_t iDataset = 0; iDataset < nDatasetsToTest; iDataset++) {
+        const size_t n = (size_t)(nPerBeam)*beamSize;
+        std::vector<float> distances(n, 0);
+        for (size_t i = 0; i < n; i++) {
+            distances[i] = u(rng);
+        }
+
+        //
+        using C = CMax<float, int>;
+
+        // do a regular beam search
+        std::vector<float> baselineDistances(k, C::neutral());
+        std::vector<int> baselineIndices(k, -1);
+
+        auto startBaseline = std::chrono::high_resolution_clock::now();
+        heap_addn<C>(
+                k,
+                baselineDistances.data(),
+                baselineIndices.data(),
+                distances.data(),
+                nullptr,
+                nPerBeam * beamSize);
+        auto endBaseline = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double> diffBaseline =
+                endBaseline - startBaseline;
+        timeBaseline += diffBaseline.count();
+
+        heap_reorder<C>(k, baselineDistances.data(), baselineIndices.data());
+
+        // do an approximate beam search
+        std::vector<float> approxDistances(k, C::neutral());
+        std::vector<int> approxIndices(k, -1);
+
+        auto startApprox = std::chrono::high_resolution_clock::now();
+        try {
+            HeapWithBuckets<C, NBUCKETS, N>::bs_addn(
+                    beamSize,
+                    nPerBeam,
+                    distances.data(),
+                    k,
+                    approxDistances.data(),
+                    approxIndices.data());
+        } catch (const faiss::FaissException& ex) {
+            //
+            if (verbose) {
+                printf("Skipping the case.\n");
+            }
+            return;
+        }
+
+        auto endApprox = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double> diffApprox = endApprox - startApprox;
+        timeApprox += diffApprox.count();
+
+        heap_reorder<C>(k, approxDistances.data(), approxIndices.data());
+
+        bool bGotMismatches = false;
+
+        // the error
+        for (uint32_t i = 0; i < k; i++) {
+            if (baselineDistances[i] != approxDistances[i]) {
+                nHardMismatches += 1;
+
+                double diff = baselineDistances[i] - approxDistances[i];
+                sqrError += diff * diff;
+
+                bGotMismatches = true;
+
+                if (verbose) {
+                    printf("i=%d, bs.d=%f, bs.i=%d, app.d=%f, app.i=%d\n",
+                           i,
+                           baselineDistances[i],
+                           baselineIndices[i],
+                           approxDistances[i],
+                           approxIndices[i]);
+                }
+            } else {
+                if (baselineIndices[i] != approxIndices[i]) {
+                    nSoftMismatches += 1;
+                } else {
+                    nMatches += 1;
+                }
+            }
+        }
+
+        if (bGotMismatches) {
+            if (verbose) {
+                printf("\n");
+            }
+        }
+
+        //
+        std::unordered_set<int> bsIndicesHS(
+                baselineIndices.cbegin(), baselineIndices.cend());
+        for (uint32_t i = 0; i < k; i++) {
+            auto itr = bsIndicesHS.find(approxIndices[i]);
+            if (itr != bsIndicesHS.cend()) {
+                nAvailable += 1;
+            } else {
+                nMissed += 1;
+            }
+        }
+    }
+
+    if (verbose) {
+        printf("%d, %d, %d, %d, %d, %d: %ld, %ld, %ld, %f, %ld, %ld, %f, %f\n",
+               NBUCKETS,
+               N,
+               beamSize,
+               nPerBeam,
+               k,
+               nDatasetsToTest,
+               nMatches,
+               nSoftMismatches,
+               nHardMismatches,
+               sqrError,
+               nAvailable,
+               nMissed,
+               timeBaseline,
+               timeApprox);
+    }
+
+    // just confirm that the error is not crazy
+    if (NBUCKETS * N * beamSize >= k) {
+        EXPECT_TRUE(nAvailable > nMissed);
+    } else {
+        // it is possible that the results are crazy here. Skip it.
+    }
+}
+
+//
+TEST(TEST_APPROX_TOPK, COMMON) {
+    constexpr bool verbose = false;
+
+    //
+    const uint32_t nDifferentDatasets = 8;
+
+    uint32_t kValues[] = {1, 2, 3, 5, 8, 13, 21, 34};
+
+    for (size_t codebookBitSize = 8; codebookBitSize <= 10; codebookBitSize++) {
+        const uint32_t codebookSize = 1 << codebookBitSize;
+        for (const auto k : kValues) {
+            test_approx_topk<1 * 8, 3>(
+                    1, codebookSize, k, nDifferentDatasets, verbose);
+            test_approx_topk<1 * 8, 3>(
+                    k, codebookSize, k, nDifferentDatasets, verbose);
+
+            test_approx_topk<1 * 8, 2>(
+                    1, codebookSize, k, nDifferentDatasets, verbose);
+            test_approx_topk<1 * 8, 2>(
+                    k, codebookSize, k, nDifferentDatasets, verbose);
+
+            test_approx_topk<2 * 8, 2>(
+                    1, codebookSize, k, nDifferentDatasets, verbose);
+            test_approx_topk<2 * 8, 2>(
+                    k, codebookSize, k, nDifferentDatasets, verbose);
+
+            test_approx_topk<4 * 8, 2>(
+                    1, codebookSize, k, nDifferentDatasets, verbose);
+            test_approx_topk<4 * 8, 2>(
+                    k, codebookSize, k, nDifferentDatasets, verbose);
+        }
+    }
+}
+
+//
diff -Nru faiss-1.7.3/tests/test_binary_flat.cpp faiss-1.7.4/tests/test_binary_flat.cpp
--- faiss-1.7.3/tests/test_binary_flat.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/tests/test_binary_flat.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -42,7 +42,7 @@
         }
 
         int k = 5;
-        std::vector<faiss::IndexBinary::idx_t> nns(k * nq);
+        std::vector<faiss::idx_t> nns(k * nq);
         std::vector<int> dis(k * nq);
 
         index.search(nq, queries.data(), k, dis.data(), nns.data());
diff -Nru faiss-1.7.3/tests/test_build_blocks.py faiss-1.7.4/tests/test_build_blocks.py
--- faiss-1.7.3/tests/test_build_blocks.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/tests/test_build_blocks.py	2023-04-19 13:18:30.000000000 +0000
@@ -597,3 +597,99 @@
             RuntimeError,
             lambda : index.reconstruct_batch(subset),
         )
+
+
+class TestBucketSort(unittest.TestCase):
+
+    def do_test_bucket_sort(self, nt):
+        rs = np.random.RandomState(123)
+        tab = rs.randint(100, size=1000, dtype='int64')
+        lims, perm = faiss.bucket_sort(tab, nt=nt)
+        for i in range(max(tab) + 1):
+            assert np.all(tab[perm[lims[i]: lims[i + 1]]] == i)
+
+    def test_bucket_sort(self):
+        self.do_test_bucket_sort(0)
+
+    def test_bucket_sort_parallel(self):
+        self.do_test_bucket_sort(4)
+
+    def do_test_bucket_sort_inplace(
+            self, nt, nrow=500, ncol=20, nbucket=300, repro=False,
+            dtype='int32'):
+        rs = np.random.RandomState(123)
+        tab = rs.randint(nbucket, size=(nrow, ncol), dtype=dtype)
+
+        tab2 = tab.copy()
+        faiss.cvar.bucket_sort_verbose
+        faiss.cvar.bucket_sort_verbose = 1
+
+        lims = faiss.matrix_bucket_sort_inplace(tab2, nt=nt)
+        tab2 = tab2.ravel()
+
+        for b in range(nbucket):
+            rows, _ = np.where(tab == b)
+            rows.sort()
+            tab2[lims[b]:lims[b + 1]].sort()
+            # print(rows, tab2[lims[b] : lims[b + 1]])
+            rows = set(rows)
+            self.assertEqual(rows, set(tab2[lims[b]:lims[b + 1]]))
+
+    def test_bucket_sort_inplace(self):
+        self.do_test_bucket_sort_inplace(0)
+
+    def test_bucket_sort_inplace_parallel(self):
+        self.do_test_bucket_sort_inplace(4)
+
+    def test_bucket_sort_inplace_parallel_fewcol(self):
+        self.do_test_bucket_sort_inplace(4, ncol=3)
+
+    def test_bucket_sort_inplace_parallel_fewbucket(self):
+        self.do_test_bucket_sort_inplace(4, nbucket=5)
+
+    def test_bucket_sort_inplace_int64(self):
+        self.do_test_bucket_sort_inplace(0, dtype='int64')
+
+    def test_bucket_sort_inplace_parallel_int64(self):
+        self.do_test_bucket_sort_inplace(4, dtype='int64')
+
+class TestMergeKNNResults(unittest.TestCase):
+
+    def do_test(self, ismax, dtype):
+        rs = np.random.RandomState()
+        n, k, nshard = 10, 5, 3
+        all_ids = rs.randint(100000, size=(nshard, n, k)).astype('int64')
+        all_dis = rs.rand(nshard, n, k)
+        if dtype == 'int32':
+            all_dis = (all_dis * 1000000).astype("int32")
+        else:
+            all_dis = all_dis.astype(dtype)
+        for i in range(nshard):
+            for j in range(n):
+                all_dis[i, j].sort()
+                if ismax:
+                    all_dis[i, j] = all_dis[i, j][::-1]
+        Dref = np.zeros((n, k), dtype=dtype)
+        Iref = np.zeros((n, k), dtype='int64')
+
+        for i in range(n):
+            dis = all_dis[:, i, :].ravel()
+            ids = all_ids[:, i, :].ravel()
+            o = dis.argsort()
+            if ismax:
+                o = o[::-1]
+            Dref[i] = dis[o[:k]]
+            Iref[i] = ids[o[:k]]
+
+        Dnew, Inew = faiss.merge_knn_results(all_dis, all_ids, keep_max=ismax)
+        np.testing.assert_array_equal(Dnew, Dref)
+        np.testing.assert_array_equal(Inew, Iref)
+
+    def test_min_float(self):
+        self.do_test(ismax=False, dtype='float32')
+
+    def test_max_int(self):
+        self.do_test(ismax=True, dtype='int32')
+
+    def test_max_float(self):
+        self.do_test(ismax=True, dtype='float32')
diff -Nru faiss-1.7.3/tests/test_clone.py faiss-1.7.4/tests/test_clone.py
--- faiss-1.7.3/tests/test_clone.py	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/tests/test_clone.py	2023-04-19 13:18:30.000000000 +0000
@@ -0,0 +1,88 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import faiss
+import numpy as np
+
+from faiss.contrib import datasets
+
+faiss.omp_set_num_threads(4)
+
+
+class TestClone(unittest.TestCase):
+    """
+    Test clone_index for various index combinations.
+    """
+
+    def do_test_clone(self, factory, with_ids=False):
+        """
+        Verify that cloning works for a given index type
+        """
+        d = 32
+        ds = datasets.SyntheticDataset(d, 1000, 2000, 10)
+        index1 = faiss.index_factory(d, factory)
+        index1.train(ds.get_train())
+        if with_ids:
+            index1.add_with_ids(ds.get_database(),
+                                np.arange(ds.nb).astype("int64"))
+        else:
+            index1.add(ds.get_database())
+        k = 5
+        Dref1, Iref1 = index1.search(ds.get_queries(), k)
+
+        index2 = faiss.clone_index(index1)
+        self.assertEqual(type(index1), type(index2))
+        index1 = None
+
+        Dref2, Iref2 = index2.search(ds.get_queries(), k)
+        np.testing.assert_array_equal(Dref1, Dref2)
+        np.testing.assert_array_equal(Iref1, Iref2)
+
+    def test_RFlat(self):
+        self.do_test_clone("SQ4,RFlat")
+
+    def test_Refine(self):
+        self.do_test_clone("SQ4,Refine(SQ8)")
+
+    def test_IVF(self):
+        self.do_test_clone("IVF16,Flat")
+
+    def test_PCA(self):
+        self.do_test_clone("PCA8,Flat")
+
+    def test_IDMap(self):
+        self.do_test_clone("IVF16,Flat,IDMap", with_ids=True)
+
+    def test_IDMap2(self):
+        self.do_test_clone("IVF16,Flat,IDMap2", with_ids=True)
+
+    def test_NSGPQ(self):
+        self.do_test_clone("NSG32,Flat")
+
+    def test_IVFAdditiveQuantizer(self):
+        self.do_test_clone("IVF16,LSQ5x6_Nqint8")
+        self.do_test_clone("IVF16,RQ5x6_Nqint8")
+        self.do_test_clone("IVF16,PLSQ4x3x5_Nqint8")
+        self.do_test_clone("IVF16,PRQ4x3x5_Nqint8")
+
+    def test_IVFAdditiveQuantizerFastScan(self):
+        self.do_test_clone("IVF16,LSQ3x4fs_32_Nlsq2x4")
+        self.do_test_clone("IVF16,RQ3x4fs_32_Nlsq2x4")
+        self.do_test_clone("IVF16,PLSQ2x3x4fs_Nlsq2x4")
+        self.do_test_clone("IVF16,PRQ2x3x4fs_Nrq2x4")
+
+    def test_AdditiveQuantizer(self):
+        self.do_test_clone("LSQ5x6_Nqint8")
+        self.do_test_clone("RQ5x6_Nqint8")
+        self.do_test_clone("PLSQ4x3x5_Nqint8")
+        self.do_test_clone("PRQ4x3x5_Nqint8")
+
+    def test_AdditiveQuantizerFastScan(self):
+        self.do_test_clone("LSQ3x4fs_32_Nlsq2x4")
+        self.do_test_clone("RQ3x4fs_32_Nlsq2x4")
+        self.do_test_clone("PLSQ2x3x4fs_Nlsq2x4")
+        self.do_test_clone("PRQ2x3x4fs_Nrq2x4")
diff -Nru faiss-1.7.3/tests/test_contrib.py faiss-1.7.4/tests/test_contrib.py
--- faiss-1.7.3/tests/test_contrib.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/tests/test_contrib.py	2023-04-19 13:18:30.000000000 +0000
@@ -7,6 +7,8 @@
 import unittest
 import numpy as np
 import platform
+import os
+import random
 
 from faiss.contrib import datasets
 from faiss.contrib import inspect_tools
@@ -19,13 +21,11 @@
     from faiss.contrib.exhaustive_search import \
         knn_ground_truth, knn, range_ground_truth, \
         range_search_max_results, exponential_query_iterator
-
 except:
     pass  # Submodule import broken in python 2.
 
 
-
-@unittest.skipIf(platform.python_version_tuple()[0] < '3', \
+@unittest.skipIf(platform.python_version_tuple()[0] < '3',
                  'Submodule import broken in python 2.')
 class TestComputeGT(unittest.TestCase):
 
@@ -43,7 +43,8 @@
             for i0 in range(0, xb.shape[0], bs):
                 yield xb[i0:i0 + bs]
 
-        Dnew, Inew = knn_ground_truth(xq, matrix_iterator(xb, 1000), 10, metric)
+        Dnew, Inew = knn_ground_truth(
+            xq, matrix_iterator(xb, 1000), 10, metric)
 
         np.testing.assert_array_equal(Iref, Inew)
         # decimal = 4 required when run on GPU
@@ -104,7 +105,6 @@
         assert np.all(Inew == Iref)
         assert np.allclose(Dref, Dnew)
 
-
         index = faiss.IndexFlatIP(32)
         index.add(xb)
         Dref, Iref = index.search(xq, 10)
@@ -311,14 +311,16 @@
         a = alt_quantizer.search(xb[:, :20].copy(), 1)[1].ravel()
         ivf_tools.add_preassigned(index, xb, a)
 
-        # search elements xq, increase nprobe, check 4 first results w/ groundtruth
+        # search elements xq, increase nprobe, check 4 first results w/
+        # groundtruth
         prev_inter_perf = 0
         for nprobe in 1, 10, 20:
 
             index.nprobe = nprobe
             a = alt_quantizer.search(xq[:, :20].copy(), index.nprobe)[1]
             D, I = ivf_tools.search_preassigned(index, xq, 4, a)
-            inter_perf = faiss.eval_intersection(I, ds.get_groundtruth()[:, :4])
+            inter_perf = faiss.eval_intersection(
+                I, ds.get_groundtruth()[:, :4])
             self.assertTrue(inter_perf >= prev_inter_perf)
             prev_inter_perf = inter_perf
 
@@ -334,7 +336,8 @@
 
         lims, DR, IR = ivf_tools.range_search_preassigned(index, xq, radius, a)
 
-        # with that radius the k-NN results are a subset of the range search results
+        # with that radius the k-NN results are a subset of the range search
+        # results
         for q in range(len(xq)):
             l0, l1 = lims[q], lims[q + 1]
             self.assertTrue(set(I[q]) <= set(IR[l0:l1]))
@@ -347,7 +350,8 @@
         xq = ds.get_queries()
         xb = ds.get_database()
 
-        # define alternative quantizer on the 20 first dims of vectors (will be in float)
+        # define alternative quantizer on the 20 first dims of vectors
+        # (will be in float)
         km = faiss.Kmeans(20, 50)
         km.train(xt[:, :20].copy())
         alt_quantizer = km.index
@@ -374,7 +378,8 @@
         ib.add(xb_bin)
         Dgt, Igt = ib.search(xq_bin, k)
 
-        # search elements xq, increase nprobe, check 4 first results w/ groundtruth
+        # search elements xq, increase nprobe, check 4 first results w/
+        # groundtruth
         prev_inter_perf = 0
         for nprobe in 1, 10, 20:
 
@@ -395,9 +400,11 @@
         D, I = ivf_tools.search_preassigned(index, xq_bin, 4, a)
         radius = int(D.max() + 1)
 
-        lims, DR, IR = ivf_tools.range_search_preassigned(index, xq_bin, radius, a)
+        lims, DR, IR = ivf_tools.range_search_preassigned(
+            index, xq_bin, radius, a)
 
-        # with that radius the k-NN results are a subset of the range search results
+        # with that radius the k-NN results are a subset of the range
+        # search results
         for q in range(len(xq)):
             l0, l1 = lims[q], lims[q + 1]
             self.assertTrue(set(I[q]) <= set(IR[l0:l1]))
@@ -417,13 +424,14 @@
         # baseline = search with that radius
         lims_ref, Dref, Iref = index.range_search(ds.get_queries(), radius0)
 
-        # now see if using just the total number of results, we can get back the same
-        # result table
+        # now see if using just the total number of results, we can get back
+        # the same result table
         query_iterator = exponential_query_iterator(ds.get_queries())
 
         init_radius = 1e10 if metric_type == faiss.METRIC_L2 else -1e10
         radius1, lims_new, Dnew, Inew = range_search_max_results(
-            index, query_iterator, init_radius, min_results=Dref.size, clip_to_min=True
+            index, query_iterator, init_radius,
+            min_results=Dref.size, clip_to_min=True
         )
 
         evaluation.test_ref_range_results(
@@ -448,7 +456,7 @@
         km_ref.train(xt)
         err = faiss.knn(xt, km_ref.centroids, 1)[0].sum()
 
-        centroids2, _ = clustering.two_level_clustering(xt, 10, 10)
+        centroids2, _ = clustering.two_level_clustering(xt, 10, 100)
         err2 = faiss.knn(xt, centroids2, 1)[0].sum()
 
         self.assertLess(err2, err * 1.1)
@@ -464,10 +472,85 @@
 
         index = faiss.index_factory(ds.d, "PCA16,IVF100,SQ8")
         faiss.extract_index_ivf(index).nprobe = 10
-        clustering.train_ivf_index_with_2level(index, ds.get_train(), verbose=True)
+        clustering.train_ivf_index_with_2level(
+            index, ds.get_train(), verbose=True, rebalance=False)
         index.add(ds.get_database())
         Dnew, Inew = index.search(ds.get_queries(), 1)
 
         # normally 47 / 200 differences
         ndiff = (Iref != Inew).sum()
         self.assertLess(ndiff, 50)
+
+
+class TestBigBatchSearch(unittest.TestCase):
+
+    def do_test(self, factory_string, metric=faiss.METRIC_L2):
+        # ds = datasets.SyntheticDataset(32, 2000, 4000, 1000)
+        ds = datasets.SyntheticDataset(32, 2000, 400, 500)
+        k = 10
+        index = faiss.index_factory(ds.d, factory_string, metric)
+        assert index.metric_type == metric
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+        index.nprobe = 5
+        Dref, Iref = index.search(ds.get_queries(), k)
+        # faiss.omp_set_num_threads(1)
+        for method in ("pairwise_distances", "knn_function", "index"):
+            for threaded in 0, 1, 3, 8:
+                Dnew, Inew = ivf_tools.big_batch_search(
+                    index, ds.get_queries(),
+                    k, method=method,
+                    threaded=threaded
+                )
+                self.assertLess((Inew != Iref).sum() / Iref.size, 1e-4)
+                np.testing.assert_almost_equal(Dnew, Dref, decimal=4)
+
+    def test_Flat(self):
+        self.do_test("IVF64,Flat")
+
+    def test_Flat_IP(self):
+        self.do_test("IVF64,Flat", metric=faiss.METRIC_INNER_PRODUCT)
+
+    def test_PQ(self):
+        self.do_test("IVF64,PQ4np")
+
+    def test_SQ(self):
+        self.do_test("IVF64,SQ8")
+
+    def test_checkpoint(self):
+        ds = datasets.SyntheticDataset(32, 2000, 400, 500)
+        k = 10
+        index = faiss.index_factory(ds.d, "IVF64,SQ8")
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+        index.nprobe = 5
+        Dref, Iref = index.search(ds.get_queries(), k)
+
+        r = random.randrange(1<<60)
+        checkpoint = "/tmp/test_big_batch_checkpoint.%d" % r
+        try:
+            # First big batch search
+            try:
+                Dnew, Inew = ivf_tools.big_batch_search(
+                    index, ds.get_queries(),
+                    k, method="knn_function",
+                    threaded=4,
+                    checkpoint=checkpoint, checkpoint_freq=4,
+                    crash_at=20
+                )
+            except ZeroDivisionError:
+                pass
+            else:
+                self.assertFalse("should have crashed")
+            # Second big batch search
+            Dnew, Inew = ivf_tools.big_batch_search(
+                index, ds.get_queries(),
+                k, method="knn_function",
+                threaded=4,
+                checkpoint=checkpoint, checkpoint_freq=4
+            )
+            self.assertLess((Inew != Iref).sum() / Iref.size, 1e-4)
+            np.testing.assert_almost_equal(Dnew, Dref, decimal=4)
+        finally:
+            if os.path.exists(checkpoint):
+                os.unlink(checkpoint)
diff -Nru faiss-1.7.3/tests/test_contrib_with_scipy.py faiss-1.7.4/tests/test_contrib_with_scipy.py
--- faiss-1.7.3/tests/test_contrib_with_scipy.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/tests/test_contrib_with_scipy.py	2023-04-19 13:18:30.000000000 +0000
@@ -56,13 +56,13 @@
         D, I = clustering.sparse_assign_to_dense(xsparse, centroids)
 
         np.testing.assert_array_equal(Iref.ravel(), I)
-        np.testing.assert_array_almost_equal(Dref.ravel(), D, decimal=4)
+        np.testing.assert_array_almost_equal(Dref.ravel(), D, decimal=3)
 
         D, I = clustering.sparse_assign_to_dense_blocks(
             xsparse, centroids, qbs=123, bbs=33, nt=4)
 
         np.testing.assert_array_equal(Iref.ravel(), I)
-        np.testing.assert_array_almost_equal(Dref.ravel(), D, decimal=4)
+        np.testing.assert_array_almost_equal(Dref.ravel(), D, decimal=3)
 
     def test_sparse_kmeans(self):
         """ demo on how to cluster sparse data into dense clusters """
diff -Nru faiss-1.7.3/tests/test_cppcontrib_sa_decode.cpp faiss-1.7.4/tests/test_cppcontrib_sa_decode.cpp
--- faiss-1.7.3/tests/test_cppcontrib_sa_decode.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/tests/test_cppcontrib_sa_decode.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -1,4 +1,9 @@
-// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 
 #include <gtest/gtest.h>
 
@@ -1026,7 +1031,7 @@
     verifyMinMaxIndexPQDecoder<T>(n, d, index, encodedData);
 }
 
-constexpr size_t NSAMPLES = 4096;
+constexpr size_t NSAMPLES = 256;
 
 //
 TEST(TEST_CPPCONTRIB_SA_DECODE, D256_IVF256_PQ16) {
@@ -1166,7 +1171,7 @@
     // It is acceptable to use COARSE_BITS=16 in this case,
     // because there's only one coarse quantizer element.
     using T = faiss::cppcontrib::Index2LevelDecoder<256, 256, 16, 16>;
-    testIndex2LevelDecoder<T>(NSAMPLES, 256, "IVF1024,PQ16np");
+    testIndex2LevelDecoder<T>(NSAMPLES * 4, 256, "IVF1024,PQ16np");
 }
 
 TEST(TEST_CPPCONTRIB_SA_DECODE, D64_Residual1x9_PQ8) {
@@ -1174,7 +1179,7 @@
     // because there's only one coarse quantizer element.
     // It won't work for "Residual2x9,PQ8".
     using T = faiss::cppcontrib::Index2LevelDecoder<64, 64, 8, 16>;
-    testIndex2LevelDecoder<T>(NSAMPLES, 64, "Residual1x9,PQ8");
+    testIndex2LevelDecoder<T>(NSAMPLES * 2, 64, "Residual1x9,PQ8");
 }
 
 //
@@ -1226,17 +1231,58 @@
 #if defined(__AVX2__) || defined(__ARM_NEON)
 TEST(TEST_CPPCONTRIB_SA_DECODE, D256_PQ16x10) {
     using T = faiss::cppcontrib::IndexPQDecoder<256, 16, 10>;
-    testIndexPQDecoder<T>(NSAMPLES, 256, "PQ16x10np");
+    testIndexPQDecoder<T>(NSAMPLES * 4, 256, "PQ16x10np");
+}
+
+TEST(TEST_CPPCONTRIB_SA_DECODE, D256_PQ16x12) {
+    using T = faiss::cppcontrib::IndexPQDecoder<256, 16, 12>;
+    testIndexPQDecoder<T>(NSAMPLES * 16, 256, "PQ16x12np");
 }
 
 TEST(TEST_CPPCONTRIB_SA_DECODE, D160_PQ20x10) {
     using T = faiss::cppcontrib::IndexPQDecoder<160, 8, 10>;
-    testIndexPQDecoder<T>(NSAMPLES, 160, "PQ20x10np");
+    testIndexPQDecoder<T>(NSAMPLES * 4, 160, "PQ20x10np");
+}
+
+TEST(TEST_CPPCONTRIB_SA_DECODE, D160_PQ20x12) {
+    using T = faiss::cppcontrib::IndexPQDecoder<160, 8, 12>;
+    testIndexPQDecoder<T>(NSAMPLES * 16, 160, "PQ20x12np");
+}
+
+TEST(TEST_CPPCONTRIB_SA_DECODE, D256_IVF256_PQ16x10) {
+    using T = faiss::cppcontrib::Index2LevelDecoder<256, 256, 16, 8, 10>;
+    testIndex2LevelDecoder<T>(NSAMPLES * 4, 256, "IVF256,PQ16x10np");
+}
+
+TEST(TEST_CPPCONTRIB_SA_DECODE, D256_IVF256_PQ16x12) {
+    using T = faiss::cppcontrib::Index2LevelDecoder<256, 256, 16, 8, 12>;
+    testIndex2LevelDecoder<T>(NSAMPLES * 16, 256, "IVF256,PQ16x12np");
+}
+
+TEST(TEST_CPPCONTRIB_SA_DECODE, D256_MINMAXFP16_IVF256_PQ16x10) {
+    using SubT = faiss::cppcontrib::Index2LevelDecoder<256, 256, 16, 8, 10>;
+    using T = faiss::cppcontrib::IndexMinMaxFP16Decoder<SubT>;
+    testMinMaxIndex2LevelDecoder<T>(
+            NSAMPLES * 4, 256, "MinMaxFP16,IVF256,PQ16x10np");
+}
+
+TEST(TEST_CPPCONTRIB_SA_DECODE, D256_MINMAXFP16_IVF1024_PQ16x10) {
+    using SubT = faiss::cppcontrib::Index2LevelDecoder<256, 256, 16, 10, 10>;
+    using T = faiss::cppcontrib::IndexMinMaxFP16Decoder<SubT>;
+    testMinMaxIndex2LevelDecoder<T>(
+            NSAMPLES * 4, 256, "MinMaxFP16,IVF1024,PQ16x10np");
+}
+
+TEST(TEST_CPPCONTRIB_SA_DECODE, D256_MINMAXFP16_IVF1024_PQ16x10_ALTERNATIVE) {
+    using SubT = faiss::cppcontrib::Index2LevelDecoder<256, 256, 16, 16, 10>;
+    using T = faiss::cppcontrib::IndexMinMaxFP16Decoder<SubT>;
+    testMinMaxIndex2LevelDecoder<T>(
+            NSAMPLES * 4, 256, "MinMaxFP16,IVF1024,PQ16x10np");
 }
 
 TEST(TEST_CPPCONTRIB_SA_DECODE, D160_Residual4x8_PQ8x10) {
     using T = faiss::cppcontrib::Index2LevelDecoder<160, 40, 20, 8, 10>;
-    testIndex2LevelDecoder<T>(NSAMPLES, 160, "Residual4x8,PQ8x10");
+    testIndex2LevelDecoder<T>(NSAMPLES * 4, 160, "Residual4x8,PQ8x10");
 }
 
 TEST(TEST_CPPCONTRIB_SA_DECODE, D256_Residual1x9_PQ16x10) {
@@ -1244,12 +1290,17 @@
     // because there's only one coarse quantizer element.
     // It won't work for "Residual2x9,PQ16x10".
     using T = faiss::cppcontrib::Index2LevelDecoder<256, 256, 16, 16, 10>;
-    testIndex2LevelDecoder<T>(NSAMPLES, 256, "Residual1x9,PQ16x10");
+    testIndex2LevelDecoder<T>(NSAMPLES * 4, 256, "Residual1x9,PQ16x10");
 }
 
 TEST(TEST_CPPCONTRIB_SA_DECODE, D256_Residual4x10_PQ16x10) {
     using T = faiss::cppcontrib::Index2LevelDecoder<256, 64, 16, 10, 10>;
-    testIndex2LevelDecoder<T>(NSAMPLES, 256, "Residual4x10,PQ16x10");
+    testIndex2LevelDecoder<T>(NSAMPLES * 4, 256, "Residual4x10,PQ16x10");
+}
+
+TEST(TEST_CPPCONTRIB_SA_DECODE, D256_Residual4x12_PQ16x12) {
+    using T = faiss::cppcontrib::Index2LevelDecoder<256, 64, 16, 12, 12>;
+    testIndex2LevelDecoder<T>(NSAMPLES * 16, 256, "Residual4x12,PQ16x12");
 }
 
 #endif
diff -Nru faiss-1.7.3/tests/test_cppcontrib_uintreader.cpp faiss-1.7.4/tests/test_cppcontrib_uintreader.cpp
--- faiss-1.7.3/tests/test_cppcontrib_uintreader.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/tests/test_cppcontrib_uintreader.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -1,3 +1,10 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
 // This test was designed to be run using valgrind or ASAN to test the
 // correctness of memory accesses.
 
@@ -98,6 +105,10 @@
     TestUintReaderBits<10>();
 }
 
+TEST(TEST_CPPCONTRIB_UINTREADER, Test12bit) {
+    TestUintReaderBits<12>();
+}
+
 TEST(TEST_CPPCONTRIB_UINTREADER, Test16bit) {
     TestUintReaderBits<16>();
 }
diff -Nru faiss-1.7.3/tests/test_dealloc_invlists.cpp faiss-1.7.4/tests/test_dealloc_invlists.cpp
--- faiss-1.7.3/tests/test_dealloc_invlists.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/tests/test_dealloc_invlists.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -24,8 +24,6 @@
 
 namespace {
 
-typedef Index::idx_t idx_t;
-
 // dimension of the vectors to index
 int d = 32;
 
diff -Nru faiss-1.7.3/tests/test_extra_distances.py faiss-1.7.4/tests/test_extra_distances.py
--- faiss-1.7.3/tests/test_extra_distances.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/tests/test_extra_distances.py	2023-04-19 13:18:30.000000000 +0000
@@ -82,6 +82,18 @@
         self.run_simple_dis_test(scipy.spatial.distance.jensenshannon,
                                  faiss.METRIC_JensenShannon)
 
+    def test_jaccard(self):
+        xq, yb = self.make_example()
+        ref_dis = np.array([
+            [
+                (np.min([x, y], axis=0).sum() / np.max([x, y], axis=0).sum())
+                for y in yb
+            ]
+            for x in xq
+        ])
+        new_dis = faiss.pairwise_distances(xq, yb, faiss.METRIC_Jaccard)
+        self.assertTrue(np.allclose(ref_dis, new_dis))
+
 
 class TestKNN(unittest.TestCase):
     """ test that the knn search gives the same as distance matrix + argmin """
diff -Nru faiss-1.7.3/tests/test_factory.py faiss-1.7.4/tests/test_factory.py
--- faiss-1.7.3/tests/test_factory.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/tests/test_factory.py	2023-04-19 13:18:30.000000000 +0000
@@ -299,7 +299,7 @@
         np.testing.assert_array_equal(codes, codes2)
 
 
-class TestIVFSpectralHashOwnerhsip(unittest.TestCase):
+class TestIVFSpectralHashOwnership(unittest.TestCase):
 
     def test_constructor(self):
         index = faiss.IndexIVFSpectralHash(faiss.IndexFlat(10), 10, 20, 10, 1)
diff -Nru faiss-1.7.3/tests/test_fast_scan.py faiss-1.7.4/tests/test_fast_scan.py
--- faiss-1.7.3/tests/test_fast_scan.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/tests/test_fast_scan.py	2023-04-19 13:18:30.000000000 +0000
@@ -82,7 +82,7 @@
         t1 = time.time()
         pqfs_t = t1 - t0
         print('PQ16x4fs search time:', pqfs_t)
-        self.assertLess(pqfs_t * 5, pq_t)
+        self.assertLess(pqfs_t * 4, pq_t)
 
 
 class TestRounding(unittest.TestCase):
@@ -284,6 +284,17 @@
             index2.implem = 4
             Dref, Iref = index2.search(ds.get_queries(), 10)
 
+            # check CodePacker
+            codes_ref = faiss.vector_to_array(index.codes)
+            codes_ref = codes_ref.reshape(-1, index.code_size)
+            index2codes = faiss.vector_to_array(index2.codes)
+            code_packer = index2.get_CodePacker()
+            index2codes = index2codes.reshape(-1, code_packer.block_size)
+
+            for i in range(0, len(codes_ref), 13):
+                code_new = code_packer.unpack_1(index2codes, i)
+                np.testing.assert_array_equal(codes_ref[i], code_new)
+
             self.cache[(d, metric)] = (ds, index, Dref, Iref)
 
         return self.cache[(d, metric)]
@@ -300,14 +311,12 @@
 
         verify_with_draws(self, Dref, Iref, Dnew, Inew)
 
-
     def build_fast_scan_index(self, index, params):
         index2 = faiss.IndexPQFastScan(index)
         index2.implem = 5
         return index2
 
 
-
 class TestImplem12(TestImplems):
 
     def build_fast_scan_index(self, index, qbs):
@@ -403,6 +412,7 @@
     def test_2_64(self):
         self.do_with_params(32, (2, 64))
 
+
 class TestAdd(unittest.TestCase):
 
     def do_test_add(self, d, bbs):
@@ -661,7 +671,7 @@
 
     def test_accuracy_PLSQ(self):
         self.subtest_accuracy("PLSQ")
-    
+
     def test_accuracy_PRQ(self):
         self.subtest_accuracy("PRQ")
 
@@ -698,3 +708,18 @@
     def test_io(self):
         self.subtest_io('PLSQ2x3x4fs_Nlsq2x4')
         self.subtest_io('PRQ2x3x4fs_Nrq2x4')
+
+
+class TestBlockDecode(unittest.TestCase):
+
+    def test_issue_2739(self):
+        ds = datasets.SyntheticDataset(960, 200, 1, 0)
+        M = 32
+        index = faiss.index_factory(ds.d, f"PQ{M}x4fs")
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+
+        np.testing.assert_array_equal(
+            index.pq.decode(index.pq.compute_codes(ds.get_database()))[0, ::100],
+            index.reconstruct(0)[::100]
+        )
diff -Nru faiss-1.7.3/tests/test_index_accuracy.py faiss-1.7.4/tests/test_index_accuracy.py
--- faiss-1.7.3/tests/test_index_accuracy.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/tests/test_index_accuracy.py	2023-04-19 13:18:30.000000000 +0000
@@ -588,6 +588,16 @@
         max_diff_D = np.abs(ref_D - new_D).max()
         assert max_diff_D < 1e-5
 
+    def test_size_0(self):
+        # just make sure it does not crash on small nb
+        index = faiss.IndexFlat1D()
+        rs = np.random.RandomState(123)
+        for i in range(3):
+            x = np.array([[rs.rand()]])
+            D, I = index.search(x, 10)
+            self.assertEqual((I == -1).sum(), 10 - i)
+            index.add(x)
+
 
 class OPQRelativeAccuracy(unittest.TestCase):
     # translated from test_opq.lua
diff -Nru faiss-1.7.3/tests/test_index_binary.py faiss-1.7.4/tests/test_index_binary.py
--- faiss-1.7.3/tests/test_index_binary.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/tests/test_index_binary.py	2023-04-19 13:18:30.000000000 +0000
@@ -289,6 +289,21 @@
         assert np.all(D == ref_D)
         # assert np.all(I == ref_I)  # id may be different
 
+    def test_search_per_invlist(self):
+        d = self.xq.shape[1] * 8
+
+        quantizer = faiss.IndexBinaryFlat(d)
+        index = faiss.IndexBinaryIVF(quantizer, d, 10)
+        index.cp.min_points_per_centroid = 5    # quiet warning
+        index.train(self.xt)
+        index.add(self.xb)
+        index.nprobe = 3
+
+        Dref, Iref = index.search(self.xq, 10)
+        index.per_invlist_search = True
+        D2, I2 = index.search(self.xq, 10)
+        compare_binary_result_lists(Dref, Iref, D2, I2)
+
 
 class TestHNSW(unittest.TestCase):
 
@@ -337,7 +352,6 @@
         self.assertTrue((Dref == Dbin).all())
 
 
-
 class TestReplicasAndShards(unittest.TestCase):
 
     @unittest.skipIf(os.name == "posix" and os.uname().sysname == "Darwin",
diff -Nru faiss-1.7.3/tests/test_index_composite.py faiss-1.7.4/tests/test_index_composite.py
--- faiss-1.7.3/tests/test_index_composite.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/tests/test_index_composite.py	2023-04-19 13:18:30.000000000 +0000
@@ -677,5 +677,47 @@
         index.replace_invlists(il, True)
 
 
-if __name__ == '__main__':
-    unittest.main()
+class TestSplitMerge(unittest.TestCase):
+
+    def do_test(self, index_key, subset_type):
+        xt, xb, xq = get_dataset_2(32, 1000, 100, 10)
+        index = faiss.index_factory(32, index_key)
+        index.train(xt)
+        nsplit = 3
+        sub_indexes = [faiss.clone_index(index) for i in range(nsplit)]
+        index.add(xb)
+        Dref, Iref = index.search(xq, 10)
+        nlist = index.nlist
+        for i in range(nsplit):
+            if subset_type in (1, 3):
+                index.copy_subset_to(sub_indexes[i], subset_type, nsplit, i)
+            elif subset_type in (0, 2):
+                j0 = index.ntotal * i // nsplit
+                j1 = index.ntotal * (i + 1) // nsplit
+                index.copy_subset_to(sub_indexes[i], subset_type, j0, j1)
+            elif subset_type == 4:
+                index.copy_subset_to(
+                    sub_indexes[i], subset_type,
+                    i * nlist // nsplit, (i + 1) * nlist // nsplit)
+
+        index_shards = faiss.IndexShards(False, False)
+        for i in range(nsplit):
+            index_shards.add_shard(sub_indexes[i])
+        Dnew, Inew = index_shards.search(xq, 10)
+        np.testing.assert_array_equal(Iref, Inew)
+        np.testing.assert_array_equal(Dref, Dnew)
+
+    def test_Flat_subset_type_0(self):
+        self.do_test("IVF30,Flat", subset_type=0)
+
+    def test_Flat_subset_type_1(self):
+        self.do_test("IVF30,Flat", subset_type=1)
+
+    def test_Flat_subset_type_2(self):
+        self.do_test("IVF30,PQ4np", subset_type=2)
+
+    def test_Flat_subset_type_3(self):
+        self.do_test("IVF30,Flat", subset_type=3)
+
+    def test_Flat_subset_type_4(self):
+        self.do_test("IVF30,Flat", subset_type=4)
diff -Nru faiss-1.7.3/tests/test_index.py faiss-1.7.4/tests/test_index.py
--- faiss-1.7.3/tests/test_index.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/tests/test_index.py	2023-04-19 13:18:30.000000000 +0000
@@ -46,7 +46,9 @@
             Iref = all_dis.argsort(axis=1)[:, ::-1][:, :k]
 
         Dref = all_dis[np.arange(nq)[:, None], Iref]
-        self.assertLessEqual((Iref != I1).sum(), Iref.size * 0.0001)
+
+        # not too many elements are off.
+        self.assertLessEqual((Iref != I1).sum(), Iref.size * 0.0002)
         #  np.testing.assert_equal(Iref, I1)
         np.testing.assert_almost_equal(Dref, D1, decimal=5)
 
@@ -776,7 +778,7 @@
         """Test IndexNSGPQ"""
         d = self.xq.shape[1]
         R, pq_M = 32, 4
-        index = faiss.index_factory(d, f"NSG{R}_PQ{pq_M}")
+        index = faiss.index_factory(d, f"NSG{R}_PQ{pq_M}np")
         assert isinstance(index, faiss.IndexNSGPQ)
         idxpq = faiss.downcast_index(index.storage)
         assert index.nsg.R == R and idxpq.pq.M == pq_M
@@ -1158,3 +1160,19 @@
         lims, D, I = index.range_search(xq, 1.0)
 
         assert len(D) == len(xb) * len(xq)
+
+
+class TestRandomIndex(unittest.TestCase):
+
+    def test_random(self):
+        """ just check if several runs of search retrieve the
+        same results """
+        index = faiss.IndexRandom(32, 1000000000)
+        (xt, xb, xq) = get_dataset_2(32, 0, 0, 10)
+
+        Dref, Iref = index.search(xq, 10)
+        self.assertTrue(np.all(Dref[:, 1:] >= Dref[:, :-1]))
+
+        Dnew, Inew = index.search(xq, 10)
+        np.testing.assert_array_equal(Dref, Dnew)
+        np.testing.assert_array_equal(Iref, Inew)
diff -Nru faiss-1.7.3/tests/test_io.py faiss-1.7.4/tests/test_io.py
--- faiss-1.7.3/tests/test_io.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/tests/test_io.py	2023-04-19 13:18:30.000000000 +0000
@@ -11,7 +11,7 @@
 import io
 import sys
 import pickle
-from multiprocessing.dummy import Pool as ThreadPool
+from multiprocessing.pool import ThreadPool
 
 
 class TestIOVariants(unittest.TestCase):
diff -Nru faiss-1.7.3/tests/test_ivflib.py faiss-1.7.4/tests/test_ivflib.py
--- faiss-1.7.3/tests/test_ivflib.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/tests/test_ivflib.py	2023-04-19 13:18:30.000000000 +0000
@@ -47,10 +47,8 @@
         sub_assign[skip_rows, skip_cols] = -1
 
         index.search_preassigned(
-            nq, faiss.swig_ptr(xq), k,
-            faiss.swig_ptr(sub_assign), faiss.swig_ptr(coarse_dis),
-            faiss.swig_ptr(rh.D), faiss.swig_ptr(rh.I),
-            False, None
+            xq, k, sub_assign, coarse_dis,
+            D=rh.D, I=rh.I
         )
 
     rh.finalize()
diff -Nru faiss-1.7.3/tests/test_ivfpq_codec.cpp faiss-1.7.4/tests/test_ivfpq_codec.cpp
--- faiss-1.7.3/tests/test_ivfpq_codec.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/tests/test_ivfpq_codec.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -34,7 +34,7 @@
 
     // encode and decode to compute reconstruction error
 
-    std::vector<faiss::Index::idx_t> keys(nb);
+    std::vector<faiss::idx_t> keys(nb);
     std::vector<uint8_t> codes(nb * m);
     index.encode_multiple(nb, keys.data(), v.data(), codes.data(), true);
 
diff -Nru faiss-1.7.3/tests/test_ivfpq_indexing.cpp faiss-1.7.4/tests/test_ivfpq_indexing.cpp
--- faiss-1.7.3/tests/test_ivfpq_indexing.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/tests/test_ivfpq_indexing.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -71,14 +71,14 @@
             queries[i] = distrib(rng);
         }
 
-        std::vector<faiss::Index::idx_t> gt_nns(nq);
+        std::vector<faiss::idx_t> gt_nns(nq);
         std::vector<float> gt_dis(nq);
 
         index_gt.search(nq, queries.data(), 1, gt_dis.data(), gt_nns.data());
 
         index.nprobe = 5;
         int k = 5;
-        std::vector<faiss::Index::idx_t> nns(k * nq);
+        std::vector<faiss::idx_t> nns(k * nq);
         std::vector<float> dis(k * nq);
 
         index.search(nq, queries.data(), k, dis.data(), nns.data());
diff -Nru faiss-1.7.3/tests/test_local_search_quantizer.py faiss-1.7.4/tests/test_local_search_quantizer.py
--- faiss-1.7.3/tests/test_local_search_quantizer.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/tests/test_local_search_quantizer.py	2023-04-19 13:18:30.000000000 +0000
@@ -252,11 +252,21 @@
 
         rs = np.random.RandomState(123)
 
-        # randomly generate codes, binary terms and unary terms
+        # randomly generate codes and unary terms
         codes = rs.randint(0, K, (n, M)).astype(np.int32)
         new_codes = codes.copy()
         unaries = rs.rand(M, n, K).astype(np.float32)
-        binaries = rs.rand(M, M, K, K).astype(np.float32)
+
+        # binary terms should be symmetric, because binary terms
+        #  represent cached dot products between the code C1 in codebook M1
+        #  and the code C2 in codebook M2.
+        # so, binaries[M1, M2, C1, C2] == binaries[M2, M1, C2, C1]
+        #
+        # generate binary terms in a standard way that provides
+        #  the needed symmetry
+        codebooks = rs.rand(M, K, d).astype(np.float32)
+        binaries = compute_binary_terms_ref(codebooks)
+        binaries = np.ascontiguousarray(binaries)
 
         # do icm encoding given binary and unary terms
         lsq = faiss.LocalSearchQuantizer(d, M, nbits)
diff -Nru faiss-1.7.3/tests/test_lowlevel_ivf.cpp faiss-1.7.4/tests/test_lowlevel_ivf.cpp
--- faiss-1.7.3/tests/test_lowlevel_ivf.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/tests/test_lowlevel_ivf.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -29,8 +29,6 @@
 
 namespace {
 
-typedef Index::idx_t idx_t;
-
 // dimension of the vectors to index
 int d = 32;
 
diff -Nru faiss-1.7.3/tests/test_mem_leak.cpp faiss-1.7.4/tests/test_mem_leak.cpp
--- faiss-1.7.3/tests/test_mem_leak.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/tests/test_mem_leak.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -40,7 +40,7 @@
         double t0 = getmillisecs();
 
         for (int i = 0; i < N2; i++) {
-            std::vector<Index::idx_t> I(10 * bs);
+            std::vector<idx_t> I(10 * bs);
             std::vector<float> D(10 * bs);
 
             tfidf_faiss_index.search(
diff -Nru faiss-1.7.3/tests/test_merge.cpp faiss-1.7.4/tests/test_merge.cpp
--- faiss-1.7.3/tests/test_merge.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/tests/test_merge.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -48,7 +48,7 @@
 
 pthread_mutex_t Tempfilename::mutex = PTHREAD_MUTEX_INITIALIZER;
 
-typedef faiss::Index::idx_t idx_t;
+typedef faiss::idx_t idx_t;
 
 // parameters to use for the test
 int d = 64;
@@ -148,7 +148,7 @@
 // test on IVFFlat with implicit numbering
 TEST(MERGE, merge_flat_no_ids) {
     faiss::IndexShards index_shards(d);
-    index_shards.own_fields = true;
+    index_shards.own_indices = true;
     for (int i = 0; i < nindex; i++) {
         index_shards.add_shard(
                 new faiss::IndexIVFFlat(&cd.quantizer, d, nlist));
@@ -164,7 +164,7 @@
 // test on IVFFlat, explicit ids
 TEST(MERGE, merge_flat) {
     faiss::IndexShards index_shards(d, false, false);
-    index_shards.own_fields = true;
+    index_shards.own_indices = true;
 
     for (int i = 0; i < nindex; i++) {
         index_shards.add_shard(
@@ -180,7 +180,7 @@
 // test on IVFFlat and a VectorTransform
 TEST(MERGE, merge_flat_vt) {
     faiss::IndexShards index_shards(d, false, false);
-    index_shards.own_fields = true;
+    index_shards.own_indices = true;
 
     // here we have to retrain because of the vectorTransform
     faiss::RandomRotationMatrix rot(d, d);
@@ -211,7 +211,7 @@
 // put the merged invfile on disk
 TEST(MERGE, merge_flat_ondisk) {
     faiss::IndexShards index_shards(d, false, false);
-    index_shards.own_fields = true;
+    index_shards.own_indices = true;
     Tempfilename filename;
 
     for (int i = 0; i < nindex; i++) {
@@ -234,7 +234,7 @@
 // now use ondisk specific merge
 TEST(MERGE, merge_flat_ondisk_2) {
     faiss::IndexShards index_shards(d, false, false);
-    index_shards.own_fields = true;
+    index_shards.own_indices = true;
 
     for (int i = 0; i < nindex; i++) {
         index_shards.add_shard(
diff -Nru faiss-1.7.3/tests/test_merge_index.py faiss-1.7.4/tests/test_merge_index.py
--- faiss-1.7.3/tests/test_merge_index.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/tests/test_merge_index.py	2023-04-19 13:18:30.000000000 +0000
@@ -1,3 +1,8 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
 import unittest
 import faiss
 import numpy as np
@@ -163,7 +168,7 @@
         self.do_flat_codes_test("Flat")
 
     def test_merge_IndexPQ(self):
-        self.do_flat_codes_test("PQ8")
+        self.do_flat_codes_test("PQ8np")
 
     def test_merge_IndexLSH(self):
         self.do_flat_codes_test("LSHr")
@@ -174,18 +179,28 @@
     def test_merge_PreTransform(self):
         self.do_flat_codes_test("PCA16,SQ4")
 
-    def do_fast_scan_test(self, factory_key, size1):
+    def do_fast_scan_test(self, factory_key, size1, with_add_id=False):
         ds = SyntheticDataset(110, 1000, 1000, 100)
-        index1 = faiss.index_factory(ds.d, factory_key)
-        index1.train(ds.get_train())
+        index_trained = faiss.index_factory(ds.d, factory_key)
+        index_trained.train(ds.get_train())
+        # test both clone and index_read/write
+        if True:
+            index1 = faiss.deserialize_index(
+                faiss.serialize_index(index_trained))
+        else:
+            index1 = faiss.clone_index(index_trained)
+        # assert index1.aq.qnorm.ntotal == index_trained.aq.qnorm.ntotal
+
         index1.add(ds.get_database())
         _, Iref = index1.search(ds.get_queries(), 5)
         index1.reset()
-        index2 = faiss.index_factory(ds.d, factory_key)
-        index2.train(ds.get_train())
+        index2 = faiss.clone_index(index_trained)
         index1.add(ds.get_database()[:size1])
         index2.add(ds.get_database()[size1:])
-        index1.merge_from(index2)
+        if with_add_id:
+            index1.merge_from(index2, add_id=index1.ntotal)
+        else:
+            index1.merge_from(index2)
         _, Inew = index1.search(ds.get_queries(), 5)
         np.testing.assert_array_equal(Inew, Iref)
 
@@ -201,6 +216,9 @@
     def test_merge_IndexAdditiveQuantizerFastScan(self):
         self.do_fast_scan_test("RQ10x4fs_32_Nrq2x4", 330)
 
+    def test_merge_IVFFastScan(self):
+        self.do_fast_scan_test("IVF20,PQ5x4fs", 123, with_add_id=True)
+
     def do_test_with_ids(self, factory_key):
         ds = SyntheticDataset(32, 300, 300, 100)
         rs = np.random.RandomState(123)
@@ -224,3 +242,23 @@
 
     def test_merge_IDMap2(self):
         self.do_test_with_ids("Flat,IDMap2")
+
+
+class TestRemoveFastScan(unittest.TestCase):
+
+    def do_fast_scan_test(self, factory_key, size1):
+        ds = SyntheticDataset(110, 1000, 1000, 100)
+        index1 = faiss.index_factory(ds.d, factory_key)
+        index1.train(ds.get_train())
+        index1.reset()
+        tokeep = [i % 3 == 0 for i in range(ds.nb)]
+        index1.add(ds.get_database()[tokeep])
+        _, Iref = index1.search(ds.get_queries(), 5)
+        index1.reset()
+        index1.add(ds.get_database())
+        index1.remove_ids(np.where(np.logical_not(tokeep))[0])
+        _, Inew = index1.search(ds.get_queries(), 5)
+        np.testing.assert_array_equal(Inew, Iref)
+
+    def test_remove(self):
+        self.do_fast_scan_test("PQ5x4fs", 320)
diff -Nru faiss-1.7.3/tests/test_meta_index.py faiss-1.7.4/tests/test_meta_index.py
--- faiss-1.7.3/tests/test_meta_index.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/tests/test_meta_index.py	2023-04-19 13:18:30.000000000 +0000
@@ -10,6 +10,8 @@
 
 from common_faiss_tests import Randu10k
 
+from faiss.contrib.datasets import SyntheticDataset
+
 ru = Randu10k()
 
 xb = ru.xb
@@ -130,6 +132,52 @@
             print('%d / %d differences' % (ndiff, nq * k))
             assert (ndiff < nq * k / 1000.)
 
+    def test_shards_ivf(self):
+        ds = SyntheticDataset(32, 1000, 100, 20)
+        ref_index = faiss.index_factory(ds.d, "IVF32,SQ8")
+        ref_index.train(ds.get_train())
+        xb = ds.get_database()
+        ref_index.add(ds.get_database())
+
+        Dref, Iref = ref_index.search(ds.get_database(), 10)
+        ref_index.reset()
+
+        sharded_index = faiss.IndexShardsIVF(
+            ref_index.quantizer, ref_index.nlist, False, True)
+        for shard in range(3):
+            index_i = faiss.clone_index(ref_index)
+            index_i.add(xb[shard * nb // 3: (shard + 1)* nb // 3])
+            sharded_index.add_shard(index_i)
+
+        Dnew, Inew = sharded_index.search(ds.get_database(), 10)
+
+        np.testing.assert_equal(Inew, Iref)
+        np.testing.assert_allclose(Dnew, Dref)
+
+    def test_shards_ivf_train_add(self):
+        ds = SyntheticDataset(32, 1000, 600, 20)
+        quantizer = faiss.IndexFlatL2(ds.d)
+        sharded_index = faiss.IndexShardsIVF(quantizer, 40, False, False)
+
+        for _ in range(3):
+            sharded_index.add_shard(faiss.index_factory(ds.d, "IVF40,Flat"))
+
+        sharded_index.train(ds.get_train())
+        sharded_index.add(ds.get_database())
+        Dnew, Inew = sharded_index.search(ds.get_queries(), 10)
+
+        index_ref = faiss.IndexIVFFlat(quantizer, ds.d, sharded_index.nlist)
+        index_ref.train(ds.get_train())
+        index_ref.add(ds.get_database())
+        Dref, Iref = index_ref.search(ds.get_queries(), 10)
+        np.testing.assert_equal(Inew, Iref)
+        np.testing.assert_allclose(Dnew, Dref)
+
+        # mess around with the quantizer's centroids
+        centroids = quantizer.reconstruct_n()
+        centroids = centroids[::-1].copy()
+        quantizer.reset()
+        quantizer.add(centroids)
 
-if __name__ == '__main__':
-    unittest.main()
+        D2, I2 = sharded_index.search(ds.get_queries(), 10)
+        self.assertFalse(np.all(I2 == Inew))
diff -Nru faiss-1.7.3/tests/test_ondisk_ivf.cpp faiss-1.7.4/tests/test_ondisk_ivf.cpp
--- faiss-1.7.3/tests/test_ondisk_ivf.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/tests/test_ondisk_ivf.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -80,10 +80,10 @@
     int ntot = 0;
     for (int i = 0; i < nlist; i++) {
         int size = ivf.list_size(i);
-        const faiss::Index::idx_t* ids = ivf.get_ids(i);
+        const faiss::idx_t* ids = ivf.get_ids(i);
         const uint8_t* codes = ivf.get_codes(i);
         for (int j = 0; j < size; j++) {
-            faiss::Index::idx_t id = ids[j];
+            faiss::idx_t id = ids[j];
             const int* ar = (const int*)&codes[code_size * j];
             EXPECT_EQ(ar[0], id);
             EXPECT_EQ(ar[1], i);
@@ -113,7 +113,7 @@
     faiss::float_rand(xq.data(), d * nq, 34567);
 
     std::vector<float> ref_D(nq * k);
-    std::vector<faiss::Index::idx_t> ref_I(nq * k);
+    std::vector<faiss::idx_t> ref_I(nq * k);
 
     index.search(nq, xq.data(), k, ref_D.data(), ref_I.data());
 
@@ -131,7 +131,7 @@
         index2.add(nb, xb.data());
 
         std::vector<float> new_D(nq * k);
-        std::vector<faiss::Index::idx_t> new_I(nq * k);
+        std::vector<faiss::idx_t> new_I(nq * k);
 
         index2.search(nq, xq.data(), k, new_D.data(), new_I.data());
 
@@ -146,7 +146,7 @@
         faiss::Index* index3 = faiss::read_index(filename2.c_str());
 
         std::vector<float> new_D(nq * k);
-        std::vector<faiss::Index::idx_t> new_I(nq * k);
+        std::vector<faiss::idx_t> new_I(nq * k);
 
         index3->search(nq, xq.data(), k, new_D.data(), new_I.data());
 
@@ -192,10 +192,10 @@
     int ntot = 0;
     for (int i = 0; i < nlist; i++) {
         int size = ivf.list_size(i);
-        const faiss::Index::idx_t* ids = ivf.get_ids(i);
+        const faiss::idx_t* ids = ivf.get_ids(i);
         const uint8_t* codes = ivf.get_codes(i);
         for (int j = 0; j < size; j++) {
-            faiss::Index::idx_t id = ids[j];
+            faiss::idx_t id = ids[j];
             const int* ar = (const int*)&codes[code_size * j];
             EXPECT_EQ(ar[0], id);
             EXPECT_EQ(ar[1], i);
diff -Nru faiss-1.7.3/tests/test_pairs_decoding.cpp faiss-1.7.4/tests/test_pairs_decoding.cpp
--- faiss-1.7.3/tests/test_pairs_decoding.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/tests/test_pairs_decoding.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -21,7 +21,7 @@
 
 namespace {
 
-typedef faiss::Index::idx_t idx_t;
+typedef faiss::idx_t idx_t;
 
 /*************************************************************
  * Test utils
diff -Nru faiss-1.7.3/tests/test_params_override.cpp faiss-1.7.4/tests/test_params_override.cpp
--- faiss-1.7.3/tests/test_params_override.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/tests/test_params_override.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -27,8 +27,6 @@
 
 namespace {
 
-typedef Index::idx_t idx_t;
-
 // dimension of the vectors to index
 int d = 32;
 
diff -Nru faiss-1.7.3/tests/test_RCQ_cropping.cpp faiss-1.7.4/tests/test_RCQ_cropping.cpp
--- faiss-1.7.3/tests/test_RCQ_cropping.cpp	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/tests/test_RCQ_cropping.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -0,0 +1,131 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/IndexAdditiveQuantizer.h>
+#include <faiss/IndexScalarQuantizer.h>
+#include <faiss/utils/random.h>
+#include <gtest/gtest.h>
+
+/* This test creates a 3-level RCQ and performs a search on it.
+ * Then it crops the RCQ to just the 2 first levels and verifies that
+ * the 3-level vectors are in a subtree that was visited in the 2-level RCQ. */
+TEST(RCQ_cropping, test_cropping) {
+    size_t nq = 10, nt = 2000, nb = 1000, d = 32;
+
+    using idx_t = faiss::idx_t;
+
+    std::vector<float> buf((nq + nb + nt) * d);
+    faiss::rand_smooth_vectors(nq + nb + nt, d, buf.data(), 1234);
+    const float* xt = buf.data();
+    const float* xb = xt + nt * d;
+    const float* xq = xb + nb * d;
+
+    std::vector<size_t> nbits = {5, 4, 4};
+    faiss::ResidualCoarseQuantizer rcq(d, nbits);
+
+    rcq.train(nt, xt);
+    // fprintf(stderr, "nb centroids: %zd\n", rcq.ntotal);
+
+    // the test below works only for beam size == nprobe
+    rcq.set_beam_factor(1.0);
+
+    // perform search
+    int nprobe = 15;
+    std::vector<faiss::idx_t> Iref(nq * nprobe);
+    std::vector<float> Dref(nq * nprobe);
+    rcq.search(nq, xq, nprobe, Dref.data(), Iref.data());
+
+    // crop to the first 2 quantization levels
+    int last_nbits = nbits.back();
+    nbits.pop_back();
+    faiss::ResidualCoarseQuantizer rcq_cropped(d, nbits);
+    rcq_cropped.initialize_from(rcq);
+    // fprintf(stderr, "cropped nb centroids: %zd\n", rcq_cropped.ntotal);
+
+    EXPECT_EQ(rcq_cropped.ntotal, rcq.ntotal >> last_nbits);
+
+    // perform search
+    std::vector<faiss::idx_t> Inew(nq * nprobe);
+    std::vector<float> Dnew(nq * nprobe);
+    rcq_cropped.search(nq, xq, nprobe, Dnew.data(), Inew.data());
+
+    // these bits are in common between the two RCQs
+    idx_t mask = ((idx_t)1 << rcq_cropped.rq.tot_bits) - 1;
+    for (int q = 0; q < nq; q++) {
+        for (int i = 0; i < nprobe; i++) {
+            idx_t fine = Iref[q * nprobe + i];
+            EXPECT_GE(fine, 0);
+            bool found = false;
+
+            // fine should be generated from a path that passes through coarse
+            for (int j = 0; j < nprobe; j++) {
+                idx_t coarse = Inew[q * nprobe + j];
+                if ((fine & mask) == coarse) {
+                    found = true;
+                    break;
+                }
+            }
+            EXPECT_TRUE(found);
+        }
+    }
+}
+
+TEST(RCQ_cropping, search_params) {
+    size_t nq = 10, nt = 2000, nb = 1000, d = 32;
+
+    using idx_t = faiss::idx_t;
+
+    std::vector<float> buf((nq + nb + nt) * d);
+    faiss::rand_smooth_vectors(nq + nb + nt, d, buf.data(), 1234);
+    const float* xt = buf.data();
+    const float* xb = xt + nt * d;
+    const float* xq = xb + nb * d;
+
+    std::vector<size_t> nbits = {3, 6, 3};
+    faiss::ResidualCoarseQuantizer quantizer(d, nbits);
+    size_t ntotal = (size_t)1 << quantizer.rq.tot_bits;
+    faiss::IndexIVFScalarQuantizer index(
+            &quantizer, d, ntotal, faiss::ScalarQuantizer::QT_8bit);
+    index.quantizer_trains_alone = true;
+
+    index.train(nt, xt);
+    index.add(nb, xb);
+
+    index.nprobe = 10;
+
+    int k = 4;
+    float beam_factor_1 = 8.0;
+    quantizer.set_beam_factor(beam_factor_1);
+    std::vector<idx_t> I1(nq * k);
+    std::vector<float> D1(nq * k);
+    index.search(nq, xq, k, D1.data(), I1.data());
+
+    // change from 8 to 1
+    quantizer.set_beam_factor(1.0f);
+    std::vector<idx_t> I2(nq * k);
+    std::vector<float> D2(nq * k);
+    index.search(nq, xq, k, D2.data(), I2.data());
+
+    // make sure it changes the result
+    EXPECT_NE(I1, I2);
+    EXPECT_NE(D1, D2);
+
+    // override the class level beam factor
+    faiss::SearchParametersResidualCoarseQuantizer params1;
+    params1.beam_factor = beam_factor_1;
+    faiss::SearchParametersIVF params;
+    params.nprobe = index.nprobe;
+    params.quantizer_params = &params1;
+
+    std::vector<idx_t> I3(nq * k);
+    std::vector<float> D3(nq * k);
+    index.search(nq, xq, k, D3.data(), I3.data(), &params);
+
+    // make sure we find back the original results
+    EXPECT_EQ(I1, I3);
+    EXPECT_EQ(D1, D3);
+}
diff -Nru faiss-1.7.3/tests/test_residual_quantizer.py faiss-1.7.4/tests/test_residual_quantizer.py
--- faiss-1.7.3/tests/test_residual_quantizer.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/tests/test_residual_quantizer.py	2023-04-19 13:18:30.000000000 +0000
@@ -1252,7 +1252,8 @@
             inters.append(inter)
 
         inters = np.array(inters)
-        self.assertTrue(np.all(inters[1:] >= inters[:-1]))
+        # 1.05: test relaxed for OSX on ARM
+        self.assertTrue(np.all(inters[1:] * 1.05 >= inters[:-1]))
 
         # do a little I/O test
         index2 = faiss.deserialize_index(faiss.serialize_index(index))
diff -Nru faiss-1.7.3/tests/test_search_params.py faiss-1.7.4/tests/test_search_params.py
--- faiss-1.7.3/tests/test_search_params.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/tests/test_search_params.py	2023-04-19 13:18:30.000000000 +0000
@@ -23,7 +23,7 @@
     def do_test_id_selector(self, index_key, id_selector_type="batch", mt=faiss.METRIC_L2):
         """ Verify that the id selector returns the subset of results that are
         members according to the IDSelector.
-        Supports id_selector_type="batch", "bitmap", "range", "range_sorted", "not"
+        Supports id_selector_type="batch", "bitmap", "range", "range_sorted", "and", "or", "xor"
         """
         ds = datasets.SyntheticDataset(32, 1000, 100, 20)
         index = faiss.index_factory(ds.d, index_key, mt)
@@ -33,6 +33,24 @@
         # reference result
         if "range" in id_selector_type:
             subset = np.arange(30, 80).astype('int64')
+        elif id_selector_type == "or":
+            lhs_rs = np.random.RandomState(123)
+            lhs_subset = lhs_rs.choice(ds.nb, 50, replace=False).astype("int64")
+            rhs_rs = np.random.RandomState(456)
+            rhs_subset = rhs_rs.choice(ds.nb, 20, replace=False).astype("int64")
+            subset = np.union1d(lhs_subset, rhs_subset)
+        elif id_selector_type == "and":
+            lhs_rs = np.random.RandomState(123)
+            lhs_subset = lhs_rs.choice(ds.nb, 50, replace=False).astype("int64")
+            rhs_rs = np.random.RandomState(456)
+            rhs_subset = rhs_rs.choice(ds.nb, 10, replace=False).astype("int64")
+            subset = np.intersect1d(lhs_subset, rhs_subset)
+        elif id_selector_type == "xor":
+            lhs_rs = np.random.RandomState(123)
+            lhs_subset = lhs_rs.choice(ds.nb, 50, replace=False).astype("int64")
+            rhs_rs = np.random.RandomState(456)
+            rhs_subset = rhs_rs.choice(ds.nb, 40, replace=False).astype("int64")
+            subset = np.setxor1d(lhs_subset, rhs_subset)
         else:
             rs = np.random.RandomState(123)
             subset = rs.choice(ds.nb, 50, replace=False).astype("int64")
@@ -81,6 +99,21 @@
                 if i not in ssubset
             ]).astype('int64')
             sel = faiss.IDSelectorNot(faiss.IDSelectorBatch(inverse_subset))
+        elif id_selector_type == "or":
+            sel = faiss.IDSelectorOr(
+                faiss.IDSelectorBatch(lhs_subset), 
+                faiss.IDSelectorBatch(rhs_subset)
+            )
+        elif id_selector_type == "and":
+            sel = faiss.IDSelectorAnd(
+                faiss.IDSelectorBatch(lhs_subset), 
+                faiss.IDSelectorBatch(rhs_subset)
+            )
+        elif id_selector_type == "xor":
+            sel = faiss.IDSelectorXOr(
+                faiss.IDSelectorBatch(lhs_subset), 
+                faiss.IDSelectorBatch(rhs_subset)
+            )
         else:
             sel = faiss.IDSelectorBatch(subset)
 
@@ -148,6 +181,9 @@
 
     def test_Flat_id_not(self):
         self.do_test_id_selector("Flat", id_selector_type="not")
+    
+    def test_Flat_id_or(self):
+        self.do_test_id_selector("Flat", id_selector_type="or")
 
     # not implemented
 
@@ -248,6 +284,33 @@
             )
         )
 
+    def test_max_codes(self):
+        " tests whether the max nb codes is taken into account "
+        ds = datasets.SyntheticDataset(32, 1000, 100, 20)
+        index = faiss.index_factory(ds.d, "IVF32,Flat")
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+
+        stats = faiss.cvar.indexIVF_stats
+        stats.reset()
+        D0, I0 = index.search(
+            ds.get_queries(), 10,
+            params=faiss.SearchParametersIVF(nprobe=8)
+        )
+        ndis0 = stats.ndis
+        target_ndis = ndis0 // ds.nq  # a few queries will be below, a few above
+        for q in range(ds.nq):
+            stats.reset()
+            Dq, Iq = index.search(
+                ds.get_queries()[q:q + 1], 10,
+                params=faiss.SearchParametersIVF(
+                    nprobe=8, max_codes=target_ndis
+                )
+            )
+            self.assertLessEqual(stats.ndis, target_ndis)
+            if stats.ndis < target_ndis:
+                np.testing.assert_equal(I0[q], Iq[0])
+
 
 class TestSelectorCallback(unittest.TestCase):
 
diff -Nru faiss-1.7.3/tests/test_simdlib.cpp faiss-1.7.4/tests/test_simdlib.cpp
--- faiss-1.7.3/tests/test_simdlib.cpp	1970-01-01 00:00:00.000000000 +0000
+++ faiss-1.7.4/tests/test_simdlib.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -0,0 +1,264 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gtest/gtest.h>
+
+#include <faiss/utils/simdlib.h>
+
+using namespace faiss;
+
+TEST(TEST_SIMDLIB, TestCmpltAndBlendInplace) {
+    simd8float32 lowestValues(0, 1, 2, 3, 4, 5, 6, 7);
+    simd8uint32 lowestIndices(0, 1, 2, 3, 4, 5, 6, 7);
+
+    simd8float32 candidateValues0(5, 5, 5, 5, 5, 5, 5, 5);
+    simd8uint32 candidateIndices0(10, 11, 12, 13, 14, 15, 16, 17);
+    cmplt_and_blend_inplace(
+            candidateValues0, candidateIndices0, lowestValues, lowestIndices);
+
+    simd8float32 candidateValues1(6, 6, 6, 6, 6, 6, 6, 6);
+    simd8uint32 candidateIndices1(20, 21, 22, 23, 24, 25, 26, 27);
+    cmplt_and_blend_inplace(
+            candidateValues1, candidateIndices1, lowestValues, lowestIndices);
+
+    simd8float32 candidateValues2(0, 1, 2, 3, 4, 5, 5, 5);
+    simd8uint32 candidateIndices2(30, 31, 32, 33, 34, 35, 36, 37);
+    cmplt_and_blend_inplace(
+            candidateValues2, candidateIndices2, lowestValues, lowestIndices);
+
+    simd8float32 expectedValues(0, 1, 2, 3, 4, 5, 5, 5);
+    simd8uint32 expectedIndices(0, 1, 2, 3, 4, 5, 16, 17);
+    ASSERT_TRUE(lowestValues.is_same_as(expectedValues));
+    ASSERT_TRUE(lowestIndices.is_same_as(expectedIndices));
+}
+
+TEST(TEST_SIMDLIB, TestCmpltMinMaxFloat) {
+    simd8float32 minValues(0, 0, 0, 0, 0, 0, 0, 0);
+    simd8uint32 minIndices(0, 0, 0, 0, 0, 0, 0, 0);
+    simd8float32 maxValues(0, 0, 0, 0, 0, 0, 0, 0);
+    simd8uint32 maxIndices(0, 0, 0, 0, 0, 0, 0, 0);
+
+    simd8float32 candidateValues0(5, 5, 5, 5, 5, 5, 5, 5);
+    simd8uint32 candidateIndices0(10, 11, 12, 13, 14, 15, 16, 17);
+    simd8float32 currentValues0(0, 1, 2, 3, 4, 5, 6, 7);
+    simd8uint32 currentIndices0(0, 1, 2, 3, 4, 5, 6, 7);
+
+    cmplt_min_max_fast(
+            candidateValues0,
+            candidateIndices0,
+            currentValues0,
+            currentIndices0,
+            minValues,
+            minIndices,
+            maxValues,
+            maxIndices);
+
+    simd8float32 expectedMinValues(0, 1, 2, 3, 4, 5, 5, 5);
+    simd8uint32 expectedMinIndices(0, 1, 2, 3, 4, 5, 16, 17);
+    ASSERT_TRUE(minValues.is_same_as(expectedMinValues));
+    ASSERT_TRUE(minIndices.is_same_as(expectedMinIndices));
+
+    simd8float32 expectedMaxValues(5, 5, 5, 5, 5, 5, 6, 7);
+    // the result is not 10,11,12,13,14,5,6,7 because it is _fast version
+    simd8uint32 expectedMaxIndices(10, 11, 12, 13, 14, 15, 6, 7);
+    ASSERT_TRUE(maxValues.is_same_as(expectedMaxValues));
+    ASSERT_TRUE(maxIndices.is_same_as(expectedMaxIndices));
+}
+
+TEST(TEST_SIMDLIB, TestCmpltMinMaxInt) {
+    simd8uint32 minValues(0, 0, 0, 0, 0, 0, 0, 0);
+    simd8uint32 minIndices(0, 0, 0, 0, 0, 0, 0, 0);
+    simd8uint32 maxValues(0, 0, 0, 0, 0, 0, 0, 0);
+    simd8uint32 maxIndices(0, 0, 0, 0, 0, 0, 0, 0);
+
+    simd8uint32 candidateValues0(5, 5, 5, 5, 5, 5, 5, 5);
+    simd8uint32 candidateIndices0(10, 11, 12, 13, 14, 15, 16, 17);
+    simd8uint32 currentValues0(0, 1, 2, 3, 4, 5, 6, 7);
+    simd8uint32 currentIndices0(0, 1, 2, 3, 4, 5, 6, 7);
+
+    cmplt_min_max_fast(
+            candidateValues0,
+            candidateIndices0,
+            currentValues0,
+            currentIndices0,
+            minValues,
+            minIndices,
+            maxValues,
+            maxIndices);
+
+    simd8uint32 expectedMinValues(0, 1, 2, 3, 4, 5, 5, 5);
+    simd8uint32 expectedMinIndices(0, 1, 2, 3, 4, 5, 16, 17);
+    ASSERT_TRUE(minValues.is_same_as(expectedMinValues));
+    ASSERT_TRUE(minIndices.is_same_as(expectedMinIndices));
+
+    simd8uint32 expectedMaxValues(5, 5, 5, 5, 5, 5, 6, 7);
+    // the result is not 10,11,12,13,14,5,6,7 because it is _fast version
+    simd8uint32 expectedMaxIndices(10, 11, 12, 13, 14, 15, 6, 7);
+    ASSERT_TRUE(maxValues.is_same_as(expectedMaxValues));
+    ASSERT_TRUE(maxIndices.is_same_as(expectedMaxIndices));
+}
+
+TEST(TEST_SIMDLIB, TestCmpltMinMaxInt16) {
+    simd16uint16 minValues(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+    simd16uint16 minIndices(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+    simd16uint16 maxValues(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+    simd16uint16 maxIndices(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+
+    simd16uint16 candidateValues0(
+            5,
+            5,
+            5,
+            5,
+            5,
+            5,
+            5,
+            5,
+            1005,
+            1005,
+            1005,
+            1005,
+            1005,
+            1005,
+            1005,
+            1005);
+    simd16uint16 candidateIndices0(
+            10,
+            11,
+            12,
+            13,
+            14,
+            15,
+            16,
+            17,
+            1010,
+            1011,
+            1012,
+            1013,
+            1014,
+            1015,
+            1016,
+            1017);
+    simd16uint16 currentValues0(
+            0,
+            1,
+            2,
+            3,
+            4,
+            5,
+            6,
+            7,
+            1000,
+            1001,
+            1002,
+            1003,
+            1004,
+            1005,
+            1006,
+            1007);
+    simd16uint16 currentIndices0(
+            0,
+            1,
+            2,
+            3,
+            4,
+            5,
+            6,
+            7,
+            1000,
+            1001,
+            1002,
+            1003,
+            1004,
+            1005,
+            1006,
+            1007);
+
+    cmplt_min_max_fast(
+            candidateValues0,
+            candidateIndices0,
+            currentValues0,
+            currentIndices0,
+            minValues,
+            minIndices,
+            maxValues,
+            maxIndices);
+
+    simd16uint16 expectedMinValues(
+            0,
+            1,
+            2,
+            3,
+            4,
+            5,
+            5,
+            5,
+            1000,
+            1001,
+            1002,
+            1003,
+            1004,
+            1005,
+            1005,
+            1005);
+    simd16uint16 expectedMinIndices(
+            0,
+            1,
+            2,
+            3,
+            4,
+            5,
+            16,
+            17,
+            1000,
+            1001,
+            1002,
+            1003,
+            1004,
+            1005,
+            1016,
+            1017);
+    ASSERT_TRUE(minValues.is_same_as(expectedMinValues));
+    ASSERT_TRUE(minIndices.is_same_as(expectedMinIndices));
+
+    simd16uint16 expectedMaxValues(
+            5,
+            5,
+            5,
+            5,
+            5,
+            5,
+            6,
+            7,
+            1005,
+            1005,
+            1005,
+            1005,
+            1005,
+            1005,
+            1006,
+            1007);
+    // the result is not 10,11,12,13,14,5,6,7 because it is _fast version
+    simd16uint16 expectedMaxIndices(
+            10,
+            11,
+            12,
+            13,
+            14,
+            15,
+            6,
+            7,
+            1010,
+            1011,
+            1012,
+            1013,
+            1014,
+            1015,
+            1006,
+            1007);
+    ASSERT_TRUE(maxValues.is_same_as(expectedMaxValues));
+    ASSERT_TRUE(maxIndices.is_same_as(expectedMaxIndices));
+}
diff -Nru faiss-1.7.3/tests/test_sliding_ivf.cpp faiss-1.7.4/tests/test_sliding_ivf.cpp
--- faiss-1.7.3/tests/test_sliding_ivf.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/tests/test_sliding_ivf.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -22,8 +22,6 @@
 
 using namespace faiss;
 
-typedef Index::idx_t idx_t;
-
 // dimension of the vectors to index
 int d = 32;
 
@@ -81,7 +79,7 @@
         Index* index = sub_indexes.back().get();
 
         auto xb = make_data(nb * d);
-        std::vector<faiss::Index::idx_t> ids(nb);
+        std::vector<faiss::idx_t> ids(nb);
         std::mt19937 rng;
         std::uniform_int_distribution<> distrib;
         for (int j = 0; j < nb; j++) {
diff -Nru faiss-1.7.3/tests/test_standalone_codec.py faiss-1.7.4/tests/test_standalone_codec.py
--- faiss-1.7.3/tests/test_standalone_codec.py	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/tests/test_standalone_codec.py	2023-04-19 13:18:30.000000000 +0000
@@ -35,22 +35,23 @@
 
         codes2 = codec.sa_encode(x2)
 
-        if 'IVF' not in factory_key:
-            self.assertTrue(np.all(codes == codes2))
-        else:
+        if 'IVF' in factory_key or 'RQ' in factory_key:
             # some rows are not reconstructed exactly because they
             # flip into another quantization cell
             nrowdiff = (codes != codes2).any(axis=1).sum()
             self.assertTrue(nrowdiff < 10)
+        else:
+            self.assertTrue(np.all(codes == codes2))
 
         x3 = codec.sa_decode(codes2)
-        if 'IVF' not in factory_key:
-            self.assertTrue(np.allclose(x2, x3))
-        else:
+
+        if 'IVF' in factory_key or 'RQ' in factory_key:
             diffs = np.abs(x2 - x3).sum(axis=1)
             avg = np.abs(x2).sum(axis=1).mean()
             diffs.sort()
             assert diffs[-10] < avg * 1e-5
+        else:
+            self.assertTrue(np.allclose(x2, x3))
 
     def test_SQ8(self):
         self.do_encode_twice('SQ8')
diff -Nru faiss-1.7.3/tests/test_threaded_index.cpp faiss-1.7.4/tests/test_threaded_index.cpp
--- faiss-1.7.3/tests/test_threaded_index.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/tests/test_threaded_index.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -19,6 +19,8 @@
 
 struct TestException : public std::exception {};
 
+using idx_t = faiss::idx_t;
+
 struct MockIndex : public faiss::Index {
     explicit MockIndex(idx_t d) : faiss::Index(d) {
         resetMock();
@@ -66,7 +68,7 @@
 
 template <typename IndexT>
 struct MockThreadedIndex : public faiss::ThreadedIndex<IndexT> {
-    using idx_t = faiss::Index::idx_t;
+    using idx_t = faiss::idx_t;
 
     explicit MockThreadedIndex(bool threaded)
             : faiss::ThreadedIndex<IndexT>(threaded) {}
@@ -178,7 +180,7 @@
 
         std::vector<float> x(n * d);
         std::vector<float> distances(n * k);
-        std::vector<faiss::Index::idx_t> labels(n * k);
+        std::vector<faiss::idx_t> labels(n * k);
 
         replica.add(n, x.data());
 
@@ -227,7 +229,7 @@
 
         std::vector<float> x(n * d);
         std::vector<float> distances(n * k);
-        std::vector<faiss::Index::idx_t> labels(n * k);
+        std::vector<faiss::idx_t> labels(n * k);
 
         shards.add(n, x.data());
 
diff -Nru faiss-1.7.3/tests/test_transfer_invlists.cpp faiss-1.7.4/tests/test_transfer_invlists.cpp
--- faiss-1.7.3/tests/test_transfer_invlists.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/tests/test_transfer_invlists.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -33,7 +33,7 @@
 
 using namespace faiss;
 
-typedef faiss::Index::idx_t idx_t;
+typedef faiss::idx_t idx_t;
 
 std::vector<float> get_data(size_t nb, int seed) {
     std::vector<float> x(nb * d);
diff -Nru faiss-1.7.3/tutorial/cpp/1-Flat.cpp faiss-1.7.4/tutorial/cpp/1-Flat.cpp
--- faiss-1.7.3/tutorial/cpp/1-Flat.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/tutorial/cpp/1-Flat.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -12,7 +12,7 @@
 #include <faiss/IndexFlat.h>
 
 // 64-bit int
-using idx_t = faiss::Index::idx_t;
+using idx_t = faiss::idx_t;
 
 int main() {
     int d = 64;      // dimension
diff -Nru faiss-1.7.3/tutorial/cpp/2-IVFFlat.cpp faiss-1.7.4/tutorial/cpp/2-IVFFlat.cpp
--- faiss-1.7.3/tutorial/cpp/2-IVFFlat.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/tutorial/cpp/2-IVFFlat.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -13,7 +13,7 @@
 #include <faiss/IndexFlat.h>
 #include <faiss/IndexIVFFlat.h>
 
-using idx_t = faiss::Index::idx_t;
+using idx_t = faiss::idx_t;
 
 int main() {
     int d = 64;      // dimension
diff -Nru faiss-1.7.3/tutorial/cpp/3-IVFPQ.cpp faiss-1.7.4/tutorial/cpp/3-IVFPQ.cpp
--- faiss-1.7.3/tutorial/cpp/3-IVFPQ.cpp	2022-11-08 11:14:13.000000000 +0000
+++ faiss-1.7.4/tutorial/cpp/3-IVFPQ.cpp	2023-04-19 13:18:30.000000000 +0000
@@ -12,7 +12,7 @@
 #include <faiss/IndexFlat.h>
 #include <faiss/IndexIVFPQ.h>
 
-using idx_t = faiss::Index::idx_t;
+using idx_t = faiss::idx_t;
 
 int main() {
     int d = 64;      // dimension