diff --git a/thirdparty/faiss/.circleci/config.yml b/thirdparty/faiss/.circleci/config.yml
index e105d7914..9ddcb5ba8 100644
--- a/thirdparty/faiss/.circleci/config.yml
+++ b/thirdparty/faiss/.circleci/config.yml
@@ -9,7 +9,7 @@ executors:
     environment:
       CONDA_ARCH: Linux-x86_64
     machine:
-      image: linux-cuda-11:2023.02.1
+      image: linux-cuda-11:default
     resource_class: gpu.nvidia.medium
   linux-arm64-cpu:
     environment:
@@ -91,7 +91,7 @@ jobs:
       - run:
           name: Install conda build tools
           command: |
-            conda config --set solver libmamba
+            # conda config --set solver libmamba
             # conda config --set verbosity 3
             conda update -y -q conda
             conda install -y -q conda-build
@@ -171,7 +171,7 @@ jobs:
                   sudo update-alternatives --set cuda /usr/local/cuda-<<parameters.cuda>>
                   cd conda
                   conda build faiss-gpu-raft --variants '{ "cudatoolkit": "<<parameters.cuda>>", "c_compiler_version": "<<parameters.compiler_version>>", "cxx_compiler_version": "<<parameters.compiler_version>>" }' \
-                      -c pytorch -c nvidia -c rapidsai -c conda-forge
+                      -c pytorch -c nvidia -c rapidsai-nightly -c conda-forge
       - when:
           condition:
             and:
@@ -186,7 +186,7 @@ jobs:
                   sudo update-alternatives --set cuda /usr/local/cuda-<<parameters.cuda>>
                   cd conda
                   conda build faiss-gpu-raft --variants '{ "cudatoolkit": "<<parameters.cuda>>", "c_compiler_version": "<<parameters.compiler_version>>", "cxx_compiler_version": "<<parameters.compiler_version>>" }' \
-                      --user pytorch --label <<parameters.label>> -c pytorch -c nvidia -c rapidsai -c conda-forge
+                      --user pytorch --label <<parameters.label>> -c pytorch -c nvidia -c rapidsai-nightly -c conda-forge
 
   build_cmake:
     parameters:
@@ -236,7 +236,7 @@ jobs:
             - run:
                 name: Install libraft
                 command: |
-                  conda install -y -q libraft cudatoolkit=11.4 -c rapidsai-nightly -c nvidia -c pkgs/main -c conda-forge
+                  conda install -y -q libraft cuda-version=11.4 -c rapidsai-nightly -c nvidia -c pkgs/main -c conda-forge
       - run:
           name: Build all targets
           no_output_timeout: 30m
@@ -283,7 +283,7 @@ jobs:
             - run:
                 name: Python tests (CPU + GPU)
                 command: |
-                  conda install -y -q pytorch pytorch-cuda -c pytorch -c nvidia
+                  conda install -y -q pytorch pytorch-cuda=11 -c pytorch -c nvidia
                   pytest --junitxml=test-results/pytest/results.xml tests/test_*.py
                   pytest --junitxml=test-results/pytest/results-torch.xml tests/torch_*.py
                   cp tests/common_faiss_tests.py faiss/gpu/test
@@ -350,7 +350,7 @@ workflows:
           exec: linux-x86_64-gpu
           label: main
           cuda: "11.4"
-          cuda_archs: "60;61;70;72;75;80;86"
+          cuda_archs: "60-real;61-real;62-real;70-real;72-real;75-real;80;86-real"
           compiler_version: "11.2"
           filters:
             tags:
@@ -363,7 +363,7 @@ workflows:
           label: main
           raft: "ON"
           cuda: "11.4"
-          cuda_archs: "60;61;70;72;75;80;86"
+          cuda_archs: "60-real;61-real;62-real;70-real;72-real;75-real;80;86-real"
           compiler_version: "11.2"
           filters:
             tags:
@@ -415,7 +415,7 @@ workflows:
           name: Linux x86_64 GPU nightlies (CUDA 11.4)
           exec: linux-x86_64-gpu
           cuda: "11.4"
-          cuda_archs: "60;61;70;72;75;80;86"
+          cuda_archs: "60-real;61-real;62-real;70-real;72-real;75-real;80;86-real"
           compiler_version: "11.2"
           label: nightly
       - build_conda:
@@ -423,7 +423,7 @@ workflows:
           exec: linux-x86_64-gpu
           raft: "ON"
           cuda: "11.4"
-          cuda_archs: "60;61;70;72;75;80;86"
+          cuda_archs: "60-real;61-real;62-real;70-real;72-real;75-real;80;86-real"
           compiler_version: "11.2"
           label: nightly
       - build_conda:
diff --git a/thirdparty/faiss/benchs/bench_all_ivf/bench_all_ivf.py b/thirdparty/faiss/benchs/bench_all_ivf/bench_all_ivf.py
index e098e9527..cb4e097a0 100644
--- a/thirdparty/faiss/benchs/bench_all_ivf/bench_all_ivf.py
+++ b/thirdparty/faiss/benchs/bench_all_ivf/bench_all_ivf.py
@@ -7,6 +7,7 @@
 import os
 import sys
 import time
+import json
 
 import faiss
 import numpy as np
@@ -19,105 +20,6 @@
 sanitize = datasets.sanitize
 
 
-######################################################
-# Command-line parsing
-######################################################
-
-
-parser = argparse.ArgumentParser()
-
-
-def aa(*args, **kwargs):
-    group.add_argument(*args, **kwargs)
-
-
-group = parser.add_argument_group('dataset options')
-
-aa('--db', default='deep1M', help='dataset')
-aa('--compute_gt', default=False, action='store_true',
-    help='compute and store the groundtruth')
-aa('--force_IP', default=False, action="store_true",
-    help='force IP search instead of L2')
-
-group = parser.add_argument_group('index consturction')
-
-aa('--indexkey', default='HNSW32', help='index_factory type')
-aa('--maxtrain', default=256 * 256, type=int,
-   help='maximum number of training points (0 to set automatically)')
-aa('--indexfile', default='', help='file to read or write index from')
-aa('--add_bs', default=-1, type=int,
-   help='add elements index by batches of this size')
-
-
-group = parser.add_argument_group('IVF options')
-aa('--by_residual', default=-1, type=int,
-    help="set if index should use residuals (default=unchanged)")
-aa('--no_precomputed_tables', action='store_true', default=False,
-   help='disable precomputed tables (uses less memory)')
-aa('--get_centroids_from', default='',
-   help='get the centroids from this index (to speed up training)')
-aa('--clustering_niter', default=-1, type=int,
-   help='number of clustering iterations (-1 = leave default)')
-aa('--train_on_gpu', default=False, action='store_true',
-   help='do training on GPU')
-
-
-group = parser.add_argument_group('index-specific options')
-aa('--M0', default=-1, type=int, help='size of base level for HNSW')
-aa('--RQ_train_default', default=False, action="store_true",
-    help='disable progressive dim training for RQ')
-aa('--RQ_beam_size', default=-1, type=int,
-    help='set beam size at add time')
-aa('--LSQ_encode_ils_iters', default=-1, type=int,
-    help='ILS iterations for LSQ')
-aa('--RQ_use_beam_LUT', default=-1, type=int,
-    help='use beam LUT at add time')
-
-group = parser.add_argument_group('searching')
-
-aa('--k', default=100, type=int, help='nb of nearest neighbors')
-aa('--inter', default=False, action='store_true',
-    help='use intersection measure instead of 1-recall as metric')
-aa('--searchthreads', default=-1, type=int,
-   help='nb of threads to use at search time')
-aa('--searchparams', nargs='+', default=['autotune'],
-   help="search parameters to use (can be autotune or a list of params)")
-aa('--n_autotune', default=500, type=int,
-   help="max nb of autotune experiments")
-aa('--autotune_max', default=[], nargs='*',
-   help='set max value for autotune variables format "var:val" (exclusive)')
-aa('--autotune_range', default=[], nargs='*',
-   help='set complete autotune range, format "var:val1,val2,..."')
-aa('--min_test_duration', default=3.0, type=float,
-   help='run test at least for so long to avoid jitter')
-
-args = parser.parse_args()
-
-print("args:", args)
-
-os.system('echo -n "nb processors "; '
-          'cat /proc/cpuinfo | grep ^processor | wc -l; '
-          'cat /proc/cpuinfo | grep ^"model name" | tail -1')
-
-######################################################
-# Load dataset
-######################################################
-
-ds = datasets.load_dataset(
-    dataset=args.db, compute_gt=args.compute_gt)
-
-if args.force_IP:
-    ds.metric = "IP"
-
-print(ds)
-
-nq, d = ds.nq, ds.d
-nb, d = ds.nq, ds.d
-
-
-######################################################
-# Make index
-######################################################
 
 def unwind_index_ivf(index):
     if isinstance(index, faiss.IndexPreTransform):
@@ -125,6 +27,10 @@ def unwind_index_ivf(index):
         vt = index.chain.at(0)
         index_ivf, vt2 = unwind_index_ivf(faiss.downcast_index(index.index))
         assert vt2 is None
+        if vt is None:
+            vt = lambda x: x
+        else:
+            vt = faiss.downcast_VectorTransform(vt)
         return index_ivf, vt
     if hasattr(faiss, "IndexRefine") and isinstance(index, faiss.IndexRefine):
         return unwind_index_ivf(faiss.downcast_index(index.base_index))
@@ -157,16 +63,50 @@ def apply_AQ_options(index, args):
         index.rq.use_beam_LUT = args.RQ_use_beam_LUT
 
 
-if args.indexfile and os.path.exists(args.indexfile):
 
-    print("reading", args.indexfile)
-    index = faiss.read_index(args.indexfile)
+def eval_setting(index, xq, gt, k, inter, min_time):
+    """ evaluate searching in terms of precision vs. speed """
+    nq = xq.shape[0]
+    ivf_stats = faiss.cvar.indexIVF_stats
+    ivf_stats.reset()
+    nrun = 0
+    t0 = time.time()
+    while True:
+        D, I = index.search(xq, k)
+        nrun += 1
+        t1 = time.time()
+        if t1 - t0 > min_time:
+            break
+    ms_per_query = ((t1 - t0) * 1000.0 / nq / nrun)
+    res = {
+        "ms_per_query": ms_per_query,
+        "nrun": nrun
+    }
+    res["n"] = ms_per_query
+    if inter:
+        rank = k
+        inter_measure = faiss.eval_intersection(gt[:, :rank], I[:, :rank]) / (nq * rank)
+        print("%.4f" % inter_measure, end=' ')
+        res["inter_measure"] = inter_measure
+    else:
+        res["recalls"] = {}
+        for rank in 1, 10, 100:
+            recall = (I[:, :rank] == gt[:, :1]).sum() / float(nq)
+            print("%.4f" % recall, end=' ')
+            res["recalls"][rank] = recall
+    print("   %9.5f  " % ms_per_query, end=' ')
+    print("%12d   " % (ivf_stats.ndis / nrun), end=' ')
+    print(nrun)
+    res["ndis"] = ivf_stats.ndis / nrun
+    return res
 
-    index_ivf, vec_transform = unwind_index_ivf(index)
-    if vec_transform is None:
-        vec_transform = lambda x: x
+######################################################
+# Training
+######################################################
 
-else:
+def run_train(args, ds, res):
+    nq, d = ds.nq, ds.d
+    nb, d = ds.nq, ds.d
 
     print("build index, key=", args.indexkey)
 
@@ -176,10 +116,6 @@ def apply_AQ_options(index, args):
     )
 
     index_ivf, vec_transform = unwind_index_ivf(index)
-    if vec_transform is None:
-        vec_transform = lambda x: x
-    else:
-        vec_transform = faiss.downcast_VectorTransform(vec_transform)
 
     if args.by_residual != -1:
         by_residual = args.by_residual == 1
@@ -205,9 +141,14 @@ def apply_AQ_options(index, args):
                     64)
                 print(base_index.nprobe)
         elif isinstance(quantizer, faiss.IndexHNSW):
-            print("   update quantizer efSearch=", quantizer.hnsw.efSearch, end=" -> ")
-            quantizer.hnsw.efSearch = 40 if index_ivf.nlist < 4e6 else 64
-            print(quantizer.hnsw.efSearch)
+            hnsw = quantizer.hnsw
+            print(
+                f"   update HNSW quantizer options, before: "
+                f"{hnsw.efSearch=:} {hnsw.efConstruction=:}"
+            )
+            hnsw.efSearch = 40 if index_ivf.nlist < 4e6 else 64
+            hnsw.efConstruction = 200
+            print(f"       after: {hnsw.efSearch=:} {hnsw.efConstruction=:}")
 
     apply_AQ_options(index_ivf or index, args)
 
@@ -286,182 +227,341 @@ def apply_AQ_options(index, args):
 
     t0 = time.time()
     index.train(xt2)
-    print("  train in %.3f s" % (time.time() - t0))
+    res.train_time = time.time() - t0
+    print("  train in %.3f s" % res.train_time)
+    return index
+
+######################################################
+# Populating index
+######################################################
+
+def run_add(args, ds, index, res):
 
     print("adding")
     t0 = time.time()
     if args.add_bs == -1:
+        assert args.split == [1, 0], "split not supported with full batch add"
         index.add(sanitize(ds.get_database()))
     else:
+        totn = ds.nb // args.split[0] # approximate
         i0 = 0
-        for xblock in ds.database_iterator(bs=args.add_bs):
+        print(f"Adding in block sizes {args.add_bs} with split {args.split}")
+        for xblock in ds.database_iterator(bs=args.add_bs, split=args.split):
             i1 = i0 + len(xblock)
             print("  adding %d:%d / %d [%.3f s, RSS %d kiB] " % (
-                i0, i1, ds.nb, time.time() - t0,
+                i0, i1, totn, time.time() - t0,
                 faiss.get_mem_usage_kb()))
             index.add(xblock)
             i0 = i1
 
-    print("  add in %.3f s" % (time.time() - t0))
-    if args.indexfile:
-        print("storing", args.indexfile)
-        faiss.write_index(index, args.indexfile)
+    res.t_add = time.time() - t0
+    print(f"  add in {res.t_add:.3f} s index size {index.ntotal}")
 
-if args.no_precomputed_tables:
-    if isinstance(index_ivf, faiss.IndexIVFPQ):
-        print("disabling precomputed table")
-        index_ivf.use_precomputed_table = -1
-        index_ivf.precomputed_table.clear()
 
-if args.indexfile:
-    print("index size on disk: ", os.stat(args.indexfile).st_size)
+######################################################
+# Search
+######################################################
 
-if hasattr(index, "code_size"):
-    print("vector code_size", index.code_size)
+def run_search(args, ds, index, res):
 
-if hasattr(index_ivf, "code_size"):
-    print("vector code_size (IVF)", index_ivf.code_size)
+    index_ivf, vec_transform = unwind_index_ivf(index)
 
-print("current RSS:", faiss.get_mem_usage_kb() * 1024)
+    if args.no_precomputed_tables:
+        if isinstance(index_ivf, faiss.IndexIVFPQ):
+            print("disabling precomputed table")
+            index_ivf.use_precomputed_table = -1
+            index_ivf.precomputed_table.clear()
 
-precomputed_table_size = 0
-if hasattr(index_ivf, 'precomputed_table'):
-    precomputed_table_size = index_ivf.precomputed_table.size() * 4
+    if args.indexfile:
+        print("index size on disk: ", os.stat(args.indexfile).st_size)
 
-print("precomputed tables size:", precomputed_table_size)
+    if hasattr(index, "code_size"):
+        print("vector code_size", index.code_size)
 
+    if hasattr(index_ivf, "code_size"):
+        print("vector code_size (IVF)", index_ivf.code_size)
 
-#############################################################
-# Index is ready
-#############################################################
+    print("current RSS:", faiss.get_mem_usage_kb() * 1024)
 
-xq = sanitize(ds.get_queries())
-gt = ds.get_groundtruth(k=args.k)
-assert gt.shape[1] == args.k
+    precomputed_table_size = 0
+    if hasattr(index_ivf, 'precomputed_table'):
+        precomputed_table_size = index_ivf.precomputed_table.size() * 4
 
-if args.searchthreads != -1:
-    print("Setting nb of threads to", args.searchthreads)
-    faiss.omp_set_num_threads(args.searchthreads)
-else:
-    print("nb search threads: ", faiss.omp_get_max_threads())
+    print("precomputed tables size:", precomputed_table_size)
 
-ps = faiss.ParameterSpace()
-ps.initialize(index)
+    # Index is ready
 
-parametersets = args.searchparams
+    xq = sanitize(ds.get_queries())
+    nq, d = xq.shape
+    gt = ds.get_groundtruth(k=args.k)
 
+    if not args.accept_short_gt: # Deep1B has only a single NN per query
+        assert gt.shape[1] == args.k
 
+    if args.searchthreads != -1:
+        print("Setting nb of threads to", args.searchthreads)
+        faiss.omp_set_num_threads(args.searchthreads)
+    else:
+        print("nb search threads: ", faiss.omp_get_max_threads())
 
-if args.inter:
-    header = (
-        '%-40s     inter@%3d time(ms/q)   nb distances #runs' %
-        ("parameters", args.k)
-    )
-else:
+    ps = faiss.ParameterSpace()
+    ps.initialize(index)
 
-    header = (
-        '%-40s     R@1   R@10  R@100  time(ms/q)   nb distances #runs' %
-        "parameters"
-    )
+    parametersets = args.searchparams
 
-def compute_inter(a, b):
-    nq, rank = a.shape
-    ninter = sum(
-        np.intersect1d(a[i, :rank], b[i, :rank]).size
-        for i in range(nq)
-    )
-    return ninter / a.size
+    if args.inter:
+        header = (
+            '%-40s     inter@%3d time(ms/q)   nb distances #runs' %
+            ("parameters", args.k)
+        )
+    else:
 
+        header = (
+            '%-40s     R@1   R@10  R@100  time(ms/q)   nb distances #runs' %
+            "parameters"
+        )
 
 
-def eval_setting(index, xq, gt, k, inter, min_time):
-    nq = xq.shape[0]
-    ivf_stats = faiss.cvar.indexIVF_stats
-    ivf_stats.reset()
-    nrun = 0
-    t0 = time.time()
-    while True:
-        D, I = index.search(xq, k)
-        nrun += 1
-        t1 = time.time()
-        if t1 - t0 > min_time:
-            break
-    ms_per_query = ((t1 - t0) * 1000.0 / nq / nrun)
-    if inter:
-        rank = k
-        inter_measure = compute_inter(gt[:, :rank], I[:, :rank])
-        print("%.4f" % inter_measure, end=' ')
-    else:
-        for rank in 1, 10, 100:
-            n_ok = (I[:, :rank] == gt[:, :1]).sum()
-            print("%.4f" % (n_ok / float(nq)), end=' ')
-    print("   %9.5f  " % ms_per_query, end=' ')
-    print("%12d   " % (ivf_stats.ndis / nrun), end=' ')
-    print(nrun)
+    res.search_results = {}
+    if parametersets == ['autotune']:
+
+        ps.n_experiments = args.n_autotune
+        ps.min_test_duration = args.min_test_duration
+
+        for kv in args.autotune_max:
+            k, vmax = kv.split(':')
+            vmax = float(vmax)
+            print("limiting %s to %g" % (k, vmax))
+            pr = ps.add_range(k)
+            values = faiss.vector_to_array(pr.values)
+            values = np.array([v for v in values if v < vmax])
+            faiss.copy_array_to_vector(values, pr.values)
+
+        for kv in args.autotune_range:
+            k, vals = kv.split(':')
+            vals = np.fromstring(vals, sep=',')
+            print("setting %s to %s" % (k, vals))
+            pr = ps.add_range(k)
+            faiss.copy_array_to_vector(vals, pr.values)
+
+        # setup the Criterion object
+        if args.inter:
+            print("Optimize for intersection @ ", args.k)
+            crit = faiss.IntersectionCriterion(nq, args.k)
+        else:
+            print("Optimize for 1-recall @ 1")
+            crit = faiss.OneRecallAtRCriterion(nq, 1)
 
+        # by default, the criterion will request only 1 NN
+        crit.nnn = args.k
+        crit.set_groundtruth(None, gt.astype('int64'))
 
-if parametersets == ['autotune']:
+        # then we let Faiss find the optimal parameters by itself
+        print("exploring operating points, %d threads" % faiss.omp_get_max_threads());
+        ps.display()
 
-    ps.n_experiments = args.n_autotune
-    ps.min_test_duration = args.min_test_duration
+        t0 = time.time()
+        op = ps.explore(index, xq, crit)
+        res.t_explore = time.time() - t0
+        print("Done in %.3f s, available OPs:" % res.t_explore)
 
-    for kv in args.autotune_max:
-        k, vmax = kv.split(':')
-        vmax = float(vmax)
-        print("limiting %s to %g" % (k, vmax))
-        pr = ps.add_range(k)
-        values = faiss.vector_to_array(pr.values)
-        values = np.array([v for v in values if v < vmax])
-        faiss.copy_array_to_vector(values, pr.values)
+        op.display()
 
-    for kv in args.autotune_range:
-        k, vals = kv.split(':')
-        vals = np.fromstring(vals, sep=',')
-        print("setting %s to %s" % (k, vals))
-        pr = ps.add_range(k)
-        faiss.copy_array_to_vector(vals, pr.values)
+        print("Re-running evaluation on selected OPs")
+        print(header)
+        opv = op.optimal_pts
+        maxw = max(max(len(opv.at(i).key) for i in range(opv.size())), 40)
+        for i in range(opv.size()):
+            opt = opv.at(i)
+
+            ps.set_index_parameters(index, opt.key)
+
+            print(opt.key.ljust(maxw), end=' ')
+            sys.stdout.flush()
+
+            res_i = eval_setting(index, xq, gt, args.k, args.inter, args.min_test_duration)
+            res.search_results[opt.key] = res_i
 
-    # setup the Criterion object
-    if args.inter:
-        print("Optimize for intersection @ ", args.k)
-        crit = faiss.IntersectionCriterion(nq, args.k)
     else:
-        print("Optimize for 1-recall @ 1")
-        crit = faiss.OneRecallAtRCriterion(nq, 1)
+        print(header)
+        for param in parametersets:
+            print("%-40s " % param, end=' ')
+            sys.stdout.flush()
+            ps.set_index_parameters(index, param)
 
-    # by default, the criterion will request only 1 NN
-    crit.nnn = args.k
-    crit.set_groundtruth(None, gt.astype('int64'))
+            res_i = eval_setting(index, xq, gt, args.k, args.inter, args.min_test_duration)
+            res.search_results[param] = res_i
 
-    # then we let Faiss find the optimal parameters by itself
-    print("exploring operating points, %d threads" % faiss.omp_get_max_threads());
-    ps.display()
 
-    t0 = time.time()
-    op = ps.explore(index, xq, crit)
-    print("Done in %.3f s, available OPs:" % (time.time() - t0))
 
-    op.display()
+######################################################
+# Driver function
+######################################################
 
-    print("Re-running evaluation on selected OPs")
-    print(header)
-    opv = op.optimal_pts
-    maxw = max(max(len(opv.at(i).key) for i in range(opv.size())), 40)
-    for i in range(opv.size()):
-        opt = opv.at(i)
+def main():
+
+    parser = argparse.ArgumentParser()
+
+    def aa(*args, **kwargs):
+        group.add_argument(*args, **kwargs)
+
+    group = parser.add_argument_group('general options')
+    aa('--nthreads', default=-1, type=int,
+        help='nb of threads to use at train and add time')
+    aa('--json', default=False, action="store_true",
+        help="output stats in JSON format at the end")
+    aa('--todo', default=["check_files"],
+       choices=["train", "add", "search", "check_files"],
+       nargs="+", help='what to do (check_files means decide depending on which index files exist)')
+
+    group = parser.add_argument_group('dataset options')
+    aa('--db', default='deep1M', help='dataset')
+    aa('--compute_gt', default=False, action='store_true',
+        help='compute and store the groundtruth')
+    aa('--force_IP', default=False, action="store_true",
+        help='force IP search instead of L2')
+    aa('--accept_short_gt', default=False, action='store_true',
+        help='work around a problem with Deep1B GT')
+
+    group = parser.add_argument_group('index construction')
+    aa('--indexkey', default='HNSW32', help='index_factory type')
+    aa('--trained_indexfile', default='',
+       help='file to read or write a trained index from')
+    aa('--maxtrain', default=256 * 256, type=int,
+        help='maximum number of training points (0 to set automatically)')
+    aa('--indexfile', default='', help='file to read or write index from')
+    aa('--split', default=[1, 0], type=int, nargs=2, help="database split")
+    aa('--add_bs', default=-1, type=int,
+        help='add elements index by batches of this size')
+
+    group = parser.add_argument_group('IVF options')
+    aa('--by_residual', default=-1, type=int,
+        help="set if index should use residuals (default=unchanged)")
+    aa('--no_precomputed_tables', action='store_true', default=False,
+        help='disable precomputed tables (uses less memory)')
+    aa('--get_centroids_from', default='',
+        help='get the centroids from this index (to speed up training)')
+    aa('--clustering_niter', default=-1, type=int,
+        help='number of clustering iterations (-1 = leave default)')
+    aa('--train_on_gpu', default=False, action='store_true',
+        help='do training on GPU')
+
+    group = parser.add_argument_group('index-specific options')
+    aa('--M0', default=-1, type=int, help='size of base level for HNSW')
+    aa('--RQ_train_default', default=False, action="store_true",
+        help='disable progressive dim training for RQ')
+    aa('--RQ_beam_size', default=-1, type=int,
+        help='set beam size at add time')
+    aa('--LSQ_encode_ils_iters', default=-1, type=int,
+        help='ILS iterations for LSQ')
+    aa('--RQ_use_beam_LUT', default=-1, type=int,
+        help='use beam LUT at add time')
+
+    group = parser.add_argument_group('searching')
+    aa('--k', default=100, type=int, help='nb of nearest neighbors')
+    aa('--inter', default=False, action='store_true',
+        help='use intersection measure instead of 1-recall as metric')
+    aa('--searchthreads', default=-1, type=int,
+        help='nb of threads to use at search time')
+    aa('--searchparams', nargs='+', default=['autotune'],
+        help="search parameters to use (can be autotune or a list of params)")
+    aa('--n_autotune', default=500, type=int,
+        help="max nb of autotune experiments")
+    aa('--autotune_max', default=[], nargs='*',
+        help='set max value for autotune variables format "var:val" (exclusive)')
+    aa('--autotune_range', default=[], nargs='*',
+        help='set complete autotune range, format "var:val1,val2,..."')
+    aa('--min_test_duration', default=3.0, type=float,
+        help='run test at least for so long to avoid jitter')
+    aa('--indexes_to_merge', default=[], nargs="*",
+        help="load these indexes to search and merge them before searching")
+
+    args = parser.parse_args()
+
+    if args.todo == ["check_files"]:
+        if os.path.exists(args.indexfile):
+            args.todo = ["search"]
+        elif os.path.exists(args.trained_indexfile):
+            args.todo = ["add", "search"]
+        else:
+            args.todo = ["train", "add", "search"]
+        print("setting todo to", args.todo)
+
+    print("args:", args)
+
+    os.system('echo -n "nb processors "; '
+            'cat /proc/cpuinfo | grep ^processor | wc -l; '
+            'cat /proc/cpuinfo | grep ^"model name" | tail -1')
 
-        ps.set_index_parameters(index, opt.key)
+    # object to collect results
+    res = argparse.Namespace()
+    res.args = args.__dict__
 
-        print(opt.key.ljust(maxw), end=' ')
-        sys.stdout.flush()
+    res.cpu_model = [
+        l for l in open("/proc/cpuinfo", "r")
+        if "model name" in l][0]
 
-        eval_setting(index, xq, gt, args.k, args.inter, args.min_test_duration)
+    print("Load dataset")
 
-else:
-    print(header)
-    for param in parametersets:
-        print("%-40s " % param, end=' ')
-        sys.stdout.flush()
-        ps.set_index_parameters(index, param)
+    ds = datasets.load_dataset(
+        dataset=args.db, compute_gt=args.compute_gt)
 
-        eval_setting(index, xq, gt, args.k, args.inter, args.min_test_duration)
+    if args.force_IP:
+        ds.metric = "IP"
+
+    print(ds)
+
+    if args.nthreads != -1:
+        print("Set nb of threads to", args.nthreads)
+        faiss.omp_set_num_threads(args.nthreads)
+    else:
+        print("nb threads: ", faiss.omp_get_max_threads())
+
+    index = None
+    if "train" in args.todo:
+        print("================== Training index")
+        index = run_train(args, ds, res)
+        if args.trained_indexfile:
+            print("storing trained index", args.trained_indexfile)
+            faiss.write_index(index, args.trained_indexfile)
+
+    if "add" in args.todo:
+        if not index:
+            assert args.trained_indexfile
+            print("reading trained index", args.trained_indexfile)
+            index = faiss.read_index(args.trained_indexfile)
+
+        print("================== Adding vectors to index")
+        run_add(args, ds, index, res)
+        if args.indexfile:
+            print("storing", args.indexfile)
+            faiss.write_index(index, args.indexfile)
+
+    if "search" in args.todo:
+        if not index:
+            if args.indexfile:
+                print("reading index", args.indexfile)
+                index = faiss.read_index(args.indexfile)
+            elif args.indexes_to_merge:
+                print(f"Merging {len(args.indexes_to_merge)} indexes")
+                sz = 0
+                for fname in args.indexes_to_merge:
+                    print(f"    reading {fname} (current size {sz})")
+                    index_i = faiss.read_index(fname)
+                    if index is None:
+                        index = index_i
+                    else:
+                        index.merge_from(index_i, index.ntotal)
+                    sz = index.ntotal
+            else:
+                assert False, "provide --indexfile"
+
+        print("================== Searching")
+        run_search(args, ds, index, res)
+
+    if args.json:
+        print("JSON results:", json.dumps(res.__dict__))
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/thirdparty/faiss/benchs/bench_fw/__init__.py b/thirdparty/faiss/benchs/bench_fw/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/thirdparty/faiss/benchs/bench_fw/benchmark.py b/thirdparty/faiss/benchs/bench_fw/benchmark.py
new file mode 100644
index 000000000..0d7f1d8b0
--- /dev/null
+++ b/thirdparty/faiss/benchs/bench_fw/benchmark.py
@@ -0,0 +1,722 @@
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+from contextlib import contextmanager
+import json
+import logging
+from dataclasses import dataclass
+from multiprocessing.pool import ThreadPool
+from operator import itemgetter
+from statistics import median, mean
+from time import perf_counter
+from typing import Any, List, Optional
+from .descriptors import DatasetDescriptor, IndexDescriptor
+
+import faiss  # @manual=//faiss/python:pyfaiss_gpu
+from faiss.contrib.evaluation import (  # @manual=//faiss/contrib:faiss_contrib_gpu
+    knn_intersection_measure,
+    OperatingPointsWithRanges,
+)
+from faiss.contrib.ivf_tools import (  # @manual=//faiss/contrib:faiss_contrib_gpu
+    add_preassigned,
+)
+
+import numpy as np
+
+from scipy.optimize import curve_fit
+
+logger = logging.getLogger(__name__)
+
+
+@contextmanager
+def timer(name) -> float:
+    logger.info(f"Measuring {name}")
+    t1 = t2 = perf_counter()
+    yield lambda: t2 - t1
+    t2 = perf_counter()
+    logger.info(f"Time for {name}: {t2 - t1:.3f} seconds")
+
+
+def refine_distances_knn(
+    D: np.ndarray, I: np.ndarray, xq: np.ndarray, xb: np.ndarray, metric
+):
+    return np.where(
+        I >= 0,
+        np.square(np.linalg.norm(xq[:, None] - xb[I], axis=2))
+        if metric == faiss.METRIC_L2
+        else np.einsum("qd,qkd->qk", xq, xb[I]),
+        D,
+    )
+
+
+def refine_distances_range(
+    lims: np.ndarray,
+    D: np.ndarray,
+    I: np.ndarray,
+    xq: np.ndarray,
+    xb: np.ndarray,
+    metric,
+):
+    with ThreadPool(32) as pool:
+        R = pool.map(
+            lambda i: (
+                np.sum(np.square(xq[i] - xb[I[lims[i]:lims[i + 1]]]), axis=1)
+                if metric == faiss.METRIC_L2
+                else np.tensordot(
+                    xq[i], xb[I[lims[i]:lims[i + 1]]], axes=(0, 1)
+                )
+            )
+            if lims[i + 1] > lims[i]
+            else [],
+            range(len(lims) - 1),
+        )
+    return np.hstack(R)
+
+
+def range_search_pr_curve(
+    dist_ann: np.ndarray, metric_score: np.ndarray, gt_rsm: float
+):
+    assert dist_ann.shape == metric_score.shape
+    assert dist_ann.ndim == 1
+    sort_by_dist_ann = dist_ann.argsort()
+    dist_ann = dist_ann[sort_by_dist_ann]
+    metric_score = metric_score[sort_by_dist_ann]
+    cum_score = np.cumsum(metric_score)
+    precision = cum_score / np.arange(1, len(cum_score) + 1)
+    recall = cum_score / gt_rsm
+    unique_key = np.round(precision * 100) * 100 + np.round(recall * 100)
+    tbl = np.vstack(
+        [dist_ann, metric_score, cum_score, precision, recall, unique_key]
+    )
+    group_by_dist_max_cum_score = np.empty(len(dist_ann), bool)
+    group_by_dist_max_cum_score[-1] = True
+    group_by_dist_max_cum_score[:-1] = dist_ann[1:] != dist_ann[:-1]
+    tbl = tbl[:, group_by_dist_max_cum_score]
+    _, unique_key_idx = np.unique(tbl[5], return_index=True)
+    dist_ann, metric_score, cum_score, precision, recall, unique_key = tbl[
+        :, np.sort(unique_key_idx)
+    ].tolist()
+    return {
+        "dist_ann": dist_ann,
+        "metric_score_sample": metric_score,
+        "cum_score": cum_score,
+        "precision": precision,
+        "recall": recall,
+        "unique_key": unique_key,
+    }
+
+
+def set_index_parameter(index, name, val):
+    index = faiss.downcast_index(index)
+
+    if isinstance(index, faiss.IndexPreTransform):
+        set_index_parameter(index.index, name, val)
+    elif name.startswith("quantizer_"):
+        index_ivf = faiss.extract_index_ivf(index)
+        set_index_parameter(
+            index_ivf.quantizer, name[name.find("_") + 1:], val
+        )
+    elif name == "efSearch":
+        index.hnsw.efSearch
+        index.hnsw.efSearch = int(val)
+    elif name == "nprobe":
+        index_ivf = faiss.extract_index_ivf(index)
+        index_ivf.nprobe
+        index_ivf.nprobe = int(val)
+    elif name == "noop":
+        pass
+    else:
+        raise RuntimeError(f"could not set param {name} on {index}")
+
+
+def optimizer(codec, search, cost_metric, perf_metric):
+    op = OperatingPointsWithRanges()
+    op.add_range("noop", [0])
+    codec_ivf = faiss.try_extract_index_ivf(codec)
+    if codec_ivf is not None:
+        op.add_range(
+            "nprobe",
+            [2**i for i in range(12) if 2**i < codec_ivf.nlist * 0.1],
+        )
+
+    totex = op.num_experiments()
+    rs = np.random.RandomState(123)
+    if totex > 1:
+        experiments = rs.permutation(totex - 2) + 1
+        experiments = [0, totex - 1] + list(experiments)
+    else:
+        experiments = [0]
+
+    print(f"total nb experiments {totex}, running {len(experiments)}")
+
+    for cno in experiments:
+        key = op.cno_to_key(cno)
+        parameters = op.get_parameters(key)
+
+        (max_perf, min_cost) = op.predict_bounds(key)
+        if not op.is_pareto_optimal(max_perf, min_cost):
+            logger.info(
+                f"{cno=:4d} {str(parameters):50}: SKIP, {max_perf=:.3f} {min_cost=:.3f}",
+            )
+            continue
+
+        logger.info(f"{cno=:4d} {str(parameters):50}: RUN")
+        cost, perf = search(
+            parameters,
+            cost_metric,
+            perf_metric,
+        )
+        logger.info(
+            f"{cno=:4d} {str(parameters):50}: DONE, {cost=:.3f} {perf=:.3f}"
+        )
+        op.add_operating_point(key, perf, cost)
+
+
+def distance_ratio_measure(I, R, D_GT, metric):
+    sum_of_R = np.sum(np.where(I >= 0, R, 0))
+    sum_of_D_GT = np.sum(np.where(I >= 0, D_GT, 0))
+    if metric == faiss.METRIC_INNER_PRODUCT:
+        return (sum_of_R / sum_of_D_GT).item()
+    elif metric == faiss.METRIC_L2:
+        return (sum_of_D_GT / sum_of_R).item()
+    else:
+        raise RuntimeError(f"unknown metric {metric}")
+
+
+# range_metric possible values:
+#
+# radius
+#    [0..radius) -> 1
+#    [radius..inf) -> 0
+#
+# [[radius1, score1], ...]
+#    [0..radius1) -> score1
+#    [radius1..radius2) -> score2
+#
+# [[radius1_from, radius1_to, score1], ...]
+#    [radius1_from, radius1_to) -> score1,
+#    [radius2_from, radius2_to) -> score2
+def get_range_search_metric_function(range_metric, D, R):
+    if D is not None:
+        assert R is not None
+        assert D.shape == R.shape
+    if isinstance(range_metric, list):
+        aradius, ascore, aradius_from, aradius_to = [], [], [], []
+        radius_to = 0
+        for rsd in range_metric:
+            assert isinstance(rsd, list)
+            if len(rsd) == 3:
+                radius_from, radius_to, score = rsd
+            elif len(rsd) == 2:
+                radius_from = radius_to
+                radius_to, score = rsd
+            else:
+                raise AssertionError(f"invalid range definition {rsd}")
+            # radius_from and radius_to are compressed distances,
+            # we need to convert them to real embedding distances.
+            if D is not None:
+                sample_idxs = np.argwhere((D <= radius_to) & (D > radius_from))
+                assert len(sample_idxs) > 0
+                real_radius = np.mean(R[sample_idxs]).item()
+            else:
+                real_radius = mean([radius_from, radius_to])
+            logger.info(
+                f"range_search_metric_function {radius_from=} {radius_to=} {real_radius=} {score=}"
+            )
+            aradius.append(real_radius)
+            ascore.append(score)
+            aradius_from.append(radius_from)
+            aradius_to.append(radius_to)
+
+        def sigmoid(x, a, b, c):
+            return a / (1 + np.exp(b * x - c))
+
+        cutoff = max(aradius)
+        popt, _ = curve_fit(sigmoid, aradius, ascore, [1, 5, 5])
+
+        for r in np.arange(0, cutoff + 0.05, 0.05):
+            logger.info(
+                f"range_search_metric_function {r=} {sigmoid(r, *popt)=}"
+            )
+
+        assert isinstance(cutoff, float)
+        return (
+            cutoff,
+            lambda x: np.where(x < cutoff, sigmoid(x, *popt), 0),
+            popt.tolist(),
+            list(zip(aradius, ascore, aradius_from, aradius_to, strict=True))
+        )
+    else:
+        # Assuming that the range_metric is a float,
+        # so the range is [0..range_metric).
+        # D is the result of a range_search with a radius of range_metric,
+        # but both range_metric and D may be compressed distances.
+        # We approximate the real embedding distance as max(R).
+        if R is not None:
+            real_range = np.max(R).item()
+        else:
+            real_range = range_metric
+        logger.info(
+            f"range_search_metric_function {range_metric=} {real_range=}"
+        )
+        assert isinstance(real_range, float)
+        return real_range * 2, lambda x: np.where(x < real_range, 1, 0), [], []
+
+
+@dataclass
+class Benchmark:
+    training_vectors: Optional[DatasetDescriptor] = None
+    db_vectors: Optional[DatasetDescriptor] = None
+    query_vectors: Optional[DatasetDescriptor] = None
+    index_descs: Optional[List[IndexDescriptor]] = None
+    range_ref_index_desc: Optional[str] = None
+    k: Optional[int] = None
+    distance_metric: str = "METRIC_L2"
+
+    def __post_init__(self):
+        if self.distance_metric == "METRIC_INNER_PRODUCT":
+            self.distance_metric_type = faiss.METRIC_INNER_PRODUCT
+        elif self.distance_metric == "METRIC_L2":
+            self.distance_metric_type = faiss.METRIC_L2
+        else:
+            raise ValueError
+        self.cached_index_key = None
+
+    def set_io(self, benchmark_io):
+        self.io = benchmark_io
+        self.io.distance_metric = self.distance_metric
+        self.io.distance_metric_type = self.distance_metric_type
+
+    def get_index_desc(self, factory: str) -> Optional[IndexDescriptor]:
+        for desc in self.index_descs:
+            if desc.factory == factory:
+                return desc
+        return None
+
+    def get_index(self, index_desc: IndexDescriptor):
+        if self.cached_index_key != index_desc.factory:
+            xb = self.io.get_dataset(self.db_vectors)
+            index = faiss.clone_index(
+                self.io.get_codec(index_desc, xb.shape[1])
+            )
+            assert index.ntotal == 0
+            logger.info("Adding vectors to index")
+            index_ivf = faiss.try_extract_index_ivf(index)
+            if index_ivf is not None:
+                QD, QI, _, QP = self.knn_search(
+                    index_desc,
+                    parameters=None,
+                    db_vectors=None,
+                    query_vectors=self.db_vectors,
+                    k=1,
+                    index=index_ivf.quantizer,
+                    level=1,
+                )
+                print(f"{QI.ravel().shape=}")
+                add_preassigned(index_ivf, xb, QI.ravel())
+            else:
+                index.add(xb)
+            assert index.ntotal == xb.shape[0]
+            logger.info("Added vectors to index")
+            self.cached_index_key = index_desc.factory
+            self.cached_index = index
+        return self.cached_index
+
+    def range_search_reference(self, index_desc, range_metric):
+        logger.info("range_search_reference: begin")
+        if isinstance(range_metric, list):
+            assert len(range_metric) > 0
+            ri = len(range_metric[0]) - 1
+            m_radius = (
+                max(rm[ri] for rm in range_metric)
+                if self.distance_metric_type == faiss.METRIC_L2
+                else min(rm[ri] for rm in range_metric)
+            )
+        else:
+            m_radius = range_metric
+
+        lims, D, I, R, P = self.range_search(
+            index_desc,
+            index_desc.parameters,
+            radius=m_radius,
+        )
+        flat = index_desc.factory == "Flat"
+        (
+            gt_radius,
+            range_search_metric_function,
+            coefficients,
+            coefficients_training_data,
+        ) = get_range_search_metric_function(
+            range_metric,
+            D if not flat else None,
+            R if not flat else None,
+        )
+        logger.info("range_search_reference: end")
+        return gt_radius, range_search_metric_function, coefficients, coefficients_training_data
+
+    def estimate_range(self, index_desc, parameters, range_scoring_radius):
+        D, I, R, P = self.knn_search(
+            index_desc, parameters, self.db_vectors, self.query_vectors
+        )
+        samples = []
+        for i, j in np.argwhere(R < range_scoring_radius):
+            samples.append((R[i, j].item(), D[i, j].item()))
+        samples.sort(key=itemgetter(0))
+        return median(r for _, r in samples[-3:])
+
+    def range_search(
+        self,
+        index_desc: IndexDescriptor,
+        parameters: Optional[dict[str, int]],
+        radius: Optional[float] = None,
+        gt_radius: Optional[float] = None,
+    ):
+        logger.info("range_search: begin")
+        flat = index_desc.factory == "Flat"
+        if radius is None:
+            assert gt_radius is not None
+            radius = (
+                gt_radius
+                if flat
+                else self.estimate_range(index_desc, parameters, gt_radius)
+            )
+        logger.info(f"Radius={radius}")
+        filename = self.io.get_filename_range_search(
+            factory=index_desc.factory,
+            parameters=parameters,
+            level=0,
+            db_vectors=self.db_vectors,
+            query_vectors=self.query_vectors,
+            r=radius,
+        )
+        if self.io.file_exist(filename):
+            logger.info(f"Using cached results for {index_desc.factory}")
+            lims, D, I, R, P = self.io.read_file(
+                filename, ["lims", "D", "I", "R", "P"]
+            )
+        else:
+            xq = self.io.get_dataset(self.query_vectors)
+            index = self.get_index(index_desc)
+            if parameters:
+                for name, val in parameters.items():
+                    set_index_parameter(index, name, val)
+
+            index_ivf = faiss.try_extract_index_ivf(index)
+            if index_ivf is not None:
+                QD, QI, _, QP = self.knn_search(
+                    index_desc,
+                    parameters=None,
+                    db_vectors=None,
+                    query_vectors=self.query_vectors,
+                    k=index.nprobe,
+                    index=index_ivf.quantizer,
+                    level=1,
+                )
+                # QD = QD[:, :index.nprobe]
+                # QI = QI[:, :index.nprobe]
+                faiss.cvar.indexIVF_stats.reset()
+                with timer("range_search_preassigned") as t:
+                    lims, D, I = index.range_search_preassigned(xq, radius, QI, QD)
+            else:
+                with timer("range_search") as t:
+                    lims, D, I = index.range_search(xq, radius)
+            if flat:
+                R = D
+            else:
+                xb = self.io.get_dataset(self.db_vectors)
+                R = refine_distances_range(
+                    lims, D, I, xq, xb, self.distance_metric_type
+                )
+            P = {
+                "time": t(),
+                "radius": radius,
+                "count": lims[-1].item(),
+                "parameters": parameters,
+                "index": index_desc.factory,
+            }
+            if index_ivf is not None:
+                stats = faiss.cvar.indexIVF_stats
+                P |= {
+                    "quantizer": QP,
+                    "nq": stats.nq,
+                    "nlist": stats.nlist,
+                    "ndis": stats.ndis,
+                    "nheap_updates": stats.nheap_updates,
+                    "quantization_time": stats.quantization_time,
+                    "search_time": stats.search_time,
+                }
+            self.io.write_file(
+                filename, ["lims", "D", "I", "R", "P"], [lims, D, I, R, P]
+            )
+        logger.info("range_seach: end")
+        return lims, D, I, R, P
+
+    def range_ground_truth(self, gt_radius, range_search_metric_function):
+        logger.info("range_ground_truth: begin")
+        flat_desc = self.get_index_desc("Flat")
+        lims, D, I, R, P = self.range_search(
+            flat_desc,
+            flat_desc.parameters,
+            radius=gt_radius,
+        )
+        gt_rsm = np.sum(range_search_metric_function(R)).item()
+        logger.info("range_ground_truth: end")
+        return gt_rsm
+
+    def range_search_benchmark(
+        self,
+        results: dict[str, Any],
+        index_desc: IndexDescriptor,
+        metric_key: str,
+        gt_radius: float,
+        range_search_metric_function,
+        gt_rsm: float,
+    ):
+        logger.info(f"range_search_benchmark: begin {index_desc.factory=}")
+        xq = self.io.get_dataset(self.query_vectors)
+        (nq, d) = xq.shape
+        logger.info(
+            f"Searching {index_desc.factory} with {nq} vectors of dimension {d}"
+        )
+        codec = self.io.get_codec(index_desc, d)
+        faiss.omp_set_num_threads(16)
+
+        def experiment(parameters, cost_metric, perf_metric):
+            nonlocal results
+            key = self.io.get_filename_evaluation_name(
+                factory=index_desc.factory,
+                parameters=parameters,
+                level=0,
+                db_vectors=self.db_vectors,
+                query_vectors=self.query_vectors,
+                evaluation_name=metric_key,
+            )
+            if key in results["experiments"]:
+                metrics = results["experiments"][key]
+            else:
+                lims, D, I, R, P = self.range_search(
+                    index_desc, parameters, gt_radius=gt_radius
+                )
+                range_search_metric = range_search_metric_function(R)
+                range_search_pr = range_search_pr_curve(
+                    D, range_search_metric, gt_rsm
+                )
+                range_score_sum = np.sum(range_search_metric).item()
+                metrics = P | {
+                    "range_score_sum": range_score_sum,
+                    "range_score_max_recall": range_score_sum / gt_rsm,
+                    "range_search_pr": range_search_pr,
+                }
+                results["experiments"][key] = metrics
+            return metrics[cost_metric], metrics[perf_metric]
+
+        for cost_metric in ["time"]:
+            for perf_metric in ["range_score_max_recall"]:
+                optimizer(
+                    codec,
+                    experiment,
+                    cost_metric,
+                    perf_metric,
+                )
+        logger.info("range_search_benchmark: end")
+        return results
+
+    def knn_search(
+        self,
+        index_desc: IndexDescriptor,
+        parameters: Optional[dict[str, int]],
+        db_vectors: Optional[DatasetDescriptor],
+        query_vectors: DatasetDescriptor,
+        k: Optional[int] = None,
+        index: Optional[faiss.Index] = None,
+        level: int = 0,
+    ):
+        assert level >= 0
+        if level == 0:
+            assert index is None
+            assert db_vectors is not None
+        else:
+            assert index is not None  # quantizer
+            assert db_vectors is None
+        logger.info("knn_seach: begin")
+        k = k if k is not None else self.k
+        flat = index_desc.factory == "Flat"
+        filename = self.io.get_filename_knn_search(
+            factory=index_desc.factory,
+            parameters=parameters,
+            level=level,
+            db_vectors=db_vectors,
+            query_vectors=query_vectors,
+            k=k,
+        )
+        if self.io.file_exist(filename):
+            logger.info(f"Using cached results for {index_desc.factory}")
+            D, I, R, P = self.io.read_file(filename, ["D", "I", "R", "P"])
+        else:
+            xq = self.io.get_dataset(query_vectors)
+            if index is None:
+                index = self.get_index(index_desc)
+            if parameters:
+                for name, val in parameters.items():
+                    set_index_parameter(index, name, val)
+
+            index_ivf = faiss.try_extract_index_ivf(index)
+            if index_ivf is not None:
+                QD, QI, _, QP = self.knn_search(
+                    index_desc,
+                    parameters=None,
+                    db_vectors=None,
+                    query_vectors=query_vectors,
+                    k=index.nprobe,
+                    index=index_ivf.quantizer,
+                    level=level + 1,
+                )
+                # QD = QD[:, :index.nprobe]
+                # QI = QI[:, :index.nprobe]
+                faiss.cvar.indexIVF_stats.reset()
+                with timer("knn search_preassigned") as t:
+                    D, I = index.search_preassigned(xq, k, QI, QD)
+            else:
+                with timer("knn search") as t:
+                    D, I = index.search(xq, k)
+            if flat or level > 0:
+                R = D
+            else:
+                xb = self.io.get_dataset(db_vectors)
+                R = refine_distances_knn(
+                    D, I, xq, xb, self.distance_metric_type
+                )
+            P = {
+                "time": t(),
+                "parameters": parameters,
+                "index": index_desc.factory,
+                "level": level,
+            }
+            if index_ivf is not None:
+                stats = faiss.cvar.indexIVF_stats
+                P |= {
+                    "quantizer": QP,
+                    "nq": stats.nq,
+                    "nlist": stats.nlist,
+                    "ndis": stats.ndis,
+                    "nheap_updates": stats.nheap_updates,
+                    "quantization_time": stats.quantization_time,
+                    "search_time": stats.search_time,
+                }
+            self.io.write_file(filename, ["D", "I", "R", "P"], [D, I, R, P])
+        logger.info("knn_seach: end")
+        return D, I, R, P
+
+    def knn_ground_truth(self):
+        logger.info("knn_ground_truth: begin")
+        flat_desc = self.get_index_desc("Flat")
+        self.gt_knn_D, self.gt_knn_I, _, _ = self.knn_search(
+            flat_desc,
+            flat_desc.parameters,
+            self.db_vectors,
+            self.query_vectors,
+        )
+        logger.info("knn_ground_truth: end")
+
+    def knn_search_benchmark(
+        self, results: dict[str, Any], index_desc: IndexDescriptor
+    ):
+        logger.info(f"knn_search_benchmark: begin {index_desc.factory=}")
+        xq = self.io.get_dataset(self.query_vectors)
+        (nq, d) = xq.shape
+        logger.info(
+            f"Searching {index_desc.factory} with {nq} vectors of dimension {d}"
+        )
+        codec = self.io.get_codec(index_desc, d)
+        codec_ivf = faiss.try_extract_index_ivf(codec)
+        if codec_ivf is not None:
+            results["indices"][index_desc.factory] = {"nlist": codec_ivf.nlist}
+
+        faiss.omp_set_num_threads(16)
+
+        def experiment(parameters, cost_metric, perf_metric):
+            nonlocal results
+            key = self.io.get_filename_evaluation_name(
+                factory=index_desc.factory,
+                parameters=parameters,
+                level=0,
+                db_vectors=self.db_vectors,
+                query_vectors=self.query_vectors,
+                evaluation_name="knn",
+            )
+            if key in results["experiments"]:
+                metrics = results["experiments"][key]
+            else:
+                D, I, R, P = self.knn_search(
+                    index_desc, parameters, self.db_vectors, self.query_vectors
+                )
+                metrics = P | {
+                    "knn_intersection": knn_intersection_measure(
+                        I, self.gt_knn_I
+                    ),
+                    "distance_ratio": distance_ratio_measure(
+                        I, R, self.gt_knn_D, self.distance_metric_type
+                    ),
+                }
+                results["experiments"][key] = metrics
+            return metrics[cost_metric], metrics[perf_metric]
+
+        for cost_metric in ["time"]:
+            for perf_metric in ["knn_intersection", "distance_ratio"]:
+                optimizer(
+                    codec,
+                    experiment,
+                    cost_metric,
+                    perf_metric,
+                )
+        logger.info("knn_search_benchmark: end")
+        return results
+
+    def benchmark(self) -> str:
+        logger.info("begin evaluate")
+        results = {"indices": {}, "experiments": {}}
+        if self.get_index_desc("Flat") is None:
+            self.index_descs.append(IndexDescriptor(factory="Flat"))
+        self.knn_ground_truth()
+        for index_desc in self.index_descs:
+            results = self.knn_search_benchmark(
+                results=results,
+                index_desc=index_desc,
+            )
+
+        if self.range_ref_index_desc is not None:
+            index_desc = self.get_index_desc(self.range_ref_index_desc)
+            if index_desc is None:
+                raise ValueError(
+                    f"Unknown range index {self.range_ref_index_desc}"
+                )
+            if index_desc.range_metrics is None:
+                raise ValueError(
+                    f"Range index {index_desc.factory} has no radius_score"
+                )
+            results["metrics"] = {}
+            for metric_key, range_metric in index_desc.range_metrics.items():
+                (
+                    gt_radius,
+                    range_search_metric_function,
+                    coefficients,
+                    coefficients_training_data,
+                ) = self.range_search_reference(index_desc, range_metric)
+                results["metrics"][metric_key] = {
+                    "coefficients": coefficients,
+                    "training_data": coefficients_training_data,
+                }
+                gt_rsm = self.range_ground_truth(
+                    gt_radius, range_search_metric_function
+                )
+                for index_desc in self.index_descs:
+                    results = self.range_search_benchmark(
+                        results=results,
+                        index_desc=index_desc,
+                        metric_key=metric_key,
+                        gt_radius=gt_radius,
+                        range_search_metric_function=range_search_metric_function,
+                        gt_rsm=gt_rsm,
+                    )
+        self.io.write_json(results, "result.json", overwrite=True)
+        logger.info("end evaluate")
+        return json.dumps(results)
diff --git a/thirdparty/faiss/benchs/bench_fw/benchmark_io.py b/thirdparty/faiss/benchs/bench_fw/benchmark_io.py
new file mode 100644
index 000000000..30fda9c72
--- /dev/null
+++ b/thirdparty/faiss/benchs/bench_fw/benchmark_io.py
@@ -0,0 +1,246 @@
+import io
+import json
+import logging
+import os
+from dataclasses import dataclass
+from typing import Any, List, Optional
+from zipfile import ZipFile
+
+import faiss  # @manual=//faiss/python:pyfaiss_gpu
+
+import numpy as np
+
+from .descriptors import DatasetDescriptor, IndexDescriptor
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class BenchmarkIO:
+    path: str
+
+    def __post_init__(self):
+        self.cached_ds = {}
+        self.cached_codec_key = None
+
+    def get_filename_search(
+        self,
+        factory: str,
+        parameters: Optional[dict[str, int]],
+        level: int,
+        db_vectors: DatasetDescriptor,
+        query_vectors: DatasetDescriptor,
+        k: Optional[int] = None,
+        r: Optional[float] = None,
+        evaluation_name: Optional[str] = None,
+    ):
+        assert factory is not None
+        assert level is not None
+        assert self.distance_metric is not None
+        assert query_vectors is not None
+        assert self.distance_metric is not None
+        filename = f"{factory.lower().replace(',', '_')}."
+        if level > 0:
+            filename += f"l_{level}."
+        if db_vectors is not None:
+            filename += db_vectors.get_filename("d")
+        filename += query_vectors.get_filename("q")
+        filename += self.distance_metric.upper() + "."
+        if k is not None:
+            filename += f"k_{k}."
+        if r is not None:
+            filename += f"r_{int(r * 1000)}."
+        if parameters is not None:
+            for name, val in parameters.items():
+                if name != "noop":
+                    filename += f"{name}_{val}."
+        if evaluation_name is None:
+            filename += "zip"
+        else:
+            filename += evaluation_name
+        return filename
+
+    def get_filename_knn_search(
+        self,
+        factory: str,
+        parameters: Optional[dict[str, int]],
+        level: int,
+        db_vectors: DatasetDescriptor,
+        query_vectors: DatasetDescriptor,
+        k: int,
+    ):
+        assert k is not None
+        return self.get_filename_search(
+            factory=factory,
+            parameters=parameters,
+            level=level,
+            db_vectors=db_vectors,
+            query_vectors=query_vectors,
+            k=k,
+        )
+
+    def get_filename_range_search(
+        self,
+        factory: str,
+        parameters: Optional[dict[str, int]],
+        level: int,
+        db_vectors: DatasetDescriptor,
+        query_vectors: DatasetDescriptor,
+        r: float,
+    ):
+        assert r is not None
+        return self.get_filename_search(
+            factory=factory,
+            parameters=parameters,
+            level=level,
+            db_vectors=db_vectors,
+            query_vectors=query_vectors,
+            r=r,
+        )
+
+    def get_filename_evaluation_name(
+        self,
+        factory: str,
+        parameters: Optional[dict[str, int]],
+        level: int,
+        db_vectors: DatasetDescriptor,
+        query_vectors: DatasetDescriptor,
+        evaluation_name: str,
+    ):
+        assert evaluation_name is not None
+        return self.get_filename_search(
+            factory=factory,
+            parameters=parameters,
+            level=level,
+            db_vectors=db_vectors,
+            query_vectors=query_vectors,
+            evaluation_name=evaluation_name,
+        )
+
+    def get_local_filename(self, filename):
+        return os.path.join(self.path, filename)
+
+    def download_file_from_blobstore(
+        self,
+        filename: str,
+        bucket: Optional[str] = None,
+        path: Optional[str] = None,
+    ):
+        return self.get_local_filename(filename)
+
+    def upload_file_to_blobstore(
+        self,
+        filename: str,
+        bucket: Optional[str] = None,
+        path: Optional[str] = None,
+        overwrite: bool = False,
+    ):
+        pass
+
+    def file_exist(self, filename: str):
+        fn = self.get_local_filename(filename)
+        exists = os.path.exists(fn)
+        logger.info(f"{filename} {exists=}")
+        return exists
+
+    def get_codec(self, index_desc: IndexDescriptor, d: int):
+        if index_desc.factory == "Flat":
+            return faiss.IndexFlat(d, self.distance_metric_type)
+        else:
+            if self.cached_codec_key != index_desc.factory:
+                codec = faiss.read_index(
+                    self.get_local_filename(index_desc.path)
+                )
+                assert (
+                    codec.metric_type == self.distance_metric_type
+                ), f"{codec.metric_type=} != {self.distance_metric_type=}"
+                logger.info(f"Loaded codec from {index_desc.path}")
+                self.cached_codec_key = index_desc.factory
+                self.cached_codec = codec
+            return self.cached_codec
+
+    def read_file(self, filename: str, keys: List[str]):
+        fn = self.download_file_from_blobstore(filename)
+        logger.info(f"Loading file {fn}")
+        results = []
+        with ZipFile(fn, "r") as zip_file:
+            for key in keys:
+                with zip_file.open(key, "r") as f:
+                    if key in ["D", "I", "R", "lims"]:
+                        results.append(np.load(f))
+                    elif key in ["P"]:
+                        t = io.TextIOWrapper(f)
+                        results.append(json.load(t))
+                    else:
+                        raise AssertionError()
+        return results
+
+    def write_file(
+        self,
+        filename: str,
+        keys: List[str],
+        values: List[Any],
+        overwrite: bool = False,
+    ):
+        fn = self.get_local_filename(filename)
+        with ZipFile(fn, "w") as zip_file:
+            for key, value in zip(keys, values, strict=True):
+                with zip_file.open(key, "w") as f:
+                    if key in ["D", "I", "R", "lims"]:
+                        np.save(f, value)
+                    elif key in ["P"]:
+                        t = io.TextIOWrapper(f, write_through=True)
+                        json.dump(value, t)
+                    else:
+                        raise AssertionError()
+        self.upload_file_to_blobstore(filename, overwrite=overwrite)
+
+    def get_dataset(self, dataset):
+        if dataset not in self.cached_ds:
+            self.cached_ds[dataset] = self.read_nparray(
+                os.path.join(self.path, dataset.tablename)
+            )
+        return self.cached_ds[dataset]
+
+    def read_nparray(
+        self,
+        filename: str,
+    ):
+        fn = self.download_file_from_blobstore(filename)
+        logger.info(f"Loading nparray from {fn}")
+        nparray = np.load(fn)
+        logger.info(f"Loaded nparray {nparray.shape} from {fn}")
+        return nparray
+
+    def write_nparray(
+        self,
+        nparray: np.ndarray,
+        filename: str,
+    ):
+        fn = self.get_local_filename(filename)
+        logger.info(f"Saving nparray {nparray.shape} to {fn}")
+        np.save(fn, nparray)
+        self.upload_file_to_blobstore(filename)
+
+    def read_json(
+        self,
+        filename: str,
+    ):
+        fn = self.download_file_from_blobstore(filename)
+        logger.info(f"Loading json {fn}")
+        with open(fn, "r") as fp:
+            json_dict = json.load(fp)
+        logger.info(f"Loaded json {json_dict} from {fn}")
+        return json_dict
+
+    def write_json(
+        self,
+        json_dict: dict[str, Any],
+        filename: str,
+        overwrite: bool = False,
+    ):
+        fn = self.get_local_filename(filename)
+        logger.info(f"Saving json {json_dict} to {fn}")
+        with open(fn, "w") as fp:
+            json.dump(json_dict, fp)
+        self.upload_file_to_blobstore(filename, overwrite=overwrite)
diff --git a/thirdparty/faiss/benchs/bench_fw/descriptors.py b/thirdparty/faiss/benchs/bench_fw/descriptors.py
new file mode 100644
index 000000000..0268ec328
--- /dev/null
+++ b/thirdparty/faiss/benchs/bench_fw/descriptors.py
@@ -0,0 +1,55 @@
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+from dataclasses import dataclass
+from typing import Any, List, Optional
+
+
+@dataclass
+class IndexDescriptor:
+    factory: str
+    bucket: Optional[str] = None
+    path: Optional[str] = None
+    parameters: Optional[dict[str, int]] = None
+    # range metric definitions
+    # key: name
+    # value: one of the following:
+    #
+    # radius
+    #    [0..radius) -> 1
+    #    [radius..inf) -> 0
+    #
+    # [[radius1, score1], ...]
+    #    [0..radius1) -> score1
+    #    [radius1..radius2) -> score2
+    #
+    # [[radius1_from, radius1_to, score1], ...]
+    #    [radius1_from, radius1_to) -> score1,
+    #    [radius2_from, radius2_to) -> score2
+    range_metrics: Optional[dict[str, Any]] = None
+
+
+@dataclass
+class DatasetDescriptor:
+    namespace: Optional[str] = None
+    tablename: Optional[str] = None
+    partitions: Optional[List[str]] = None
+    num_vectors: Optional[int] = None
+
+    def __hash__(self):
+        return hash(self.get_filename())
+
+    def get_filename(
+        self,
+        prefix: str = "v",
+    ) -> str:
+        filename = prefix + "_"
+        if self.namespace is not None:
+            filename += self.namespace + "_"
+        assert self.tablename is not None
+        filename += self.tablename
+        if self.partitions is not None:
+            filename += "_" + "_".join(self.partitions).replace("=", "_")
+        if self.num_vectors is not None:
+            filename += f"_{self.num_vectors}"
+        filename += "."
+        return filename
diff --git a/thirdparty/faiss/benchs/bench_hybrid_cpu_gpu.py b/thirdparty/faiss/benchs/bench_hybrid_cpu_gpu.py
index 8a509f323..779a09fef 100644
--- a/thirdparty/faiss/benchs/bench_hybrid_cpu_gpu.py
+++ b/thirdparty/faiss/benchs/bench_hybrid_cpu_gpu.py
@@ -530,14 +530,7 @@ def aa(*args, **kwargs):
         raise RuntimeError()
 
     totex = op.num_experiments()
-    rs = np.random.RandomState(123)
-    if totex < args.n_autotune:
-        experiments = rs.permutation(totex - 2) + 1
-    else:
-        experiments = rs.randint(
-            totex - 2, size=args.n_autotune - 2, replace=False)
-
-    experiments = [0, totex - 1] + list(experiments)
+    experiments = op.sample_experiments()
     print(f"total nb experiments {totex}, running {len(experiments)}")
 
     print("perform search")
diff --git a/thirdparty/faiss/conda/faiss-gpu-raft/meta.yaml b/thirdparty/faiss/conda/faiss-gpu-raft/meta.yaml
index 14a5c606b..387f8b4ac 100644
--- a/thirdparty/faiss/conda/faiss-gpu-raft/meta.yaml
+++ b/thirdparty/faiss/conda/faiss-gpu-raft/meta.yaml
@@ -47,13 +47,13 @@ outputs:
       host:
         - mkl =2023  # [x86_64]
         - openblas  # [not x86_64]
-        - cudatoolkit {{ cudatoolkit }}
-        - libraft =23.08
+        - cuda-version {{ cudatoolkit }}
+        - libraft =23.12
       run:
         - mkl =2023  # [x86_64]
         - openblas  # [not x86_64]
-        - {{ pin_compatible('cudatoolkit', max_pin='x.x') }}
-        - libraft =23.08
+        - {{ pin_compatible('cuda-version', max_pin='x') }}
+        - libraft =23.12
     test:
       requires:
         - conda-build
@@ -90,6 +90,8 @@ outputs:
         - numpy
         - scipy
         - pytorch
+        - pytorch-cuda =11.8
+        - cuda-version =11.8
       commands:
         - python -X faulthandler -m unittest discover -v -s tests/ -p "test_*"
         - python -X faulthandler -m unittest discover -v -s tests/ -p "torch_*"
diff --git a/thirdparty/faiss/conda/faiss-gpu/meta.yaml b/thirdparty/faiss/conda/faiss-gpu/meta.yaml
index fcfd3b4bd..e7b839975 100644
--- a/thirdparty/faiss/conda/faiss-gpu/meta.yaml
+++ b/thirdparty/faiss/conda/faiss-gpu/meta.yaml
@@ -51,7 +51,7 @@ outputs:
       run:
         - mkl =2023  # [x86_64]
         - openblas  # [not x86_64]
-        - {{ pin_compatible('cudatoolkit', max_pin='x.x') }}
+        - {{ pin_compatible('cudatoolkit', max_pin='x') }}
     test:
       requires:
         - conda-build
@@ -88,6 +88,8 @@ outputs:
         - numpy
         - scipy
         - pytorch
+        - pytorch-cuda =11.8
+        - cudatoolkit =11.8
       commands:
         - python -X faulthandler -m unittest discover -v -s tests/ -p "test_*"
         - python -X faulthandler -m unittest discover -v -s tests/ -p "torch_*"
diff --git a/thirdparty/faiss/contrib/evaluation.py b/thirdparty/faiss/contrib/evaluation.py
index 1f4068734..50e8a9331 100644
--- a/thirdparty/faiss/contrib/evaluation.py
+++ b/thirdparty/faiss/contrib/evaluation.py
@@ -380,7 +380,23 @@ def do_nothing_key(self):
         return np.zeros(len(self.ranges), dtype=int)
 
     def num_experiments(self):
-        return np.prod([len(values) for name, values in self.ranges])
+        return int(np.prod([len(values) for name, values in self.ranges]))
+
+    def sample_experiments(self, n_autotune, rs=np.random):
+        """ sample a set of experiments of max size n_autotune
+        (run all experiments in random order if n_autotune is 0)
+        """
+        assert n_autotune == 0 or n_autotune >= 2
+        totex = self.num_experiments()
+        rs = np.random.RandomState(123)
+        if n_autotune == 0 or totex < n_autotune:
+            experiments = rs.permutation(totex - 2)
+        else:
+            experiments = rs.choice(
+                totex - 2, size=n_autotune - 2, replace=False)
+
+        experiments = [0, totex - 1] + [int(cno) + 1 for cno in experiments]
+        return experiments
 
     def cno_to_key(self, cno):
         """Convert a sequential experiment number to a key"""
diff --git a/thirdparty/faiss/faiss/Index.h b/thirdparty/faiss/faiss/Index.h
index ba9b6ba7d..c9b5e3022 100644
--- a/thirdparty/faiss/faiss/Index.h
+++ b/thirdparty/faiss/faiss/Index.h
@@ -16,9 +16,6 @@
 #include <string>
 #include <typeinfo>
 
-// #include <knowhere/bitsetview.h>
-// using knowhere::BitsetView;
-
 #define FAISS_VERSION_MAJOR 1
 #define FAISS_VERSION_MINOR 7
 #define FAISS_VERSION_PATCH 4
@@ -53,9 +50,6 @@ struct DistanceComputer;
  * Ownership of the object fields is always to the caller.
  */
 struct SearchParameters {
-    // BitsetView bitset = nullptr;
-
-    // // Disabled for Knowhere.
     /// if non-null, only these IDs will be considered during search.
     IDSelector* sel = nullptr;
 
@@ -109,6 +103,7 @@ struct Index {
      * Vectors are implicitly assigned labels ntotal .. ntotal + n - 1
      * This function slices the input vectors in chunks smaller than
      * blocksize_add and calls add_core.
+     * @param n      number of vectors
      * @param x      input matrix, size n * d
      */
     virtual void add(idx_t n, const float* x) = 0;
@@ -118,7 +113,9 @@ struct Index {
      * The default implementation fails with an assertion, as it is
      * not supported by all indexes.
      *
-     * @param xids if non-null, ids to store for the vectors (size n)
+     * @param n         number of vectors
+     * @param x         input vectors, size n * d
+     * @param xids      if non-null, ids to store for the vectors (size n)
      */
     virtual void add_with_ids(idx_t n, const float* x, const idx_t* xids);
 
@@ -127,9 +124,11 @@ struct Index {
      * return at most k vectors. If there are not enough results for a
      * query, the result array is padded with -1s.
      *
+     * @param n           number of vectors
      * @param x           input vectors to search, size n * d
-     * @param labels      output labels of the NNs, size n*k
+     * @param k           number of extracted vectors
      * @param distances   output pairwise distances, size n*k
+     * @param labels      output labels of the NNs, size n*k
      */
     virtual void search(
             idx_t n,
@@ -145,6 +144,7 @@ struct Index {
      * indexes do not implement the range_search (only the k-NN search
      * is mandatory).
      *
+     * @param n           number of vectors
      * @param x           input vectors to search, size n * d
      * @param radius      search radius
      * @param result      result table
@@ -159,8 +159,10 @@ struct Index {
     /** return the indexes of the k vectors closest to the query x.
      *
      * This function is identical as search but only return labels of neighbors.
+     * @param n           number of vectors
      * @param x           input vectors to search, size n * d
      * @param labels      output labels of the NNs, size n*k
+     * @param k           number of nearest neighbours
      */
     virtual void assign(idx_t n, const float* x, idx_t* labels, idx_t k = 1)
             const;
@@ -184,7 +186,7 @@ struct Index {
     /** Reconstruct several stored vectors (or an approximation if lossy coding)
      *
      * this function may not be defined for some indexes
-     * @param n        number of vectors to reconstruct
+     * @param n           number of vectors to reconstruct
      * @param keys        ids of the vectors to reconstruct (size n)
      * @param recons      reconstucted vector (size n * d)
      */
@@ -194,6 +196,8 @@ struct Index {
     /** Reconstruct vectors i0 to i0 + ni - 1
      *
      * this function may not be defined for some indexes
+     * @param i0          index of the first vector in the sequence
+     * @param ni          number of vectors in the sequence
      * @param recons      reconstucted vector (size ni * d)
      */
     virtual void reconstruct_n(idx_t i0, idx_t ni, float* recons) const;
@@ -204,6 +208,11 @@ struct Index {
      * If there are not enough results for a query, the resulting arrays
      * is padded with -1s.
      *
+     * @param n           number of vectors
+     * @param x           input vectors to search, size n * d
+     * @param k           number of extracted vectors
+     * @param distances   output pairwise distances, size n*k
+     * @param labels      output labels of the NNs, size n*k
      * @param recons      reconstructed vectors size (n, k, d)
      **/
     virtual void search_and_reconstruct(
diff --git a/thirdparty/faiss/faiss/IndexFlat.h b/thirdparty/faiss/faiss/IndexFlat.h
index c045f2779..165b06aaa 100644
--- a/thirdparty/faiss/faiss/IndexFlat.h
+++ b/thirdparty/faiss/faiss/IndexFlat.h
@@ -21,8 +21,10 @@ struct IndexFlat : IndexFlatCodes {
     /// database vectors, size ntotal * d
     std::vector<float> xb;
 
-    explicit IndexFlat(idx_t d, MetricType metric = METRIC_L2,
-                       bool is_cosine = false);
+    explicit IndexFlat(
+            idx_t d, ///< dimensionality of the input vectors
+            MetricType metric = METRIC_L2,
+            bool is_cosine = false);
 
     // Be careful with overriding this function, because
     //   renormalized x may be used inside. 
@@ -89,6 +91,9 @@ struct IndexFlat : IndexFlatCodes {
 };
 
 struct IndexFlatIP : IndexFlat {
+    /**
+     * @param d dimensionality of the input vectors
+     */
     explicit IndexFlatIP(idx_t d) : IndexFlat(d, METRIC_INNER_PRODUCT) {}
     IndexFlatIP() {}
 };
@@ -100,6 +105,9 @@ struct IndexFlatL2 : IndexFlat {
     // and l2 norms.
     std::vector<float> cached_l2norms;
 
+    /**
+     * @param d dimensionality of the input vectors
+     */
     explicit IndexFlatL2(idx_t d) : IndexFlat(d, METRIC_L2) {}
     IndexFlatL2() {}
 
diff --git a/thirdparty/faiss/faiss/IndexFlatCodes.h b/thirdparty/faiss/faiss/IndexFlatCodes.h
index 35123ce2b..ad91c8601 100644
--- a/thirdparty/faiss/faiss/IndexFlatCodes.h
+++ b/thirdparty/faiss/faiss/IndexFlatCodes.h
@@ -36,7 +36,6 @@ struct IndexFlatCodes : Index {
 
     void reset() override;
 
-    /// reconstruction using the codec interface
     void reconstruct_n(idx_t i0, idx_t ni, float* recons) const override;
 
     void reconstruct(idx_t key, float* recons) const override;
diff --git a/thirdparty/faiss/faiss/IndexIVF.cpp b/thirdparty/faiss/faiss/IndexIVF.cpp
index 8f976f37c..284dd83d5 100644
--- a/thirdparty/faiss/faiss/IndexIVF.cpp
+++ b/thirdparty/faiss/faiss/IndexIVF.cpp
@@ -711,6 +711,12 @@ void IndexIVF::search_preassigned(
         }
     }
 
+    // // baseline Faiss version contains the following additional code,
+    // //   introduced in 0a00d8137a386a0efd7f789e3e0912ab4eb73508
+    // if (ivf_stats == nullptr) {
+    //     ivf_stats = &indexIVF_stats;
+    // }
+
     if (ivf_stats) {
         ivf_stats->nq += n;
         ivf_stats->nlist += nlistv;
@@ -911,6 +917,12 @@ void IndexIVF::range_search_preassigned(
         }
     }
 
+    // // baseline Faiss version contains the following additional code,
+    // //   introduced in 0a00d8137a386a0efd7f789e3e0912ab4eb73508
+    // if (stats == nullptr) {
+    //     stats = &indexIVF_stats;
+    // }
+
     if (stats) {
         stats->nq += nx;
         stats->nlist += nlistv;
@@ -993,16 +1005,12 @@ void IndexIVF::search_and_reconstruct(
             std::min(nlist, params ? params->nprobe : this->nprobe);
     FAISS_THROW_IF_NOT(nprobe > 0);
 
-    // todo aguzhva: deprecate ScopeDeleter and ScopeDeleter1
-    //   in favor of std::unique_ptr
-    idx_t* idx = new idx_t[n * nprobe];
-    ScopeDeleter<idx_t> del(idx);
-    float* coarse_dis = new float[n * nprobe];
-    ScopeDeleter<float> del2(coarse_dis);
+    std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe]);
+    std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
 
-    quantizer->search(n, x, nprobe, coarse_dis, idx);
+    quantizer->search(n, x, nprobe, coarse_dis.get(), idx.get());
 
-    invlists->prefetch_lists(idx, n * nprobe);
+    invlists->prefetch_lists(idx.get(), n * nprobe);
 
     // search_preassigned() with `store_pairs` enabled to obtain the list_no
     // and offset into `codes` for reconstruction
@@ -1010,29 +1018,94 @@ void IndexIVF::search_and_reconstruct(
             n,
             x,
             k,
-            idx,
-            coarse_dis,
+            idx.get(),
+            coarse_dis.get(),
             distances,
             labels,
             true /* store_pairs */,
             params);
-    for (idx_t i = 0; i < n; ++i) {
-        for (idx_t j = 0; j < k; ++j) {
-            idx_t ij = i * k + j;
-            idx_t key = labels[ij];
-            float* reconstructed = recons + ij * d;
-            if (key < 0) {
-                // Fill with NaNs
-                memset(reconstructed, -1, sizeof(*reconstructed) * d);
-            } else {
-                int list_no = lo_listno(key);
-                int offset = lo_offset(key);
+#pragma omp parallel for if (n * k > 1000)
+    for (idx_t ij = 0; ij < n * k; ij++) {
+        idx_t key = labels[ij];
+        float* reconstructed = recons + ij * d;
+        if (key < 0) {
+            // Fill with NaNs
+            memset(reconstructed, -1, sizeof(*reconstructed) * d);
+        } else {
+            int list_no = lo_listno(key);
+            int offset = lo_offset(key);
+
+            // Update label to the actual id
+            labels[ij] = invlists->get_single_id(list_no, offset);
+
+            reconstruct_from_offset(list_no, offset, reconstructed);
+        }
+    }
+}
+
+void IndexIVF::search_and_return_codes(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels,
+        uint8_t* codes,
+        bool include_listno,
+        const SearchParameters* params_in) const {
+    const IVFSearchParameters* params = nullptr;
+    if (params_in) {
+        params = dynamic_cast<const IVFSearchParameters*>(params_in);
+        FAISS_THROW_IF_NOT_MSG(params, "IndexIVF params have incorrect type");
+    }
+    const size_t nprobe =
+            std::min(nlist, params ? params->nprobe : this->nprobe);
+    FAISS_THROW_IF_NOT(nprobe > 0);
+
+    std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe]);
+    std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
+
+    quantizer->search(n, x, nprobe, coarse_dis.get(), idx.get());
+
+    invlists->prefetch_lists(idx.get(), n * nprobe);
+
+    // search_preassigned() with `store_pairs` enabled to obtain the list_no
+    // and offset into `codes` for reconstruction
+    search_preassigned(
+            n,
+            x,
+            k,
+            idx.get(),
+            coarse_dis.get(),
+            distances,
+            labels,
+            true /* store_pairs */,
+            params);
+
+    size_t code_size_1 = code_size;
+    if (include_listno) {
+        code_size_1 += coarse_code_size();
+    }
+
+#pragma omp parallel for if (n * k > 1000)
+    for (idx_t ij = 0; ij < n * k; ij++) {
+        idx_t key = labels[ij];
+        uint8_t* code1 = codes + ij * code_size_1;
+
+        if (key < 0) {
+            // Fill with 0xff
+            memset(code1, -1, code_size_1);
+        } else {
+            int list_no = lo_listno(key);
+            int offset = lo_offset(key);
+            const uint8_t* cc = invlists->get_single_code(list_no, offset);
 
-                // Update label to the actual id
-                labels[ij] = invlists->get_single_id(list_no, offset);
+            labels[ij] = invlists->get_single_id(list_no, offset);
 
-                reconstruct_from_offset(list_no, offset, reconstructed);
+            if (include_listno) {
+                encode_listno(list_no, code1);
+                code1 += code_size_1 - code_size;
             }
+            memcpy(code1, cc, code_size);
         }
     }
 }
diff --git a/thirdparty/faiss/faiss/IndexIVF.h b/thirdparty/faiss/faiss/IndexIVF.h
index f8bbe6e3c..9e4bf0f62 100644
--- a/thirdparty/faiss/faiss/IndexIVF.h
+++ b/thirdparty/faiss/faiss/IndexIVF.h
@@ -358,6 +358,24 @@ struct IndexIVF : Index, IndexIVFInterface {
             float* recons,
             const SearchParameters* params = nullptr) const override;
 
+    /** Similar to search, but also returns the codes corresponding to the
+     * stored vectors for the search results.
+     *
+     * @param codes      codes (n, k, code_size)
+     * @param include_listno
+     *                   include the list ids in the code (in this case add
+     *                   ceil(log8(nlist)) to the code size)
+     */
+    void search_and_return_codes(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels,
+            uint8_t* recons,
+            bool include_listno = false,
+            const SearchParameters* params = nullptr) const;
+
     /** Reconstruct a vector given the location in terms of (inv list index +
      * inv list offset) instead of the id.
      *
diff --git a/thirdparty/faiss/faiss/IndexIVFAdditiveQuantizer.cpp b/thirdparty/faiss/faiss/IndexIVFAdditiveQuantizer.cpp
index 9395dd6ee..7fc9b4a48 100644
--- a/thirdparty/faiss/faiss/IndexIVFAdditiveQuantizer.cpp
+++ b/thirdparty/faiss/faiss/IndexIVFAdditiveQuantizer.cpp
@@ -150,6 +150,7 @@ struct AQInvertedListScanner : InvertedListScanner {
     const float* q;
     /// following codes come from this inverted list
     void set_list(idx_t list_no, float coarse_dis) override {
+        this->list_no = list_no;
         if (ia.metric_type == METRIC_L2 && ia.by_residual) {
             ia.quantizer->compute_residual(q0, tmp.data(), list_no);
             q = tmp.data();
diff --git a/thirdparty/faiss/faiss/IndexRefine.cpp b/thirdparty/faiss/faiss/IndexRefine.cpp
index 2d17d33c4..2372c137a 100644
--- a/thirdparty/faiss/faiss/IndexRefine.cpp
+++ b/thirdparty/faiss/faiss/IndexRefine.cpp
@@ -97,16 +97,26 @@ void IndexRefine::search(
         idx_t k,
         float* distances,
         idx_t* labels,
-        const SearchParameters* params) const {
-    FAISS_THROW_IF_NOT_MSG(
-            !params, "search params not supported for this index");
+        const SearchParameters* params_in) const {
+    const IndexRefineSearchParameters* params = nullptr;
+    if (params_in) {
+        params = dynamic_cast<const IndexRefineSearchParameters*>(params_in);
+        FAISS_THROW_IF_NOT_MSG(
+                params, "IndexRefine params have incorrect type");
+    }
+
+    idx_t k_base = (params != nullptr) ? idx_t(k * params->k_factor)
+                                       : idx_t(k * k_factor);
+    SearchParameters* base_index_params =
+            (params != nullptr) ? params->base_index_params : nullptr;
+
+    FAISS_THROW_IF_NOT(k_base >= k);
 
     FAISS_THROW_IF_NOT(base_index);
     FAISS_THROW_IF_NOT(refine_index);
 
     FAISS_THROW_IF_NOT(k > 0);
     FAISS_THROW_IF_NOT(is_trained);
-    idx_t k_base = idx_t(k * k_factor);
     idx_t* base_labels = labels;
     float* base_distances = distances;
     ScopeDeleter<idx_t> del1;
@@ -119,7 +129,8 @@ void IndexRefine::search(
         del2.set(base_distances);
     }
 
-    base_index->search(n, x, k_base, base_distances, base_labels);
+    base_index->search(
+            n, x, k_base, base_distances, base_labels, base_index_params);
 
     for (int i = 0; i < n * k_base; i++)
         assert(base_labels[i] >= -1 && base_labels[i] < ntotal);
@@ -250,15 +261,26 @@ void IndexRefineFlat::search(
         idx_t k,
         float* distances,
         idx_t* labels,
-        const SearchParameters* params) const {
-    FAISS_THROW_IF_NOT_MSG(
-            !params, "search params not supported for this index");
+        const SearchParameters* params_in) const {
+    const IndexRefineSearchParameters* params = nullptr;
+    if (params_in) {
+        params = dynamic_cast<const IndexRefineSearchParameters*>(params_in);
+        FAISS_THROW_IF_NOT_MSG(
+                params, "IndexRefineFlat params have incorrect type");
+    }
+
+    idx_t k_base = (params != nullptr) ? idx_t(k * params->k_factor)
+                                       : idx_t(k * k_factor);
+    SearchParameters* base_index_params =
+            (params != nullptr) ? params->base_index_params : nullptr;
+
+    FAISS_THROW_IF_NOT(k_base >= k);
+
     FAISS_THROW_IF_NOT(base_index);
     FAISS_THROW_IF_NOT(refine_index);
 
     FAISS_THROW_IF_NOT(k > 0);
     FAISS_THROW_IF_NOT(is_trained);
-    idx_t k_base = idx_t(k * k_factor);
     idx_t* base_labels = labels;
     float* base_distances = distances;
     ScopeDeleter<idx_t> del1;
@@ -271,7 +293,8 @@ void IndexRefineFlat::search(
         del2.set(base_distances);
     }
 
-    base_index->search(n, x, k_base, base_distances, base_labels);
+    base_index->search(
+            n, x, k_base, base_distances, base_labels, base_index_params);
 
     for (int i = 0; i < n * k_base; i++)
         assert(base_labels[i] >= -1 && base_labels[i] < ntotal);
diff --git a/thirdparty/faiss/faiss/IndexRefine.h b/thirdparty/faiss/faiss/IndexRefine.h
index 79b671b56..23687af9f 100644
--- a/thirdparty/faiss/faiss/IndexRefine.h
+++ b/thirdparty/faiss/faiss/IndexRefine.h
@@ -11,6 +11,13 @@
 
 namespace faiss {
 
+struct IndexRefineSearchParameters : SearchParameters {
+    float k_factor = 1;
+    SearchParameters* base_index_params = nullptr; // non-owning
+
+    virtual ~IndexRefineSearchParameters() = default;
+};
+
 /** Index that queries in a base_index (a fast one) and refines the
  *  results with an exact search, hopefully improving the results.
  */
diff --git a/thirdparty/faiss/faiss/impl/AdditiveQuantizer.cpp b/thirdparty/faiss/faiss/impl/AdditiveQuantizer.cpp
index e72af5431..eea6be778 100644
--- a/thirdparty/faiss/faiss/impl/AdditiveQuantizer.cpp
+++ b/thirdparty/faiss/faiss/impl/AdditiveQuantizer.cpp
@@ -262,7 +262,7 @@ void AdditiveQuantizer::decode(const uint8_t* code, float* x, size_t n) const {
             is_trained, "The additive quantizer is not trained yet.");
 
     // standard additive quantizer decoding
-#pragma omp parallel for if (n > 1000)
+#pragma omp parallel for if (n > 100)
     for (int64_t i = 0; i < n; i++) {
         BitstringReader bsr(code + i * code_size, code_size);
         float* xi = x + i * d;
diff --git a/thirdparty/faiss/faiss/impl/ProductQuantizer.cpp b/thirdparty/faiss/faiss/impl/ProductQuantizer.cpp
index 06db87662..2ce6cecf1 100644
--- a/thirdparty/faiss/faiss/impl/ProductQuantizer.cpp
+++ b/thirdparty/faiss/faiss/impl/ProductQuantizer.cpp
@@ -307,7 +307,8 @@ void ProductQuantizer::decode(const uint8_t* code, float* x) const {
 }
 
 void ProductQuantizer::decode(const uint8_t* code, float* x, size_t n) const {
-    for (size_t i = 0; i < n; i++) {
+#pragma omp parallel for if (n > 100)
+    for (int64_t i = 0; i < n; i++) {
         this->decode(code + code_size * i, x + d * i);
     }
 }
diff --git a/thirdparty/faiss/faiss/utils/distances.cpp b/thirdparty/faiss/faiss/utils/distances.cpp
index 252490598..1c340a3a3 100644
--- a/thirdparty/faiss/faiss/utils/distances.cpp
+++ b/thirdparty/faiss/faiss/utils/distances.cpp
@@ -896,11 +896,16 @@ void fvec_inner_products_by_idx(
 
         // // baseline version
         // for (size_t i = 0; i < ny; i++) {
-        //     if (idsj[i] < 0)
-        //         continue;
-        //     ipj[i] = fvec_inner_product(xj, y + d * idsj[i], d);
+        //     if (idsj[i] < 0) {
+        //         ipj[i] = -INFINITY;
+        //     } else {
+        //         ipj[i] = fvec_inner_product(xj, y + d * idsj[i], d);
+        //     }
         // }
 
+        // todo aguzhva: this version deviates from the baseline
+        //   on not assigning -INFINITY
+
         // the lambda that filters acceptable elements.
         auto filter = [=](const size_t i) { return (idsj[i] >= 0); };
         
@@ -940,11 +945,16 @@ void fvec_L2sqr_by_idx(
 
         // // baseline version
         // for (size_t i = 0; i < ny; i++) {
-        //     if (idsj[i] < 0)
-        //         continue;
-        //     disj[i] = fvec_L2sqr(xj, y + d * idsj[i], d);
+        //     if (idsj[i] < 0) {
+        //         disj[i] = INFINITY;
+        //     } else {
+        //         disj[i] = fvec_L2sqr(xj, y + d * idsj[i], d);
+        //     }
         // }
 
+        // todo aguzhva: this version deviates from the baseline
+        //   on not assigning INFINITY
+
         // the lambda that filters acceptable elements.
         auto filter = [=](const size_t i) { return (idsj[i] >= 0); };
 
@@ -978,6 +988,8 @@ void pairwise_indexed_L2sqr(
     for (int64_t j = 0; j < n; j++) {
         if (ix[j] >= 0 && iy[j] >= 0) {
             dis[j] = fvec_L2sqr(x + d * ix[j], y + d * iy[j], d);
+        } else {
+            dis[j] = INFINITY;
         }
     }
 }
@@ -994,6 +1006,8 @@ void pairwise_indexed_inner_product(
     for (int64_t j = 0; j < n; j++) {
         if (ix[j] >= 0 && iy[j] >= 0) {
             dis[j] = fvec_inner_product(x + d * ix[j], y + d * iy[j], d);
+        } else {
+            dis[j] = -INFINITY;
         }
     }
 }
diff --git a/thirdparty/faiss/faiss/utils/distances.h b/thirdparty/faiss/faiss/utils/distances.h
index 30b631673..55e0063f7 100644
--- a/thirdparty/faiss/faiss/utils/distances.h
+++ b/thirdparty/faiss/faiss/utils/distances.h
@@ -105,8 +105,16 @@ void fvec_sub(size_t d, const float* a, const float* b, float* c);
  * Compute a subset of  distances
  ***************************************************************************/
 
-/* compute the inner product between x and a subset y of ny vectors,
-  whose indices are given by idy.  */
+/** compute the inner product between x and a subset y of ny vectors defined by
+ * ids
+ *
+ * ip(i, j) = inner_product(x(i, :), y(ids(i, j), :))
+ *
+ * @param ip    output array, size nx * ny
+ * @param x     first-term vector, size nx * d
+ * @param y     second-term vector, size (max(ids) + 1) * d
+ * @param ids   ids to sample from y, size nx * ny
+ */
 void fvec_inner_products_by_idx(
         float* ip,
         const float* x,
@@ -116,7 +124,16 @@ void fvec_inner_products_by_idx(
         size_t nx,
         size_t ny);
 
-/* same but for a subset in y indexed by idsy (ny vectors in total) */
+/** compute the squared L2 distances between x and a subset y of ny vectors
+ * defined by ids
+ *
+ * dis(i, j) = inner_product(x(i, :), y(ids(i, j), :))
+ *
+ * @param dis   output array, size nx * ny
+ * @param x     first-term vector, size nx * d
+ * @param y     second-term vector, size (max(ids) + 1) * d
+ * @param ids   ids to sample from y, size nx * ny
+ */
 void fvec_L2sqr_by_idx(
         float* dis,
         const float* x,
@@ -143,7 +160,14 @@ void pairwise_indexed_L2sqr(
         const int64_t* iy,
         float* dis);
 
-/* same for inner product */
+/** compute dis[j] = inner_product(x[ix[j]], y[iy[j]]) forall j=0..n-1
+ *
+ * @param x  size (max(ix) + 1, d)
+ * @param y  size (max(iy) + 1, d)
+ * @param ix size n
+ * @param iy size n
+ * @param dis size n
+ */
 void pairwise_indexed_inner_product(
         size_t d,
         size_t n,
diff --git a/thirdparty/faiss/faiss/utils/hamming.cpp b/thirdparty/faiss/faiss/utils/hamming.cpp
index 00ac8dc7a..f0242f875 100644
--- a/thirdparty/faiss/faiss/utils/hamming.cpp
+++ b/thirdparty/faiss/faiss/utils/hamming.cpp
@@ -686,4 +686,88 @@ void generalized_hammings_knn_hc(
         ha->reorder();
 }
 
+void pack_bitstrings(
+        size_t n,
+        size_t M,
+        int nbit,
+        const int32_t* unpacked,
+        uint8_t* packed,
+        size_t code_size) {
+    FAISS_THROW_IF_NOT(code_size >= (M * nbit + 7) / 8);
+#pragma omp parallel for if (n > 1000)
+    for (int64_t i = 0; i < n; i++) {
+        const int32_t* in = unpacked + i * M;
+        uint8_t* out = packed + i * code_size;
+        BitstringWriter wr(out, code_size);
+        for (int j = 0; j < M; j++) {
+            wr.write(in[j], nbit);
+        }
+    }
+}
+
+void pack_bitstrings(
+        size_t n,
+        size_t M,
+        const int32_t* nbit,
+        const int32_t* unpacked,
+        uint8_t* packed,
+        size_t code_size) {
+    int totbit = 0;
+    for (int j = 0; j < M; j++) {
+        totbit += nbit[j];
+    }
+    FAISS_THROW_IF_NOT(code_size >= (totbit + 7) / 8);
+#pragma omp parallel for if (n > 1000)
+    for (int64_t i = 0; i < n; i++) {
+        const int32_t* in = unpacked + i * M;
+        uint8_t* out = packed + i * code_size;
+        BitstringWriter wr(out, code_size);
+        for (int j = 0; j < M; j++) {
+            wr.write(in[j], nbit[j]);
+        }
+    }
+}
+
+void unpack_bitstrings(
+        size_t n,
+        size_t M,
+        int nbit,
+        const uint8_t* packed,
+        size_t code_size,
+        int32_t* unpacked) {
+    FAISS_THROW_IF_NOT(code_size >= (M * nbit + 7) / 8);
+#pragma omp parallel for if (n > 1000)
+    for (int64_t i = 0; i < n; i++) {
+        const uint8_t* in = packed + i * code_size;
+        int32_t* out = unpacked + i * M;
+        BitstringReader rd(in, code_size);
+        for (int j = 0; j < M; j++) {
+            out[j] = rd.read(nbit);
+        }
+    }
+}
+
+void unpack_bitstrings(
+        size_t n,
+        size_t M,
+        const int32_t* nbit,
+        const uint8_t* packed,
+        size_t code_size,
+        int32_t* unpacked) {
+    int totbit = 0;
+    for (int j = 0; j < M; j++) {
+        totbit += nbit[j];
+    }
+    FAISS_THROW_IF_NOT(code_size >= (totbit + 7) / 8);
+#pragma omp parallel for if (n > 1000)
+    for (int64_t i = 0; i < n; i++) {
+        const uint8_t* in = packed + i * code_size;
+        int32_t* out = unpacked + i * M;
+        BitstringReader rd(in, code_size);
+        for (int j = 0; j < M; j++) {
+            out[j] = rd.read(nbit[j]);
+        }
+    }
+}
+
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/utils/hamming.h b/thirdparty/faiss/faiss/utils/hamming.h
index 661b49e49..89b1dc5da 100644
--- a/thirdparty/faiss/faiss/utils/hamming.h
+++ b/thirdparty/faiss/faiss/utils/hamming.h
@@ -225,6 +225,64 @@ void generalized_hammings_knn_hc(
         size_t code_size,
         int ordered = true);
 
+/** Pack a set of n codes of size M * nbit
+ *
+ * @param n           number of codes to pack
+ * @param M           number of elementary codes per code
+ * @param nbit        number of bits per elementary code
+ * @param unpacked    input unpacked codes, size (n, M)
+ * @param packed      output packed codes, size (n, code_size)
+ * @param code_size   should be >= ceil(M * nbit / 8)
+ */
+void pack_bitstrings(
+        size_t n,
+        size_t M,
+        int nbit,
+        const int32_t* unpacked,
+        uint8_t* packed,
+        size_t code_size);
+
+/** Pack a set of n codes of variable sizes
+ *
+ * @param nbit       number of bits per entry (size M)
+ */
+void pack_bitstrings(
+        size_t n,
+        size_t M,
+        const int32_t* nbits,
+        const int32_t* unpacked,
+        uint8_t* packed,
+        size_t code_size);
+
+/** Unpack a set of n codes of size M * nbit
+ *
+ * @param n           number of codes to pack
+ * @param M           number of elementary codes per code
+ * @param nbit        number of bits per elementary code
+ * @param unpacked    input unpacked codes, size (n, M)
+ * @param packed      output packed codes, size (n, code_size)
+ * @param code_size   should be >= ceil(M * nbit / 8)
+ */
+void unpack_bitstrings(
+        size_t n,
+        size_t M,
+        int nbit,
+        const uint8_t* packed,
+        size_t code_size,
+        int32_t* unpacked);
+
+/** Unpack a set of n codes of variable sizes
+ *
+ * @param nbit       number of bits per entry (size M)
+ */
+void unpack_bitstrings(
+        size_t n,
+        size_t M,
+        const int32_t* nbits,
+        const uint8_t* packed,
+        size_t code_size,
+        int32_t* unpacked);
+
 } // namespace faiss
 
 #include <faiss/utils/hamming-inl.h>
diff --git a/thirdparty/faiss/faiss/utils/partitioning.cpp b/thirdparty/faiss/faiss/utils/partitioning.cpp
index 955bf2da9..4070aaae6 100644
--- a/thirdparty/faiss/faiss/utils/partitioning.cpp
+++ b/thirdparty/faiss/faiss/utils/partitioning.cpp
@@ -754,8 +754,6 @@ typename C::T partition_fuzzy(
         size_t q_min,
         size_t q_max,
         size_t* q_out) {
-    // the code below compiles and runs without AVX2 but it's slower than
-    // the scalar implementation
 #ifdef __AVX2__
     constexpr bool is_uint16 = std::is_same<typename C::T, uint16_t>::value;
     if (is_uint16 && is_aligned_pointer(vals)) {
diff --git a/thirdparty/faiss/tests/test_index_composite.py b/thirdparty/faiss/tests/test_index_composite.py
index 81a00cb93..a760c0cf0 100644
--- a/thirdparty/faiss/tests/test_index_composite.py
+++ b/thirdparty/faiss/tests/test_index_composite.py
@@ -14,7 +14,7 @@
 import tempfile
 import platform
 
-from common_faiss_tests import get_dataset_2
+from common_faiss_tests import get_dataset_2, get_dataset
 from faiss.contrib.datasets import SyntheticDataset
 from faiss.contrib.inspect_tools import make_LinearTransform_matrix
 from faiss.contrib.evaluation import check_ref_knn_with_draws
@@ -822,3 +822,158 @@ def test_precomputed_tables(self):
 
         np.testing.assert_array_equal(Dnew, D2)
         np.testing.assert_array_equal(Inew, I2)
+
+
+
+class TestSearchAndReconstruct(unittest.TestCase):
+
+    def run_search_and_reconstruct(self, index, xb, xq, k=10, eps=None):
+        n, d = xb.shape
+        assert xq.shape[1] == d
+        assert index.d == d
+
+        D_ref, I_ref = index.search(xq, k)
+        R_ref = index.reconstruct_n(0, n)
+        D, I, R = index.search_and_reconstruct(xq, k)
+
+        np.testing.assert_almost_equal(D, D_ref, decimal=5)
+        self.assertTrue((I == I_ref).all())
+        self.assertEqual(R.shape[:2], I.shape)
+        self.assertEqual(R.shape[2], d)
+
+        # (n, k, ..) -> (n * k, ..)
+        I_flat = I.reshape(-1)
+        R_flat = R.reshape(-1, d)
+        # Filter out -1s when not enough results
+        R_flat = R_flat[I_flat >= 0]
+        I_flat = I_flat[I_flat >= 0]
+
+        recons_ref_err = np.mean(np.linalg.norm(R_flat - R_ref[I_flat]))
+        self.assertLessEqual(recons_ref_err, 1e-6)
+
+        def norm1(x):
+            return np.sqrt((x ** 2).sum(axis=1))
+
+        recons_err = np.mean(norm1(R_flat - xb[I_flat]))
+
+        print('Reconstruction error = %.3f' % recons_err)
+        if eps is not None:
+            self.assertLessEqual(recons_err, eps)
+
+        return D, I, R
+
+    def test_IndexFlat(self):
+        d = 32
+        nb = 1000
+        nt = 1500
+        nq = 200
+
+        (xt, xb, xq) = get_dataset(d, nb, nt, nq)
+
+        index = faiss.IndexFlatL2(d)
+        index.add(xb)
+
+        self.run_search_and_reconstruct(index, xb, xq, eps=0.0)
+
+    def test_IndexIVFFlat(self):
+        d = 32
+        nb = 1000
+        nt = 1500
+        nq = 200
+
+        (xt, xb, xq) = get_dataset(d, nb, nt, nq)
+
+        quantizer = faiss.IndexFlatL2(d)
+        index = faiss.IndexIVFFlat(quantizer, d, 32, faiss.METRIC_L2)
+        index.cp.min_points_per_centroid = 5    # quiet warning
+        index.nprobe = 4
+        index.train(xt)
+        index.add(xb)
+
+        self.run_search_and_reconstruct(index, xb, xq, eps=0.0)
+
+    def test_IndexIVFPQ(self):
+        d = 32
+        nb = 1000
+        nt = 1500
+        nq = 200
+
+        (xt, xb, xq) = get_dataset(d, nb, nt, nq)
+
+        quantizer = faiss.IndexFlatL2(d)
+        index = faiss.IndexIVFPQ(quantizer, d, 32, 8, 8)
+        index.cp.min_points_per_centroid = 5    # quiet warning
+        index.nprobe = 4
+        index.train(xt)
+        index.add(xb)
+
+        self.run_search_and_reconstruct(index, xb, xq, eps=1.0)
+
+    def test_MultiIndex(self):
+        d = 32
+        nb = 1000
+        nt = 1500
+        nq = 200
+
+        (xt, xb, xq) = get_dataset(d, nb, nt, nq)
+
+        index = faiss.index_factory(d, "IMI2x5,PQ8np")
+        faiss.ParameterSpace().set_index_parameter(index, "nprobe", 4)
+        index.train(xt)
+        index.add(xb)
+
+        self.run_search_and_reconstruct(index, xb, xq, eps=1.0)
+
+    def test_IndexTransform(self):
+        d = 32
+        nb = 1000
+        nt = 1500
+        nq = 200
+
+        (xt, xb, xq) = get_dataset(d, nb, nt, nq)
+
+        index = faiss.index_factory(d, "L2norm,PCA8,IVF32,PQ8np")
+        faiss.ParameterSpace().set_index_parameter(index, "nprobe", 4)
+        index.train(xt)
+        index.add(xb)
+
+        self.run_search_and_reconstruct(index, xb, xq)
+
+
+class TestSearchAndGetCodes(unittest.TestCase):
+
+    def do_test(self, factory_string):
+        ds = SyntheticDataset(32, 1000, 100, 10)
+
+        index = faiss.index_factory(ds.d, factory_string)
+
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+
+        index.nprobe
+        index.nprobe = 10
+        Dref, Iref = index.search(ds.get_queries(), 10)
+
+        #print(index.search_and_return_codes)
+        D, I, codes = index.search_and_return_codes(
+            ds.get_queries(), 10, include_listnos=True)
+
+        np.testing.assert_array_equal(I, Iref)
+        np.testing.assert_array_equal(D, Dref)
+
+        # verify that we get the same distances when decompressing from
+        # returned codes (the codes are compatible with sa_decode)
+        for qi in range(ds.nq):
+            q = ds.get_queries()[qi]
+            xbi = index.sa_decode(codes[qi])
+            D2 = ((q - xbi) ** 2).sum(1)
+            np.testing.assert_allclose(D2, D[qi], rtol=1e-5)
+
+    def test_ivfpq(self):
+        self.do_test("IVF20,PQ4x4np")
+
+    def test_ivfsq(self):
+        self.do_test("IVF20,SQ8")
+
+    def test_ivfrq(self):
+        self.do_test("IVF20,RQ3x4")
diff --git a/thirdparty/faiss/tests/test_refine.py b/thirdparty/faiss/tests/test_refine.py
index aff285f40..4e85ee11e 100644
--- a/thirdparty/faiss/tests/test_refine.py
+++ b/thirdparty/faiss/tests/test_refine.py
@@ -64,3 +64,58 @@ def test_distance_computer_AQ_LUT(self):
 
     def test_distance_computer_AQ_LUT_IP(self):
         self.do_test("RQ3x4_Nqint8", faiss.METRIC_INNER_PRODUCT)
+
+
+class TestIndexRefineSearchParams(unittest.TestCase):
+
+    def do_test(self, factory_string):
+        ds = datasets.SyntheticDataset(32, 256, 100, 40)
+
+        index = faiss.index_factory(32, factory_string)
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+        index.nprobe = 4
+        xq = ds.get_queries()
+
+        # do a search with k_factor = 1
+        D1, I1 = index.search(xq, 10)
+        inter1 = faiss.eval_intersection(I1, ds.get_groundtruth(10))
+
+        # do a search with k_factor = 1.5
+        params = faiss.IndexRefineSearchParameters(k_factor=1.1)
+        D2, I2 = index.search(xq, 10, params=params)
+        inter2 = faiss.eval_intersection(I2, ds.get_groundtruth(10))
+
+        # do a search with k_factor = 2
+        params = faiss.IndexRefineSearchParameters(k_factor=2)
+        D3, I3 = index.search(xq, 10, params=params)
+        inter3 = faiss.eval_intersection(I3, ds.get_groundtruth(10))
+
+        # make sure that the recall rate increases with k_factor
+        self.assertGreater(inter2, inter1)
+        self.assertGreater(inter3, inter2)
+
+        # make sure that the baseline k_factor is unchanged
+        self.assertEqual(index.k_factor, 1)
+
+        # try passing params for the baseline index, change nprobe
+        base_params = faiss.IVFSearchParameters(nprobe=10)
+        params = faiss.IndexRefineSearchParameters(k_factor=1, base_index_params=base_params)
+        D4, I4 = index.search(xq, 10, params=params)
+        inter4 = faiss.eval_intersection(I4, ds.get_groundtruth(10))
+
+        base_params = faiss.IVFSearchParameters(nprobe=2)
+        params = faiss.IndexRefineSearchParameters(k_factor=1, base_index_params=base_params)
+        D5, I5 = index.search(xq, 10, params=params)
+        inter5 = faiss.eval_intersection(I5, ds.get_groundtruth(10))
+
+        # make sure that the recall rate changes
+        self.assertNotEqual(inter4, inter5)
+
+    def test_rflat(self):
+        # flat is handled by the IndexRefineFlat class
+        self.do_test("IVF8,PQ2x4np,RFlat")
+
+    def test_refine_sq8(self):
+        # this case uses the IndexRefine class
+        self.do_test("IVF8,PQ2x4np,Refine(SQ8)")
diff --git a/thirdparty/faiss/tests/test_residual_quantizer.py b/thirdparty/faiss/tests/test_residual_quantizer.py
index 2ca3e4c32..e37ee3efe 100644
--- a/thirdparty/faiss/tests/test_residual_quantizer.py
+++ b/thirdparty/faiss/tests/test_residual_quantizer.py
@@ -1117,7 +1117,7 @@ def test_precomp(self):
             K = 1 << rq.nbits.at(m)
             cpp_table = codebook_cross_prods[ofs:ofs + K * kk].reshape(kk, K)
             ofs += kk * K
-            np.testing.assert_allclose(py_table, cpp_table, rtol=2e-4)
+            np.testing.assert_allclose(py_table, cpp_table, atol=1e-5)
 
         cent_norms = faiss.vector_to_array(rq.cent_norms)
         np.testing.assert_array_almost_equal(
diff --git a/thirdparty/faiss/tests/test_standalone_codec.py b/thirdparty/faiss/tests/test_standalone_codec.py
index 1e1993bb4..7fdcf6849 100644
--- a/thirdparty/faiss/tests/test_standalone_codec.py
+++ b/thirdparty/faiss/tests/test_standalone_codec.py
@@ -266,9 +266,9 @@ def test_ZnSphereCodecAlt24(self):
 
 
 class TestBitstring(unittest.TestCase):
-    """ Low-level bit string tests """
 
     def test_rw(self):
+        """ Low-level bit string tests """
         rs = np.random.RandomState(1234)
         nbyte = 1000
         sz = 0
@@ -311,6 +311,26 @@ def test_rw(self):
             # print('nbit %d xref %x xnew %x' % (nbit, xref, xnew))
             self.assertTrue(xnew == xref)
 
+    def test_arrays(self):
+        nbit = 5
+        M = 10
+        n = 20
+        rs = np.random.RandomState(123)
+        a = rs.randint(1<<nbit, size=(n, M), dtype='int32')
+        b = faiss.pack_bitstrings(a, nbit)
+        c = faiss.unpack_bitstrings(b, M, nbit)
+        np.testing.assert_array_equal(a, c)
+
+    def test_arrays_variable_size(self):
+        nbits = [10, 5, 3, 12, 6, 7, 4]
+        n = 20
+        rs = np.random.RandomState(123)
+        a = rs.randint(1<<16, size=(n, len(nbits)), dtype='int32')
+        a &= (1 << np.array(nbits)) - 1
+        b = faiss.pack_bitstrings(a, nbits)
+        c = faiss.unpack_bitstrings(b, nbits)
+        np.testing.assert_array_equal(a, c)
+
 
 class TestIVFTransfer(unittest.TestCase):