Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/data/users/hoss/faiss/OnDiskInvertedLists.cpp
1 /**
2  * Copyright (c) Facebook, Inc. and its affiliates.
3  *
4  * This source code is licensed under the MIT license found in the
5  * LICENSE file in the root directory of this source tree.
6  */
7 
8 // -*- c++ -*-
9 
10 #include "OnDiskInvertedLists.h"
11 
12 #include <pthread.h>
13 
14 #include <unordered_set>
15 
16 #include <sys/mman.h>
17 #include <unistd.h>
18 #include <sys/types.h>
19 
20 #include "FaissAssert.h"
21 #include "utils.h"
22 
23 
24 namespace faiss {
25 
26 
27 /**********************************************
28  * LockLevels
29  **********************************************/
30 
31 
32 struct LockLevels {
33  /* There n times lock1(n), one lock2 and one lock3
34  * Invariants:
35  * a single thread can hold one lock1(n) for some n
36  * a single thread can hold lock2, if it holds lock1(n) for some n
37  * a single thread can hold lock3, if it holds lock1(n) for some n
38  * AND lock2 AND no other thread holds lock1(m) for m != n
39  */
40  pthread_mutex_t mutex1;
41  pthread_cond_t level1_cv;
42  pthread_cond_t level2_cv;
43  pthread_cond_t level3_cv;
44 
45  std::unordered_set<int> level1_holders; // which level1 locks are held
46  int n_level2; // nb threads that wait on level2
47  bool level3_in_use; // a threads waits on level3
48  bool level2_in_use;
49 
50  LockLevels() {
51  pthread_mutex_init(&mutex1, nullptr);
52  pthread_cond_init(&level1_cv, nullptr);
53  pthread_cond_init(&level2_cv, nullptr);
54  pthread_cond_init(&level3_cv, nullptr);
55  n_level2 = 0;
56  level2_in_use = false;
57  level3_in_use = false;
58  }
59 
60  ~LockLevels() {
61  pthread_cond_destroy(&level1_cv);
62  pthread_cond_destroy(&level2_cv);
63  pthread_cond_destroy(&level3_cv);
64  pthread_mutex_destroy(&mutex1);
65  }
66 
67  void lock_1(int no) {
68  pthread_mutex_lock(&mutex1);
69  while (level3_in_use || level1_holders.count(no) > 0) {
70  pthread_cond_wait(&level1_cv, &mutex1);
71  }
72  level1_holders.insert(no);
73  pthread_mutex_unlock(&mutex1);
74  }
75 
76  void unlock_1(int no) {
77  pthread_mutex_lock(&mutex1);
78  assert(level1_holders.count(no) == 1);
79  level1_holders.erase(no);
80  if (level3_in_use) { // a writer is waiting
81  pthread_cond_signal(&level3_cv);
82  } else {
83  pthread_cond_broadcast(&level1_cv);
84  }
85  pthread_mutex_unlock(&mutex1);
86  }
87 
88  void lock_2() {
89  pthread_mutex_lock(&mutex1);
90  n_level2 ++;
91  if (level3_in_use) { // tell waiting level3 that we are blocked
92  pthread_cond_signal(&level3_cv);
93  }
94  while (level2_in_use) {
95  pthread_cond_wait(&level2_cv, &mutex1);
96  }
97  level2_in_use = true;
98  pthread_mutex_unlock(&mutex1);
99  }
100 
101  void unlock_2() {
102  pthread_mutex_lock(&mutex1);
103  level2_in_use = false;
104  n_level2 --;
105  pthread_cond_signal(&level2_cv);
106  pthread_mutex_unlock(&mutex1);
107  }
108 
109  void lock_3() {
110  pthread_mutex_lock(&mutex1);
111  level3_in_use = true;
112  // wait until there are no level1 holders anymore except the
113  // ones that are waiting on level2 (we are holding lock2)
114  while (level1_holders.size() > n_level2) {
115  pthread_cond_wait(&level3_cv, &mutex1);
116  }
117  // don't release the lock!
118  }
119 
120  void unlock_3() {
121  level3_in_use = false;
122  // wake up all level1_holders
123  pthread_cond_broadcast(&level1_cv);
124  pthread_mutex_unlock(&mutex1);
125  }
126 
127  void print () {
128  pthread_mutex_lock(&mutex1);
129  printf("State: level3_in_use=%d n_level2=%d level1_holders: [", level3_in_use, n_level2);
130  for (int k : level1_holders) {
131  printf("%d ", k);
132  }
133  printf("]\n");
134  pthread_mutex_unlock(&mutex1);
135  }
136 
137 };
138 
139 /**********************************************
140  * OngoingPrefetch
141  **********************************************/
142 
144 
145  struct Thread {
146  pthread_t pth;
147  OngoingPrefetch *pf;
148 
149  bool one_list () {
150  idx_t list_no = pf->get_next_list();
151  if(list_no == -1) return false;
152  const OnDiskInvertedLists *od = pf->od;
153  od->locks->lock_1 (list_no);
154  size_t n = od->list_size (list_no);
155  const Index::idx_t *idx = od->get_ids (list_no);
156  const uint8_t *codes = od->get_codes (list_no);
157  int cs = 0;
158  for (size_t i = 0; i < n;i++) {
159  cs += idx[i];
160  }
161  const idx_t *codes8 = (const idx_t*)codes;
162  idx_t n8 = n * od->code_size / 8;
163 
164  for (size_t i = 0; i < n8;i++) {
165  cs += codes8[i];
166  }
167  od->locks->unlock_1(list_no);
168 
169  global_cs += cs & 1;
170  return true;
171  }
172 
173  };
174 
175  std::vector<Thread> threads;
176 
177  pthread_mutex_t list_ids_mutex;
178  std::vector<idx_t> list_ids;
179  int cur_list;
180 
181  // mutex for the list of tasks
182  pthread_mutex_t mutex;
183 
184  // pretext to avoid code below to be optimized out
185  static int global_cs;
186 
187  const OnDiskInvertedLists *od;
188 
189  explicit OngoingPrefetch (const OnDiskInvertedLists *od): od (od)
190  {
191  pthread_mutex_init (&mutex, nullptr);
192  pthread_mutex_init (&list_ids_mutex, nullptr);
193  cur_list = 0;
194  }
195 
196  static void* prefetch_list (void * arg) {
197  Thread *th = static_cast<Thread*>(arg);
198 
199  while (th->one_list()) ;
200 
201  return nullptr;
202  }
203 
204  idx_t get_next_list () {
205  idx_t list_no = -1;
206  pthread_mutex_lock (&list_ids_mutex);
207  if (cur_list >= 0 && cur_list < list_ids.size()) {
208  list_no = list_ids[cur_list++];
209  }
210  pthread_mutex_unlock (&list_ids_mutex);
211  return list_no;
212  }
213 
214  void prefetch_lists (const idx_t *list_nos, int n) {
215  pthread_mutex_lock (&mutex);
216  pthread_mutex_lock (&list_ids_mutex);
217  list_ids.clear ();
218  pthread_mutex_unlock (&list_ids_mutex);
219  for (auto &th: threads) {
220  pthread_join (th.pth, nullptr);
221  }
222 
223  threads.resize (0);
224  cur_list = 0;
225  int nt = std::min (n, od->prefetch_nthread);
226 
227  if (nt > 0) {
228  // prepare tasks
229  for (int i = 0; i < n; i++) {
230  idx_t list_no = list_nos[i];
231  if (list_no >= 0 && od->list_size(list_no) > 0) {
232  list_ids.push_back (list_no);
233  }
234  }
235  // prepare threads
236  threads.resize (nt);
237  for (Thread &th: threads) {
238  th.pf = this;
239  pthread_create (&th.pth, nullptr, prefetch_list, &th);
240  }
241  }
242  pthread_mutex_unlock (&mutex);
243  }
244 
245  ~OngoingPrefetch () {
246  pthread_mutex_lock (&mutex);
247  for (auto &th: threads) {
248  pthread_join (th.pth, nullptr);
249  }
250  pthread_mutex_unlock (&mutex);
251  pthread_mutex_destroy (&mutex);
252  pthread_mutex_destroy (&list_ids_mutex);
253  }
254 
255 };
256 
257 int OnDiskInvertedLists::OngoingPrefetch::global_cs = 0;
258 
259 
260 void OnDiskInvertedLists::prefetch_lists (const idx_t *list_nos, int n) const
261 {
262  pf->prefetch_lists (list_nos, n);
263 }
264 
265 
266 
267 /**********************************************
268  * OnDiskInvertedLists: mmapping
269  **********************************************/
270 
271 
272 void OnDiskInvertedLists::do_mmap ()
273 {
274  const char *rw_flags = read_only ? "r" : "r+";
275  int prot = read_only ? PROT_READ : PROT_WRITE | PROT_READ;
276  FILE *f = fopen (filename.c_str(), rw_flags);
277  FAISS_THROW_IF_NOT_FMT (f, "could not open %s in mode %s: %s",
278  filename.c_str(), rw_flags, strerror(errno));
279 
280  uint8_t * ptro = (uint8_t*)mmap (nullptr, totsize,
281  prot, MAP_SHARED, fileno (f), 0);
282 
283  FAISS_THROW_IF_NOT_FMT (ptro != MAP_FAILED,
284  "could not mmap %s: %s",
285  filename.c_str(),
286  strerror(errno));
287  ptr = ptro;
288  fclose (f);
289 
290 }
291 
292 void OnDiskInvertedLists::update_totsize (size_t new_size)
293 {
294 
295  // unmap file
296  if (ptr != nullptr) {
297  int err = munmap (ptr, totsize);
298  FAISS_THROW_IF_NOT_FMT (err == 0, "munmap error: %s",
299  strerror(errno));
300  }
301  if (totsize == 0) {
302  // must create file before truncating it
303  FILE *f = fopen (filename.c_str(), "w");
304  FAISS_THROW_IF_NOT_FMT (f, "could not open %s in mode W: %s",
305  filename.c_str(), strerror(errno));
306  fclose (f);
307  }
308 
309  if (new_size > totsize) {
310  if (!slots.empty() &&
311  slots.back().offset + slots.back().capacity == totsize) {
312  slots.back().capacity += new_size - totsize;
313  } else {
314  slots.push_back (Slot(totsize, new_size - totsize));
315  }
316  } else {
317  assert(!"not implemented");
318  }
319 
320  totsize = new_size;
321 
322  // create file
323  printf ("resizing %s to %ld bytes\n", filename.c_str(), totsize);
324 
325  int err = truncate (filename.c_str(), totsize);
326 
327  FAISS_THROW_IF_NOT_FMT (err == 0, "truncate %s to %ld: %s",
328  filename.c_str(), totsize,
329  strerror(errno));
330  do_mmap ();
331 }
332 
333 
334 
335 
336 
337 
338 /**********************************************
339  * OnDiskInvertedLists
340  **********************************************/
341 
342 #define INVALID_OFFSET (size_t)(-1)
343 
344 OnDiskInvertedLists::List::List ():
345  size (0), capacity (0), offset (INVALID_OFFSET)
346 {}
347 
348 OnDiskInvertedLists::Slot::Slot (size_t offset, size_t capacity):
349  offset (offset), capacity (capacity)
350 {}
351 
352 OnDiskInvertedLists::Slot::Slot ():
353  offset (0), capacity (0)
354 {}
355 
356 
357 
358 OnDiskInvertedLists::OnDiskInvertedLists (
359  size_t nlist, size_t code_size,
360  const char *filename):
361  InvertedLists (nlist, code_size),
362  filename (filename),
363  totsize (0),
364  ptr (nullptr),
365  read_only (false),
366  locks (new LockLevels ()),
367  pf (new OngoingPrefetch (this)),
368  prefetch_nthread (32)
369 {
370  lists.resize (nlist);
371 
372  // slots starts empty
373 }
374 
375 OnDiskInvertedLists::OnDiskInvertedLists ():
376  OnDiskInvertedLists (0, 0, "")
377 {
378 }
379 
380 OnDiskInvertedLists::~OnDiskInvertedLists ()
381 {
382  delete pf;
383 
384  // unmap all lists
385  if (ptr != nullptr) {
386  int err = munmap (ptr, totsize);
387  if (err != 0) {
388  fprintf(stderr, "mumap error: %s",
389  strerror(errno));
390  }
391  }
392  delete locks;
393 }
394 
395 
396 
397 
398 size_t OnDiskInvertedLists::list_size(size_t list_no) const
399 {
400  return lists[list_no].size;
401 }
402 
403 
404 const uint8_t * OnDiskInvertedLists::get_codes (size_t list_no) const
405 {
406  if (lists[list_no].offset == INVALID_OFFSET) {
407  return nullptr;
408  }
409 
410  return ptr + lists[list_no].offset;
411 }
412 
413 const Index::idx_t * OnDiskInvertedLists::get_ids (size_t list_no) const
414 {
415  if (lists[list_no].offset == INVALID_OFFSET) {
416  return nullptr;
417  }
418 
419  return (const idx_t*)(ptr + lists[list_no].offset +
420  code_size * lists[list_no].capacity);
421 }
422 
423 
424 void OnDiskInvertedLists::update_entries (
425  size_t list_no, size_t offset, size_t n_entry,
426  const idx_t *ids_in, const uint8_t *codes_in)
427 {
428  FAISS_THROW_IF_NOT (!read_only);
429  if (n_entry == 0) return;
430  const List & l = lists[list_no];
431  assert (n_entry + offset <= l.size);
432  idx_t *ids = const_cast<idx_t*>(get_ids (list_no));
433  memcpy (ids + offset, ids_in, sizeof(ids_in[0]) * n_entry);
434  uint8_t *codes = const_cast<uint8_t*>(get_codes (list_no));
435  memcpy (codes + offset * code_size, codes_in, code_size * n_entry);
436 }
437 
438 size_t OnDiskInvertedLists::add_entries (
439  size_t list_no, size_t n_entry,
440  const idx_t* ids, const uint8_t *code)
441 {
442  FAISS_THROW_IF_NOT (!read_only);
443  locks->lock_1 (list_no);
444  size_t o = list_size (list_no);
445  resize_locked (list_no, n_entry + o);
446  update_entries (list_no, o, n_entry, ids, code);
447  locks->unlock_1 (list_no);
448  return o;
449 }
450 
451 void OnDiskInvertedLists::resize (size_t list_no, size_t new_size)
452 {
453  FAISS_THROW_IF_NOT (!read_only);
454  locks->lock_1 (list_no);
455  resize_locked (list_no, new_size);
456  locks->unlock_1 (list_no);
457 }
458 
459 
460 
461 void OnDiskInvertedLists::resize_locked (size_t list_no, size_t new_size)
462 {
463  List & l = lists[list_no];
464 
465  if (new_size <= l.capacity &&
466  new_size > l.capacity / 2) {
467  l.size = new_size;
468  return;
469  }
470 
471  // otherwise we release the current slot, and find a new one
472 
473  locks->lock_2 ();
474  free_slot (l.offset, l.capacity);
475 
476  List new_l;
477 
478  if (new_size == 0) {
479  new_l = List();
480  } else {
481  new_l.size = new_size;
482  new_l.capacity = 1;
483  while (new_l.capacity < new_size) {
484  new_l.capacity *= 2;
485  }
486  new_l.offset = allocate_slot (
487  new_l.capacity * (sizeof(idx_t) + code_size));
488  }
489 
490  // copy common data
491  if (l.offset != new_l.offset) {
492  size_t n = std::min (new_size, l.size);
493  if (n > 0) {
494  memcpy (ptr + new_l.offset, get_codes(list_no), n * code_size);
495  memcpy (ptr + new_l.offset + new_l.capacity * code_size,
496  get_ids (list_no), n * sizeof(idx_t));
497  }
498  }
499 
500  lists[list_no] = new_l;
501  locks->unlock_2 ();
502 }
503 
504 size_t OnDiskInvertedLists::allocate_slot (size_t capacity) {
505  // should hold lock2
506 
507  auto it = slots.begin();
508  while (it != slots.end() && it->capacity < capacity) {
509  it++;
510  }
511 
512  if (it == slots.end()) {
513  // not enough capacity
514  size_t new_size = totsize == 0 ? 32 : totsize * 2;
515  while (new_size - totsize < capacity)
516  new_size *= 2;
517  locks->lock_3 ();
518  update_totsize(new_size);
519  locks->unlock_3 ();
520  it = slots.begin();
521  while (it != slots.end() && it->capacity < capacity) {
522  it++;
523  }
524  assert (it != slots.end());
525  }
526 
527  size_t o = it->offset;
528  if (it->capacity == capacity) {
529  slots.erase (it);
530  } else {
531  // take from beginning of slot
532  it->capacity -= capacity;
533  it->offset += capacity;
534  }
535 
536  return o;
537 }
538 
539 
540 
541 void OnDiskInvertedLists::free_slot (size_t offset, size_t capacity) {
542 
543  // should hold lock2
544  if (capacity == 0) return;
545 
546  auto it = slots.begin();
547  while (it != slots.end() && it->offset <= offset) {
548  it++;
549  }
550 
551  size_t inf = 1UL << 60;
552 
553  size_t end_prev = inf;
554  if (it != slots.begin()) {
555  auto prev = it;
556  prev--;
557  end_prev = prev->offset + prev->capacity;
558  }
559 
560  size_t begin_next = 1L << 60;
561  if (it != slots.end()) {
562  begin_next = it->offset;
563  }
564 
565  assert (end_prev == inf || offset >= end_prev);
566  assert (offset + capacity <= begin_next);
567 
568  if (offset == end_prev) {
569  auto prev = it;
570  prev--;
571  if (offset + capacity == begin_next) {
572  prev->capacity += capacity + it->capacity;
573  slots.erase (it);
574  } else {
575  prev->capacity += capacity;
576  }
577  } else {
578  if (offset + capacity == begin_next) {
579  it->offset -= capacity;
580  it->capacity += capacity;
581  } else {
582  slots.insert (it, Slot (offset, capacity));
583  }
584  }
585 
586  // TODO shrink global storage if needed
587 }
588 
589 
590 /*****************************************
591  * Compact form
592  *****************************************/
593 
594 size_t OnDiskInvertedLists::merge_from (const InvertedLists **ils, int n_il,
595  bool verbose)
596 {
597  FAISS_THROW_IF_NOT_MSG (totsize == 0, "works only on an empty InvertedLists");
598 
599  std::vector<size_t> sizes (nlist);
600  for (int i = 0; i < n_il; i++) {
601  const InvertedLists *il = ils[i];
602  FAISS_THROW_IF_NOT (il->nlist == nlist && il->code_size == code_size);
603 
604  for (size_t j = 0; j < nlist; j++) {
605  sizes [j] += il->list_size(j);
606  }
607  }
608 
609  size_t cums = 0;
610  size_t ntotal = 0;
611  for (size_t j = 0; j < nlist; j++) {
612  ntotal += sizes[j];
613  lists[j].size = 0;
614  lists[j].capacity = sizes[j];
615  lists[j].offset = cums;
616  cums += lists[j].capacity * (sizeof(idx_t) + code_size);
617  }
618 
619  update_totsize (cums);
620 
621 
622  size_t nmerged = 0;
623  double t0 = getmillisecs(), last_t = t0;
624 
625 #pragma omp parallel for
626  for (size_t j = 0; j < nlist; j++) {
627  List & l = lists[j];
628  for (int i = 0; i < n_il; i++) {
629  const InvertedLists *il = ils[i];
630  size_t n_entry = il->list_size(j);
631  l.size += n_entry;
632  update_entries (j, l.size - n_entry, n_entry,
633  ScopedIds(il, j).get(),
634  ScopedCodes(il, j).get());
635  }
636  assert (l.size == l.capacity);
637  if (verbose) {
638 #pragma omp critical
639  {
640  nmerged++;
641  double t1 = getmillisecs();
642  if (t1 - last_t > 500) {
643  printf("merged %ld lists in %.3f s\r",
644  nmerged, (t1 - t0) / 1000.0);
645  fflush(stdout);
646  last_t = t1;
647  }
648  }
649  }
650  }
651  if(verbose) {
652  printf("\n");
653  }
654 
655  return ntotal;
656 }
657 
658 
659 void OnDiskInvertedLists::crop_invlists(size_t l0, size_t l1)
660 {
661  FAISS_THROW_IF_NOT(0 <= l0 && l0 <= l1 && l1 <= nlist);
662 
663  std::vector<List> new_lists (l1 - l0);
664  memcpy (new_lists.data(), &lists[l0], (l1 - l0) * sizeof(List));
665 
666  lists.swap(new_lists);
667 
668  nlist = l1 - l0;
669 }
670 
671 
672 
673 
674 } // namespace faiss
void crop_invlists(size_t l0, size_t l1)
restrict the inverted lists to l0:l1 without touching the mmapped region
const idx_t * get_ids(size_t list_no) const override
void prefetch_lists(const idx_t *list_nos, int nlist) const override
long idx_t
all indices are this type
Definition: Index.h:62
size_t code_size
code size per vector in bytes
Definition: InvertedLists.h:35
const uint8_t * get_codes(size_t list_no) const override
double getmillisecs()
ms elapsed since some arbitrary epoch
Definition: utils.cpp:69
size_t list_size(size_t list_no) const override
get the size of a list
size_t nlist
number of possible key values
Definition: InvertedLists.h:34