Add files via upload
parent
4e0e6b2193
commit
16518c4157
|
@ -0,0 +1,6 @@
|
|||
CXX=/usr/bin/g++-5
|
||||
|
||||
all : index
|
||||
|
||||
index.so : src/config.h src/graph.h src/data.h interface.cc
|
||||
$(CXX) -shared -fPIC interface.cc -o index.so -std=c++11 -Ofast -march=native -g -flto -funroll-loops -DOMP -fopenmp
|
Binary file not shown.
|
@ -0,0 +1,242 @@
|
|||
#include<stdio.h>
|
||||
#include<string.h>
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <queue>
|
||||
#include <chrono>
|
||||
#include <unordered_set>
|
||||
#include <unordered_map>
|
||||
#include<stdlib.h>
|
||||
#include<memory>
|
||||
#include<vector>
|
||||
#include<functional>
|
||||
|
||||
#include"src/data.h"
|
||||
#include"src/graph.h"
|
||||
|
||||
struct IndexContext{
|
||||
void* graph;
|
||||
void* data;
|
||||
};
|
||||
|
||||
|
||||
int topk = 0;
|
||||
int display_topk = 1;
|
||||
int build_idx_offset = 0;
|
||||
int query_idx_offset = 0;
|
||||
|
||||
void flush_add_buffer(
|
||||
std::vector<std::pair<idx_t,std::vector<std::pair<int,value_t>>>>& add_buffer,
|
||||
GraphWrapper* graph){
|
||||
#pragma omp parallel for
|
||||
for(int i = 0;i < add_buffer.size();++i){
|
||||
auto& idx = add_buffer[i].first;
|
||||
auto& point = add_buffer[i].second;
|
||||
graph->add_vertex_lock(idx,point);
|
||||
}
|
||||
add_buffer.clear();
|
||||
}
|
||||
|
||||
|
||||
extern "C"{
|
||||
// for mobius IP index
|
||||
void build_mobius_index(float* dense_mat,int row,int dim, int pq_size, double mobius_pow , const char* prefix){
|
||||
std::unique_ptr<Data> data;
|
||||
std::unique_ptr<Data> data_original;
|
||||
std::unique_ptr<GraphWrapper> graph;
|
||||
int topk = 0;
|
||||
int display_topk = 1;
|
||||
int build_idx_offset = 0;
|
||||
int query_idx_offset = 0;
|
||||
|
||||
++row;
|
||||
data = std::unique_ptr<Data>(new Data(row,dim));
|
||||
graph = std::unique_ptr<GraphWrapper>(new FixedDegreeGraph<3>(data.get()));
|
||||
graph->set_construct_pq_size(pq_size);
|
||||
|
||||
std::vector<std::pair<idx_t,std::vector<std::pair<int,value_t>>>> add_buffer;
|
||||
|
||||
((FixedDegreeGraph<3>*)graph.get())->get_data()->mobius_pow = mobius_pow;
|
||||
data_original = std::unique_ptr<Data>(new Data(row,dim));
|
||||
|
||||
std::vector<std::pair<int,value_t>> dummy_mobius_point;
|
||||
for(int i = 0;i < dim;++i)
|
||||
dummy_mobius_point.push_back(std::make_pair(i,0));
|
||||
|
||||
//idx += build_idx_offset;
|
||||
|
||||
for(int i = 0;i < row - 1;++i){
|
||||
|
||||
std::vector<std::pair<int,value_t>> point;
|
||||
point.reserve(dim);
|
||||
for(int j = 0;j < dim;++j)
|
||||
point.push_back(std::make_pair(j,dense_mat[i * dim + j]));
|
||||
|
||||
data_original->add(i,point);
|
||||
data->add_mobius(i,point);
|
||||
if(i < 1000){
|
||||
graph->add_vertex(i,point);
|
||||
}else{
|
||||
add_buffer.push_back(std::make_pair(i,point));
|
||||
}
|
||||
if(add_buffer.size() >= 1000000)
|
||||
flush_add_buffer(add_buffer,graph.get());
|
||||
}
|
||||
flush_add_buffer(add_buffer,graph.get());
|
||||
graph->add_vertex(row - 1,dummy_mobius_point);
|
||||
data.swap(data_original);
|
||||
|
||||
std::string str = std::string(prefix);
|
||||
data->dump(str + ".data");
|
||||
graph->dump(str + ".graph");
|
||||
|
||||
}
|
||||
|
||||
void load_mobius_index_prefix(int row,int dim,IndexContext* index_context,const char* prefix){
|
||||
std::string str = std::string(prefix);
|
||||
|
||||
++row;
|
||||
Data* data = new Data(row,dim);
|
||||
GraphWrapper* graph = new FixedDegreeGraph<1>(data);
|
||||
|
||||
//idx += build_idx_offset;
|
||||
data->load(str + ".data");
|
||||
graph->load(str + ".graph");
|
||||
|
||||
((FixedDegreeGraph<1>*)graph)->search_start_point = row - 1;
|
||||
((FixedDegreeGraph<1>*)graph)->ignore_startpoint = true;
|
||||
|
||||
index_context->graph = graph;
|
||||
index_context->data = data;
|
||||
}
|
||||
|
||||
void save_mobius_index_prefix(IndexContext* index_context,const char* prefix){
|
||||
std::string str = std::string(prefix);
|
||||
Data* data = (Data*)(index_context->data);
|
||||
GraphWrapper* graph = (GraphWrapper*)(index_context->graph);
|
||||
|
||||
data->dump(str + ".data");
|
||||
graph->dump(str + ".graph");
|
||||
}
|
||||
|
||||
void search_mobius_index(float* dense_vec,int dim,int search_budget,int return_k, IndexContext* index_context,idx_t* ret_id,double* ret_score){
|
||||
int topk = 0;
|
||||
int display_topk = 1;
|
||||
int build_idx_offset = 0;
|
||||
int query_idx_offset = 0;
|
||||
|
||||
Data* data = reinterpret_cast<Data*>(index_context->data);
|
||||
GraphWrapper* graph = reinterpret_cast<GraphWrapper*>(index_context->graph);
|
||||
|
||||
|
||||
//auto flag = (data==NULL);
|
||||
//std::cout<<flag<<std::endl;
|
||||
|
||||
std::vector<std::pair<int,value_t>> point;
|
||||
point.reserve(dim);
|
||||
for(int j = 0;j < dim;++j)
|
||||
point.push_back(std::make_pair(j,dense_vec[j]));
|
||||
std::vector<idx_t> topN;
|
||||
std::vector<double> score;
|
||||
graph->search_top_k_with_score(point,search_budget,topN,score);
|
||||
for(int i = 0;i < topN.size() && i < return_k;++i){
|
||||
ret_id[i] = topN[i];
|
||||
ret_score[i] = score[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// For L2 index
|
||||
void build_l2_index(float* dense_mat,int row,int dim, int pq_size, const char* prefix){
|
||||
std::unique_ptr<Data> data;
|
||||
std::unique_ptr<GraphWrapper> graph;
|
||||
int topk = 0;
|
||||
int display_topk = 1;
|
||||
int build_idx_offset = 0;
|
||||
int query_idx_offset = 0;
|
||||
|
||||
data = std::unique_ptr<Data>(new Data(row,dim));
|
||||
graph = std::unique_ptr<GraphWrapper>(new FixedDegreeGraph<3>(data.get()));
|
||||
graph->set_construct_pq_size(pq_size);
|
||||
|
||||
std::vector<std::pair<idx_t,std::vector<std::pair<int,value_t>>>> add_buffer;
|
||||
|
||||
for(int i = 0;i < row;++i){
|
||||
std::vector<std::pair<int,value_t>> point;
|
||||
point.reserve(dim);
|
||||
for(int j = 0;j < dim;++j)
|
||||
point.push_back(std::make_pair(j,dense_mat[i * dim + j]));
|
||||
data->add(i,point);
|
||||
if(i < 1000){
|
||||
graph->add_vertex(i,point);
|
||||
}else{
|
||||
add_buffer.push_back(std::make_pair(i,point));
|
||||
}
|
||||
if(add_buffer.size() >= 1000000)
|
||||
flush_add_buffer(add_buffer,graph.get());
|
||||
}
|
||||
flush_add_buffer(add_buffer,graph.get());
|
||||
|
||||
std::string str = std::string(prefix);
|
||||
data->dump(str + ".data");
|
||||
graph->dump(str + ".graph");
|
||||
|
||||
}
|
||||
|
||||
void load_l2_index_prefix(int row,int dim,IndexContext* index_context,const char* prefix){
|
||||
std::string str = std::string(prefix);
|
||||
|
||||
Data* data = new Data(row,dim);
|
||||
GraphWrapper* graph = new FixedDegreeGraph<3>(data);
|
||||
|
||||
//idx += build_idx_offset;
|
||||
|
||||
data->load(str + ".data");
|
||||
graph->load(str + ".graph");
|
||||
|
||||
index_context->graph = graph;
|
||||
index_context->data = data;
|
||||
}
|
||||
|
||||
void save_l2_index_prefix(IndexContext* index_context,const char* prefix){
|
||||
std::string str = std::string(prefix);
|
||||
Data* data = (Data*)(index_context->data);
|
||||
GraphWrapper* graph = (GraphWrapper*)(index_context->graph);
|
||||
|
||||
data->dump(str + ".data");
|
||||
graph->dump(str + ".graph");
|
||||
}
|
||||
|
||||
|
||||
|
||||
void search_l2_index(float* dense_vec,int dim,int search_budget,int return_k, IndexContext* index_context,idx_t* ret_id,double* ret_score){
|
||||
int topk = 0;
|
||||
int display_topk = 1;
|
||||
int build_idx_offset = 0;
|
||||
int query_idx_offset = 0;
|
||||
|
||||
Data* data = reinterpret_cast<Data*>(index_context->data);
|
||||
GraphWrapper* graph = reinterpret_cast<GraphWrapper*>(index_context->graph);
|
||||
|
||||
std::vector<std::pair<int,value_t>> point;
|
||||
point.reserve(dim);
|
||||
for(int j = 0;j < dim;++j)
|
||||
point.push_back(std::make_pair(j,dense_vec[j]));
|
||||
std::vector<idx_t> topN;
|
||||
std::vector<double> score;
|
||||
graph->search_top_k_with_score(point,search_budget,topN,score);
|
||||
for(int i = 0;i < topN.size() && i < return_k;++i){
|
||||
// printf("%d: (%zu, %f)\n",i,topN[i],score[i]);
|
||||
ret_id[i] = topN[i];
|
||||
ret_score[i] = score[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void release_context(IndexContext* index_context){
|
||||
delete (Data*)(index_context->data);
|
||||
delete (GraphWrapper*)(index_context->graph);
|
||||
}
|
||||
|
||||
} // extern "C"
|
||||
|
|
@ -0,0 +1,166 @@
|
|||
import ctypes
|
||||
import paddle
|
||||
import numpy.ctypeslib as ctl
|
||||
import numpy as np
|
||||
import os
|
||||
import json
|
||||
|
||||
from ctypes import *
|
||||
from numpy.ctypeslib import ndpointer
|
||||
|
||||
lib = ctypes.cdll.LoadLibrary("./index.so")
|
||||
|
||||
class IndexContext(Structure):
|
||||
_fields_=[("graph",c_void_p),
|
||||
("data",c_void_p)]
|
||||
|
||||
# for mobius IP index
|
||||
build_mobius_index = lib.build_mobius_index
|
||||
build_mobius_index.restype = None
|
||||
build_mobius_index.argtypes = [ctl.ndpointer(np.float32, flags='aligned, c_contiguous'), ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_double, ctypes.c_char_p]
|
||||
|
||||
search_mobius_index = lib.search_mobius_index
|
||||
search_mobius_index.restype = None
|
||||
search_mobius_index.argtypes = [ctl.ndpointer(np.float32, flags='aligned, c_contiguous'), ctypes.c_int, ctypes.c_int,ctypes.c_int,POINTER(IndexContext),ctl.ndpointer(np.uint64, flags='aligned, c_contiguous'),ctl.ndpointer(np.float64, flags='aligned, c_contiguous')]
|
||||
|
||||
load_mobius_index_prefix = lib.load_mobius_index_prefix
|
||||
load_mobius_index_prefix.restype = None
|
||||
load_mobius_index_prefix.argtypes = [ctypes.c_int, ctypes.c_int, POINTER(IndexContext), ctypes.c_char_p]
|
||||
|
||||
save_mobius_index_prefix = lib.save_mobius_index_prefix
|
||||
save_mobius_index_prefix.restype = None
|
||||
save_mobius_index_prefix.argtypes = [POINTER(IndexContext), ctypes.c_char_p]
|
||||
|
||||
|
||||
# for L2 index
|
||||
build_l2_index = lib.build_l2_index
|
||||
build_l2_index.restype = None
|
||||
build_l2_index.argtypes = [ctl.ndpointer(np.float32, flags='aligned, c_contiguous'), ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_char_p]
|
||||
|
||||
search_l2_index = lib.search_l2_index
|
||||
search_l2_index.restype = None
|
||||
search_l2_index.argtypes = [ctl.ndpointer(np.float32, flags='aligned, c_contiguous'), ctypes.c_int, ctypes.c_int,ctypes.c_int,POINTER(IndexContext),ctl.ndpointer(np.uint64, flags='aligned, c_contiguous'),ctl.ndpointer(np.float64, flags='aligned, c_contiguous')]
|
||||
|
||||
load_l2_index_prefix = lib.load_l2_index_prefix
|
||||
load_l2_index_prefix.restype = None
|
||||
load_l2_index_prefix.argtypes = [ctypes.c_int, ctypes.c_int, POINTER(IndexContext), ctypes.c_char_p]
|
||||
|
||||
save_l2_index_prefix = lib.save_l2_index_prefix
|
||||
save_l2_index_prefix.restype = None
|
||||
save_l2_index_prefix.argtypes = [POINTER(IndexContext), ctypes.c_char_p]
|
||||
|
||||
release_context = lib.release_context
|
||||
release_context.restype = None
|
||||
release_context.argtypes = [POINTER(IndexContext)]
|
||||
|
||||
|
||||
|
||||
class Graph_Index(object):
|
||||
"""
|
||||
graph index
|
||||
"""
|
||||
def __init__(self, dist_type="IP"):
|
||||
self.dim = 0
|
||||
self.total_num = 0
|
||||
self.dist_type = dist_type
|
||||
self.mobius_pow = 2.0
|
||||
self.index_context = IndexContext(0,0)
|
||||
self.gallery_doc_dict = {}
|
||||
self.with_attr = False
|
||||
assert dist_type in ["IP", "L2"], "Only support IP and L2 distance ..."
|
||||
|
||||
def build(self, gallery_vectors, gallery_docs=[], pq_size=100, index_path='graph_index/'):
|
||||
"""
|
||||
build index
|
||||
"""
|
||||
if paddle.is_tensor(gallery_vectors):
|
||||
gallery_vectors = gallery_vectors.numpy()
|
||||
assert gallery_vectors.ndim == 2, "Input vector must be 2D ..."
|
||||
|
||||
self.total_num = gallery_vectors.shape[0]
|
||||
self.dim = gallery_vectors.shape[1]
|
||||
|
||||
assert (len(gallery_docs) == self.total_num if len(gallery_docs)>0 else True)
|
||||
|
||||
print("training index -> num: {}, dim: {}, dist_type: {}".format(self.total_num, self.dim, self.dist_type))
|
||||
|
||||
if not os.path.exists(index_path):
|
||||
os.makedirs(index_path)
|
||||
|
||||
if self.dist_type == "IP":
|
||||
build_mobius_index(gallery_vectors,self.total_num,self.dim, pq_size, self.mobius_pow, create_string_buffer((index_path+"/index").encode('utf-8')))
|
||||
load_mobius_index_prefix(self.total_num, self.dim, ctypes.byref(self.index_context), create_string_buffer((index_path+"/index").encode('utf-8')))
|
||||
else:
|
||||
build_l2_index(gallery_vectors,self.total_num,self.dim, pq_size, create_string_buffer((index_path+"/index").encode('utf-8')))
|
||||
load_l2_index_prefix(self.total_num, self.dim, ctypes.byref(self.index_context), create_string_buffer((index_path+"/index").encode('utf-8')))
|
||||
|
||||
self.gallery_doc_dict = {}
|
||||
if len(gallery_docs) > 0:
|
||||
self.with_attr = True
|
||||
for i in range(gallery_vectors.shape[0]):
|
||||
self.gallery_doc_dict[str(i)] = gallery_docs[i]
|
||||
|
||||
self.gallery_doc_dict["total_num"] = self.total_num
|
||||
self.gallery_doc_dict["dim"] = self.dim
|
||||
self.gallery_doc_dict["dist_type"] = self.dist_type
|
||||
self.gallery_doc_dict["with_attr"] = self.with_attr
|
||||
|
||||
with open(index_path + "/info.json", "w") as f:
|
||||
json.dump(self.gallery_doc_dict, f)
|
||||
|
||||
print("finished creating index ...")
|
||||
|
||||
def search(self, query, return_k=10, search_budget=100):
|
||||
"""
|
||||
search
|
||||
"""
|
||||
ret_id = np.zeros(return_k, dtype=np.uint64)
|
||||
ret_score = np.zeros(return_k, dtype=np.float64)
|
||||
|
||||
if paddle.is_tensor(query):
|
||||
query = query.numpy()
|
||||
if self.dist_type == "IP":
|
||||
search_mobius_index(query,self.dim,search_budget,return_k,ctypes.byref(self.index_context),ret_id,ret_score)
|
||||
else:
|
||||
search_l2_index(query,self.dim,search_budget,return_k,ctypes.byref(self.index_context),ret_id,ret_score)
|
||||
|
||||
ret_id = ret_id.tolist()
|
||||
ret_doc = []
|
||||
if self.with_attr:
|
||||
for i in range(return_k):
|
||||
ret_doc.append(self.gallery_doc_dict[str(ret_id[i])])
|
||||
return ret_score, ret_doc
|
||||
else:
|
||||
return ret_score, ret_id
|
||||
|
||||
def dump(self, index_path):
|
||||
|
||||
if not os.path.exists(index_path):
|
||||
os.makedirs(index_path)
|
||||
|
||||
if self.dist_type == "IP":
|
||||
save_mobius_index_prefix(ctypes.byref(self.index_context),create_string_buffer((index_path+"/index").encode('utf-8')))
|
||||
else:
|
||||
save_l2_index_prefix(ctypes.byref(self.index_context), create_string_buffer((index_path+"/index").encode('utf-8')))
|
||||
|
||||
with open(index_path + "/info.json", "w") as f:
|
||||
json.dump(self.gallery_doc_dict, f)
|
||||
|
||||
def load(self, index_path):
|
||||
self.gallery_doc_dict = {}
|
||||
|
||||
with open(index_path + "/info.json", "r") as f:
|
||||
self.gallery_doc_dict = json.load(f)
|
||||
|
||||
self.total_num = self.gallery_doc_dict["total_num"]
|
||||
self.dim = self.gallery_doc_dict["dim"]
|
||||
self.dist_type = self.gallery_doc_dict["dist_type"]
|
||||
self.with_attr = self.gallery_doc_dict["with_attr"]
|
||||
|
||||
if self.dist_type == "IP":
|
||||
load_mobius_index_prefix(self.total_num,self.dim,ctypes.byref(self.index_context), create_string_buffer((index_path+"/index").encode('utf-8')))
|
||||
else:
|
||||
load_l2_index_prefix(self.total_num,self.dim,ctypes.byref(self.index_context), create_string_buffer((index_path+"/index").encode('utf-8')))
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,20 @@
|
|||
import numpy as np
|
||||
from interface import *
|
||||
|
||||
# 随机产生样本
|
||||
index_vectors = np.random.rand(100000,128).astype(np.float32)
|
||||
query_vector = np.random.rand(128).astype(np.float32)
|
||||
index_docs = ["ID_"+str(i) for i in range(100000)]
|
||||
|
||||
# 初始化索引结构
|
||||
indexer = Graph_Index(dist_type="IP") #支持"IP"和"L2"
|
||||
indexer.build(gallery_vectors=index_vectors, gallery_docs=index_docs, pq_size=100, index_path='test')
|
||||
|
||||
# 查询
|
||||
scores, docs = indexer.search(query=query_vector, return_k=10, search_budget=100)
|
||||
print(scores)
|
||||
print(docs)
|
||||
|
||||
# 保存与加载
|
||||
indexer.dump(index_path="test")
|
||||
indexer.load(index_path="test")
|
Loading…
Reference in New Issue