|
import os |
|
import numpy as np |
|
|
|
def load_aligned_bin(file_path, dtype=np.int8, alignment=8): |
|
""" |
|
读取并解析diskann规定的二进制向量文件,比如bigann_query.bin等 |
|
|
|
参数: |
|
file_path (str): 二进制文件路径 |
|
dtype (np.dtype): 数据类型,默认为float32 |
|
alignment (int): 维度对齐的倍数,默认为8(读取数据集要对齐,保存res的时候没有对齐要求,所以读res的时候不用align) |
|
|
|
返回: |
|
np.ndarray: 读取并对齐后的向量数据 |
|
int: 向量数量 |
|
int: 原始向量维度 |
|
int: 对齐后的向量维度 |
|
""" |
|
|
|
actual_file_size = os.path.getsize(file_path) |
|
|
|
with open(file_path, 'rb') as f: |
|
|
|
npts = int(np.fromfile(f, dtype=np.int32, count=1)[0]) |
|
dim = int(np.fromfile(f, dtype=np.int32, count=1)[0]) |
|
|
|
expected_file_size = 2 * 4 + npts * dim * np.dtype(dtype).itemsize |
|
if actual_file_size != expected_file_size: |
|
raise ValueError(f"文件大小不匹配。实际大小: {actual_file_size} 字节," |
|
f"预期大小: {expected_file_size} 字节") |
|
|
|
|
|
rounded_dim = ((dim + alignment - 1) // alignment) * alignment |
|
|
|
|
|
print(f"元数据: #向量 = {npts}, #维度 = {dim}, 对齐维度 = {rounded_dim}") |
|
print(f"分配内存: {npts * rounded_dim * np.dtype(dtype).itemsize} 字节") |
|
|
|
|
|
data = np.zeros((npts, rounded_dim), dtype=dtype) |
|
|
|
|
|
for i in range(npts): |
|
|
|
vector = np.fromfile(f, dtype=dtype, count=dim) |
|
|
|
data[i, :dim] = vector |
|
|
|
print("数据读取完成") |
|
|
|
return data, npts, dim, rounded_dim |
|
|
|
|
|
|
|
def load_bin(filename, dtype=np.float32): |
|
""" |
|
读取按照save_bin函数格式保存的二进制文件 |
|
|
|
参数: |
|
filename (str): 二进制文件路径 |
|
dtype (np.dtype): 数据类型,根据文件内容选择np.uint32或np.float32 |
|
|
|
返回: |
|
np.ndarray: 读取的数据数组 |
|
int: 向量数量(npts) |
|
int: 每个向量的维度(ndims) |
|
""" |
|
with open(filename, 'rb') as f: |
|
|
|
npts = np.fromfile(f, dtype=np.int32, count=1)[0] |
|
ndims = np.fromfile(f, dtype=np.int32, count=1)[0] |
|
|
|
print(f"读取元数据: #向量 = {npts}, #维度 = {ndims}") |
|
|
|
|
|
data = np.fromfile(f, dtype=dtype, count=npts * ndims) |
|
|
|
data = data.reshape(npts, ndims) |
|
|
|
print(f"成功读取 {data.shape[0]} 个向量,每个向量维度为 {data.shape[1]}") |
|
print(f"数据类型: {data.dtype}") |
|
|
|
return data, npts, ndims |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
idx_filename = "/home/myw/wuchangli/yk/diskann_demo/DiskANN/build/data_backup_clean_test/res_20_idx_uint32.bin" |
|
idx_data, npts, ndims = load_bin(idx_filename, dtype=np.uint32) |
|
|
|
|
|
dist_filename = "/home/myw/wuchangli/yk/diskann_demo/DiskANN/build/data_backup_clean_test/res_20_dists_float.bin" |
|
dist_data, _, _ = load_bin(dist_filename, dtype=np.float32) |
|
|
|
file_path = "/home/myw/wuchangli/yk/diskann_demo/DiskANN/build/data_backup_clean_test/bigann_query.bin" |
|
query_data, query_npts, query_dim, query_rounded_dim = load_aligned_bin(file_path) |
|
|
|
|
|
print("\n示例结果:") |
|
for i in range(min(5, npts)): |
|
print(f"查询 {i}:") |
|
print(f" 最近邻索引: {idx_data[i, :]}") |
|
print(f" 最近邻距离: {dist_data[i, :]}") |
|
|
|
import pdb; pdb.set_trace() |
|
|
|
print('load learn vectors') |
|
base_path = "/home/myw/wuchangli/yk/diskann_demo/DiskANN/build/data_backup_clean_test/bigann_learn.bin" |
|
base_vectors, base_npts, base_dim, base_rounded_dim = load_aligned_bin(base_path) |
|
import pdb; pdb.set_trace() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|