相似度向量计算
限制pytorch线程数=4
from transformers import AutoTokenizer, AutoModel
import torch
torch.set_num_threads(4)
model_id = "models/bge-small-zh-v1.5"
tokenizer = AutoTokenizer.from_pretrained(model_id)
bge_model = AutoModel.from_pretrained(model_id)
def compute_vectors(model, l, batchsize=500):
n_batch = (len(l) + batchsize - 1 )//batchsize
vectors = []
for i in range(n_batch):
encoded_input = tokenizer(l[i*batchsize:(i+1)*batchsize], padding=True, truncation=True, return_tensors='pt')
with torch.no_grad():
model_output = model(**encoded_input)
sentence_embeddings = model_output[0][:, 0]
sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1) # normalize embeddings
vectors += sentence_embeddings.numpy().astype('float32').tolist()
return vectors
测试向量计算速度
text = ["全球领先的信息与通信技术(ICT)解决方案供应商,专注于ICT领域,坚持稳健经营、持续创新、开放合作,在电信运营商、企业、终端和云计算等领域构筑了端到端的解决方案优势"] * 100
%timeit compute_vectors(bge_model, text)
# 1.25 s ± 16.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
转 onnx 格式
安装 optimum[exporters] 库,并指定阿里云镜像源,以加速下载速度。
!pip install optimum[exporters] -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
from optimum.onnxruntime import ORTModelForFeatureExtraction
from transformers import AutoTokenizer
import torch
import os
onnx_path = os.path.join(model_id, 'onnx')
# load vanilla transformers and convert to onnx
model = ORTModelForFeatureExtraction.from_pretrained(model_id, from_transformers=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)
# save onnx checkpoint and tokenizer
model.save_pretrained(onnx_path)
tokenizer.save_pretrained(onnx_path)
"""
('models/bge-small-zh-v1.5/onnx/tokenizer_config.json',
'models/bge-small-zh-v1.5/onnx/special_tokens_map.json',
'models/bge-small-zh-v1.5/onnx/vocab.txt',
'models/bge-small-zh-v1.5/onnx/added_tokens.json',
'models/bge-small-zh-v1.5/onnx/tokenizer.json')
"""
测试onnx模型速度:
from onnxruntime import SessionOptions
options = SessionOptions() # initialize session options
options.intra_op_num_threads = 4 # 设置线程数
# load onnx model
onnx_model = ORTModelForFeatureExtraction.from_pretrained(onnx_path, file_name="model.onnx", session_options=options)
%timeit compute_vectors(onnx_model, text)
# 1.23 s ± 37 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
可以看到onnx模型速度和原始模型相比几乎没变
onnx optimized
需要注意优化级别,超过1导致优化后的模型对硬件依赖
from optimum.onnxruntime import ORTOptimizer
from optimum.onnxruntime.configuration import OptimizationConfig
# create ORTOptimizer and define optimization configuration
optimizer = ORTOptimizer.from_pretrained(model)
optimization_config = OptimizationConfig(optimization_level=1) # enable basic optimizations
# apply the optimization configuration to the model
optimizer.optimize(
save_dir=onnx_path,
optimization_config=optimization_config,
)
"""
optimization_level (`int`, defaults to 1):
| Optimization level performed by ONNX Runtime of the loaded graph.
| Supported optimization level are 0, 1, 2 and 99.
| - 0: will disable all optimizations
| - 1: will enable basic optimizations
| - 2: will enable basic and extended optimizations, including complex node fusions applied to the nodes
| assigned to the CPU or CUDA execution provider, making the resulting optimized graph hardware dependent
| - 99: will enable all available optimizations including layout optimizations
"""
测试优化后的模型
from onnxruntime import SessionOptions
options = SessionOptions() # initialize session options
options.intra_op_num_threads = 4 # 设置线程数
# load optimized model
optimized_model = ORTModelForFeatureExtraction.from_pretrained(onnx_path, file_name="model_optimized.onnx", session_options=options)
%timeit compute_vectors(optimized_model, text)
# 1.19 s ± 31.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
优化后提升也不多
onnx optimized quantized
from optimum.onnxruntime import ORTQuantizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig
# create ORTQuantizer and define quantization configuration
dynamic_quantizer = ORTQuantizer.from_pretrained(optimized_model)
dqconfig = AutoQuantizationConfig.avx2(is_static=False, per_channel=False) # avx2指令集比avx512更普遍
# apply the quantization configuration to the model
model_quantized_path = dynamic_quantizer.quantize(
save_dir=onnx_path,
quantization_config=dqconfig,
)
AutoQuantizationConfig 有这几个选项:
- arm64
- avx2
- avx512
- avx512_vnni
- tensorrt
可见是针对不同硬件的优化
测试量化后的模型
from onnxruntime import SessionOptions
options = SessionOptions() # initialize session options
options.intra_op_num_threads = 4 # 设置线程数
# load optimized model
optimized_model = ORTModelForFeatureExtraction.from_pretrained(onnx_path, file_name="model_optimized_quantized.onnx", session_options=options)
%timeit compute_vectors(optimized_model, text)
# 590 ms ± 33.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
速度提升一倍
220

被折叠的 条评论
为什么被折叠?



