# Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import ctypes
ctypes.CDLL("/usr/local/TensorRT-7.2.3.4/lib/libnvinfer.so.7", mode=ctypes.RTLD_GLOBAL)
ctypes.CDLL("/usr/local/TensorRT-7.2.3.4/targets/x86_64-linux-gnu/lib/libnvonnxparser.so.7", mode=ctypes.RTLD_GLOBAL)
ctypes.CDLL("/usr/local/TensorRT-7.2.3.4/targets/x86_64-linux-gnu/lib/libnvparsers.so.7", mode=ctypes.RTLD_GLOBAL)
ctypes.CDLL("/usr/local/cuda-11.1/targets/x86_64-linux/lib/libcudart.so", mode=ctypes.RTLD_GLOBAL)
ctypes.CDLL("/usr/local/cuda-11.1/targets/x86_64-linux/lib/libaccinj64.so.11.1", mode=ctypes.RTLD_GLOBAL)
ctypes.CDLL("/usr/local/cuda-11.1/targets/x86_64-linux/lib/libcublas.so", mode=ctypes.RTLD_GLOBAL)
from cuda import cudart
import numpy as np
import os
import tensorrt as trt
soFile = "/home/pdd/MPI/Swish-Plugin-TensorRT--master/Swish/cmake-build-debug/libswish.so"#"/home/pdd/MPI/AddScalarPlugin/cmake-build-debug/libAddScalarPlugin.so" /home/pdd/MPI/trt-custom-plugin-master/geluPluginv2/cmake-build-debug/libGeluPlugin.so
np.set_printoptions(precision=3, linewidth=100, suppress=True)
np.random.seed(31193)
cudart.cudaDeviceSynchronize()
def printArrayInfomation(x, info="", n=5):
print( '%s:%s,SumAbs=%.5e,Var=%.5f,Max=%.5f,Min=%.5f,SAD=%.5f'%( \
info,str(x.shape),np.sum(abs(x)),np.var(x),np.max(x),np.min(x),np.sum(np.abs(np.diff(x.reshape(-1)))) ))
print('\t', x.reshape(-1)[:n], x.reshape(-1)[-n:])
def check(a, b, weak=False, checkEpsilon=1e-5):
if weak:
res = np.all(np.abs(a - b) < checkEpsilon)
else:
res = np.all(a == b)
diff0 = np.max(np.abs(a - b))
diff1 = np.max(np.abs(a - b) / (np.abs(b) + checkEpsilon))
print("check:%s, absDiff=%f, relDiff=%f" % (res, diff0, diff1))
def addScalarCPU(inputH, scalar):
return [inputH[0] + scalar]
def getAddScalarPlugin(scalar):
for c in trt.get_plugin_registry().plugin_creator_list:
print(c.name)
if c.name == "Swish_TRT":# "LReLU_TRT":#
parameterList = []
#parameterList.append(trt.PluginField("scalar", np.float32(scalar), trt.PluginFieldType.FLOAT32))
print("*-"*1000)
#res = c.create_plugin(c.name,None) ## 段错误 (核心已转储)
print("*"*1000)
# parameterList.append(trt.PluginField("typeId", np.int32(0), trt.PluginFieldType.INT32))
# parameterList.append(trt.PluginField("bias", np.int32(scalar), trt.PluginFieldType.INT32))
res = c.create_plugin(c.name, trt.PluginFieldCollection(parameterList))
return res
return None
def run(shape, scalar):
testCase = "<shape=%s,scalar=%f>" % (shape, scalar)
trtFile = "./model-Dim%s.plan" % str(len(shape))
print("Test %s" % testCase)
logger = trt.Logger(trt.Logger.ERROR)
trt.init_libnvinfer_plugins(logger, '')
ctypes.cdll.LoadLibrary(soFile)
if os.path.isfile(trtFile):
with open(trtFile, "rb") as f:
engine = trt.Runtime(logger).deserialize_cuda_engine(f.read())
if engine == None:
print("Failed loading engine!")
return
print("Succeeded loading engine!")
else:
BATCH_SIZE = 1
builder = trt.Builder(logger)
#network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
network = builder.create_network(1 << 0)
profile = builder.create_optimization_profile()
config = builder.create_builder_config()
# inputT0 = network.add_input("inputT0", trt.float32, [-1 for i in shape])
inputT0 = network.add_input("inputT0", trt.float32,(BATCH_SIZE,3))# https://blog.csdn.net/qq_39967751/article/details/126061511
profile.set_shape(inputT0.name, (BATCH_SIZE,3), (BATCH_SIZE,3), (BATCH_SIZE,3))
config.add_optimization_profile(profile)
print(inputT0)
print("--"*30)
print(getAddScalarPlugin(scalar))
pluginLayer = network.add_plugin_v2([inputT0], getAddScalarPlugin(scalar))
network.mark_output(pluginLayer.get_output(0))
#engineString = builder.build_serialized_network(network, config)# !!! tensorrt 8.5.3.1? # https://blog.csdn.net/hhhhhhhhhhwwwwwwwwww/article/details/127888740
profile = builder.create_optimization_profile()
config = builder.create_builder_config()
config.add_optimization_profile(profile)
engineString = builder.build_engine(network, config).serialize()
if engineString == None:
print("Failed building engine!")
return
print("Succeeded building engine!")
with open(trtFile, "wb") as f:
f.write(engineString) # a bytes-like object is required, not 'tensorrt.tensorrt.ICudaEngine'
engine = trt.Runtime(logger).deserialize_cuda_engine(engineString)
nIO = engine.num_bindings#num_io_tensors # https://github.com/NVIDIA/trt-samples-for-hackathon-cn/issues/63
lTensorName = [engine.get_tensor_name(i) for i in range(nIO)]
nInput = [engine.get_tensor_mode(lTensorName[i]) for i in range(nIO)].count(trt.TensorIOMode.INPUT)
context = engine.create_execution_context()
context.set_input_shape(lTensorName[0], shape)
#for i in range(nIO):
# print("[%2d]%s->" % (i, "Input " if i < nInput else "Output"), engine.get_tensor_dtype(lTensorName[i]), engine.get_tensor_shape(lTensorName[i]), context.get_tensor_shape(lTensorName[i]), lTensorName[i])
bufferH = []
bufferH.append(np.arange(np.prod(shape), dtype=np.float32).reshape(shape))
for i in range(nInput, nIO):
bufferH.append(np.empty(context.get_tensor_shape(lTensorName[i]), dtype=trt.nptype(engine.get_tensor_dtype(lTensorName[i]))))
bufferD = []
for i in range(nIO):
bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1])
for i in range(nInput):
cudart.cudaMemcpy(bufferD[i], bufferH[i].ctypes.data, bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)
for i in range(nIO):
context.set_tensor_address(lTensorName[i], int(bufferD[i]))
context.execute_async_v3(0)
for i in range(nInput, nIO):
cudart.cudaMemcpy(bufferH[i].ctypes.data, bufferD[i], bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)
outputCPU = addScalarCPU(bufferH[:nInput], scalar)
"""
for i in range(nInput):
printArrayInfomation(bufferH[i])
for i in range(nInput, nIO):
printArrayInfomation(bufferH[i])
for i in range(nInput, nIO):
printArrayInfomation(outputCPU[i - nInput])
"""
check(bufferH[nInput:][0], outputCPU[0], True)
for b in bufferD:
cudart.cudaFree(b)
print("Test %s finish!\n" % testCase)
if __name__ == "__main__":
os.system("rm -rf ./*.plan")
#run([32], 1)
run([32, 32], 1)
# run([16, 16, 16], 1)
# run([8, 8, 8, 8], 1)
# run([32], 1)
# run([32, 32], 1)
# run([16, 16, 16], 1)
# run([8, 8, 8, 8], 1)
print("Test all finish!")