RapidPoseTriangulation/extras/mmdeploy/add_extra_steps.py

import re

import numpy as np
import onnx
from onnx import TensorProto, helper, numpy_helper

# ==================================================================================================

base_path = "/RapidPoseTriangulation/extras/mmdeploy/exports/"
det_model_path1 = base_path + "rtmdet-nano_1x3x320x320.onnx"
det_model_path2 = base_path + "rtmdet-m_1x3x320x320.onnx"
pose_model_path1 = base_path + "rtmpose-m_Bx3x384x288.onnx"
pose_model_path2 = base_path + "rtmpose-m_1x3x384x288.onnx"
pose_model_path3 = base_path + "rtmpose-l_wb_Bx3x384x288.onnx"
pose_model_path4 = base_path + "rtmpose-l_wb_1x3x384x288.onnx"

norm_mean = -1 * (np.array([0.485, 0.456, 0.406]) * 255)
norm_std = 1.0 / (np.array([0.229, 0.224, 0.225]) * 255)


# ==================================================================================================


def add_steps_to_onnx(model_path):

    # Load existing model
    model = onnx.load(model_path)
    graph = model.graph

    mean = norm_mean.astype(np.float32)
    std = norm_std.astype(np.float32)

    mean = np.reshape(mean, (1, 3, 1, 1)).astype(np.float32)
    std = np.reshape(std, (1, 3, 1, 1)).astype(np.float32)

    use_fp16 = bool("fp16" in model_path)
    if use_fp16:
        mean = mean.astype(np.float16)
        std = std.astype(np.float16)

    # Add the initializers to the graph
    mean_initializer = numpy_helper.from_array(mean, name="norm_mean")
    std_initializer = numpy_helper.from_array(std, name="norm_std")
    graph.initializer.extend([mean_initializer, std_initializer])

    # Define layer names, assuming the first input is the image tensor
    input_name = graph.input[0].name

    # Cast to internal type
    # This has to be the first node, because tensorrt does not support uint8 layers
    cast_type = 10 if use_fp16 else 1
    casted_output = "casted_output"
    cast_node = helper.make_node(
        "Cast",
        inputs=[input_name],
        outputs=[casted_output],
        to=cast_type,
        name="Cast_Input",
    )

    # Node to transpose
    transpose_output = "transpose_output"
    transpose_node = helper.make_node(
        "Transpose",
        inputs=[casted_output],
        outputs=[transpose_output],
        perm=[0, 3, 1, 2],
        name="Transpose",
    )

    # Node to add mean
    mean_added_output = "mean_added_output"
    mean_add_node = helper.make_node(
        "Add",
        inputs=[transpose_output, "norm_mean"],
        outputs=[mean_added_output],
        name="Mean_Addition",
    )

    # Node to multiply by std
    std_mult_output = "std_mult_output"
    std_mul_node = helper.make_node(
        "Mul",
        inputs=[mean_added_output, "norm_std"],
        outputs=[std_mult_output],
        name="Std_Multiplication",
    )

    # Replace original input of the model with the output of normalization
    for node in graph.node:
        for idx, input_name_in_node in enumerate(node.input):
            if input_name_in_node == input_name:
                node.input[idx] = std_mult_output

    # Add the new nodes to the graph
    graph.node.insert(0, cast_node)
    graph.node.insert(1, transpose_node)
    graph.node.insert(2, mean_add_node)
    graph.node.insert(3, std_mul_node)

    # Transpose the input shape
    input_shape = graph.input[0].type.tensor_type.shape.dim
    dims = [dim.dim_value for dim in input_shape]
    for i, j in enumerate([0, 3, 1, 2]):
        input_shape[j].dim_value = dims[i]

    # Set the batch size to a defined string
    input_shape = graph.input[0].type.tensor_type.shape.dim
    if input_shape[0].dim_value == 0:
        input_shape[0].dim_param = "batch_size"

    # Rename the input tensor
    main_input_image_name = model.graph.input[0].name
    for node in model.graph.node:
        for idx, name in enumerate(node.input):
            if name == main_input_image_name:
                node.input[idx] = "image_input"
    model.graph.input[0].name = "image_input"

    # Set input image type to int8
    model.graph.input[0].type.tensor_type.elem_type = TensorProto.UINT8

    # Cast all outputs to fp32 to avoid half precision issues in cpp code
    for output in graph.output:
        orig_output_name = output.name
        internal_output_name = orig_output_name + "_internal"

        # Rename the output tensor
        for node in model.graph.node:
            for idx, name in enumerate(node.output):
                if name == orig_output_name:
                    node.output[idx] = internal_output_name

        # Insert a Cast node that casts the internal output to fp32
        cast_fp32_name = orig_output_name
        cast_node_output = helper.make_node(
            "Cast",
            inputs=[internal_output_name],
            outputs=[cast_fp32_name],
            to=1,
            name="Cast_Output_" + orig_output_name,
        )
        # Append the cast node to the graph
        graph.node.append(cast_node_output)

        # Update the output's data type info
        output.type.tensor_type.elem_type = TensorProto.FLOAT

    # Merge the two outputs
    if "det" in model_path:
        r1_output = "dets"
        r2_output = "labels"
        out_name = "bboxes"
        out_dim = 6
    if "pose" in model_path:
        r1_output = "kpts"
        r2_output = "scores"
        out_name = "keypoints"
        out_dim = 3
    if "det" in model_path or "pose" in model_path:
        # Node to expand
        r2_expanded = r2_output + "_expanded"
        unsqueeze_node = helper.make_node(
            "Unsqueeze",
            inputs=[r2_output],
            outputs=[r2_expanded],
            axes=[2],
            name="Unsqueeze",
        )

        # Node to concatenate
        r12_merged = out_name
        concat_node = helper.make_node(
            "Concat",
            inputs=[r1_output, r2_expanded],
            outputs=[r12_merged],
            axis=2,
            name="Merged",
        )

        # Define the new concatenated output
        merged_output = helper.make_tensor_value_info(
            r12_merged,
            TensorProto.FLOAT,
            [
                (
                    graph.input[0].type.tensor_type.shape.dim[0].dim_value
                    if graph.input[0].type.tensor_type.shape.dim[0].dim_value > 0
                    else None
                ),
                (
                    graph.output[0].type.tensor_type.shape.dim[1].dim_value
                    if graph.output[0].type.tensor_type.shape.dim[1].dim_value > 0
                    else None
                ),
                out_dim,
            ],
        )

        # Update the graph
        graph.node.append(unsqueeze_node)
        graph.node.append(concat_node)
        graph.output.pop()
        graph.output.pop()
        graph.output.append(merged_output)

    path = re.sub(r"(x)(\d+)x(\d+)x(\d+)", r"\1\3x\4x\2", model_path)
    path = path.replace(".onnx", "_extra-steps.onnx")
    onnx.save(model, path)


# ==================================================================================================


def main():
    add_steps_to_onnx(det_model_path1)
    add_steps_to_onnx(det_model_path2)
    add_steps_to_onnx(pose_model_path1)
    add_steps_to_onnx(pose_model_path2)
    add_steps_to_onnx(pose_model_path3)
    add_steps_to_onnx(pose_model_path4)
    add_steps_to_onnx(det_model_path1.replace(".onnx", "_fp16.onnx"))
    add_steps_to_onnx(det_model_path2.replace(".onnx", "_fp16.onnx"))
    add_steps_to_onnx(pose_model_path1.replace(".onnx", "_fp16.onnx"))
    add_steps_to_onnx(pose_model_path2.replace(".onnx", "_fp16.onnx"))
    add_steps_to_onnx(pose_model_path3.replace(".onnx", "_fp16.onnx"))
    add_steps_to_onnx(pose_model_path4.replace(".onnx", "_fp16.onnx"))


# ==================================================================================================

if __name__ == "__main__":
    main()