diff --git a/extras/mmdeploy/add_extra_steps.py b/extras/mmdeploy/add_extra_steps.py
index 4eb822b..f80ea77 100644
--- a/extras/mmdeploy/add_extra_steps.py
+++ b/extras/mmdeploy/add_extra_steps.py
@@ -54,6 +54,7 @@ def add_steps_to_onnx(model_path):
         inputs=[input_name],
         outputs=[casted_output],
         to=cast_type,
+        name="Cast_Input",
     )
 
     # Node to transpose
@@ -118,6 +119,32 @@ def add_steps_to_onnx(model_path):
     # Set input image type to int8
     model.graph.input[0].type.tensor_type.elem_type = TensorProto.UINT8
 
+    # Cast all outputs to fp32 to avoid half precision issues in cpp code
+    for output in graph.output:
+        orig_output_name = output.name
+        internal_output_name = orig_output_name + "_internal"
+
+        # Rename the output tensor
+        for node in model.graph.node:
+            for idx, name in enumerate(node.output):
+                if name == orig_output_name:
+                    node.output[idx] = internal_output_name
+
+        # Insert a Cast node that casts the internal output to fp32
+        cast_fp32_name = orig_output_name
+        cast_node_output = helper.make_node(
+            "Cast",
+            inputs=[internal_output_name],
+            outputs=[cast_fp32_name],
+            to=1,
+            name="Cast_Output_" + orig_output_name,
+        )
+        # Append the cast node to the graph
+        graph.node.append(cast_node_output)
+
+        # Update the output's data type info
+        output.type.tensor_type.elem_type = TensorProto.FLOAT
+
     path = re.sub(r"(x)(\d+)x(\d+)x(\d+)", r"\1\3x\4x\2", model_path)
     path = path.replace(".onnx", "_extra-steps.onnx")
     onnx.save(model, path)
diff --git a/scripts/test_triangulate.py b/scripts/test_triangulate.py
index 481e4b0..5c25d44 100644
--- a/scripts/test_triangulate.py
+++ b/scripts/test_triangulate.py
@@ -253,11 +253,11 @@ def update_keypoints(poses_2d: list, joint_names: List[str]) -> list:
 
             new_body = body[:17]
             if whole_body["foots"]:
-                new_body.extend(body[17:22])
+                new_body.extend(body[17:23])
             if whole_body["face"]:
-                new_body.extend(body[22:90])
+                new_body.extend(body[23:91])
             if whole_body["hands"]:
-                new_body.extend(body[90:])
+                new_body.extend(body[91:])
             body = new_body
 
             hlid = joint_names.index("hip_left")
diff --git a/scripts/utils_2d_pose.py b/scripts/utils_2d_pose.py
index 0117870..63eb89e 100644
--- a/scripts/utils_2d_pose.py
+++ b/scripts/utils_2d_pose.py
@@ -189,9 +189,15 @@ class BoxCrop:
         self.fill_value = fill_value
 
     def calc_params(self, ishape, bbox):
-        start_x, start_y, end_x, end_y = bbox[0], bbox[1], bbox[2], bbox[3]
+        img_h, img_w = ishape[:2]
         target_h, target_w = self.target_size
 
+        # Round the bounding box coordinates
+        start_x = math.floor(bbox[0])
+        start_y = math.floor(bbox[1])
+        end_x = math.ceil(bbox[2])
+        end_y = math.ceil(bbox[3])
+
         # Calculate original bounding box center
         center_x = (start_x + end_x) / 2.0
         center_y = (start_y + end_y) / 2.0
@@ -231,8 +237,8 @@ class BoxCrop:
         # Define the new box coordinates
         new_start_x = max(0, start_x)
         new_start_y = max(0, start_y)
-        new_end_x = min(ishape[1] - 1, end_x)
-        new_end_y = min(ishape[0] - 1, end_y)
+        new_end_x = min(img_w - 1, end_x)
+        new_end_y = min(img_h - 1, end_y)
         new_box = [new_start_x, new_start_y, new_end_x, new_end_y]
 
         # Calculate resized crop size
@@ -344,7 +350,6 @@ class RTMDet(BaseModel):
             return np.array([])
 
         # Drop boxes with too small area
-        boxes = boxes.astype(np.float32)
         areas = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
         keep = areas >= self.min_area
         boxes = boxes[keep]
@@ -386,10 +391,7 @@ class RTMPose(BaseModel):
     def preprocess(self, image: np.ndarray, bboxes: np.ndarray):
         cutouts = []
         for i in range(len(bboxes)):
-            bbox = np.asarray(bboxes[i])[0:4]
-            bbox += np.array([-0.5, -0.5, 0.5 - 1e-8, 0.5 - 1e-8])
-            bbox = bbox.round().astype(np.int32)
-            region = self.boxcrop.crop_resize_box(image, bbox)
+            region = self.boxcrop.crop_resize_box(image, bboxes[i])
             tensor = np.asarray(region).astype(self.input_types[0], copy=False)
             cutouts.append(tensor)