from typing import Any, Union
import numpy as np
from jaxtyping import Float, Int, Shaped, Num, jaxtyped
from typing import (
    Literal,
    List,
    Dict,
    TypedDict,
    Any,
    cast,
    Tuple,
    Optional,
    Sequence,
    Deque,
)

try:
    from cv2.typing import MatLike
except ImportError:
    MatLike = np.ndarray

NDArray = np.ndarray

BoundingBoxFormat = Literal["xyxy", "xywh"]


class DetectionResult(TypedDict):
    """
    Detection result per frame

    N is the number of detected objects
    """

    boxes_num: Num[NDArray, "1"]
    boxes: Num[NDArray, "N 4"]
    scores: Num[NDArray, "N"]
    reference_frame_size: Tuple[int, int]
    """
    Height and width of reference frame.
    
    The bounding box coordinates are relative to this frame.

    If one resizes the reference frame, the bounding box and keypoint coordinates should be scaled accordingly.
    """


H36KeyPoints = Float[NDArray, "B F 17 2"]
PersonBasedKeypointLike = Float[NDArray, "B F N 2"]
BoundingBoxes = Float[NDArray, "B F 4"]


class KeyPointDetectionResult(TypedDict):
    """
    keypoints, bounding boxes, and scores
    """

    skeleton_keypoints: Float[NDArray, "N 17 2"]
    skeleton_keypoints_scores: Float[NDArray, "N 17 1"]
    bboxes: Float[NDArray, "N 4"]
    bboxes_scores: Optional[Float[NDArray, "N"]]
    frame_number: int
    """
    The frame number in the video sequence.
    -1 when the frame number is not available.
    """
    reference_frame_size: tuple[int, int]
    """
    Height and Width of the reference frame.

    The bounding box coordinates and keypoint coordinates are relative to this frame.

    If one resizes the reference frame, the bounding box and keypoint coordinates should be scaled accordingly.
    """


KeyPointDetectionTimeSeries = Sequence[KeyPointDetectionResult]
"""
Each item contains keypoint detection result for all people detected in a single frame.

Intervals between frames are not guaranteed to be consistent
"""


class MixKeypoints(TypedDict):
    MixThreeDkeypoints: Num[NDArray, "... N 3"]
    Channels: Any
    Header: Any


class ReferenceFrameSize(TypedDict):
    width: int
    height: int