File size: 4,015 Bytes
a6dbd37
 
2fb4cf4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d209694
 
2fb4cf4
 
 
 
 
 
 
 
 
 
 
 
 
 
d209694
 
 
 
2fb4cf4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d209694
 
 
 
2fb4cf4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from __future__ import annotations

import math

import cv2
import numpy as np

_LANDMARK_INDICES = [1, 152, 33, 263, 61, 291]

_MODEL_POINTS = np.array(
    [
        [0.0, 0.0, 0.0],
        [0.0, -330.0, -65.0],
        [-225.0, 170.0, -135.0],
        [225.0, 170.0, -135.0],
        [-150.0, -150.0, -125.0],
        [150.0, -150.0, -125.0],
    ],
    dtype=np.float64,
)


class HeadPoseEstimator:
    def __init__(self, max_angle: float = 30.0, roll_weight: float = 0.5):
        self.max_angle = max_angle
        self.roll_weight = roll_weight
        self._camera_matrix = None
        self._frame_size = None
        self._dist_coeffs = np.zeros((4, 1), dtype=np.float64)
        self._cache_key = None
        self._cache_result = None

    def _get_camera_matrix(self, frame_w: int, frame_h: int) -> np.ndarray:
        if self._camera_matrix is not None and self._frame_size == (frame_w, frame_h):
            return self._camera_matrix
        focal_length = float(frame_w)
        cx, cy = frame_w / 2.0, frame_h / 2.0
        self._camera_matrix = np.array(
            [[focal_length, 0, cx], [0, focal_length, cy], [0, 0, 1]],
            dtype=np.float64,
        )
        self._frame_size = (frame_w, frame_h)
        return self._camera_matrix

    def _solve(self, landmarks: np.ndarray, frame_w: int, frame_h: int):
        key = (landmarks.data.tobytes(), frame_w, frame_h)
        if self._cache_key == key:
            return self._cache_result

        image_points = np.array(
            [
                [landmarks[i, 0] * frame_w, landmarks[i, 1] * frame_h]
                for i in _LANDMARK_INDICES
            ],
            dtype=np.float64,
        )
        camera_matrix = self._get_camera_matrix(frame_w, frame_h)
        success, rvec, tvec = cv2.solvePnP(
            _MODEL_POINTS,
            image_points,
            camera_matrix,
            self._dist_coeffs,
            flags=cv2.SOLVEPNP_ITERATIVE,
        )
        result = (success, rvec, tvec, image_points)
        self._cache_key = key
        self._cache_result = result
        return result

    def estimate(
        self, landmarks: np.ndarray, frame_w: int, frame_h: int
    ) -> tuple[float, float, float] | None:
        success, rvec, tvec, _ = self._solve(landmarks, frame_w, frame_h)
        if not success:
            return None

        rmat, _ = cv2.Rodrigues(rvec)
        nose_dir = rmat @ np.array([0.0, 0.0, 1.0])
        face_up = rmat @ np.array([0.0, 1.0, 0.0])

        yaw = math.degrees(math.atan2(nose_dir[0], -nose_dir[2]))
        pitch = math.degrees(math.asin(np.clip(-nose_dir[1], -1.0, 1.0)))
        roll = math.degrees(math.atan2(face_up[0], -face_up[1]))

        return (yaw, pitch, roll)

    def score(self, landmarks: np.ndarray, frame_w: int, frame_h: int) -> float:
        angles = self.estimate(landmarks, frame_w, frame_h)
        if angles is None:
            return 0.0

        yaw, pitch, roll = angles
        deviation = math.sqrt(yaw**2 + pitch**2 + (self.roll_weight * roll) ** 2)
        t = min(deviation / self.max_angle, 1.0)
        return 0.5 * (1.0 + math.cos(math.pi * t))

    def draw_axes(
        self,
        frame: np.ndarray,
        landmarks: np.ndarray,
        axis_length: float = 50.0,
    ) -> np.ndarray:
        h, w = frame.shape[:2]
        success, rvec, tvec, image_points = self._solve(landmarks, w, h)
        if not success:
            return frame

        camera_matrix = self._get_camera_matrix(w, h)
        nose = tuple(image_points[0].astype(int))

        axes_3d = np.float64(
            [[axis_length, 0, 0], [0, axis_length, 0], [0, 0, axis_length]]
        )
        projected, _ = cv2.projectPoints(
            axes_3d, rvec, tvec, camera_matrix, self._dist_coeffs
        )

        colors = [(0, 0, 255), (0, 255, 0), (255, 0, 0)]
        for i, color in enumerate(colors):
            pt = tuple(projected[i].ravel().astype(int))
            cv2.line(frame, nose, pt, color, 2)

        return frame