| |
| |
|
|
| |
| |
|
|
| import numpy as np |
|
|
| from typing import Iterable, Mapping, Tuple, Union |
|
|
|
|
| def compute_tapvid_metrics( |
| query_points: np.ndarray, |
| gt_occluded: np.ndarray, |
| gt_tracks: np.ndarray, |
| pred_occluded: np.ndarray, |
| pred_tracks: np.ndarray, |
| query_mode: str, |
| ) -> Mapping[str, np.ndarray]: |
| """Computes TAP-Vid metrics (Jaccard, Pts. Within Thresh, Occ. Acc.) |
| See the TAP-Vid paper for details on the metric computation. All inputs are |
| given in raster coordinates. The first three arguments should be the direct |
| outputs of the reader: the 'query_points', 'occluded', and 'target_points'. |
| The paper metrics assume these are scaled relative to 256x256 images. |
| pred_occluded and pred_tracks are your algorithm's predictions. |
| This function takes a batch of inputs, and computes metrics separately for |
| each video. The metrics for the full benchmark are a simple mean of the |
| metrics across the full set of videos. These numbers are between 0 and 1, |
| but the paper multiplies them by 100 to ease reading. |
| Args: |
| query_points: The query points, an in the format [t, y, x]. Its size is |
| [b, n, 3], where b is the batch size and n is the number of queries |
| gt_occluded: A boolean array of shape [b, n, t], where t is the number |
| of frames. True indicates that the point is occluded. |
| gt_tracks: The target points, of shape [b, n, t, 2]. Each point is |
| in the format [x, y] |
| pred_occluded: A boolean array of predicted occlusions, in the same |
| format as gt_occluded. |
| pred_tracks: An array of track predictions from your algorithm, in the |
| same format as gt_tracks. |
| query_mode: Either 'first' or 'strided', depending on how queries are |
| sampled. If 'first', we assume the prior knowledge that all points |
| before the query point are occluded, and these are removed from the |
| evaluation. |
| Returns: |
| A dict with the following keys: |
| occlusion_accuracy: Accuracy at predicting occlusion. |
| pts_within_{x} for x in [1, 2, 4, 8, 16]: Fraction of points |
| predicted to be within the given pixel threshold, ignoring occlusion |
| prediction. |
| jaccard_{x} for x in [1, 2, 4, 8, 16]: Jaccard metric for the given |
| threshold |
| average_pts_within_thresh: average across pts_within_{x} |
| average_jaccard: average across jaccard_{x} |
| """ |
|
|
| metrics = {} |
| |
| |
| eye = np.eye(gt_tracks.shape[2], dtype=np.int32) |
|
|
| if query_mode == "first": |
| |
| query_frame_to_eval_frames = np.cumsum(eye, axis=1) - eye |
| elif query_mode == "strided": |
| |
| query_frame_to_eval_frames = 1 - eye |
| else: |
| raise ValueError("Unknown query mode " + query_mode) |
|
|
| query_frame = query_points[..., 0] |
| query_frame = np.round(query_frame).astype(np.int32) |
| evaluation_points = query_frame_to_eval_frames[query_frame] > 0 |
|
|
| |
| |
| occ_acc = np.sum( |
| np.equal(pred_occluded, gt_occluded) & evaluation_points, |
| axis=(1, 2), |
| ) / np.sum(evaluation_points) |
| metrics["occlusion_accuracy"] = occ_acc |
|
|
| |
| |
| visible = np.logical_not(gt_occluded) |
| pred_visible = np.logical_not(pred_occluded) |
| all_frac_within = [] |
| all_jaccard = [] |
| for thresh in [1, 2, 4, 8, 16]: |
| |
| |
| within_dist = np.sum( |
| np.square(pred_tracks - gt_tracks), |
| axis=-1, |
| ) < np.square(thresh) |
| is_correct = np.logical_and(within_dist, visible) |
|
|
| |
| |
| |
| count_correct = np.sum( |
| is_correct & evaluation_points, |
| axis=(1, 2), |
| ) |
| count_visible_points = np.sum(visible & evaluation_points, axis=(1, 2)) |
| frac_correct = count_correct / count_visible_points |
| metrics["pts_within_" + str(thresh)] = frac_correct |
| all_frac_within.append(frac_correct) |
|
|
| true_positives = np.sum( |
| is_correct & pred_visible & evaluation_points, axis=(1, 2) |
| ) |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| gt_positives = np.sum(visible & evaluation_points, axis=(1, 2)) |
| false_positives = (~visible) & pred_visible |
| false_positives = false_positives | ((~within_dist) & pred_visible) |
| false_positives = np.sum(false_positives & evaluation_points, axis=(1, 2)) |
| jaccard = true_positives / (gt_positives + false_positives) |
| metrics["jaccard_" + str(thresh)] = jaccard |
| all_jaccard.append(jaccard) |
| metrics["average_jaccard"] = np.mean( |
| np.stack(all_jaccard, axis=1), |
| axis=1, |
| ) |
| metrics["average_pts_within_thresh"] = np.mean( |
| np.stack(all_frac_within, axis=1), |
| axis=1, |
| ) |
| return metrics |
|
|