Skip to content

COCOeval

Run COCO evaluation to compute AP/AR metrics.

from hotcoco import COCO, COCOeval

coco_gt = COCO("instances_val2017.json")
coco_dt = coco_gt.load_res("detections.json")

ev = COCOeval(coco_gt, coco_dt, "bbox")
ev.evaluate()
ev.accumulate()
ev.summarize()
use hotcoco::{COCO, COCOeval};
use hotcoco::params::IouType;
use std::path::Path;

let coco_gt = COCO::new(Path::new("instances_val2017.json"))?;
let coco_dt = coco_gt.load_res(Path::new("detections.json"))?;

let mut ev = COCOeval::new(coco_gt, coco_dt, IouType::Bbox);
ev.evaluate();
ev.accumulate();
ev.summarize();

Constructor

COCOeval(
    coco_gt: COCO,
    coco_dt: COCO,
    iou_type: str,
    *,
    lvis_style: bool = False,
    oid_style: bool = False,
    hierarchy: Hierarchy | None = None,
)
Parameter Type Default Description
coco_gt COCO Ground truth COCO object
coco_dt COCO Detections COCO object (from load_res)
iou_type str "bbox", "segm", or "keypoints"
lvis_style bool False Enable LVIS federated evaluation mode
oid_style bool False Enable Open Images evaluation mode (IoU=0.5, group-of matching)
hierarchy Hierarchy | None None Category hierarchy for GT expansion in OID mode
// Standard COCO
COCOeval::new(coco_gt: COCO, coco_dt: COCO, iou_type: IouType) -> Self

// LVIS federated
COCOeval::new_lvis(coco_gt: COCO, coco_dt: COCO, iou_type: IouType) -> Self

// Open Images
COCOeval::new_oid(coco_gt: COCO, coco_dt: COCO, hierarchy: Option<Hierarchy>) -> Self
Parameter Type Description
coco_gt COCO Ground truth COCO object
coco_dt COCO Detections COCO object (from load_res)
iou_type IouType IouType::Bbox, IouType::Segm, or IouType::Keypoints
hierarchy Option<Hierarchy> Category hierarchy for GT expansion; None to skip expansion

Properties

params

params: Params

Evaluation parameters. Modify before calling evaluate().

ev = COCOeval(coco_gt, coco_dt, "bbox")
ev.params.cat_ids = [1, 2, 3]
ev.params.max_dets = [1, 10, 100]
pub params: Params
let mut ev = COCOeval::new(coco_gt, coco_dt, IouType::Bbox);
ev.params.cat_ids = vec![1, 2, 3];
ev.params.max_dets = vec![1, 10, 100];

See Params for all configurable fields.


stats

stats: list[float] | None

The 12 summary metrics (10 for keypoints), populated after summarize(). None before summarize() is called.

ev.summarize()
print(f"AP: {ev.stats[0]:.3f}")
print(f"AP50: {ev.stats[1]:.3f}")
fn stats(&self) -> Option<&[f64]>
ev.summarize();
if let Some(stats) = ev.stats() {
    println!("AP: {:.3}", stats[0]);
    println!("AP50: {:.3}", stats[1]);
}

eval_imgs

Per-image evaluation results, populated after evaluate(). See Working with Results for details.

eval_imgs: list[dict | None]
fn eval_imgs(&self) -> &[Option<EvalImg>]

eval

Accumulated precision/recall arrays, populated after accumulate(). See Working with Results for details.

eval: dict | None

Contains "precision", "recall", and "scores" arrays.

fn accumulated(&self) -> Option<&AccumulatedEval>

Access elements with precision_idx(t, r, k, a, m) and recall_idx(t, k, a, m).


Methods

evaluate

evaluate() -> None

Run per-image evaluation. Matches detections to ground truth annotations using greedy matching sorted by confidence. Must be called before accumulate().

Populates eval_imgs.


accumulate

accumulate() -> None

Accumulate per-image results into precision/recall curves using interpolated precision at 101 recall thresholds.

Populates eval.


summarize

summarize() -> None

Compute and print the standard COCO metrics. Populates stats.

Non-default parameters

summarize() uses a fixed display format that assumes default iou_thrs, max_dets, and area_rng_lbl. If you've changed any of these, a warning is printed to stderr and some metrics may show -1.000 (e.g. AP50 when iou_thrs doesn't include 0.50). The stats array always has 12 entries (10 for keypoints) regardless of your parameters.

Prints 12 lines for bbox/segm (10 for keypoints):

 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.382
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.584
 ...

run

run() -> None

Run the full pipeline in one call: evaluate()accumulate()summarize(). Primarily used with LVIS pipelines (Detectron2, MMDetection) that expect a single run() call.


metric_keys

metric_keys() -> list[str]

Return metric names in canonical display order for the current evaluation mode. This is the authoritative ordering — the same list that drives summarize() and get_results().

ev = COCOeval(gt, dt, "bbox")
ev.metric_keys()
# ['AP', 'AP50', 'AP75', 'APs', 'APm', 'APl', 'AR1', 'AR10', 'AR100', 'ARs', 'ARm', 'ARl']

Does not require evaluate() or run() — only depends on the evaluation mode and IoU type.


get_results

get_results(prefix: str | None = None, per_class: bool = False) -> dict[str, float]

Return the summary metrics as a dict. Must be called after summarize() (or run()). Returns an empty dict if summarize() has not been called.

Parameter Type Default Description
prefix str | None None If given, each key is prefixed as "{prefix}/{metric}".
per_class bool False If True, include per-category AP values keyed as "AP/{cat_name}" (or "{prefix}/AP/{cat_name}" with a prefix).

Standard bbox/segm keys: AP, AP50, AP75, APs, APm, APl, AR1, AR10, AR100, ARs, ARm, ARl.

Keypoint keys: AP, AP50, AP75, APm, APl, AR, AR50, AR75, ARm, ARl.

LVIS keys: AP, AP50, AP75, APs, APm, APl, APr, APc, APf, AR@300, ARs@300, ARm@300, ARl@300.

ev.run()

# Basic usage (unchanged)
results = ev.get_results()
print(f"AP: {results['AP']:.3f}, AP50: {results['AP50']:.3f}")

# Prefixed keys — ready for any logger
results = ev.get_results(prefix="val/bbox")
# {"val/bbox/AP": 0.578, "val/bbox/AP50": 0.861, ...}

# With per-class AP
results = ev.get_results(prefix="val/bbox", per_class=True)
# {"val/bbox/AP": 0.578, ..., "val/bbox/AP/person": 0.82, ...}

print_results() -> None

Print a formatted results table to stdout. For LVIS, matches the lvis-api print_results() style. Must be called after summarize() (or run()).


results

results(per_class: bool = False) -> dict

Return evaluation results as a serializable dict. Must be called after summarize() (or run()). Raises RuntimeError if summarize() has not been called.

Parameter Type Default Description
per_class bool False If True, include per-category AP values under the "per_class" key.

Returns a dict with:

Key Type Description
"hotcoco_version" str hotcoco version that produced these results.
"params" dict Evaluation parameters: iou_type, iou_thresholds, area_ranges, max_dets, is_lvis.
"metrics" dict[str, float] Summary metrics keyed by name (same keys as get_results()).
"per_class" dict[str, float] | absent Per-category AP values keyed by category name. Only present if per_class=True.
ev.run()
r = ev.results()
print(r["metrics"]["AP"])

# With per-category breakdown
r = ev.results(per_class=True)
print(r["per_class"]["person"])

save_results

save_results(path: str, per_class: bool = False) -> None

Save evaluation results to a JSON file. Must be called after summarize() (or run()). Raises RuntimeError if summarize() has not been called, or IOError if the file cannot be written.

Parameter Type Default Description
path str Output file path.
per_class bool False If True, include per-category AP values.
ev = COCOeval(coco_gt, coco_dt, "bbox")
ev.run()
ev.save_results("results.json")

# With per-category AP
ev.save_results("results_per_class.json", per_class=True)

The JSON structure matches the dict returned by results().


confusion_matrix

confusion_matrix(
    iou_thr: float = 0.5,
    max_det: int | None = None,
    min_score: float | None = None,
) -> dict

Compute a per-category confusion matrix. Unlike evaluate(), this method compares all detections in an image against all ground truth boxes regardless of category, enabling cross-category confusion analysis.

This method is standalone — no evaluate() call is needed first.

Parameters:

Parameter Type Default Description
iou_thr float 0.5 IoU threshold for a DT↔GT match
max_det int \| None last params.max_dets value Max detections per image by score
min_score float \| None None Discard detections below this confidence before max_det truncation

Returns a dict with:

Key Type Description
"matrix" np.ndarray[int64] shape (K+1, K+1) Raw confusion counts. Rows = GT category, cols = predicted. Index K is background.
"normalized" np.ndarray[float64] shape (K+1, K+1) Row-normalized version (rows sum to 1.0; zero rows stay zero).
"cat_ids" list[int] Category IDs for rows/cols 0..K-1.
"cat_names" list[str] Category names for rows/cols 0..K-1, in the same order as cat_ids.
"num_cats" int Number of categories K.
"iou_thr" float IoU threshold used.

Matrix layout (rows = GT, cols = predicted):

  • matrix[i][j] where i ≠ K, j ≠ K — GT category i matched to predicted category j. On-diagonal = TP; off-diagonal = class confusion.
  • matrix[i][K] — GT category i unmatched (false negative).
  • matrix[K][j] — Predicted category j unmatched (false positive).
ev = COCOeval(coco_gt, coco_dt, "bbox")
cm = ev.confusion_matrix(iou_thr=0.5, max_det=100)

matrix = cm["matrix"]
cat_ids = cm["cat_ids"]

# True positives per category
tp = matrix.diagonal()[:-1]

# False negatives per category
fn = matrix[:-1, -1]

# False positives per category
fp = matrix[-1, :-1]

# Normalized view
print(cm["normalized"])

See Confusion Matrix in the evaluation guide for a full walkthrough.


tide_errors

tide_errors(
    pos_thr: float = 0.5,
    bg_thr: float = 0.1,
) -> dict

Decompose detection errors into six TIDE error types (Bolya et al., ECCV 2020) and compute ΔAP — the AP gain from eliminating each error type.

Requires evaluate() to have been called first.

Parameters:

Parameter Type Default Description
pos_thr float 0.5 IoU threshold for TP/FP classification
bg_thr float 0.1 Background IoU threshold for Loc/Both/Bkg discrimination

Returns a dict with:

Key Type Description
"delta_ap" dict[str, float] ΔAP for each error type. Keys: "Cls", "Loc", "Both", "Dupe", "Bkg", "Miss", "FP", "FN".
"counts" dict[str, int] Count of each error type. Keys: "Cls", "Loc", "Both", "Dupe", "Bkg", "Miss".
"ap_base" float Baseline mean AP at pos_thr.
"pos_thr" float IoU threshold used.
"bg_thr" float Background threshold used.
ev = COCOeval(coco_gt, coco_dt, "bbox")
ev.evaluate()
result = ev.tide_errors(pos_thr=0.5, bg_thr=0.1)

print(f"ap_base: {result['ap_base']:.3f}")
for k, v in sorted(result["delta_ap"].items(), key=lambda x: -x[1]):
    if k not in ("FP", "FN"):
        print(f"  {k}: ΔAP={v:.4f}  n={result['counts'].get(k, '—')}")

See TIDE Error Analysis in the evaluation guide for a detailed walkthrough.


calibration

calibration(
    n_bins: int = 10,
    iou_threshold: float = 0.5,
) -> dict

Compute confidence calibration metrics — how well confidence scores predict actual detection accuracy.

Requires evaluate() to have been called first. Bins all non-ignored detections by confidence score and compares the mean confidence in each bin to the fraction of true positives.

Parameters:

Parameter Type Default Description
n_bins int 10 Number of equal-width confidence bins in [0, 1].
iou_threshold float 0.5 IoU threshold for TP/FP classification. Must match one of params.iouThrs.

Returns a dict with:

Key Type Description
"ece" float Expected Calibration Error — weighted mean of per-bin |accuracy - confidence|.
"mce" float Maximum Calibration Error — worst per-bin gap.
"bins" list[dict] Per-bin breakdown. Each dict has bin_lower, bin_upper, avg_confidence, avg_accuracy, count.
"per_category" dict[str, float] Per-category ECE, keyed by category name.
"iou_threshold" float IoU threshold used.
"n_bins" int Number of bins.
"num_detections" int Total non-ignored detections analyzed.
ev = COCOeval(coco_gt, coco_dt, "bbox")
ev.evaluate()

cal = ev.calibration(n_bins=10, iou_threshold=0.5)
print(f"ECE: {cal['ece']:.4f}, MCE: {cal['mce']:.4f}")

# Per-category breakdown
for name, ece in sorted(cal["per_category"].items(), key=lambda x: -x[1])[:5]:
    print(f"  {name}: ECE={ece:.4f}")

See Confidence Calibration in the evaluation guide for a full walkthrough.


f_scores

f_scores(beta: float = 1.0) -> dict[str, float]

Compute F-beta scores after accumulate() (or run()).

For each (IoU threshold, category), finds the confidence operating point that maximizes F-beta, then averages across categories — analogous to how mAP averages precision. Returns three metrics mirroring AP/AP50/AP75.

Parameter Type Default Description
beta float 1.0 Trade-off weight. beta=1 → F1 (equal weight). beta<1 → weights precision. beta>1 → weights recall.

Returns a dict with three keys:

Key Description
"F1" Mean max-F1 across IoU 0.50:0.05:0.95, all categories
"F150" Max-F1 at IoU=0.50
"F175" Max-F1 at IoU=0.75

Key names reflect beta: "F0.5", "F0.550", "F0.575" for beta=0.5, etc.

Returns an empty dict if accumulate() has not been called.

ev = COCOeval(coco_gt, coco_dt, "bbox")
ev.run()

# F1 (default)
scores = ev.f_scores()
print(f"F1: {scores['F1']:.3f}, F1@50: {scores['F150']:.3f}")

# Precision-weighted
print(ev.f_scores(beta=0.5))   # {"F0.5": ..., "F0.550": ..., "F0.575": ...}

# Recall-weighted
print(ev.f_scores(beta=2.0))   # {"F2.0": ..., "F2.050": ..., "F2.075": ...}

image_diagnostics

image_diagnostics(
    iou_thr: float = 0.5,
    score_thr: float = 0.5,
) -> dict

Per-image diagnostics: annotation TP/FP/FN index, per-image F1 and AP scores, error profiles, and label error candidates.

Requires evaluate() to have been called first.

Parameters:

Parameter Type Default Description
iou_thr float 0.5 IoU threshold for TP/FP classification (snapped to nearest in params.iouThrs).
score_thr float 0.5 Minimum detection confidence to consider for label error detection.

Returns a dict with:

Key Type Description
"dt_status" dict[int, str] Detection annotation ID → "tp" or "fp".
"gt_status" dict[int, str] GT annotation ID → "matched" or "fn".
"dt_match" dict[int, int] TP detection → matched GT annotation ID.
"gt_match" dict[int, int] Matched GT → the detection that matched it.
"img_summary" dict[int, dict] Per-image summary (see below).
"label_errors" list[dict] Suspected label errors, sorted by detection score descending (see below).
"iou_thr" float Actual IoU threshold used (snapped).
"score_thr" float Score threshold used for label error detection.

Each image summary dict contains:

Key Type Description
"tp" int True positive count.
"fp" int False positive count.
"fn" int False negative count.
"f1" float F1 score: 2*tp / (2*tp + fp + fn). 1.0 for empty images.
"ap" float AP at the selected IoU threshold (101-point interpolation).
"error_profile" str One of "perfect", "fp_heavy", "fn_heavy", "mixed".

Each label error dict contains:

Key Type Description
"image_id" int Image containing the suspected error.
"dt_id" int Detection annotation ID.
"dt_score" float Detection confidence.
"dt_category" str Detection category name.
"dt_category_id" int Detection category ID.
"gt_id" int \| None Overlapping GT annotation ID (None for missing_annotation).
"gt_category" str \| None GT category name.
"gt_category_id" int \| None GT category ID.
"iou" float Bbox IoU between detection and GT (0.0 for missing_annotation).
"type" str "wrong_label" or "missing_annotation".
ev = COCOeval(coco_gt, coco_dt, "bbox")
ev.evaluate()

diag = ev.image_diagnostics(iou_thr=0.5, score_thr=0.5)

# Worst images by F1
worst = sorted(diag["img_summary"].items(), key=lambda x: x[1]["f1"])[:5]

# Label errors
for le in diag["label_errors"]:
    print(f"{le['type']}: {le['dt_category']}{le.get('gt_category', 'N/A')}")

See Per-image diagnostics in the evaluation guide.


Module-level functions

compare

hotcoco.compare(
    eval_a: COCOeval,
    eval_b: COCOeval,
    n_bootstrap: int = 0,
    seed: int = 42,
    confidence: float = 0.95,
) -> dict

Pairwise model comparison. Both evaluators must have had evaluate() called and use the same eval_mode and iou_type. Accumulation and summarization are performed internally on the shared image set.

Parameter Type Default Description
eval_a COCOeval Baseline model evaluation.
eval_b COCOeval Improved model evaluation.
n_bootstrap int 0 Bootstrap samples for CIs (0 = disabled).
seed int 42 Random seed for reproducibility.
confidence float 0.95 Confidence level (e.g. 0.95 for 95% CI).

Returns a dict with:

Key Type Description
metric_keys list[str] Metric names in canonical display order.
metrics_a dict[str, float] Summary metrics for model A.
metrics_b dict[str, float] Summary metrics for model B.
deltas dict[str, float] Per-metric delta (B − A).
ci dict or None Bootstrap CIs per metric (lower, upper, confidence, prob_positive, std_err). None if n_bootstrap=0.
per_category list[dict] Per-category AP comparison, sorted by delta ascending. Each entry has cat_id, cat_name, ap_a, ap_b, delta.
n_bootstrap int Number of bootstrap samples used.
num_images int Number of shared images.
import hotcoco

gt = hotcoco.COCO("annotations.json")
ev_a = hotcoco.COCOeval(gt, gt.load_res("baseline.json"), "bbox")
ev_a.evaluate()
ev_b = hotcoco.COCOeval(gt, gt.load_res("improved.json"), "bbox")
ev_b.evaluate()

# Without bootstrap
result = hotcoco.compare(ev_a, ev_b)
print(result["deltas"]["AP"])  # e.g. +0.033

# With bootstrap CIs
result = hotcoco.compare(ev_a, ev_b, n_bootstrap=1000)
ci = result["ci"]["AP"]
print(f"[{ci['lower']:+.3f}, {ci['upper']:+.3f}]")  # e.g. [+0.01, +0.05]