CUDA Out Of Memory when training a DETR Object detection model with compute_metrics

Kamal-Moha · November 9, 2024, 9:30pm

I’m training a DETR Object Detection model using the Trainer API. I have properly created the coco dataset.

But when I run the Trainer API with custom_metrics, I get the error “CUDA Out of Memory”. I have reduced batch_size from 16 until 1, but the same error of “Out of memory”. Here’s how I’m creating the custom_metrics function

id2label = {id: label for id, label in enumerate(train_ds.classes)}
label2id = {label: id for id, label in enumerate(train_ds.classes)}


@dataclass
class ModelOutput:
    logits: torch.Tensor
    pred_boxes: torch.Tensor

class MAPEvaluator:

    def __init__(self, image_processor, threshold=0.00, id2label=None):
        self.image_processor = image_processor
        self.threshold = threshold
        self.id2label = id2label

    def collect_image_sizes(self, targets):
        """Collect image sizes across the dataset as list of tensors with shape [batch_size, 2]."""
        image_sizes = []
        for batch in targets:
            batch_image_sizes = torch.tensor(np.array([x["size"] for x in batch]))
            image_sizes.append(batch_image_sizes)
        return image_sizes

    def collect_targets(self, targets, image_sizes):
        post_processed_targets = []
        for target_batch, image_size_batch in zip(targets, image_sizes):
            for target, (height, width) in zip(target_batch, image_size_batch):
                boxes = target["boxes"]
                boxes = sv.xcycwh_to_xyxy(boxes)
                boxes = boxes * np.array([width, height, width, height])
                boxes = torch.tensor(boxes)
                labels = torch.tensor(target["class_labels"])
                post_processed_targets.append({"boxes": boxes, "labels": labels})
        return post_processed_targets

    def collect_predictions(self, predictions, image_sizes):
        post_processed_predictions = []
        for batch, target_sizes in zip(predictions, image_sizes):
            batch_logits, batch_boxes = batch[1], batch[2]
            output = ModelOutput(logits=torch.tensor(batch_logits), pred_boxes=torch.tensor(batch_boxes))
            post_processed_output = self.image_processor.post_process_object_detection(
                output, threshold=self.threshold, target_sizes=target_sizes
            )
            post_processed_predictions.extend(post_processed_output)
        return post_processed_predictions

    @torch.no_grad()
    def __call__(self, evaluation_results):

        predictions, targets = evaluation_results.predictions, evaluation_results.label_ids

        image_sizes = self.collect_image_sizes(targets)
        post_processed_targets = self.collect_targets(targets, image_sizes)
        post_processed_predictions = self.collect_predictions(predictions, image_sizes)

        evaluator = MeanAveragePrecision(box_format="xyxy", class_metrics=True)
        evaluator.warn_on_many_detections = False
        evaluator.update(post_processed_predictions, post_processed_targets)

        metrics = evaluator.compute()

        # Replace list of per class metrics with separate metric for each class
        classes = metrics.pop("classes")
        map_per_class = metrics.pop("map_per_class")
        mar_100_per_class = metrics.pop("mar_100_per_class")
        for class_id, class_map, class_mar in zip(classes, map_per_class, mar_100_per_class):
            class_name = id2label[class_id.item()] if id2label is not None else class_id.item()
            metrics[f"map_{class_name}"] = class_map
            metrics[f"mar_100_{class_name}"] = class_mar

        metrics = {k: round(v.item(), 4) for k, v in metrics.items()}

        return metrics

eval_compute_metrics_fn = MAPEvaluator(image_processor=processor, threshold=0.01, id2label=id2label)

Below is the model training on the custom dataset

training_args = TrainingArguments(
    output_dir=f"Malaria-finetune",
    report_to="none",
    num_train_epochs=10,
    max_grad_norm=0.1,
    learning_rate=5e-5,
    warmup_steps=300,
    per_device_train_batch_size=1,
    dataloader_num_workers=2,
    metric_for_best_model="eval_map",
    greater_is_better=True,
    load_best_model_at_end=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    remove_unused_columns=False,
    eval_do_concat_batches=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=pytorch_dataset_train,
    eval_dataset=pytorch_dataset_valid,
    processing_class=processor,
    data_collator=collate_fn,
    compute_metrics=eval_compute_metrics_fn
)
trainer.train()

I have looked at a few discussions talking about a similar issue;

github.com/huggingface/transformers

Getting GPU crash when running compute_metrics within SFTTrainer

opened 07:24PM - 09 Oct 23 UTC

closed 08:05AM - 12 Feb 24 UTC

matthewchung74

### System Info This is in colab https://colab.research.google.com/drive/1qTIxG…_9R8xIYryY5mEPXIydZB5QeSnKp#scrollTo=5VM67iQMymv5 I have an Mistral Fine tune in Colab. Before training, I try running ``` trainer.evaluate() ``` and get the following error message ``` [/usr/local/lib/python3.10/dist-packages/transformers/trainer_pt_utils.py](https://localhost:8080/#) in torch_pad_and_concatenate(tensor1, tensor2, padding_index) 86 87 # Now let's fill the result tensor ---> 88 result = tensor1.new_full(new_shape, padding_index) 89 result[: tensor1.shape[0], : tensor1.shape[1]] = tensor1 90 result[tensor1.shape[0] :, : tensor2.shape[1]] = tensor2 OutOfMemoryError: CUDA out of memory. Tried to allocate 12.94 GiB (GPU 0; 39.56 GiB total capacity; 18.33 GiB already allocated; 12.29 GiB free; 25.71 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF ``` however, when compute_metrics is commented out, i can evaluate/train the model Any help is appreciated. Thanks! ### Who can help? _No response_ ### Information - [ ] The official example scripts - [x] My own modified scripts ### Tasks - [ ] An officially supported task in the `examples` folder (such as GLUE/SQuAD, ...) - [ ] My own task or dataset (give details below) ### Reproduction 1. run the colab https://colab.research.google.com/drive/1qTIxG_9R8xIYryY5mEPXIydZB5QeSnKp#scrollTo=5VM67iQMymv5 2. see the error 3. comment out comput_metrics 4. run again and do not see error ### Expected behavior I don't see compute_metrics requiring any additional resources, so am unsure why it would be using more gpu and would expect it not to crash.

I have done all the suggestions, but wasn’t able to fix this DETR model.

With research, I see that using the preprocess_logits_for_metrics in the Trainer might work. I have implemented it like this;

def preprocess_logits_for_metrics(logits, labels):
    """
    Original Trainer may have a memory leak. 
    This is a workaround to avoid storing too many tensors that are not needed.
    """
    pred_ids = torch.argmax(logits[0], dim=-1)
    return pred_ids, labels

Implemented it in the Trainer;

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=pytorch_dataset_train,
    eval_dataset=pytorch_dataset_valid,
    processing_class=processor,
    data_collator=collate_fn,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    compute_metrics=eval_compute_metrics_fn
)
trainer.train()

This couldn’t work as well. Please help

adamomeister · July 16, 2025, 9:42pm

I am having similar issue with RT-DETR. Found your post when searching online. Did you ever solve this?

John6666 · July 17, 2025, 2:13am

Seems resolved in GitHub…?

github.com/huggingface/transformers

CUDA Out Of Memory when training a DETR Object detection model with compute_metrics

opened 10:56PM - 09 Nov 24 UTC

closed 08:04AM - 05 Jan 25 UTC

Kamal-Moha

trainer bug Vision

### System Info `transformers` version 4.47.0.dev0 `accelerate` version 0.34….2 `timm` version 1.0.11 `supervision` version 0.25.0rc2 ### Who can help? @muellerzr @Arth ### Information - [ ] The official example scripts - [ ] My own modified scripts ### Tasks - [ ] An officially supported task in the `examples` folder (such as GLUE/SQuAD, ...) - [ ] My own task or dataset (give details below) ### Reproduction I’m training a DETR Object Detection model using the Trainer API. I have properly created the coco dataset. But when I run the Trainer API with `custom_metrics`, I get the error “OutOfMemoryError: CUDA out of memory.”. I have reduced batch_size from 16 until 1, but the same error of “Out of memory”. ``` def collate_fn(batch): data = {} data["pixel_values"] = torch.stack([x["pixel_values"] for x in batch]) data["labels"] = [x["labels"] for x in batch] return data ``` Here’s how I’m creating the `custom_metrics` function ``` id2label = {id: label for id, label in enumerate(train_ds.classes)} label2id = {label: id for id, label in enumerate(train_ds.classes)} @dataclass class ModelOutput: logits: torch.Tensor pred_boxes: torch.Tensor class MAPEvaluator: def __init__(self, image_processor, threshold=0.00, id2label=None): self.image_processor = image_processor self.threshold = threshold self.id2label = id2label def collect_image_sizes(self, targets): """Collect image sizes across the dataset as list of tensors with shape [batch_size, 2].""" image_sizes = [] for batch in targets: batch_image_sizes = torch.tensor(np.array([x["size"] for x in batch])) image_sizes.append(batch_image_sizes) return image_sizes def collect_targets(self, targets, image_sizes): post_processed_targets = [] for target_batch, image_size_batch in zip(targets, image_sizes): for target, (height, width) in zip(target_batch, image_size_batch): boxes = target["boxes"] boxes = sv.xcycwh_to_xyxy(boxes) boxes = boxes * np.array([width, height, width, height]) boxes = torch.tensor(boxes) labels = torch.tensor(target["class_labels"]) post_processed_targets.append({"boxes": boxes, "labels": labels}) return post_processed_targets def collect_predictions(self, predictions, image_sizes): post_processed_predictions = [] for batch, target_sizes in zip(predictions, image_sizes): batch_logits, batch_boxes = batch[1], batch[2] output = ModelOutput(logits=torch.tensor(batch_logits), pred_boxes=torch.tensor(batch_boxes)) post_processed_output = self.image_processor.post_process_object_detection( output, threshold=self.threshold, target_sizes=target_sizes ) post_processed_predictions.extend(post_processed_output) return post_processed_predictions @torch.no_grad() def __call__(self, evaluation_results): predictions, targets = evaluation_results.predictions, evaluation_results.label_ids image_sizes = self.collect_image_sizes(targets) post_processed_targets = self.collect_targets(targets, image_sizes) post_processed_predictions = self.collect_predictions(predictions, image_sizes) evaluator = MeanAveragePrecision(box_format="xyxy", class_metrics=True) evaluator.warn_on_many_detections = False evaluator.update(post_processed_predictions, post_processed_targets) metrics = evaluator.compute() # Replace list of per class metrics with separate metric for each class classes = metrics.pop("classes") map_per_class = metrics.pop("map_per_class") mar_100_per_class = metrics.pop("mar_100_per_class") for class_id, class_map, class_mar in zip(classes, map_per_class, mar_100_per_class): class_name = id2label[class_id.item()] if id2label is not None else class_id.item() metrics[f"map_{class_name}"] = class_map metrics[f"mar_100_{class_name}"] = class_mar metrics = {k: round(v.item(), 4) for k, v in metrics.items()} return metrics eval_compute_metrics_fn = MAPEvaluator(image_processor=processor, threshold=0.01, id2label=id2label) ``` This is how I'm training the model ``` training_args = TrainingArguments( output_dir=f"Malaria-finetune", report_to="none", num_train_epochs=10, max_grad_norm=0.1, learning_rate=5e-5, warmup_steps=300, per_device_train_batch_size=1, dataloader_num_workers=2, metric_for_best_model="eval_map", greater_is_better=True, load_best_model_at_end=True, eval_strategy="epoch", save_strategy="epoch", save_total_limit=2, remove_unused_columns=False, eval_do_concat_batches=False, ) trainer = Trainer( model=model, args=training_args, train_dataset=pytorch_dataset_train, eval_dataset=pytorch_dataset_valid, processing_class=processor, data_collator=collate_fn, compute_metrics=eval_compute_metrics_fn ) trainer.train() ``` I would like to please get help on this. ### Expected behavior I expected the Trainer to train normally on this custom dataset for 10 epochs without any errors

yolandaer · July 17, 2025, 1:20pm

I’m considering hiring a custom app development company for a mid-sized fintech app. I came across TekRevol and liked their portfolio and focus on digital transformation.

Before jumping in, what should I look out for when partnering with a software development firm?

Contracts and IP?
Tech flexibility?
Project management transparency?

adamomeister · December 3, 2025, 5:40pm

This was helpful. I implemented my own solution similar to the one on github. So far, no errors. I’ll add it here in case someone else finds it useful, or if someone finds bugs I missed. Either way, please let me know. Thanks!

@dataclass
class ModelOutput:
    logits: torch.Tensor
    pred_boxes: torch.Tensor

class BatchMAPEvaluator:

def __init__(self, image_processor, threshold=0.00, id2label=None):
    self.image_processor = image_processor
    self.threshold = threshold
    self.id2label = id2label
    self.evaluator = MeanAveragePrecision(box_format="xyxy", class_metrics=True)
    self.evaluator.warn_on_many_detections = False

def reset(self):
    """Reset the evaluator state for a new evaluation run."""
    self.evaluator.reset()

def process_batch(self, predictions, targets):
    """
    Process a single batch and update the evaluator.
    
    Args:
        predictions: tuple of (loss, logits, pred_boxes)
        targets: list of dicts, each with "size", "boxes", "class_labels" as tensors
    """
    # Get image sizes - targets is a list of dicts with tensor values
    image_sizes = torch.stack([x["size"] for x in targets]).cpu()
    
    # Process predictions
    batch_logits = predictions[1]
    batch_boxes = predictions[2]
    
    # Ensure tensors are on CPU
    if isinstance(batch_logits, torch.Tensor):
        batch_logits = batch_logits.cpu()
    else:
        batch_logits = torch.tensor(batch_logits)
        
    if isinstance(batch_boxes, torch.Tensor):
        batch_boxes = batch_boxes.cpu()
    else:
        batch_boxes = torch.tensor(batch_boxes)
    
    output = ModelOutput(logits=batch_logits, pred_boxes=batch_boxes)
    post_processed_predictions = self.image_processor.post_process_object_detection(
        output, threshold=self.threshold, target_sizes=image_sizes
    )
    
    # Process targets
    post_processed_targets = []
    for target, (height, width) in zip(targets, image_sizes):
        # Move tensors to CPU and convert to numpy
        boxes = target["boxes"].cpu().numpy()
        labels = target["class_labels"].cpu()  # Already a torch tensor, just moved to CPU
        
        # Convert xcycwh to xyxy and scale to image size
        boxes = sv.xcycwh_to_xyxy(boxes)
        boxes = boxes * np.array([width.item(), height.item(), width.item(), height.item()])
        
        post_processed_targets.append({
            "boxes": torch.tensor(boxes),
            "labels": labels
        })
    
    # Update evaluator with this batch
    self.evaluator.update(post_processed_predictions, post_processed_targets)

def compute(self):
    """Compute final metrics after all batches have been processed."""
    metrics = self.evaluator.compute()

    classes = metrics.pop("classes")
    map_per_class = metrics.pop("map_per_class")
    mar_100_per_class = metrics.pop("mar_100_per_class")
    for class_id, class_map, class_mar in zip(classes, map_per_class, mar_100_per_class):
        class_name = self.id2label[class_id.item()] if self.id2label is not None else class_id.item()
        metrics[f"map_{class_name}"] = class_map
        metrics[f"mar_100_{class_name}"] = class_mar

    metrics = {k: round(v.item(), 4) for k, v in metrics.items()}
    
    return metrics

@torch.no_grad()
def __call__(self, evaluation_results, compute_result: bool):
    if not compute_result:
        predictions = evaluation_results.predictions
        targets = evaluation_results.label_ids
        self.process_batch(predictions, targets)
        return {}
    else:
        metrics = self.compute()
        self.reset()
        return metrics

# Create batched map evaluator instance
batch_eval_compute_metrics_fn = BatchMAPEvaluator(image_processor=processor, threshold=0.30, id2label=id2label)

# Set batch_eval_metrics in TrainingArguments
training_args = TrainingArguments(
    ...
    batch_eval_metrics=True,
)

# Add batched map evaluator instance to Trainer
trainer = Trainer(
    ...
    compute_metrics=batch_eval_compute_metrics_fn,
)

Topic		Replies	Views
CUDA out of memory when using Trainer with compute_metrics 🤗Transformers	26	47652	September 19, 2025
Adding compute_metrics produces Cuda OutOfMemoryError Beginners	0	142	May 22, 2024
Potential bug in the rt-detr v2 fine tune script 🤗Transformers	5	443	July 29, 2025
Transformer Trainer no response when evaluate with compute_metrics 🤗Transformers	1	201	September 12, 2024
Add metrics to object detection example 🤗Transformers	12	4068	May 8, 2024

CUDA Out Of Memory when training a DETR Object detection model with compute_metrics

Related topics