This may occur if num_labels is not passed during model loading.
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import numpy as np
import pandas as pd
import torch
import math
# 0) Example dataframe (replace with your df)
# df = pd.read_csv("your_data.csv") # must contain 'text' and integer 'label'
df = pd.DataFrame({
"text": [f"ejemplo {i}" for i in range(3000)],
"label": np.repeat(np.arange(252), repeats=math.ceil(3000/252))[:3000]
})
# 1) Ensure labels are 0..C-1
C = int(df["label"].max() + 1)
m = int(df["label"].min())
if m != 0:
df["label"] = df["label"] - m
assert df["label"].between(0, C - 1).all(), "labels must be in [0, C-1]"
# 2) Build small train/test datasets
ds = Dataset.from_pandas(df[["text", "label"]], split="train").train_test_split(test_size=0.1, seed=42)
# 3) Tokenize
tok = AutoTokenizer.from_pretrained("roberta-base")
def preprocess(ex):
return tok(ex["text"], truncation=True, padding="max_length", max_length=64)
ds_tok = ds.map(preprocess, batched=True).remove_columns(["text"]).with_format("torch")
# 4) Create model with the correct class count; let Transformers swap the head
model = AutoModelForSequenceClassification.from_pretrained(
"roberta-base",
num_labels=C, # tells the new classifier size
ignore_mismatched_sizes=True, # skip loading the old head
)
# optional but recommended: explicit label maps
model.config.id2label = {i: str(i) for i in range(C)}
model.config.label2id = {v: k for k, v in model.config.id2label.items()}
# 5) Train briefly
args = TrainingArguments(
output_dir="out_fix",
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
learning_rate=5e-5,
num_train_epochs=1,
logging_steps=10,
eval_strategy="no",
report_to="none",
)
trainer = Trainer(model=model, args=args, train_dataset=ds_tok["train"])
trainer.train() # IndexError: Target ** is out of bounds. (If without num_labels and ignore_mismatched_sizes)