LLM Evaluation: Measuring Performance and Quality
Evaluating large language models is a complex challenge that requires multiple approaches and metrics. This guide covers comprehensive evaluation strategies from automated metrics to human assessment.
Evaluation Dimensions
1. Task Performance
- Accuracy: Correctness of outputs
- Fluency: Natural language quality
- Coherence: Logical consistency
- Relevance: Appropriateness to context
2. Safety and Ethics
- Bias: Fairness across demographics
- Toxicity: Harmful content generation
- Privacy: Information leakage
- Alignment: Following intended behavior
3. Efficiency
- Latency: Response time
- Throughput: Requests per second
- Memory Usage: Resource consumption
- Cost: Computational expense
Automated Evaluation Methods
1. Perplexity
import torch
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
def calculate_perplexity(model, tokenizer, text):
"""Calculate perplexity for given text"""
encodings = tokenizer(text, return_tensors='pt')
max_length = model.config.n_positions
stride = 512
seq_len = encodings.input_ids.size(1)
nlls = []
prev_end_loc = 0
for begin_loc in range(0, seq_len, stride):
end_loc = min(begin_loc + max_length, seq_len)
trg_len = end_loc - prev_end_loc
input_ids = encodings.input_ids[:, begin_loc:end_loc]
target_ids = input_ids.clone()
target_ids[:, :-trg_len] = -100
with torch.no_grad():
outputs = model(input_ids, labels=target_ids)
neg_log_likelihood = outputs.loss * trg_len
nlls.append(neg_log_likelihood)
prev_end_loc = end_loc
if end_loc == seq_len:
break
ppl = torch.exp(torch.stack(nlls).sum() / end_loc)
return ppl.item()
# Usage
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
test_text = "The quick brown fox jumps over the lazy dog."
perplexity = calculate_perplexity(model, tokenizer, test_text)
print(f"Perplexity: {perplexity}")
2. BLEU Score
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
def calculate_bleu(references, candidates):
"""Calculate BLEU score for text generation"""
bleu_scores = []
for ref, cand in zip(references, candidates):
# Tokenize
ref_tokens = [ref.split()]
cand_tokens = cand.split()
# Calculate BLEU
score = sentence_bleu(ref_tokens, cand_tokens)
bleu_scores.append(score)
return {
'individual_scores': bleu_scores,
'average_score': sum(bleu_scores) / len(bleu_scores),
'corpus_score': corpus_bleu(
[[ref.split()] for ref in references],
[cand.split() for cand in candidates]
)
}
# Usage
references = ["The cat sat on the mat", "Hello world"]
candidates = ["A cat sat on a mat", "Hello there world"]
bleu_results = calculate_bleu(references, candidates)
3. ROUGE Score
from rouge_score import rouge_scorer
def calculate_rouge(references, candidates):
"""Calculate ROUGE scores for summarization"""
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
for ref, cand in zip(references, candidates):
score = scorer.score(ref, cand)
for metric in scores:
scores[metric].append(score[metric].fmeasure)
# Calculate averages
avg_scores = {}
for metric in scores:
avg_scores[f'{metric}_avg'] = sum(scores[metric]) / len(scores[metric])
return avg_scores
# Usage
references = ["The quick brown fox jumps", "Machine learning is powerful"]
candidates = ["A quick brown fox jumped", "ML is very powerful"]
rouge_results = calculate_rouge(references, candidates)
4. BERTScore
from bert_score import score
def calculate_bertscore(references, candidates, lang='en'):
"""Calculate BERTScore for semantic similarity"""
P, R, F1 = score(candidates, references, lang=lang, verbose=True)
return {
'precision': P.mean().item(),
'recall': R.mean().item(),
'f1': F1.mean().item(),
'individual_scores': {
'precision': P.tolist(),
'recall': R.tolist(),
'f1': F1.tolist()
}
}
# Usage
references = ["The weather is nice today", "I love programming"]
candidates = ["Today's weather is pleasant", "Programming is my passion"]
bert_results = calculate_bertscore(references, candidates)
Task-Specific Evaluation
1. Question Answering
from sklearn.metrics import exact_match_score
import string
import re
def normalize_answer(s):
"""Normalize answer for comparison"""
def remove_articles(text):
return re.sub(r'\b(a|an|the)\b', ' ', text)
def white_space_fix(text):
return ' '.join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return ''.join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def evaluate_qa(predictions, ground_truths):
"""Evaluate QA performance"""
exact_matches = []
f1_scores = []
for pred, gt in zip(predictions, ground_truths):
# Normalize answers
pred_norm = normalize_answer(pred)
gt_norm = normalize_answer(gt)
# Exact match
em = int(pred_norm == gt_norm)
exact_matches.append(em)
# F1 score
pred_tokens = pred_norm.split()
gt_tokens = gt_norm.split()
if len(pred_tokens) == 0 or len(gt_tokens) == 0:
f1 = int(pred_tokens == gt_tokens)
else:
common = set(pred_tokens) & set(gt_tokens)
if len(common) == 0:
f1 = 0
else:
precision = len(common) / len(pred_tokens)
recall = len(common) / len(gt_tokens)
f1 = 2 * precision * recall / (precision + recall)
f1_scores.append(f1)
return {
'exact_match': sum(exact_matches) / len(exact_matches),
'f1': sum(f1_scores) / len(f1_scores)
}
2. Text Classification
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
def evaluate_classification(y_true, y_pred, labels=None):
"""Comprehensive classification evaluation"""
# Basic metrics
report = classification_report(y_true, y_pred, target_names=labels, output_dict=True)
# Confusion matrix
cm = confusion_matrix(y_true, y_pred)
# Per-class analysis
per_class_metrics = {}
for i, label in enumerate(labels or range(len(np.unique(y_true)))):
tp = cm[i, i]
fp = cm[:, i].sum() - tp
fn = cm[i, :].sum() - tp
tn = cm.sum() - tp - fp - fn
per_class_metrics[label] = {
'true_positives': tp,
'false_positives': fp,
'false_negatives': fn,
'true_negatives': tn,
'sensitivity': tp / (tp + fn) if (tp + fn) > 0 else 0,
'specificity': tn / (tn + fp) if (tn + fp) > 0 else 0
}
return {
'classification_report': report,
'confusion_matrix': cm.tolist(),
'per_class_metrics': per_class_metrics
}
3. Text Generation
import nltk
from collections import Counter
def evaluate_generation(generated_texts, reference_texts=None):
"""Evaluate text generation quality"""
metrics = {}
# Diversity metrics
all_tokens = []
for text in generated_texts:
tokens = nltk.word_tokenize(text.lower())
all_tokens.extend(tokens)
unique_tokens = set(all_tokens)
total_tokens = len(all_tokens)
metrics['diversity'] = {
'unique_tokens': len(unique_tokens),
'total_tokens': total_tokens,
'type_token_ratio': len(unique_tokens) / total_tokens if total_tokens > 0 else 0
}
# N-gram diversity
for n in [2, 3, 4]:
ngrams = []
for text in generated_texts:
tokens = nltk.word_tokenize(text.lower())
text_ngrams = list(nltk.ngrams(tokens, n))
ngrams.extend(text_ngrams)
unique_ngrams = set(ngrams)
total_ngrams = len(ngrams)
metrics['diversity'][f'{n}gram_diversity'] = (
len(unique_ngrams) / total_ngrams if total_ngrams > 0 else 0
)
# Length statistics
lengths = [len(nltk.word_tokenize(text)) for text in generated_texts]
metrics['length'] = {
'mean': np.mean(lengths),
'std': np.std(lengths),
'min': min(lengths),
'max': max(lengths)
}
# If reference texts provided, calculate similarity
if reference_texts:
bleu_scores = calculate_bleu(reference_texts, generated_texts)
rouge_scores = calculate_rouge(reference_texts, generated_texts)
bert_scores = calculate_bertscore(reference_texts, generated_texts)
metrics['similarity'] = {
'bleu': bleu_scores,
'rouge': rouge_scores,
'bertscore': bert_scores
}
return metrics
Human Evaluation
1. Evaluation Framework
import random
from dataclasses import dataclass
from typing import List, Dict, Any
@dataclass
class EvaluationItem:
id: str
prompt: str
response: str
model_name: str
metadata: Dict[str, Any] = None
@dataclass
class EvaluationCriteria:
name: str
description: str
scale: List[int]
scale_descriptions: List[str]
class HumanEvaluationFramework:
def __init__(self):
self.criteria = [
EvaluationCriteria(
name="fluency",
description="How natural and well-formed is the text?",
scale=[1, 2, 3, 4, 5],
scale_descriptions=[
"Very poor - incomprehensible",
"Poor - many errors",
"Fair - some errors",
"Good - minor errors",
"Excellent - no errors"
]
),
EvaluationCriteria(
name="relevance",
description="How relevant is the response to the prompt?",
scale=[1, 2, 3, 4, 5],
scale_descriptions=[
"Completely irrelevant",
"Mostly irrelevant",
"Somewhat relevant",
"Mostly relevant",
"Completely relevant"
]
),
EvaluationCriteria(
name="helpfulness",
description="How helpful is the response?",
scale=[1, 2, 3, 4, 5],
scale_descriptions=[
"Not helpful at all",
"Slightly helpful",
"Moderately helpful",
"Very helpful",
"Extremely helpful"
]
)
]
def create_evaluation_batch(self, items: List[EvaluationItem],
batch_size: int = 10) -> List[List[EvaluationItem]]:
"""Create batches for human evaluation"""
random.shuffle(items)
batches = []
for i in range(0, len(items), batch_size):
batch = items[i:i + batch_size]
batches.append(batch)
return batches
def generate_evaluation_form(self, batch: List[EvaluationItem]) -> str:
"""Generate HTML form for human evaluation"""
html = """
<html>
<head><title>LLM Evaluation</title></head>
<body>
<h1>Language Model Evaluation</h1>
<form method="post">
"""
for item in batch:
html += f"""
<div style="border: 1px solid #ccc; margin: 20px; padding: 20px;">
<h3>Item {item.id}</h3>
<p><strong>Prompt:</strong> {item.prompt}</p>
<p><strong>Response:</strong> {item.response}</p>
<p><strong>Model:</strong> {item.model_name}</p>
<table>
"""
for criterion in self.criteria:
html += f"""
<tr>
<td><strong>{criterion.name.title()}:</strong><br>
<small>{criterion.description}</small></td>
<td>
"""
for i, (score, desc) in enumerate(zip(criterion.scale, criterion.scale_descriptions)):
html += f"""
<input type="radio" name="{item.id}_{criterion.name}" value="{score}" id="{item.id}_{criterion.name}_{score}">
<label for="{item.id}_{criterion.name}_{score}">{score} - {desc}</label><br>
"""
html += """
</td>
</tr>
"""
html += """
</table>
<p><strong>Comments:</strong></p>
<textarea name="{}_comments" rows="3" cols="50"></textarea>
</div>
""".format(item.id)
html += """
<input type="submit" value="Submit Evaluation">
</form>
</body>
</html>
"""
return html
2. Inter-Annotator Agreement
from sklearn.metrics import cohen_kappa_score
import numpy as np
def calculate_inter_annotator_agreement(annotations):
"""Calculate inter-annotator agreement"""
# annotations: dict with structure {item_id: {annotator_id: {criterion: score}}}
results = {}
# Get all criteria
criteria = set()
for item_data in annotations.values():
for annotator_data in item_data.values():
criteria.update(annotator_data.keys())
for criterion in criteria:
# Collect scores for this criterion
annotator_scores = {}
for item_id, item_data in annotations.items():
for annotator_id, scores in item_data.items():
if criterion in scores:
if annotator_id not in annotator_scores:
annotator_scores[annotator_id] = []
annotator_scores[annotator_id].append(scores[criterion])
# Calculate pairwise kappa scores
annotators = list(annotator_scores.keys())
kappa_scores = []
for i in range(len(annotators)):
for j in range(i + 1, len(annotators)):
ann1, ann2 = annotators[i], annotators[j]
if len(annotator_scores[ann1]) == len(annotator_scores[ann2]):
kappa = cohen_kappa_score(
annotator_scores[ann1],
annotator_scores[ann2]
)
kappa_scores.append(kappa)
results[criterion] = {
'mean_kappa': np.mean(kappa_scores) if kappa_scores else 0,
'std_kappa': np.std(kappa_scores) if kappa_scores else 0,
'pairwise_kappas': kappa_scores
}
return results
Benchmark Evaluation
1. Standard Benchmarks
import requests
import json
class BenchmarkEvaluator:
def __init__(self):
self.benchmarks = {
'glue': self.evaluate_glue,
'superglue': self.evaluate_superglue,
'hellaswag': self.evaluate_hellaswag,
'mmlu': self.evaluate_mmlu
}
def evaluate_glue(self, model, tokenizer):
"""Evaluate on GLUE benchmark"""
from datasets import load_dataset
results = {}
glue_tasks = ['cola', 'sst2', 'mrpc', 'qqp', 'stsb', 'mnli', 'qnli', 'rte']
for task in glue_tasks:
dataset = load_dataset('glue', task)
# Task-specific evaluation logic
if task == 'cola':
results[task] = self._evaluate_cola(model, tokenizer, dataset)
elif task == 'sst2':
results[task] = self._evaluate_sst2(model, tokenizer, dataset)
# Add other tasks...
return results
def _evaluate_cola(self, model, tokenizer, dataset):
"""Evaluate CoLA (Corpus of Linguistic Acceptability)"""
from sklearn.metrics import matthews_corrcoef
predictions = []
labels = []
for example in dataset['validation']:
sentence = example['sentence']
label = example['label']
# Get model prediction
inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=True)
with torch.no_grad():
outputs = model(**inputs)
prediction = torch.argmax(outputs.logits, dim=-1).item()
predictions.append(prediction)
labels.append(label)
mcc = matthews_corrcoef(labels, predictions)
accuracy = sum(p == l for p, l in zip(predictions, labels)) / len(labels)
return {'matthews_correlation': mcc, 'accuracy': accuracy}
def evaluate_custom_benchmark(self, model, tokenizer, benchmark_data):
"""Evaluate on custom benchmark"""
results = []
for item in benchmark_data:
prompt = item['prompt']
expected = item['expected']
# Generate response
inputs = tokenizer(prompt, return_tensors='pt')
with torch.no_grad():
outputs = model.generate(
inputs.input_ids,
max_length=inputs.input_ids.shape[1] + 100,
num_return_sequences=1,
temperature=0.7
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
response = response[len(prompt):].strip()
# Evaluate response
score = self._score_response(response, expected, item.get('criteria', {}))
results.append({
'prompt': prompt,
'expected': expected,
'generated': response,
'score': score
})
return results
def _score_response(self, response, expected, criteria):
"""Score individual response"""
scores = {}
# Exact match
scores['exact_match'] = int(response.strip().lower() == expected.strip().lower())
# BLEU score
bleu_result = calculate_bleu([expected], [response])
scores['bleu'] = bleu_result['average_score']
# BERTScore
bert_result = calculate_bertscore([expected], [response])
scores['bertscore_f1'] = bert_result['f1']
# Custom criteria
for criterion, weight in criteria.items():
if criterion == 'length_penalty':
len_diff = abs(len(response.split()) - len(expected.split()))
scores[criterion] = max(0, 1 - len_diff * weight)
return scores
2. Leaderboard Integration
class LeaderboardManager:
def __init__(self, leaderboard_url=None):
self.leaderboard_url = leaderboard_url
self.results_history = []
def submit_results(self, model_name, results, metadata=None):
"""Submit results to leaderboard"""
submission = {
'model_name': model_name,
'timestamp': datetime.now().isoformat(),
'results': results,
'metadata': metadata or {}
}
self.results_history.append(submission)
if self.leaderboard_url:
try:
response = requests.post(
f"{self.leaderboard_url}/submit",
json=submission
)
return response.json()
except Exception as e:
print(f"Failed to submit to leaderboard: {e}")
return None
def get_rankings(self, benchmark=None):
"""Get current rankings"""
if self.leaderboard_url:
try:
params = {'benchmark': benchmark} if benchmark else {}
response = requests.get(
f"{self.leaderboard_url}/rankings",
params=params
)
return response.json()
except Exception as e:
print(f"Failed to get rankings: {e}")
return None
# Local rankings from history
return self._calculate_local_rankings(benchmark)
def _calculate_local_rankings(self, benchmark=None):
"""Calculate rankings from local results"""
if not self.results_history:
return []
# Group by model and get latest results
latest_results = {}
for submission in self.results_history:
model = submission['model_name']
if model not in latest_results or submission['timestamp'] > latest_results[model]['timestamp']:
latest_results[model] = submission
# Calculate rankings
rankings = []
for model, submission in latest_results.items():
if benchmark and benchmark not in submission['results']:
continue
score = submission['results'].get(benchmark, {}).get('overall_score', 0)
rankings.append({
'model': model,
'score': score,
'timestamp': submission['timestamp']
})
rankings.sort(key=lambda x: x['score'], reverse=True)
return rankings
Evaluation Best Practices
1. Comprehensive Evaluation Pipeline
class ComprehensiveEvaluator:
def __init__(self):
self.automated_metrics = [
'perplexity', 'bleu', 'rouge', 'bertscore'
]
self.human_metrics = [
'fluency', 'relevance', 'helpfulness', 'safety'
]
self.benchmark_suites = [
'glue', 'superglue', 'hellaswag', 'mmlu'
]
def evaluate_model(self, model, tokenizer, test_data,
include_human=False, include_benchmarks=False):
"""Comprehensive model evaluation"""
results = {
'model_info': {
'name': model.name_or_path if hasattr(model, 'name_or_path') else 'unknown',
'parameters': sum(p.numel() for p in model.parameters()),
'evaluation_date': datetime.now().isoformat()
},
'automated_metrics': {},
'task_specific': {},
'efficiency': {}
}
# Automated metrics
print("Running automated evaluation...")
results['automated_metrics'] = self._run_automated_evaluation(
model, tokenizer, test_data
)
# Task-specific evaluation
print("Running task-specific evaluation...")
results['task_specific'] = self._run_task_evaluation(
model, tokenizer, test_data
)
# Efficiency metrics
print("Running efficiency evaluation...")
results['efficiency'] = self._run_efficiency_evaluation(
model, tokenizer, test_data
)
# Human evaluation (if requested)
if include_human:
print("Preparing human evaluation...")
results['human_evaluation'] = self._prepare_human_evaluation(
model, tokenizer, test_data
)
# Benchmark evaluation (if requested)
if include_benchmarks:
print("Running benchmark evaluation...")
results['benchmarks'] = self._run_benchmark_evaluation(
model, tokenizer
)
return results
def _run_automated_evaluation(self, model, tokenizer, test_data):
"""Run automated metrics"""
results = {}
# Generate responses
responses = []
references = []
for item in test_data[:100]: # Limit for efficiency
prompt = item['prompt']
reference = item['reference']
inputs = tokenizer(prompt, return_tensors='pt', truncation=True)
with torch.no_grad():
outputs = model.generate(
inputs.input_ids,
max_length=inputs.input_ids.shape[1] + 100,
do_sample=True,
temperature=0.7,
pad_token_id=tokenizer.eos_token_id
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
response = response[len(prompt):].strip()
responses.append(response)
references.append(reference)
# Calculate metrics
results['bleu'] = calculate_bleu(references, responses)
results['rouge'] = calculate_rouge(references, responses)
results['bertscore'] = calculate_bertscore(references, responses)
results['generation_quality'] = evaluate_generation(responses, references)
return results
def generate_report(self, results, output_file=None):
"""Generate comprehensive evaluation report"""
report = f"""
# Model Evaluation Report
## Model Information
- **Name**: {results['model_info']['name']}
- **Parameters**: {results['model_info']['parameters']:,}
- **Evaluation Date**: {results['model_info']['evaluation_date']}
## Automated Metrics
"""
# Add automated metrics
if 'automated_metrics' in results:
for metric, values in results['automated_metrics'].items():
if isinstance(values, dict):
report += f"\n### {metric.upper()}\n"
for k, v in values.items():
if isinstance(v, float):
report += f"- **{k}**: {v:.4f}\n"
else:
report += f"- **{k}**: {v}\n"
else:
report += f"- **{metric}**: {values:.4f}\n"
# Add efficiency metrics
if 'efficiency' in results:
report += "\n## Efficiency Metrics\n"
for metric, value in results['efficiency'].items():
report += f"- **{metric}**: {value}\n"
# Add recommendations
report += self._generate_recommendations(results)
if output_file:
with open(output_file, 'w') as f:
f.write(report)
return report
def _generate_recommendations(self, results):
"""Generate recommendations based on results"""
recommendations = "\n## Recommendations\n"
# Analyze results and provide recommendations
if 'automated_metrics' in results:
bleu_score = results['automated_metrics'].get('bleu', {}).get('average_score', 0)
if bleu_score < 0.1:
recommendations += "- **Low BLEU Score**: Consider improving training data quality or fine-tuning approach\n"
elif bleu_score > 0.3:
recommendations += "- **Good BLEU Score**: Model shows strong text generation capabilities\n"
if 'efficiency' in results:
latency = results['efficiency'].get('avg_latency_ms', 0)
if latency > 1000:
recommendations += "- **High Latency**: Consider model optimization techniques like quantization or distillation\n"
elif latency < 100:
recommendations += "- **Low Latency**: Model is well-optimized for production use\n"
return recommendations
Conclusion
Effective LLM evaluation requires a multi-faceted approach combining automated metrics, human assessment, and benchmark testing. Key principles:
- Multiple Metrics: No single metric captures all aspects of model quality
- Task Alignment: Evaluation should match intended use cases
- Human Validation: Automated metrics should be validated with human judgment
- Continuous Monitoring: Evaluation should be ongoing, not one-time
- Transparency: Results should be reproducible and well-documented
The evaluation landscape continues to evolve with new metrics and methodologies emerging regularly. Stay updated with the latest research and adapt your evaluation strategy accordingly.