Prompt Engineering: Mastering Communication with Large Language Models
Prompt engineering is the art and science of crafting effective inputs to guide large language models toward desired outputs. This comprehensive guide covers fundamental principles, advanced techniques, and systematic approaches to prompt optimization.
Fundamentals of Prompt Engineering
Core Principles
- Clarity: Clear, unambiguous instructions
- Context: Relevant background information
- Specificity: Precise requirements and constraints
- Structure: Logical organization of information
- Examples: Demonstrations of desired behavior
Basic Prompt Structure
import openai
import json
import re
from typing import List, Dict, Any, Optional
from dataclasses import dataclass
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
@dataclass
class PromptTemplate:
"""Structure for organizing prompt components"""
system_message: str = ""
context: str = ""
instruction: str = ""
examples: List[Dict[str, str]] = None
constraints: List[str] = None
output_format: str = ""
def __post_init__(self):
if self.examples is None:
self.examples = []
if self.constraints is None:
self.constraints = []
def build_prompt(self) -> str:
"""Build complete prompt from components"""
prompt_parts = []
# System message
if self.system_message:
prompt_parts.append(f"System: {self.system_message}")
# Context
if self.context:
prompt_parts.append(f"Context: {self.context}")
# Examples (few-shot)
if self.examples:
prompt_parts.append("Examples:")
for i, example in enumerate(self.examples, 1):
prompt_parts.append(f"Example {i}:")
for key, value in example.items():
prompt_parts.append(f"{key}: {value}")
# Constraints
if self.constraints:
prompt_parts.append("Constraints:")
for constraint in self.constraints:
prompt_parts.append(f"- {constraint}")
# Output format
if self.output_format:
prompt_parts.append(f"Output format: {self.output_format}")
# Main instruction
if self.instruction:
prompt_parts.append(f"Task: {self.instruction}")
return "\n\n".join(prompt_parts)
class PromptEngineer:
def __init__(self, model_name="gpt-3.5-turbo"):
self.model_name = model_name
self.conversation_history = []
self.prompt_templates = {}
# Initialize sentence transformer for similarity calculations
try:
self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
except:
print("Warning: SentenceTransformer not available. Some features may be limited.")
self.sentence_model = None
def create_basic_prompt(self, task: str, context: str = "", examples: List[str] = None) -> str:
"""Create a basic prompt with task, context, and examples"""
prompt_parts = []
if context:
prompt_parts.append(f"Context: {context}")
if examples:
prompt_parts.append("Examples:")
for i, example in enumerate(examples, 1):
prompt_parts.append(f"{i}. {example}")
prompt_parts.append(f"Task: {task}")
return "\n\n".join(prompt_parts)
def zero_shot_prompt(self, task: str, context: str = "") -> str:
"""Create zero-shot prompt"""
template = PromptTemplate(
instruction=task,
context=context
)
return template.build_prompt()
def few_shot_prompt(self, task: str, examples: List[Dict[str, str]], context: str = "") -> str:
"""Create few-shot prompt with examples"""
template = PromptTemplate(
instruction=task,
context=context,
examples=examples
)
return template.build_prompt()
def chain_of_thought_prompt(self, task: str, examples: List[Dict[str, str]] = None) -> str:
"""Create chain-of-thought prompt for reasoning tasks"""
cot_instruction = f"""
{task}
Let's think step by step to solve this problem:
1. First, identify the key information
2. Then, break down the problem into smaller parts
3. Solve each part systematically
4. Combine the results for the final answer
Please show your reasoning process clearly.
"""
template = PromptTemplate(
instruction=cot_instruction,
examples=examples or []
)
return template.build_prompt()
def role_based_prompt(self, task: str, role: str, context: str = "") -> str:
"""Create role-based prompt"""
system_message = f"You are a {role}. Respond in character with appropriate expertise and tone."
template = PromptTemplate(
system_message=system_message,
instruction=task,
context=context
)
return template.build_prompt()
# Example usage
def demonstrate_basic_prompting():
engineer = PromptEngineer()
# Zero-shot example
task = "Classify the sentiment of this text as positive, negative, or neutral"
context = "Text: 'I love this new restaurant! The food is amazing.'"
zero_shot = engineer.zero_shot_prompt(task, context)
print("Zero-shot prompt:")
print(zero_shot)
print("\n" + "="*50 + "\n")
# Few-shot example
examples = [
{
"Input": "The movie was terrible and boring.",
"Output": "negative"
},
{
"Input": "It's an okay product, nothing special.",
"Output": "neutral"
},
{
"Input": "Absolutely fantastic experience!",
"Output": "positive"
}
]
few_shot = engineer.few_shot_prompt(task, examples, context)
print("Few-shot prompt:")
print(few_shot)
print("\n" + "="*50 + "\n")
# Chain-of-thought example
math_task = "Solve: If a train travels 120 miles in 2 hours, and then 180 miles in 3 hours, what is its average speed for the entire journey?"
cot_examples = [
{
"Problem": "A car travels 60 miles in 1 hour, then 90 miles in 2 hours. What's the average speed?",
"Solution": "Step 1: Total distance = 60 + 90 = 150 miles\nStep 2: Total time = 1 + 2 = 3 hours\nStep 3: Average speed = 150 ÷ 3 = 50 mph"
}
]
cot_prompt = engineer.chain_of_thought_prompt(math_task, cot_examples)
print("Chain-of-thought prompt:")
print(cot_prompt)
demonstrate_basic_prompting()
Advanced Prompting Techniques
Chain-of-Thought Reasoning
class ChainOfThoughtPrompting:
def __init__(self):
self.reasoning_patterns = {
'mathematical': [
"Identify the given information",
"Determine what needs to be found",
"Choose the appropriate formula or method",
"Perform the calculations step by step",
"Verify the answer makes sense"
],
'logical': [
"Identify the premises",
"Determine the logical structure",
"Apply logical rules",
"Draw conclusions",
"Check for consistency"
],
'analytical': [
"Break down the problem into components",
"Analyze each component separately",
"Identify relationships between components",
"Synthesize findings",
"Draw overall conclusions"
]
}
def create_cot_prompt(self, problem: str, reasoning_type: str = 'analytical') -> str:
"""Create chain-of-thought prompt with structured reasoning"""
steps = self.reasoning_patterns.get(reasoning_type, self.reasoning_patterns['analytical'])
prompt = f"""
Problem: {problem}
Let's solve this step by step:
"""
for i, step in enumerate(steps, 1):
prompt += f"Step {i}: {step}\n"
prompt += "\nNow, let's work through each step:\n"
return prompt
def self_consistency_prompting(self, problem: str, num_paths: int = 3) -> List[str]:
"""Generate multiple reasoning paths for self-consistency"""
base_prompt = f"""
Problem: {problem}
Let's think about this problem from different angles and solve it step by step.
"""
prompts = []
reasoning_approaches = [
"Approach 1: Start with the most obvious facts and build up",
"Approach 2: Work backwards from what we want to find",
"Approach 3: Consider alternative interpretations and methods"
]
for i in range(min(num_paths, len(reasoning_approaches))):
prompt = base_prompt + f"\n{reasoning_approaches[i]}\n"
prompts.append(prompt)
return prompts
def tree_of_thoughts_prompt(self, problem: str) -> str:
"""Create tree-of-thoughts prompt for complex reasoning"""
prompt = f"""
Problem: {problem}
Let's explore this problem using a tree of thoughts approach:
1. Generate multiple possible approaches:
- Approach A: [Describe first approach]
- Approach B: [Describe second approach]
- Approach C: [Describe third approach]
2. For each approach, consider:
- What are the key steps?
- What assumptions are we making?
- What could go wrong?
- How confident are we in this path?
3. Evaluate and compare approaches:
- Which approach seems most promising?
- Can we combine insights from different approaches?
- What additional information would help?
4. Execute the best approach:
- Work through the chosen method step by step
- Double-check each step
- Verify the final answer
Let's begin:
"""
return prompt
# Example usage
def demonstrate_advanced_reasoning():
cot = ChainOfThoughtPrompting()
# Mathematical reasoning
math_problem = "A store offers a 20% discount on all items. If an item originally costs $80, and there's an additional 5% tax on the discounted price, what is the final amount a customer pays?"
math_prompt = cot.create_cot_prompt(math_problem, 'mathematical')
print("Mathematical Chain-of-Thought:")
print(math_prompt)
print("\n" + "="*50 + "\n")
# Self-consistency prompting
logic_problem = "All birds can fly. Penguins are birds. Can penguins fly? Explain the logical issue with this reasoning."
consistency_prompts = cot.self_consistency_prompting(logic_problem)
print("Self-Consistency Prompts:")
for i, prompt in enumerate(consistency_prompts, 1):
print(f"Path {i}:")
print(prompt)
print("-" * 30)
# Tree of thoughts
complex_problem = "Design a system to reduce traffic congestion in a major city while considering environmental impact, cost, and citizen satisfaction."
tot_prompt = cot.tree_of_thoughts_prompt(complex_problem)
print("Tree-of-Thoughts Prompt:")
print(tot_prompt)
demonstrate_advanced_reasoning()
Prompt Optimization Techniques
class PromptOptimizer:
def __init__(self):
self.optimization_history = []
self.performance_metrics = {}
def iterative_refinement(self, base_prompt: str, test_cases: List[Dict],
max_iterations: int = 5) -> str:
"""Iteratively refine prompt based on test case performance"""
current_prompt = base_prompt
best_prompt = base_prompt
best_score = 0
for iteration in range(max_iterations):
# Evaluate current prompt
score = self.evaluate_prompt(current_prompt, test_cases)
if score > best_score:
best_score = score
best_prompt = current_prompt
# Generate refinement suggestions
refinements = self.suggest_refinements(current_prompt, test_cases)
# Apply best refinement
if refinements:
current_prompt = self.apply_refinement(current_prompt, refinements[0])
self.optimization_history.append({
'iteration': iteration,
'prompt': current_prompt,
'score': score
})
return best_prompt
def evaluate_prompt(self, prompt: str, test_cases: List[Dict]) -> float:
"""Evaluate prompt performance on test cases"""
# Simplified evaluation - in practice, you'd use actual model responses
score = 0
total_cases = len(test_cases)
for case in test_cases:
# Check if prompt contains relevant keywords for the task
task_keywords = case.get('keywords', [])
prompt_lower = prompt.lower()
keyword_score = sum(1 for keyword in task_keywords if keyword.lower() in prompt_lower)
case_score = min(keyword_score / len(task_keywords), 1.0) if task_keywords else 0.5
score += case_score
return score / total_cases if total_cases > 0 else 0
def suggest_refinements(self, prompt: str, test_cases: List[Dict]) -> List[str]:
"""Suggest prompt refinements based on test case analysis"""
refinements = []
# Analyze common patterns in test cases
all_keywords = []
for case in test_cases:
all_keywords.extend(case.get('keywords', []))
# Find missing important keywords
prompt_lower = prompt.lower()
missing_keywords = [kw for kw in set(all_keywords) if kw.lower() not in prompt_lower]
if missing_keywords:
refinements.append(f"Add context about: {', '.join(missing_keywords[:3])}")
# Suggest structure improvements
if "step by step" not in prompt_lower and any("reasoning" in str(case) for case in test_cases):
refinements.append("Add 'Let's think step by step' for better reasoning")
if "example" not in prompt_lower:
refinements.append("Consider adding examples to clarify the task")
return refinements
def apply_refinement(self, prompt: str, refinement: str) -> str:
"""Apply a specific refinement to the prompt"""
if "Add context about:" in refinement:
keywords = refinement.split("Add context about: ")[1]
return f"{prompt}\n\nAdditional context: Consider {keywords} when responding."
elif "step by step" in refinement:
return f"{prompt}\n\nLet's think step by step to ensure accuracy."
elif "examples" in refinement:
return f"{prompt}\n\nPlease provide examples to illustrate your response when appropriate."
return prompt
def a_b_test_prompts(self, prompt_a: str, prompt_b: str, test_cases: List[Dict]) -> Dict:
"""A/B test two prompts"""
score_a = self.evaluate_prompt(prompt_a, test_cases)
score_b = self.evaluate_prompt(prompt_b, test_cases)
return {
'prompt_a_score': score_a,
'prompt_b_score': score_b,
'winner': 'A' if score_a > score_b else 'B',
'improvement': abs(score_a - score_b)
}
def genetic_prompt_optimization(self, base_prompts: List[str], test_cases: List[Dict],
generations: int = 5, population_size: int = 10) -> str:
"""Use genetic algorithm approach for prompt optimization"""
population = base_prompts[:population_size]
# Fill population if needed
while len(population) < population_size:
population.append(self.mutate_prompt(population[0]))
for generation in range(generations):
# Evaluate fitness
fitness_scores = [(prompt, self.evaluate_prompt(prompt, test_cases))
for prompt in population]
# Sort by fitness
fitness_scores.sort(key=lambda x: x[1], reverse=True)
# Select top performers
top_half = [prompt for prompt, score in fitness_scores[:population_size//2]]
# Generate new population
new_population = top_half.copy()
# Crossover and mutation
while len(new_population) < population_size:
parent1, parent2 = np.random.choice(top_half, 2, replace=False)
child = self.crossover_prompts(parent1, parent2)
child = self.mutate_prompt(child)
new_population.append(child)
population = new_population
# Return best prompt
final_scores = [(prompt, self.evaluate_prompt(prompt, test_cases))
for prompt in population]
best_prompt = max(final_scores, key=lambda x: x[1])[0]
return best_prompt
def mutate_prompt(self, prompt: str) -> str:
"""Apply random mutations to a prompt"""
mutations = [
lambda p: p + "\n\nPlease be specific and detailed in your response.",
lambda p: p + "\n\nConsider multiple perspectives when answering.",
lambda p: p.replace(".", ". Think carefully about this."),
lambda p: f"Important: {p}",
lambda p: p + "\n\nProvide reasoning for your answer."
]
mutation = np.random.choice(mutations)
return mutation(prompt)
def crossover_prompts(self, prompt1: str, prompt2: str) -> str:
"""Combine two prompts to create a new one"""
sentences1 = prompt1.split('.')
sentences2 = prompt2.split('.')
# Randomly select sentences from each prompt
combined_sentences = []
max_len = max(len(sentences1), len(sentences2))
for i in range(max_len):
if i < len(sentences1) and i < len(sentences2):
chosen = sentences1[i] if np.random.random() < 0.5 else sentences2[i]
elif i < len(sentences1):
chosen = sentences1[i]
else:
chosen = sentences2[i]
combined_sentences.append(chosen)
return '.'.join(combined_sentences)
# Example usage
def demonstrate_prompt_optimization():
optimizer = PromptOptimizer()
# Test cases for sentiment analysis
test_cases = [
{
'input': 'I love this product!',
'expected_output': 'positive',
'keywords': ['sentiment', 'emotion', 'positive', 'negative']
},
{
'input': 'This is terrible.',
'expected_output': 'negative',
'keywords': ['sentiment', 'emotion', 'positive', 'negative']
},
{
'input': 'It\'s okay, nothing special.',
'expected_output': 'neutral',
'keywords': ['sentiment', 'emotion', 'neutral']
}
]
# Base prompts to optimize
base_prompts = [
"Classify the sentiment of the given text.",
"Determine if the text expresses positive, negative, or neutral sentiment.",
"Analyze the emotional tone of the text and categorize it."
]
print("Original prompts:")
for i, prompt in enumerate(base_prompts):
score = optimizer.evaluate_prompt(prompt, test_cases)
print(f"{i+1}. {prompt} (Score: {score:.2f})")
# A/B test
ab_result = optimizer.a_b_test_prompts(base_prompts[0], base_prompts[1], test_cases)
print(f"\nA/B Test Result: Prompt {ab_result['winner']} wins with improvement of {ab_result['improvement']:.2f}")
# Iterative refinement
refined_prompt = optimizer.iterative_refinement(base_prompts[0], test_cases)
print(f"\nRefined prompt: {refined_prompt}")
# Genetic optimization
best_prompt = optimizer.genetic_prompt_optimization(base_prompts, test_cases, generations=3)
print(f"\nGenetically optimized prompt: {best_prompt}")
demonstrate_prompt_optimization()
Specialized Prompting Strategies
Task-Specific Prompting
class TaskSpecificPrompting:
def __init__(self):
self.task_templates = {
'classification': self.classification_prompt,
'summarization': self.summarization_prompt,
'question_answering': self.qa_prompt,
'code_generation': self.code_generation_prompt,
'creative_writing': self.creative_writing_prompt,
'data_analysis': self.data_analysis_prompt
}
def classification_prompt(self, text: str, categories: List[str],
context: str = "") -> str:
"""Create classification prompt"""
categories_str = ", ".join(categories)
prompt = f"""
Task: Text Classification
{f"Context: {context}" if context else ""}
Categories: {categories_str}
Text to classify: "{text}"
Instructions:
1. Read the text carefully
2. Consider the context and meaning
3. Choose the most appropriate category
4. Provide a brief explanation for your choice
Classification:"""
return prompt
def summarization_prompt(self, text: str, length: str = "medium",
style: str = "neutral") -> str:
"""Create summarization prompt"""
length_instructions = {
"short": "in 1-2 sentences",
"medium": "in 3-5 sentences",
"long": "in 1-2 paragraphs"
}
style_instructions = {
"neutral": "using objective, factual language",
"casual": "using conversational, accessible language",
"formal": "using professional, academic language",
"bullet": "using bullet points for key information"
}
prompt = f"""
Task: Text Summarization
Instructions:
- Summarize the following text {length_instructions.get(length, "concisely")}
- Use {style_instructions.get(style, "clear and concise language")}
- Capture the main ideas and key points
- Maintain the original meaning and context
Text to summarize:
{text}
Summary:"""
return prompt
def qa_prompt(self, context: str, question: str, answer_type: str = "comprehensive") -> str:
"""Create question-answering prompt"""
answer_instructions = {
"brief": "Provide a concise, direct answer",
"comprehensive": "Provide a detailed, thorough answer with explanations",
"step_by_step": "Break down the answer into clear steps",
"examples": "Include relevant examples to illustrate your answer"
}
prompt = f"""
Task: Question Answering
Context:
{context}
Question: {question}
Instructions:
- {answer_instructions.get(answer_type, "Provide a clear and accurate answer")}
- Base your answer on the provided context
- If the context doesn't contain enough information, state this clearly
- Cite specific parts of the context when relevant
Answer:"""
return prompt
def code_generation_prompt(self, task: str, language: str = "Python",
requirements: List[str] = None) -> str:
"""Create code generation prompt"""
requirements = requirements or []
prompt = f"""
Task: Code Generation
Programming Language: {language}
Task Description: {task}
Requirements:
"""
if requirements:
for req in requirements:
prompt += f"- {req}\n"
else:
prompt += "- Write clean, readable code\n- Include appropriate comments\n- Handle edge cases\n"
prompt += f"""
Additional Instructions:
- Follow {language} best practices and conventions
- Include error handling where appropriate
- Provide example usage if applicable
- Explain complex logic with comments
Code:
```{language.lower()}
"""
return prompt
def creative_writing_prompt(self, genre: str, theme: str, length: str = "short",
constraints: List[str] = None) -> str:
"""Create creative writing prompt"""
constraints = constraints or []
length_guide = {
"flash": "under 100 words",
"short": "200-500 words",
"medium": "500-1000 words",
"long": "1000+ words"
}
prompt = f"""
Task: Creative Writing
Genre: {genre}
Theme: {theme}
Target Length: {length_guide.get(length, "as appropriate")}
"""
if constraints:
prompt += "Constraints:\n"
for constraint in constraints:
prompt += f"- {constraint}\n"
prompt += "\n"
prompt += f"""
Instructions:
- Create an engaging {genre} piece centered around the theme of {theme}
- Develop compelling characters and/or scenarios
- Use vivid, descriptive language
- Maintain consistency in tone and style
- Create a satisfying narrative arc
Story:"""
return prompt
def data_analysis_prompt(self, data_description: str, analysis_type: str,
specific_questions: List[str] = None) -> str:
"""Create data analysis prompt"""
specific_questions = specific_questions or []
analysis_instructions = {
"descriptive": "Provide descriptive statistics and summarize key patterns",
"exploratory": "Explore relationships and identify interesting insights",
"diagnostic": "Investigate causes and explain observed patterns",
"predictive": "Identify trends and make predictions based on the data"
}
prompt = f"""
Task: Data Analysis
Data Description: {data_description}
Analysis Type: {analysis_type}
Objective: {analysis_instructions.get(analysis_type, "Analyze the data comprehensively")}
"""
if specific_questions:
prompt += "Specific Questions to Address:\n"
for i, question in enumerate(specific_questions, 1):
prompt += f"{i}. {question}\n"
prompt += "\n"
prompt += """
Instructions:
- Examine the data systematically
- Identify key patterns, trends, and anomalies
- Provide statistical evidence for your findings
- Suggest actionable insights or recommendations
- Highlight any limitations or assumptions
Analysis:"""
return prompt
def get_task_prompt(self, task_type: str, **kwargs) -> str:
"""Get prompt for specific task type"""
if task_type in self.task_templates:
return self.task_templates[task_type](**kwargs)
else:
raise ValueError(f"Unknown task type: {task_type}")
# Example usage
def demonstrate_task_specific_prompting():
task_prompter = TaskSpecificPrompting()
# Classification example
classification_prompt = task_prompter.classification_prompt(
text="The new smartphone has an amazing camera and long battery life, but the price is quite high.",
categories=["positive", "negative", "neutral"],
context="Product review analysis"
)
print("Classification Prompt:")
print(classification_prompt)
print("\n" + "="*50 + "\n")
# Code generation example
code_prompt = task_prompter.code_generation_prompt(
task="Create a function to calculate the factorial of a number",
language="Python",
requirements=[
"Handle negative numbers appropriately",
"Use recursion",
"Include input validation"
]
)
print("Code Generation Prompt:")
print(code_prompt)
print("\n" + "="*50 + "\n")
# Creative writing example
creative_prompt = task_prompter.creative_writing_prompt(
genre="science fiction",
theme="artificial intelligence",
length="short",
constraints=[
"Set in the year 2050",
"Include a moral dilemma",
"First-person perspective"
]
)
print("Creative Writing Prompt:")
print(creative_prompt)
demonstrate_task_specific_prompting()
Prompt Evaluation and Testing
Systematic Evaluation Framework
class PromptEvaluator:
def __init__(self):
self.evaluation_metrics = {}
self.test_results = []
def create_test_suite(self, task_type: str, test_cases: List[Dict]) -> Dict:
"""Create comprehensive test suite for prompt evaluation"""
test_suite = {
'task_type': task_type,
'test_cases': test_cases,
'metrics': self.get_metrics_for_task(task_type),
'created_at': pd.Timestamp.now()
}
return test_suite
def get_metrics_for_task(self, task_type: str) -> List[str]:
"""Get appropriate metrics for different task types"""
task_metrics = {
'classification': ['accuracy', 'precision', 'recall', 'f1_score'],
'generation': ['fluency', 'coherence', 'relevance', 'creativity'],
'summarization': ['coverage', 'conciseness', 'faithfulness'],
'qa': ['correctness', 'completeness', 'clarity'],
'reasoning': ['logical_consistency', 'step_completeness', 'conclusion_validity']
}
return task_metrics.get(task_type, ['quality', 'relevance', 'clarity'])
def evaluate_response_quality(self, response: str, expected: str = None,
criteria: List[str] = None) -> Dict[str, float]:
"""Evaluate response quality using multiple criteria"""
criteria = criteria or ['clarity', 'relevance', 'completeness']
scores = {}
# Length-based metrics
scores['length_score'] = min(len(response.split()) / 50, 1.0) # Normalize to 50 words
# Clarity (simplified - based on sentence structure)
sentences = response.split('.')
avg_sentence_length = np.mean([len(s.split()) for s in sentences if s.strip()])
scores['clarity'] = max(0, 1 - abs(avg_sentence_length - 15) / 15) # Optimal ~15 words
# Relevance (keyword matching if expected answer provided)
if expected:
response_words = set(response.lower().split())
expected_words = set(expected.lower().split())
overlap = len(response_words.intersection(expected_words))
scores['relevance'] = overlap / len(expected_words) if expected_words else 0
else:
scores['relevance'] = 0.8 # Default score when no expected answer
# Completeness (based on response length and structure)
has_intro = any(word in response.lower() for word in ['first', 'initially', 'to begin'])
has_body = len(response.split()) > 20
has_conclusion = any(word in response.lower() for word in ['conclusion', 'summary', 'finally'])
scores['completeness'] = (has_intro + has_body + has_conclusion) / 3
return scores
def batch_evaluate_prompts(self, prompts: List[str], test_suite: Dict) -> pd.DataFrame:
"""Evaluate multiple prompts on a test suite"""
results = []
for i, prompt in enumerate(prompts):
prompt_results = {
'prompt_id': i,
'prompt': prompt[:100] + "..." if len(prompt) > 100 else prompt
}
# Evaluate on each test case
case_scores = []
for case in test_suite['test_cases']:
# Simulate model response (in practice, you'd call the actual model)
simulated_response = self.simulate_model_response(prompt, case['input'])
# Evaluate response
scores = self.evaluate_response_quality(
simulated_response,
case.get('expected_output'),
test_suite['metrics']
)
case_scores.append(scores)
# Aggregate scores across test cases
for metric in test_suite['metrics']:
if metric in case_scores[0]:
prompt_results[metric] = np.mean([scores.get(metric, 0) for scores in case_scores])
# Overall score
prompt_results['overall_score'] = np.mean([
prompt_results.get(metric, 0) for metric in test_suite['metrics']
])
results.append(prompt_results)
return pd.DataFrame(results)
def simulate_model_response(self, prompt: str, input_text: str) -> str:
"""Simulate model response for testing purposes"""
# This is a simplified simulation - replace with actual model calls
combined_input = f"{prompt}\n\nInput: {input_text}"
# Simple response generation based on prompt keywords
if 'classify' in prompt.lower():
return "positive" # Simplified classification
elif 'summarize' in prompt.lower():
return f"Summary of the input: {input_text[:50]}..."
elif 'question' in prompt.lower():
return f"Based on the context, the answer is related to {input_text[:30]}..."
else:
return f"Response to: {input_text[:50]}..."
def compare_prompt_versions(self, prompt_versions: Dict[str, str],
test_suite: Dict) -> Dict:
"""Compare different versions of a prompt"""
comparison_results = {}
for version_name, prompt in prompt_versions.items():
results = self.batch_evaluate_prompts([prompt], test_suite)
comparison_results[version_name] = results.iloc[0].to_dict()
# Create comparison DataFrame
comparison_df = pd.DataFrame(comparison_results).T
# Find best performing version for each metric
best_versions = {}
for metric in test_suite['metrics']:
if metric in comparison_df.columns:
best_version = comparison_df[metric].idxmax()
best_versions[metric] = best_version
return {
'comparison_table': comparison_df,
'best_versions': best_versions,
'overall_winner': comparison_df['overall_score'].idxmax()
}
def statistical_significance_test(self, results_a: List[float],
results_b: List[float]) -> Dict:
"""Test statistical significance between two prompt versions"""
from scipy import stats
# Perform t-test
t_stat, p_value = stats.ttest_ind(results_a, results_b)
# Calculate effect size (Cohen's d)
pooled_std = np.sqrt(((len(results_a) - 1) * np.var(results_a) +
(len(results_b) - 1) * np.var(results_b)) /
(len(results_a) + len(results_b) - 2))
cohens_d = (np.mean(results_a) - np.mean(results_b)) / pooled_std
return {
't_statistic': t_stat,
'p_value': p_value,
'significant': p_value < 0.05,
'effect_size': cohens_d,
'interpretation': self.interpret_effect_size(cohens_d)
}
def interpret_effect_size(self, cohens_d: float) -> str:
"""Interpret Cohen's d effect size"""
abs_d = abs(cohens_d)
if abs_d < 0.2:
return "negligible"
elif abs_d < 0.5:
return "small"
elif abs_d < 0.8:
return "medium"
else:
return "large"
# Example usage
def demonstrate_prompt_evaluation():
evaluator = PromptEvaluator()
# Create test suite
test_cases = [
{
'input': 'The movie was fantastic with great acting and plot.',
'expected_output': 'positive',
'category': 'clear_positive'
},
{
'input': 'I hate this product, it broke after one day.',
'expected_output': 'negative',
'category': 'clear_negative'
},
{
'input': 'The service was okay, nothing special but not bad either.',
'expected_output': 'neutral',
'category': 'neutral'
}
]
test_suite = evaluator.create_test_suite('classification', test_cases)
# Prompt versions to compare
prompt_versions = {
'basic': 'Classify the sentiment as positive, negative, or neutral.',
'detailed': 'Analyze the sentiment of the given text. Consider the emotional tone, word choice, and overall message. Classify as positive, negative, or neutral.',
'structured': '''Task: Sentiment Classification
Instructions:
1. Read the text carefully
2. Identify emotional indicators
3. Consider context and nuance
4. Classify as positive, negative, or neutral
5. Provide brief reasoning
Text:'''
}
# Compare prompt versions
comparison = evaluator.compare_prompt_versions(prompt_versions, test_suite)
print("Prompt Comparison Results:")
print(comparison['comparison_table'].round(3))
print(f"\nOverall Winner: {comparison['overall_winner']}")
print(f"Best versions by metric: {comparison['best_versions']}")
# Statistical significance test
basic_scores = [0.7, 0.8, 0.6, 0.9, 0.7]
detailed_scores = [0.8, 0.9, 0.8, 0.9, 0.8]
sig_test = evaluator.statistical_significance_test(basic_scores, detailed_scores)
print(f"\nStatistical Significance Test:")
print(f"P-value: {sig_test['p_value']:.4f}")
print(f"Significant: {sig_test['significant']}")
print(f"Effect size: {sig_test['effect_size']:.3f} ({sig_test['interpretation']})")
demonstrate_prompt_evaluation()
Conclusion
Prompt engineering is a critical skill for maximizing the effectiveness of large language models. By understanding fundamental principles, mastering advanced techniques, and implementing systematic evaluation approaches, you can significantly improve model performance across various tasks.
Next Steps
- Practice with different prompt types and techniques
- Experiment with prompt optimization methods
- Build automated prompt testing pipelines
- Study task-specific prompting strategies
- Learn about fine-tuning as an alternative to prompting