Prompt Engineering: Mastering Communication with Large Language Models

Prompt engineering is the art and science of crafting effective inputs to guide large language models toward desired outputs. This comprehensive guide covers fundamental principles, advanced techniques, and systematic approaches to prompt optimization.

Fundamentals of Prompt Engineering

Core Principles

  1. Clarity: Clear, unambiguous instructions
  2. Context: Relevant background information
  3. Specificity: Precise requirements and constraints
  4. Structure: Logical organization of information
  5. Examples: Demonstrations of desired behavior

Basic Prompt Structure

import openai
import json
import re
from typing import List, Dict, Any, Optional
from dataclasses import dataclass
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

@dataclass
class PromptTemplate:
    """Structure for organizing prompt components"""
    system_message: str = ""
    context: str = ""
    instruction: str = ""
    examples: List[Dict[str, str]] = None
    constraints: List[str] = None
    output_format: str = ""

    def __post_init__(self):
        if self.examples is None:
            self.examples = []
        if self.constraints is None:
            self.constraints = []

    def build_prompt(self) -> str:
        """Build complete prompt from components"""
        prompt_parts = []

        # System message
        if self.system_message:
            prompt_parts.append(f"System: {self.system_message}")

        # Context
        if self.context:
            prompt_parts.append(f"Context: {self.context}")

        # Examples (few-shot)
        if self.examples:
            prompt_parts.append("Examples:")
            for i, example in enumerate(self.examples, 1):
                prompt_parts.append(f"Example {i}:")
                for key, value in example.items():
                    prompt_parts.append(f"{key}: {value}")

        # Constraints
        if self.constraints:
            prompt_parts.append("Constraints:")
            for constraint in self.constraints:
                prompt_parts.append(f"- {constraint}")

        # Output format
        if self.output_format:
            prompt_parts.append(f"Output format: {self.output_format}")

        # Main instruction
        if self.instruction:
            prompt_parts.append(f"Task: {self.instruction}")

        return "\n\n".join(prompt_parts)

class PromptEngineer:
    def __init__(self, model_name="gpt-3.5-turbo"):
        self.model_name = model_name
        self.conversation_history = []
        self.prompt_templates = {}

        # Initialize sentence transformer for similarity calculations
        try:
            self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
        except:
            print("Warning: SentenceTransformer not available. Some features may be limited.")
            self.sentence_model = None

    def create_basic_prompt(self, task: str, context: str = "", examples: List[str] = None) -> str:
        """Create a basic prompt with task, context, and examples"""
        prompt_parts = []

        if context:
            prompt_parts.append(f"Context: {context}")

        if examples:
            prompt_parts.append("Examples:")
            for i, example in enumerate(examples, 1):
                prompt_parts.append(f"{i}. {example}")

        prompt_parts.append(f"Task: {task}")

        return "\n\n".join(prompt_parts)

    def zero_shot_prompt(self, task: str, context: str = "") -> str:
        """Create zero-shot prompt"""
        template = PromptTemplate(
            instruction=task,
            context=context
        )
        return template.build_prompt()

    def few_shot_prompt(self, task: str, examples: List[Dict[str, str]], context: str = "") -> str:
        """Create few-shot prompt with examples"""
        template = PromptTemplate(
            instruction=task,
            context=context,
            examples=examples
        )
        return template.build_prompt()

    def chain_of_thought_prompt(self, task: str, examples: List[Dict[str, str]] = None) -> str:
        """Create chain-of-thought prompt for reasoning tasks"""
        cot_instruction = f"""
        {task}

        Let's think step by step to solve this problem:
        1. First, identify the key information
        2. Then, break down the problem into smaller parts
        3. Solve each part systematically
        4. Combine the results for the final answer

        Please show your reasoning process clearly.
        """

        template = PromptTemplate(
            instruction=cot_instruction,
            examples=examples or []
        )
        return template.build_prompt()

    def role_based_prompt(self, task: str, role: str, context: str = "") -> str:
        """Create role-based prompt"""
        system_message = f"You are a {role}. Respond in character with appropriate expertise and tone."

        template = PromptTemplate(
            system_message=system_message,
            instruction=task,
            context=context
        )
        return template.build_prompt()

# Example usage
def demonstrate_basic_prompting():
    engineer = PromptEngineer()

    # Zero-shot example
    task = "Classify the sentiment of this text as positive, negative, or neutral"
    context = "Text: 'I love this new restaurant! The food is amazing.'"

    zero_shot = engineer.zero_shot_prompt(task, context)
    print("Zero-shot prompt:")
    print(zero_shot)
    print("\n" + "="*50 + "\n")

    # Few-shot example
    examples = [
        {
            "Input": "The movie was terrible and boring.",
            "Output": "negative"
        },
        {
            "Input": "It's an okay product, nothing special.",
            "Output": "neutral"
        },
        {
            "Input": "Absolutely fantastic experience!",
            "Output": "positive"
        }
    ]

    few_shot = engineer.few_shot_prompt(task, examples, context)
    print("Few-shot prompt:")
    print(few_shot)
    print("\n" + "="*50 + "\n")

    # Chain-of-thought example
    math_task = "Solve: If a train travels 120 miles in 2 hours, and then 180 miles in 3 hours, what is its average speed for the entire journey?"

    cot_examples = [
        {
            "Problem": "A car travels 60 miles in 1 hour, then 90 miles in 2 hours. What's the average speed?",
            "Solution": "Step 1: Total distance = 60 + 90 = 150 miles\nStep 2: Total time = 1 + 2 = 3 hours\nStep 3: Average speed = 150 ÷ 3 = 50 mph"
        }
    ]

    cot_prompt = engineer.chain_of_thought_prompt(math_task, cot_examples)
    print("Chain-of-thought prompt:")
    print(cot_prompt)

demonstrate_basic_prompting()

Advanced Prompting Techniques

Chain-of-Thought Reasoning

class ChainOfThoughtPrompting:
    def __init__(self):
        self.reasoning_patterns = {
            'mathematical': [
                "Identify the given information",
                "Determine what needs to be found",
                "Choose the appropriate formula or method",
                "Perform the calculations step by step",
                "Verify the answer makes sense"
            ],
            'logical': [
                "Identify the premises",
                "Determine the logical structure",
                "Apply logical rules",
                "Draw conclusions",
                "Check for consistency"
            ],
            'analytical': [
                "Break down the problem into components",
                "Analyze each component separately",
                "Identify relationships between components",
                "Synthesize findings",
                "Draw overall conclusions"
            ]
        }

    def create_cot_prompt(self, problem: str, reasoning_type: str = 'analytical') -> str:
        """Create chain-of-thought prompt with structured reasoning"""
        steps = self.reasoning_patterns.get(reasoning_type, self.reasoning_patterns['analytical'])

        prompt = f"""
Problem: {problem}

Let's solve this step by step:

"""
        for i, step in enumerate(steps, 1):
            prompt += f"Step {i}: {step}\n"

        prompt += "\nNow, let's work through each step:\n"

        return prompt

    def self_consistency_prompting(self, problem: str, num_paths: int = 3) -> List[str]:
        """Generate multiple reasoning paths for self-consistency"""
        base_prompt = f"""
Problem: {problem}

Let's think about this problem from different angles and solve it step by step.
"""

        prompts = []
        reasoning_approaches = [
            "Approach 1: Start with the most obvious facts and build up",
            "Approach 2: Work backwards from what we want to find",
            "Approach 3: Consider alternative interpretations and methods"
        ]

        for i in range(min(num_paths, len(reasoning_approaches))):
            prompt = base_prompt + f"\n{reasoning_approaches[i]}\n"
            prompts.append(prompt)

        return prompts

    def tree_of_thoughts_prompt(self, problem: str) -> str:
        """Create tree-of-thoughts prompt for complex reasoning"""
        prompt = f"""
Problem: {problem}

Let's explore this problem using a tree of thoughts approach:

1. Generate multiple possible approaches:
   - Approach A: [Describe first approach]
   - Approach B: [Describe second approach]
   - Approach C: [Describe third approach]

2. For each approach, consider:
   - What are the key steps?
   - What assumptions are we making?
   - What could go wrong?
   - How confident are we in this path?

3. Evaluate and compare approaches:
   - Which approach seems most promising?
   - Can we combine insights from different approaches?
   - What additional information would help?

4. Execute the best approach:
   - Work through the chosen method step by step
   - Double-check each step
   - Verify the final answer

Let's begin:
"""
        return prompt

# Example usage
def demonstrate_advanced_reasoning():
    cot = ChainOfThoughtPrompting()

    # Mathematical reasoning
    math_problem = "A store offers a 20% discount on all items. If an item originally costs $80, and there's an additional 5% tax on the discounted price, what is the final amount a customer pays?"

    math_prompt = cot.create_cot_prompt(math_problem, 'mathematical')
    print("Mathematical Chain-of-Thought:")
    print(math_prompt)
    print("\n" + "="*50 + "\n")

    # Self-consistency prompting
    logic_problem = "All birds can fly. Penguins are birds. Can penguins fly? Explain the logical issue with this reasoning."

    consistency_prompts = cot.self_consistency_prompting(logic_problem)
    print("Self-Consistency Prompts:")
    for i, prompt in enumerate(consistency_prompts, 1):
        print(f"Path {i}:")
        print(prompt)
        print("-" * 30)

    # Tree of thoughts
    complex_problem = "Design a system to reduce traffic congestion in a major city while considering environmental impact, cost, and citizen satisfaction."

    tot_prompt = cot.tree_of_thoughts_prompt(complex_problem)
    print("Tree-of-Thoughts Prompt:")
    print(tot_prompt)

demonstrate_advanced_reasoning()

Prompt Optimization Techniques

class PromptOptimizer:
    def __init__(self):
        self.optimization_history = []
        self.performance_metrics = {}

    def iterative_refinement(self, base_prompt: str, test_cases: List[Dict],
                           max_iterations: int = 5) -> str:
        """Iteratively refine prompt based on test case performance"""
        current_prompt = base_prompt
        best_prompt = base_prompt
        best_score = 0

        for iteration in range(max_iterations):
            # Evaluate current prompt
            score = self.evaluate_prompt(current_prompt, test_cases)

            if score > best_score:
                best_score = score
                best_prompt = current_prompt

            # Generate refinement suggestions
            refinements = self.suggest_refinements(current_prompt, test_cases)

            # Apply best refinement
            if refinements:
                current_prompt = self.apply_refinement(current_prompt, refinements[0])

            self.optimization_history.append({
                'iteration': iteration,
                'prompt': current_prompt,
                'score': score
            })

        return best_prompt

    def evaluate_prompt(self, prompt: str, test_cases: List[Dict]) -> float:
        """Evaluate prompt performance on test cases"""
        # Simplified evaluation - in practice, you'd use actual model responses
        score = 0
        total_cases = len(test_cases)

        for case in test_cases:
            # Check if prompt contains relevant keywords for the task
            task_keywords = case.get('keywords', [])
            prompt_lower = prompt.lower()

            keyword_score = sum(1 for keyword in task_keywords if keyword.lower() in prompt_lower)
            case_score = min(keyword_score / len(task_keywords), 1.0) if task_keywords else 0.5

            score += case_score

        return score / total_cases if total_cases > 0 else 0

    def suggest_refinements(self, prompt: str, test_cases: List[Dict]) -> List[str]:
        """Suggest prompt refinements based on test case analysis"""
        refinements = []

        # Analyze common patterns in test cases
        all_keywords = []
        for case in test_cases:
            all_keywords.extend(case.get('keywords', []))

        # Find missing important keywords
        prompt_lower = prompt.lower()
        missing_keywords = [kw for kw in set(all_keywords) if kw.lower() not in prompt_lower]

        if missing_keywords:
            refinements.append(f"Add context about: {', '.join(missing_keywords[:3])}")

        # Suggest structure improvements
        if "step by step" not in prompt_lower and any("reasoning" in str(case) for case in test_cases):
            refinements.append("Add 'Let's think step by step' for better reasoning")

        if "example" not in prompt_lower:
            refinements.append("Consider adding examples to clarify the task")

        return refinements

    def apply_refinement(self, prompt: str, refinement: str) -> str:
        """Apply a specific refinement to the prompt"""
        if "Add context about:" in refinement:
            keywords = refinement.split("Add context about: ")[1]
            return f"{prompt}\n\nAdditional context: Consider {keywords} when responding."

        elif "step by step" in refinement:
            return f"{prompt}\n\nLet's think step by step to ensure accuracy."

        elif "examples" in refinement:
            return f"{prompt}\n\nPlease provide examples to illustrate your response when appropriate."

        return prompt

    def a_b_test_prompts(self, prompt_a: str, prompt_b: str, test_cases: List[Dict]) -> Dict:
        """A/B test two prompts"""
        score_a = self.evaluate_prompt(prompt_a, test_cases)
        score_b = self.evaluate_prompt(prompt_b, test_cases)

        return {
            'prompt_a_score': score_a,
            'prompt_b_score': score_b,
            'winner': 'A' if score_a > score_b else 'B',
            'improvement': abs(score_a - score_b)
        }

    def genetic_prompt_optimization(self, base_prompts: List[str], test_cases: List[Dict],
                                  generations: int = 5, population_size: int = 10) -> str:
        """Use genetic algorithm approach for prompt optimization"""
        population = base_prompts[:population_size]

        # Fill population if needed
        while len(population) < population_size:
            population.append(self.mutate_prompt(population[0]))

        for generation in range(generations):
            # Evaluate fitness
            fitness_scores = [(prompt, self.evaluate_prompt(prompt, test_cases))
                            for prompt in population]

            # Sort by fitness
            fitness_scores.sort(key=lambda x: x[1], reverse=True)

            # Select top performers
            top_half = [prompt for prompt, score in fitness_scores[:population_size//2]]

            # Generate new population
            new_population = top_half.copy()

            # Crossover and mutation
            while len(new_population) < population_size:
                parent1, parent2 = np.random.choice(top_half, 2, replace=False)
                child = self.crossover_prompts(parent1, parent2)
                child = self.mutate_prompt(child)
                new_population.append(child)

            population = new_population

        # Return best prompt
        final_scores = [(prompt, self.evaluate_prompt(prompt, test_cases))
                       for prompt in population]
        best_prompt = max(final_scores, key=lambda x: x[1])[0]

        return best_prompt

    def mutate_prompt(self, prompt: str) -> str:
        """Apply random mutations to a prompt"""
        mutations = [
            lambda p: p + "\n\nPlease be specific and detailed in your response.",
            lambda p: p + "\n\nConsider multiple perspectives when answering.",
            lambda p: p.replace(".", ". Think carefully about this."),
            lambda p: f"Important: {p}",
            lambda p: p + "\n\nProvide reasoning for your answer."
        ]

        mutation = np.random.choice(mutations)
        return mutation(prompt)

    def crossover_prompts(self, prompt1: str, prompt2: str) -> str:
        """Combine two prompts to create a new one"""
        sentences1 = prompt1.split('.')
        sentences2 = prompt2.split('.')

        # Randomly select sentences from each prompt
        combined_sentences = []
        max_len = max(len(sentences1), len(sentences2))

        for i in range(max_len):
            if i < len(sentences1) and i < len(sentences2):
                chosen = sentences1[i] if np.random.random() < 0.5 else sentences2[i]
            elif i < len(sentences1):
                chosen = sentences1[i]
            else:
                chosen = sentences2[i]

            combined_sentences.append(chosen)

        return '.'.join(combined_sentences)

# Example usage
def demonstrate_prompt_optimization():
    optimizer = PromptOptimizer()

    # Test cases for sentiment analysis
    test_cases = [
        {
            'input': 'I love this product!',
            'expected_output': 'positive',
            'keywords': ['sentiment', 'emotion', 'positive', 'negative']
        },
        {
            'input': 'This is terrible.',
            'expected_output': 'negative',
            'keywords': ['sentiment', 'emotion', 'positive', 'negative']
        },
        {
            'input': 'It\'s okay, nothing special.',
            'expected_output': 'neutral',
            'keywords': ['sentiment', 'emotion', 'neutral']
        }
    ]

    # Base prompts to optimize
    base_prompts = [
        "Classify the sentiment of the given text.",
        "Determine if the text expresses positive, negative, or neutral sentiment.",
        "Analyze the emotional tone of the text and categorize it."
    ]

    print("Original prompts:")
    for i, prompt in enumerate(base_prompts):
        score = optimizer.evaluate_prompt(prompt, test_cases)
        print(f"{i+1}. {prompt} (Score: {score:.2f})")

    # A/B test
    ab_result = optimizer.a_b_test_prompts(base_prompts[0], base_prompts[1], test_cases)
    print(f"\nA/B Test Result: Prompt {ab_result['winner']} wins with improvement of {ab_result['improvement']:.2f}")

    # Iterative refinement
    refined_prompt = optimizer.iterative_refinement(base_prompts[0], test_cases)
    print(f"\nRefined prompt: {refined_prompt}")

    # Genetic optimization
    best_prompt = optimizer.genetic_prompt_optimization(base_prompts, test_cases, generations=3)
    print(f"\nGenetically optimized prompt: {best_prompt}")

demonstrate_prompt_optimization()

Specialized Prompting Strategies

Task-Specific Prompting

class TaskSpecificPrompting:
    def __init__(self):
        self.task_templates = {
            'classification': self.classification_prompt,
            'summarization': self.summarization_prompt,
            'question_answering': self.qa_prompt,
            'code_generation': self.code_generation_prompt,
            'creative_writing': self.creative_writing_prompt,
            'data_analysis': self.data_analysis_prompt
        }

    def classification_prompt(self, text: str, categories: List[str],
                            context: str = "") -> str:
        """Create classification prompt"""
        categories_str = ", ".join(categories)

        prompt = f"""
Task: Text Classification

{f"Context: {context}" if context else ""}

Categories: {categories_str}

Text to classify: "{text}"

Instructions:
1. Read the text carefully
2. Consider the context and meaning
3. Choose the most appropriate category
4. Provide a brief explanation for your choice

Classification:"""

        return prompt

    def summarization_prompt(self, text: str, length: str = "medium",
                           style: str = "neutral") -> str:
        """Create summarization prompt"""
        length_instructions = {
            "short": "in 1-2 sentences",
            "medium": "in 3-5 sentences",
            "long": "in 1-2 paragraphs"
        }

        style_instructions = {
            "neutral": "using objective, factual language",
            "casual": "using conversational, accessible language",
            "formal": "using professional, academic language",
            "bullet": "using bullet points for key information"
        }

        prompt = f"""
Task: Text Summarization

Instructions:
- Summarize the following text {length_instructions.get(length, "concisely")}
- Use {style_instructions.get(style, "clear and concise language")}
- Capture the main ideas and key points
- Maintain the original meaning and context

Text to summarize:
{text}

Summary:"""

        return prompt

    def qa_prompt(self, context: str, question: str, answer_type: str = "comprehensive") -> str:
        """Create question-answering prompt"""
        answer_instructions = {
            "brief": "Provide a concise, direct answer",
            "comprehensive": "Provide a detailed, thorough answer with explanations",
            "step_by_step": "Break down the answer into clear steps",
            "examples": "Include relevant examples to illustrate your answer"
        }

        prompt = f"""
Task: Question Answering

Context:
{context}

Question: {question}

Instructions:
- {answer_instructions.get(answer_type, "Provide a clear and accurate answer")}
- Base your answer on the provided context
- If the context doesn't contain enough information, state this clearly
- Cite specific parts of the context when relevant

Answer:"""

        return prompt

    def code_generation_prompt(self, task: str, language: str = "Python",
                             requirements: List[str] = None) -> str:
        """Create code generation prompt"""
        requirements = requirements or []

        prompt = f"""
Task: Code Generation

Programming Language: {language}

Task Description: {task}

Requirements:
"""

        if requirements:
            for req in requirements:
                prompt += f"- {req}\n"
        else:
            prompt += "- Write clean, readable code\n- Include appropriate comments\n- Handle edge cases\n"

        prompt += f"""
Additional Instructions:
- Follow {language} best practices and conventions
- Include error handling where appropriate
- Provide example usage if applicable
- Explain complex logic with comments

Code:
```{language.lower()}
"""

        return prompt

    def creative_writing_prompt(self, genre: str, theme: str, length: str = "short",
                              constraints: List[str] = None) -> str:
        """Create creative writing prompt"""
        constraints = constraints or []

        length_guide = {
            "flash": "under 100 words",
            "short": "200-500 words",
            "medium": "500-1000 words",
            "long": "1000+ words"
        }

        prompt = f"""
Task: Creative Writing

Genre: {genre}
Theme: {theme}
Target Length: {length_guide.get(length, "as appropriate")}

"""

        if constraints:
            prompt += "Constraints:\n"
            for constraint in constraints:
                prompt += f"- {constraint}\n"
            prompt += "\n"

        prompt += f"""
Instructions:
- Create an engaging {genre} piece centered around the theme of {theme}
- Develop compelling characters and/or scenarios
- Use vivid, descriptive language
- Maintain consistency in tone and style
- Create a satisfying narrative arc

Story:"""

        return prompt

    def data_analysis_prompt(self, data_description: str, analysis_type: str,
                           specific_questions: List[str] = None) -> str:
        """Create data analysis prompt"""
        specific_questions = specific_questions or []

        analysis_instructions = {
            "descriptive": "Provide descriptive statistics and summarize key patterns",
            "exploratory": "Explore relationships and identify interesting insights",
            "diagnostic": "Investigate causes and explain observed patterns",
            "predictive": "Identify trends and make predictions based on the data"
        }

        prompt = f"""
Task: Data Analysis

Data Description: {data_description}

Analysis Type: {analysis_type}

Objective: {analysis_instructions.get(analysis_type, "Analyze the data comprehensively")}

"""

        if specific_questions:
            prompt += "Specific Questions to Address:\n"
            for i, question in enumerate(specific_questions, 1):
                prompt += f"{i}. {question}\n"
            prompt += "\n"

        prompt += """
Instructions:
- Examine the data systematically
- Identify key patterns, trends, and anomalies
- Provide statistical evidence for your findings
- Suggest actionable insights or recommendations
- Highlight any limitations or assumptions

Analysis:"""

        return prompt

    def get_task_prompt(self, task_type: str, **kwargs) -> str:
        """Get prompt for specific task type"""
        if task_type in self.task_templates:
            return self.task_templates[task_type](**kwargs)
        else:
            raise ValueError(f"Unknown task type: {task_type}")

# Example usage
def demonstrate_task_specific_prompting():
    task_prompter = TaskSpecificPrompting()

    # Classification example
    classification_prompt = task_prompter.classification_prompt(
        text="The new smartphone has an amazing camera and long battery life, but the price is quite high.",
        categories=["positive", "negative", "neutral"],
        context="Product review analysis"
    )
    print("Classification Prompt:")
    print(classification_prompt)
    print("\n" + "="*50 + "\n")

    # Code generation example
    code_prompt = task_prompter.code_generation_prompt(
        task="Create a function to calculate the factorial of a number",
        language="Python",
        requirements=[
            "Handle negative numbers appropriately",
            "Use recursion",
            "Include input validation"
        ]
    )
    print("Code Generation Prompt:")
    print(code_prompt)
    print("\n" + "="*50 + "\n")

    # Creative writing example
    creative_prompt = task_prompter.creative_writing_prompt(
        genre="science fiction",
        theme="artificial intelligence",
        length="short",
        constraints=[
            "Set in the year 2050",
            "Include a moral dilemma",
            "First-person perspective"
        ]
    )
    print("Creative Writing Prompt:")
    print(creative_prompt)

demonstrate_task_specific_prompting()

Prompt Evaluation and Testing

Systematic Evaluation Framework

class PromptEvaluator:
    def __init__(self):
        self.evaluation_metrics = {}
        self.test_results = []

    def create_test_suite(self, task_type: str, test_cases: List[Dict]) -> Dict:
        """Create comprehensive test suite for prompt evaluation"""
        test_suite = {
            'task_type': task_type,
            'test_cases': test_cases,
            'metrics': self.get_metrics_for_task(task_type),
            'created_at': pd.Timestamp.now()
        }
        return test_suite

    def get_metrics_for_task(self, task_type: str) -> List[str]:
        """Get appropriate metrics for different task types"""
        task_metrics = {
            'classification': ['accuracy', 'precision', 'recall', 'f1_score'],
            'generation': ['fluency', 'coherence', 'relevance', 'creativity'],
            'summarization': ['coverage', 'conciseness', 'faithfulness'],
            'qa': ['correctness', 'completeness', 'clarity'],
            'reasoning': ['logical_consistency', 'step_completeness', 'conclusion_validity']
        }
        return task_metrics.get(task_type, ['quality', 'relevance', 'clarity'])

    def evaluate_response_quality(self, response: str, expected: str = None,
                                criteria: List[str] = None) -> Dict[str, float]:
        """Evaluate response quality using multiple criteria"""
        criteria = criteria or ['clarity', 'relevance', 'completeness']
        scores = {}

        # Length-based metrics
        scores['length_score'] = min(len(response.split()) / 50, 1.0)  # Normalize to 50 words

        # Clarity (simplified - based on sentence structure)
        sentences = response.split('.')
        avg_sentence_length = np.mean([len(s.split()) for s in sentences if s.strip()])
        scores['clarity'] = max(0, 1 - abs(avg_sentence_length - 15) / 15)  # Optimal ~15 words

        # Relevance (keyword matching if expected answer provided)
        if expected:
            response_words = set(response.lower().split())
            expected_words = set(expected.lower().split())
            overlap = len(response_words.intersection(expected_words))
            scores['relevance'] = overlap / len(expected_words) if expected_words else 0
        else:
            scores['relevance'] = 0.8  # Default score when no expected answer

        # Completeness (based on response length and structure)
        has_intro = any(word in response.lower() for word in ['first', 'initially', 'to begin'])
        has_body = len(response.split()) > 20
        has_conclusion = any(word in response.lower() for word in ['conclusion', 'summary', 'finally'])
        scores['completeness'] = (has_intro + has_body + has_conclusion) / 3

        return scores

    def batch_evaluate_prompts(self, prompts: List[str], test_suite: Dict) -> pd.DataFrame:
        """Evaluate multiple prompts on a test suite"""
        results = []

        for i, prompt in enumerate(prompts):
            prompt_results = {
                'prompt_id': i,
                'prompt': prompt[:100] + "..." if len(prompt) > 100 else prompt
            }

            # Evaluate on each test case
            case_scores = []
            for case in test_suite['test_cases']:
                # Simulate model response (in practice, you'd call the actual model)
                simulated_response = self.simulate_model_response(prompt, case['input'])

                # Evaluate response
                scores = self.evaluate_response_quality(
                    simulated_response,
                    case.get('expected_output'),
                    test_suite['metrics']
                )
                case_scores.append(scores)

            # Aggregate scores across test cases
            for metric in test_suite['metrics']:
                if metric in case_scores[0]:
                    prompt_results[metric] = np.mean([scores.get(metric, 0) for scores in case_scores])

            # Overall score
            prompt_results['overall_score'] = np.mean([
                prompt_results.get(metric, 0) for metric in test_suite['metrics']
            ])

            results.append(prompt_results)

        return pd.DataFrame(results)

    def simulate_model_response(self, prompt: str, input_text: str) -> str:
        """Simulate model response for testing purposes"""
        # This is a simplified simulation - replace with actual model calls
        combined_input = f"{prompt}\n\nInput: {input_text}"

        # Simple response generation based on prompt keywords
        if 'classify' in prompt.lower():
            return "positive"  # Simplified classification
        elif 'summarize' in prompt.lower():
            return f"Summary of the input: {input_text[:50]}..."
        elif 'question' in prompt.lower():
            return f"Based on the context, the answer is related to {input_text[:30]}..."
        else:
            return f"Response to: {input_text[:50]}..."

    def compare_prompt_versions(self, prompt_versions: Dict[str, str],
                              test_suite: Dict) -> Dict:
        """Compare different versions of a prompt"""
        comparison_results = {}

        for version_name, prompt in prompt_versions.items():
            results = self.batch_evaluate_prompts([prompt], test_suite)
            comparison_results[version_name] = results.iloc[0].to_dict()

        # Create comparison DataFrame
        comparison_df = pd.DataFrame(comparison_results).T

        # Find best performing version for each metric
        best_versions = {}
        for metric in test_suite['metrics']:
            if metric in comparison_df.columns:
                best_version = comparison_df[metric].idxmax()
                best_versions[metric] = best_version

        return {
            'comparison_table': comparison_df,
            'best_versions': best_versions,
            'overall_winner': comparison_df['overall_score'].idxmax()
        }

    def statistical_significance_test(self, results_a: List[float],
                                    results_b: List[float]) -> Dict:
        """Test statistical significance between two prompt versions"""
        from scipy import stats

        # Perform t-test
        t_stat, p_value = stats.ttest_ind(results_a, results_b)

        # Calculate effect size (Cohen's d)
        pooled_std = np.sqrt(((len(results_a) - 1) * np.var(results_a) +
                             (len(results_b) - 1) * np.var(results_b)) /
                            (len(results_a) + len(results_b) - 2))
        cohens_d = (np.mean(results_a) - np.mean(results_b)) / pooled_std

        return {
            't_statistic': t_stat,
            'p_value': p_value,
            'significant': p_value < 0.05,
            'effect_size': cohens_d,
            'interpretation': self.interpret_effect_size(cohens_d)
        }

    def interpret_effect_size(self, cohens_d: float) -> str:
        """Interpret Cohen's d effect size"""
        abs_d = abs(cohens_d)
        if abs_d < 0.2:
            return "negligible"
        elif abs_d < 0.5:
            return "small"
        elif abs_d < 0.8:
            return "medium"
        else:
            return "large"

# Example usage
def demonstrate_prompt_evaluation():
    evaluator = PromptEvaluator()

    # Create test suite
    test_cases = [
        {
            'input': 'The movie was fantastic with great acting and plot.',
            'expected_output': 'positive',
            'category': 'clear_positive'
        },
        {
            'input': 'I hate this product, it broke after one day.',
            'expected_output': 'negative',
            'category': 'clear_negative'
        },
        {
            'input': 'The service was okay, nothing special but not bad either.',
            'expected_output': 'neutral',
            'category': 'neutral'
        }
    ]

    test_suite = evaluator.create_test_suite('classification', test_cases)

    # Prompt versions to compare
    prompt_versions = {
        'basic': 'Classify the sentiment as positive, negative, or neutral.',
        'detailed': 'Analyze the sentiment of the given text. Consider the emotional tone, word choice, and overall message. Classify as positive, negative, or neutral.',
        'structured': '''Task: Sentiment Classification

Instructions:
1. Read the text carefully
2. Identify emotional indicators
3. Consider context and nuance
4. Classify as positive, negative, or neutral
5. Provide brief reasoning

Text:'''
    }

    # Compare prompt versions
    comparison = evaluator.compare_prompt_versions(prompt_versions, test_suite)

    print("Prompt Comparison Results:")
    print(comparison['comparison_table'].round(3))
    print(f"\nOverall Winner: {comparison['overall_winner']}")
    print(f"Best versions by metric: {comparison['best_versions']}")

    # Statistical significance test
    basic_scores = [0.7, 0.8, 0.6, 0.9, 0.7]
    detailed_scores = [0.8, 0.9, 0.8, 0.9, 0.8]

    sig_test = evaluator.statistical_significance_test(basic_scores, detailed_scores)
    print(f"\nStatistical Significance Test:")
    print(f"P-value: {sig_test['p_value']:.4f}")
    print(f"Significant: {sig_test['significant']}")
    print(f"Effect size: {sig_test['effect_size']:.3f} ({sig_test['interpretation']})")

demonstrate_prompt_evaluation()

Conclusion

Prompt engineering is a critical skill for maximizing the effectiveness of large language models. By understanding fundamental principles, mastering advanced techniques, and implementing systematic evaluation approaches, you can significantly improve model performance across various tasks.

Next Steps

  1. Practice with different prompt types and techniques
  2. Experiment with prompt optimization methods
  3. Build automated prompt testing pipelines
  4. Study task-specific prompting strategies
  5. Learn about fine-tuning as an alternative to prompting
Share this article: