An LLM agent is a language model that takes actions in an environment: calling APIs, executing code, reading files, querying databases. The core challenge is decision-making under uncertainty: the agent observes partial state, selects an action, observes the result, and repeats until the task is complete. This requires reasoning (choosing the right action), grounding (understanding the result), and planning (sequencing actions toward a goal).
ReAct (Yao et al., 2022) interleaves reasoning and acting in a single generation loop. The model generates a thought (reasoning about what to do), an action (the specific tool call), and processes the observation (the tool’s output). Tool use extends this with structured function calling. Multi-step planning adds lookahead via tree search over possible action sequences. Memory systems provide the agent with persistent state across interactions.
This post covers each component with full implementations, performance analysis, and production considerations.
ReAct: Reasoning + Acting
The ReAct Loop
ReAct generates a trace of interleaved thoughts and actions. Each step has three parts:
- Thought: natural language reasoning about what to do next
- Action: a structured tool invocation
- Observation: the tool’s return value, injected back into the context
import json
import re
from dataclasses import dataclass, field
from typing import Any, Callable
@dataclass
class Tool:
name: str
description: str
parameters: dict
function: Callable
def to_schema(self):
"""Convert to JSON schema for the model prompt."""
return {
"name": self.name,
"description": self.description,
"parameters": self.parameters,
}
@dataclass
class AgentStep:
thought: str
action_name: str
action_input: dict
observation: str
class ReActAgent:
"""
ReAct agent: Reasoning + Acting.
Interleaves chain-of-thought with tool calls.
"""
def __init__(self, model, tokenizer, tools, max_steps=10):
self.model = model
self.tokenizer = tokenizer
self.tools = {t.name: t for t in tools}
self.max_steps = max_steps
def _build_system_prompt(self):
"""Build the system prompt with tool descriptions."""
tool_descriptions = "\n".join(
f"- {t.name}: {t.description}\n"
f" Parameters: {json.dumps(t.parameters)}"
for t in self.tools.values()
)
return f"""You are a helpful assistant with access to the following tools:
{tool_descriptions}
To use a tool, respond in this exact format:
Thought: [your reasoning about what to do next]
Action: [tool_name]
Action Input: [JSON object with parameters]
After receiving the observation, continue reasoning.
When you have the final answer, respond with:
Thought: [your final reasoning]
Final Answer: [your answer]
Always think step by step. Do not make up information; use tools to verify."""
def _parse_response(self, response):
"""Parse model response into thought, action, and action input."""
# Check for final answer
if "Final Answer:" in response:
thought = response.split("Final Answer:")[0].strip()
answer = response.split("Final Answer:")[1].strip()
return thought, None, None, answer
# Parse thought
thought = ""
if "Thought:" in response:
thought = response.split("Thought:")[1].split("Action:")[0].strip()
# Parse action
action_name = ""
if "Action:" in response:
action_line = response.split("Action:")[1].split("\n")[0].strip()
action_name = action_line.strip()
# Parse action input
action_input = {}
if "Action Input:" in response:
input_text = response.split("Action Input:")[1].strip()
# Try to parse as JSON
try:
# Find the JSON object
json_match = re.search(r'\{[^}]+\}', input_text, re.DOTALL)
if json_match:
action_input = json.loads(json_match.group())
except json.JSONDecodeError:
action_input = {"raw": input_text.split("\n")[0]}
return thought, action_name, action_input, None
def _execute_action(self, action_name, action_input):
"""Execute a tool and return the observation."""
if action_name not in self.tools:
return f"Error: Unknown tool '{action_name}'. Available tools: {list(self.tools.keys())}"
tool = self.tools[action_name]
try:
result = tool.function(**action_input)
return str(result)
except Exception as e:
return f"Error executing {action_name}: {str(e)}"
def run(self, query):
"""
Run the ReAct loop until the agent produces a final answer
or hits the step limit.
"""
system_prompt = self._build_system_prompt()
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": query},
]
steps = []
for step_num in range(self.max_steps):
# Generate model response
prompt = self._format_messages(messages)
inputs = self.tokenizer(prompt, return_tensors="pt").to(
self.model.device
)
import torch
with torch.no_grad():
output = self.model.generate(
**inputs,
max_new_tokens=512,
temperature=0.1,
do_sample=True,
stop_strings=["Observation:"],
)
response = self.tokenizer.decode(
output[0][inputs['input_ids'].shape[1]:],
skip_special_tokens=True,
)
# Parse response
thought, action_name, action_input, final_answer = \
self._parse_response(response)
if final_answer is not None:
return {
'answer': final_answer,
'steps': steps,
'num_steps': step_num + 1,
}
# Execute action
observation = self._execute_action(action_name, action_input)
steps.append(AgentStep(
thought=thought,
action_name=action_name,
action_input=action_input,
observation=observation,
))
# Add to message history
messages.append({
"role": "assistant",
"content": response,
})
messages.append({
"role": "user",
"content": f"Observation: {observation}",
})
return {
'answer': "Reached maximum steps without final answer.",
'steps': steps,
'num_steps': self.max_steps,
}
def _format_messages(self, messages):
"""Format messages into a single prompt string."""
parts = []
for msg in messages:
role = msg['role']
content = msg['content']
parts.append(f"<|{role}|>\n{content}")
parts.append("<|assistant|>\n")
return "\n".join(parts)
Example: Building a Research Agent
# Define tools
def search_web(query):
"""Search the web and return top results."""
# Placeholder: integrate with search API
return f"Search results for '{query}': [Result 1, Result 2, Result 3]"
def read_url(url):
"""Read the content of a URL."""
import requests
try:
resp = requests.get(url, timeout=10)
return resp.text[:2000] # Truncate
except Exception as e:
return f"Error: {e}"
def calculate(expression):
"""Evaluate a mathematical expression."""
try:
# Safe eval for math expressions
allowed = {
'abs': abs, 'round': round, 'min': min, 'max': max,
'sum': sum, 'len': len, 'pow': pow,
}
import math
allowed.update({k: getattr(math, k) for k in dir(math)
if not k.startswith('_')})
return eval(expression, {"__builtins__": {}}, allowed)
except Exception as e:
return f"Error: {e}"
def python_exec(code):
"""Execute Python code and return the output."""
import io, sys
old_stdout = sys.stdout
sys.stdout = io.StringIO()
try:
exec(code, {})
output = sys.stdout.getvalue()
return output if output else "Code executed successfully (no output)."
except Exception as e:
return f"Error: {e}"
finally:
sys.stdout = old_stdout
tools = [
Tool("search", "Search the web for information",
{"query": {"type": "string"}}, search_web),
Tool("read_url", "Read content from a URL",
{"url": {"type": "string"}}, read_url),
Tool("calculate", "Evaluate a math expression",
{"expression": {"type": "string"}}, calculate),
Tool("python", "Execute Python code",
{"code": {"type": "string"}}, python_exec),
]
Structured Tool Use (Function Calling)
JSON Schema-Based Function Calling
import inspect
from typing import get_type_hints
class FunctionCallingAgent:
"""
Agent with structured function calling.
Tools are defined as JSON schemas; the model generates
structured JSON to invoke them.
"""
def __init__(self, model_client, tools):
self.client = model_client
self.tools = {t.name: t for t in tools}
def _build_tool_schemas(self):
"""Convert tools to OpenAI-compatible function schemas."""
schemas = []
for tool in self.tools.values():
schemas.append({
"type": "function",
"function": {
"name": tool.name,
"description": tool.description,
"parameters": {
"type": "object",
"properties": tool.parameters,
"required": list(tool.parameters.keys()),
},
},
})
return schemas
def run(self, messages, max_turns=10):
"""
Run the agent with automatic tool execution.
Handles multi-turn tool calling.
"""
tool_schemas = self._build_tool_schemas()
for turn in range(max_turns):
response = self.client.chat_completions_create(
messages=messages,
tools=tool_schemas,
tool_choice="auto",
)
message = response.choices[0].message
# Check if the model wants to call a tool
if message.tool_calls:
messages.append(message)
for tool_call in message.tool_calls:
function_name = tool_call.function.name
arguments = json.loads(tool_call.function.arguments)
# Execute the tool
result = self._execute_tool(function_name, arguments)
# Add tool result to messages
messages.append({
"role": "tool",
"tool_call_id": tool_call.id,
"content": str(result),
})
else:
# Model produced a text response (no tool call)
return message.content, messages
return "Reached maximum turns.", messages
def _execute_tool(self, name, arguments):
"""Execute a tool by name with given arguments."""
if name not in self.tools:
return f"Error: Unknown tool '{name}'"
try:
return self.tools[name].function(**arguments)
except Exception as e:
return f"Error: {e}"
def auto_register_tool(func):
"""
Decorator to automatically generate tool schema from function signature.
"""
hints = get_type_hints(func)
sig = inspect.signature(func)
parameters = {}
for param_name, param in sig.parameters.items():
if param_name == 'self':
continue
type_hint = hints.get(param_name, str)
json_type = {
str: "string", int: "integer", float: "number",
bool: "boolean", list: "array", dict: "object",
}.get(type_hint, "string")
parameters[param_name] = {
"type": json_type,
"description": "", # Extract from docstring if available
}
return Tool(
name=func.__name__,
description=func.__doc__ or "",
parameters=parameters,
function=func,
)
Tool Calling Accuracy by Model
| Model | Schema Compliance | Correct Tool Selection | Correct Arguments | End-to-End Success |
|---|---|---|---|---|
| GPT-4o | 98.5% | 94.2% | 91.8% | 87.3% |
| Claude 3.5 Sonnet | 97.8% | 93.5% | 90.7% | 86.1% |
| Llama 3.1 70B | 95.2% | 88.3% | 84.5% | 78.2% |
| Llama 3.1 8B | 89.7% | 78.5% | 72.1% | 64.3% |
| Mistral 7B | 87.3% | 74.8% | 68.9% | 58.7% |
Multi-Step Planning
Tree Search Over Actions
For complex tasks, the agent needs to plan multiple steps ahead. Greedy action selection (pick the best action at each step) can lead to dead ends. Tree search explores multiple possible action sequences and selects the best trajectory.
import heapq
from copy import deepcopy
@dataclass
class PlanNode:
state: dict
actions_taken: list
observations: list
value_estimate: float = 0.0
depth: int = 0
def __lt__(self, other):
return self.value_estimate > other.value_estimate # Max-heap
class TreeSearchPlanner:
"""
Plan multi-step actions using tree search.
Explores multiple possible action sequences and selects the best.
"""
def __init__(self, agent, value_model, max_depth=5, beam_width=3):
self.agent = agent
self.value_model = value_model # Estimates value of a state
self.max_depth = max_depth
self.beam_width = beam_width
def plan(self, initial_state, goal):
"""
Search for the best action sequence to achieve the goal.
Uses beam search: at each depth, keep only the top-k states.
"""
root = PlanNode(
state=initial_state,
actions_taken=[],
observations=[],
value_estimate=self._estimate_value(initial_state, goal),
depth=0,
)
beam = [root]
best_plan = root
for depth in range(self.max_depth):
candidates = []
for node in beam:
# Generate possible actions from this state
possible_actions = self._generate_actions(node.state, goal)
for action in possible_actions:
# Simulate the action
new_state, observation = self._simulate_action(
node.state, action
)
child = PlanNode(
state=new_state,
actions_taken=node.actions_taken + [action],
observations=node.observations + [observation],
value_estimate=self._estimate_value(new_state, goal),
depth=depth + 1,
)
candidates.append(child)
# Check if goal is achieved
if self._is_goal_achieved(new_state, goal):
if child.value_estimate > best_plan.value_estimate:
best_plan = child
# Beam selection: keep top-k candidates
candidates.sort(key=lambda n: n.value_estimate, reverse=True)
beam = candidates[:self.beam_width]
if not beam:
break
# Update best plan
if beam[0].value_estimate > best_plan.value_estimate:
best_plan = beam[0]
return {
'actions': best_plan.actions_taken,
'observations': best_plan.observations,
'estimated_value': best_plan.value_estimate,
'depth': best_plan.depth,
}
def _generate_actions(self, state, goal):
"""Generate possible actions given the current state."""
prompt = (
f"Current state:\n{json.dumps(state, indent=2)}\n\n"
f"Goal: {goal}\n\n"
f"List 3-5 possible next actions as JSON array:\n"
)
# Use the agent's model to generate actions
# Parse and return as list of action dicts
return [{"action": "placeholder"}] # Replace with model inference
def _simulate_action(self, state, action):
"""Simulate an action and return new state + observation."""
# Execute action in a sandboxed environment
observation = self.agent._execute_action(
action.get('name', ''),
action.get('params', {}),
)
new_state = deepcopy(state)
new_state['history'] = state.get('history', []) + [
{'action': action, 'observation': observation}
]
return new_state, observation
def _estimate_value(self, state, goal):
"""Estimate the value of a state (how close to the goal)."""
return self.value_model.score(state, goal)
def _is_goal_achieved(self, state, goal):
"""Check if the goal is achieved."""
return state.get('goal_achieved', False)
Memory Systems
Short-Term and Long-Term Memory
import time
import hashlib
import numpy as np
from collections import deque
class ShortTermMemory:
"""
Short-term memory: the recent conversation context.
Implemented as a sliding window over the message history.
"""
def __init__(self, max_tokens=8000, tokenizer=None):
self.max_tokens = max_tokens
self.tokenizer = tokenizer
self.messages = []
def add(self, role, content):
"""Add a message to short-term memory."""
self.messages.append({
'role': role,
'content': content,
'timestamp': time.time(),
})
self._trim()
def _trim(self):
"""Remove oldest messages if context exceeds max_tokens."""
while len(self.messages) > 1:
total_tokens = sum(
len(self.tokenizer.encode(m['content']))
for m in self.messages
) if self.tokenizer else sum(
len(m['content'].split()) * 1.3
for m in self.messages
)
if total_tokens <= self.max_tokens:
break
self.messages.pop(0) # Remove oldest
def get_context(self):
"""Get the current context as a list of messages."""
return self.messages
class LongTermMemory:
"""
Long-term memory: persistent storage with retrieval.
Stores facts, experiences, and learned patterns.
"""
def __init__(self, embedding_model, index_type="flat"):
self.embedding_model = embedding_model
self.memories = [] # List of memory entries
self.embeddings = None # numpy array
self._dirty = True
def store(self, content, metadata=None):
"""Store a memory entry."""
entry = {
'id': hashlib.md5(content.encode()).hexdigest()[:12],
'content': content,
'metadata': metadata or {},
'timestamp': time.time(),
'access_count': 0,
}
self.memories.append(entry)
self._dirty = True
def _build_index(self):
"""Build embedding index for retrieval."""
if not self._dirty or not self.memories:
return
texts = [m['content'] for m in self.memories]
self.embeddings = self.embedding_model.encode(
texts, normalize_embeddings=True
)
self._dirty = False
def retrieve(self, query, top_k=5, recency_weight=0.1):
"""
Retrieve relevant memories using hybrid scoring:
- Semantic similarity (embedding cosine similarity)
- Recency (more recent memories score higher)
- Access frequency (less-accessed memories may be more informative)
"""
self._build_index()
if self.embeddings is None or len(self.embeddings) == 0:
return []
query_embedding = self.embedding_model.encode(
[query], normalize_embeddings=True
)
# Semantic similarity
similarities = (query_embedding @ self.embeddings.T).flatten()
# Recency score (exponential decay)
current_time = time.time()
recency_scores = np.array([
np.exp(-0.001 * (current_time - m['timestamp']))
for m in self.memories
])
# Combined score
combined = (1 - recency_weight) * similarities + recency_weight * recency_scores
# Top-k
top_indices = np.argsort(-combined)[:top_k]
results = []
for idx in top_indices:
self.memories[idx]['access_count'] += 1
results.append({
'content': self.memories[idx]['content'],
'score': float(combined[idx]),
'similarity': float(similarities[idx]),
'metadata': self.memories[idx]['metadata'],
})
return results
class WorkingMemory:
"""
Working memory: structured scratchpad for the current task.
Stores intermediate results, hypotheses, and task state.
"""
def __init__(self):
self.facts = {} # key -> value facts discovered
self.hypotheses = [] # Current hypotheses being tested
self.plan = [] # Current plan steps
self.plan_idx = 0 # Current step in the plan
self.notes = [] # Free-form notes
def add_fact(self, key, value, source="observation"):
"""Record a discovered fact."""
self.facts[key] = {
'value': value,
'source': source,
'timestamp': time.time(),
}
def add_hypothesis(self, hypothesis, confidence=0.5):
"""Add a hypothesis to test."""
self.hypotheses.append({
'text': hypothesis,
'confidence': confidence,
'status': 'untested',
})
def update_hypothesis(self, index, status, new_confidence=None):
"""Update hypothesis status after testing."""
if 0 <= index < len(self.hypotheses):
self.hypotheses[index]['status'] = status
if new_confidence is not None:
self.hypotheses[index]['confidence'] = new_confidence
def set_plan(self, steps):
"""Set the current plan."""
self.plan = [{'step': s, 'status': 'pending'} for s in steps]
self.plan_idx = 0
def advance_plan(self, result):
"""Mark current plan step as done and advance."""
if self.plan_idx < len(self.plan):
self.plan[self.plan_idx]['status'] = 'done'
self.plan[self.plan_idx]['result'] = result
self.plan_idx += 1
def to_prompt(self):
"""Serialize working memory to a prompt section."""
parts = []
if self.facts:
parts.append("Known facts:")
for key, val in self.facts.items():
parts.append(f" {key}: {val['value']}")
if self.hypotheses:
parts.append("\nHypotheses:")
for i, h in enumerate(self.hypotheses):
parts.append(f" {i + 1}. [{h['status']}] {h['text']} "
f"(confidence: {h['confidence']:.1f})")
if self.plan:
parts.append("\nPlan:")
for i, step in enumerate(self.plan):
marker = ">>>" if i == self.plan_idx else " "
parts.append(f" {marker} {i + 1}. [{step['status']}] {step['step']}")
return "\n".join(parts)
Integrated Memory Agent
class MemoryAgent(ReActAgent):
"""
ReAct agent with integrated memory systems.
"""
def __init__(self, model, tokenizer, tools, embedding_model,
max_steps=15):
super().__init__(model, tokenizer, tools, max_steps)
self.short_term = ShortTermMemory(max_tokens=6000, tokenizer=tokenizer)
self.long_term = LongTermMemory(embedding_model)
self.working = WorkingMemory()
def run(self, query):
"""Run agent with memory-augmented context."""
# Retrieve relevant long-term memories
relevant_memories = self.long_term.retrieve(query, top_k=3)
memory_context = "\n".join(
f"- {m['content']}" for m in relevant_memories
)
# Augmented query with memory context
augmented_prompt = query
if memory_context:
augmented_prompt = (
f"Relevant information from previous interactions:\n"
f"{memory_context}\n\n"
f"Current request: {query}"
)
# Run ReAct loop
result = super().run(augmented_prompt)
# Store key findings in long-term memory
if result['answer'] and result['answer'] != "Reached maximum steps without final answer.":
self.long_term.store(
f"Query: {query}\nAnswer: {result['answer']}",
metadata={'type': 'qa_pair', 'query': query},
)
# Store discovered facts
for step in result['steps']:
if step.observation and len(step.observation) > 20:
self.long_term.store(
f"Observation from {step.action_name}: {step.observation[:500]}",
metadata={'type': 'observation', 'tool': step.action_name},
)
return result
Memory retrieval adds 20-50ms latency per agent step (embedding generation + vector search). For a 10-step agent trace, this is 200-500ms total — negligible compared to the 2-10 seconds per LLM generation. The benefit is substantial: memory-augmented agents solve 15-25% more tasks than memoryless agents on multi-session benchmarks.
Agent Evaluation Benchmarks
AGENT_BENCHMARKS = {
"WebArena": {
"description": "Web navigation tasks on real websites",
"tasks": 812,
"environment": "Browser (Playwright)",
"metrics": ["task_success_rate", "steps_to_completion"],
"top_scores": {"GPT-4o": 0.357, "Claude_3_5": 0.312, "Llama_70B": 0.142},
},
"SWE-bench": {
"description": "Real GitHub issues requiring code changes",
"tasks": 2294,
"environment": "Code repository + tests",
"metrics": ["resolved_rate", "test_pass_rate"],
"top_scores": {"GPT-4o_agent": 0.235, "Claude_3_5_agent": 0.491, "Devin": 0.137},
},
"GAIA": {
"description": "General AI assistants: multi-step reasoning + tools",
"tasks": 466,
"environment": "Web + calculator + code",
"metrics": ["exact_match_accuracy"],
"top_scores": {"GPT-4o_plugins": 0.393, "Human": 0.920},
},
"ToolBench": {
"description": "API tool selection and composition",
"tasks": 16000,
"environment": "REST APIs",
"metrics": ["pass_rate", "win_rate_vs_chatgpt"],
"top_scores": {"GPT-4": 0.78, "ToolLLaMA_7B": 0.58},
},
}
Agent Benchmark Performance (2025)
| Benchmark | Best Agent | Score | Human Baseline | Gap |
|---|---|---|---|---|
| WebArena | GPT-4o + ReAct | 35.7% | 78.2% | 42.5% |
| SWE-bench Verified | Claude 3.5 Sonnet | 49.1% | ~95% | 45.9% |
| GAIA Level 1 | GPT-4o + plugins | 58.3% | 92.0% | 33.7% |
| GAIA Level 3 | GPT-4o + plugins | 12.3% | 92.0% | 79.7% |
Production Considerations
Error Recovery and Retry Logic
class ProductionAgent(MemoryAgent):
"""
Production-ready agent with error recovery, timeouts, and logging.
"""
def __init__(self, *args, max_retries=2, timeout_seconds=30, **kwargs):
super().__init__(*args, **kwargs)
self.max_retries = max_retries
self.timeout_seconds = timeout_seconds
self.execution_log = []
def _execute_action_safe(self, action_name, action_input):
"""Execute with retry logic and timeout."""
import signal
for attempt in range(self.max_retries + 1):
try:
result = self._execute_action(action_name, action_input)
# Validate result
if result is None:
result = "Tool returned no output."
if len(str(result)) > 5000:
result = str(result)[:5000] + "\n... [truncated]"
self.execution_log.append({
'action': action_name,
'input': action_input,
'output': result,
'attempt': attempt + 1,
'status': 'success',
})
return result
except Exception as e:
self.execution_log.append({
'action': action_name,
'input': action_input,
'error': str(e),
'attempt': attempt + 1,
'status': 'error',
})
if attempt < self.max_retries:
continue
return f"Error after {self.max_retries + 1} attempts: {str(e)}"
def get_cost_estimate(self):
"""Estimate the token cost of this agent run."""
total_input_tokens = 0
total_output_tokens = 0
for entry in self.execution_log:
input_tokens = len(str(entry.get('input', '')).split()) * 1.3
output_tokens = len(str(entry.get('output', '')).split()) * 1.3
total_input_tokens += input_tokens
total_output_tokens += output_tokens
return {
'input_tokens': int(total_input_tokens),
'output_tokens': int(total_output_tokens),
'estimated_cost_usd': (
total_input_tokens * 0.000003 + # $3/M input tokens
total_output_tokens * 0.000015 # $15/M output tokens
),
}
Agent Cost per Task by Complexity
| Metric | 1-2 steps | 3-5 steps | 6-10 steps | 11-20 steps | 20+ steps |
|---|---|---|---|---|---|
| GPT-4o | |||||
| Claude 3.5 Sonnet | |||||
| Llama 3.1 70B (self-hosted) |
Agent architectures are converging on a common pattern: ReAct-style interleaving of reasoning and tool use, with structured function calling for reliable tool invocation, tree search for complex planning, and layered memory for context management. The primary bottleneck is not architecture but model capability: current models succeed 30-50% of the time on multi-step tasks. Improvements in base model reasoning, tool use training data, and error recovery will drive agent capability forward faster than architectural innovations.