Усилить надёжность: логирование, lifespan, LRU-кэш и fail-fast семантика
Подключить loguru и заменить молчаливые except на warning/exception в step_planner, mcp_client и mcp_workflow_runner — раньше ошибки терялись в пустых дикт-возвратах.\n Перенести Phoenix tracing из module-level в FastAPI lifespan, чтобы импорт agent_os не поднимал трейсер в тестах и тулах.\n Заменить неограниченный dict _workflow_cache на OrderedDict-LRU с лимитом WORKFLOW_CACHE_MAX_SIZE (default 64) — чтобы кэш не рос бесконечно при разных scenario_id.\n Зафиксировать инвариант fail-fast: шаги, не дошедшие до исполнения из-за падения upstream, возвращаются со статусом skipped (для UI), а не queued; run помечается success только если все payload.ok.\n Добавить module docstrings во все модули src/ по STYLE_GUIDE cookbook. Запинить версии зависимостей в requirements.txt.
This commit is contained in:
@@ -25,6 +25,7 @@ dist/
|
||||
.vscode/
|
||||
.DS_Store
|
||||
.cursor
|
||||
.claude
|
||||
|
||||
# Cookbook code
|
||||
vendor/agno/cookbook/
|
||||
+10
-9
@@ -1,9 +1,10 @@
|
||||
agno
|
||||
fastapi
|
||||
uvicorn
|
||||
python-dotenv
|
||||
ollama
|
||||
socksio
|
||||
openai
|
||||
arize-phoenix-otel
|
||||
openinference-instrumentation-agno
|
||||
agno==2.5.17
|
||||
fastapi==0.136.0
|
||||
uvicorn==0.44.0
|
||||
python-dotenv==1.2.2
|
||||
ollama==0.6.1
|
||||
socksio==1.0.0
|
||||
openai==2.32.0
|
||||
arize-phoenix-otel==0.15.0
|
||||
openinference-instrumentation-agno==0.1.30
|
||||
loguru==0.7.3
|
||||
|
||||
+25
-4
@@ -1,26 +1,47 @@
|
||||
"""AgentOS entrypoint: wires the agent, REST routes and FastAPI lifespan.
|
||||
|
||||
Phoenix tracing is initialized from the lifespan (not at import time) so that
|
||||
importing this module for tooling or tests does not spin up the tracer.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from fastapi import FastAPI
|
||||
from loguru import logger
|
||||
|
||||
from agno.os import AgentOS
|
||||
|
||||
from src.api_routes import router as api_router
|
||||
from src.agent_runner import get_agent
|
||||
from src.observability import init_phoenix_tracing
|
||||
from src.api_routes import router as api_router
|
||||
from src.observability import init_phoenix_tracing, is_phoenix_tracing_enabled
|
||||
|
||||
load_dotenv()
|
||||
_tracing_enabled = init_phoenix_tracing()
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def _lifespan(_app: FastAPI):
|
||||
init_phoenix_tracing()
|
||||
logger.info("Prisma Platform API starting up")
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
logger.info("Prisma Platform API shutting down")
|
||||
|
||||
|
||||
_agent = get_agent()
|
||||
_base_app = FastAPI(
|
||||
title="Prisma Platform API",
|
||||
version="0.1.0",
|
||||
lifespan=_lifespan,
|
||||
)
|
||||
_base_app.include_router(api_router)
|
||||
_agent_os = AgentOS(
|
||||
agents=[_agent],
|
||||
tracing=_tracing_enabled,
|
||||
tracing=is_phoenix_tracing_enabled(),
|
||||
base_app=_base_app,
|
||||
)
|
||||
app = _agent_os.get_app()
|
||||
|
||||
@@ -1,3 +1,11 @@
|
||||
"""Lazy factory for the top-level Prisma agent.
|
||||
|
||||
Config is read from environment variables so the same module can be used by
|
||||
the API server, CLI tools and tests without re-wiring.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
|
||||
from agno.agent import Agent
|
||||
|
||||
@@ -1,3 +1,11 @@
|
||||
"""REST routes for scenario execution.
|
||||
|
||||
These endpoints live on the FastAPI ``base_app`` that AgentOS composes with
|
||||
its own routes, so the prefix ``/api`` does not collide with AgentOS paths.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from fastapi import APIRouter
|
||||
|
||||
from src.mcp_workflow_runner import run_scenario
|
||||
|
||||
@@ -1,3 +1,9 @@
|
||||
"""Thin async client for MCP tool invocation over streamable HTTP.
|
||||
|
||||
Opens a short-lived ``ClientSession`` per call, wraps the tool response in
|
||||
a normalized dict, and raises ``RuntimeError`` on transport/tool errors.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import timedelta
|
||||
@@ -5,6 +11,7 @@ import json
|
||||
import os
|
||||
from typing import Any
|
||||
|
||||
from loguru import logger
|
||||
from mcp import ClientSession
|
||||
from mcp.client.streamable_http import streamablehttp_client
|
||||
from mcp.types import TextContent
|
||||
@@ -33,8 +40,10 @@ async def call_mcp_tool(tool_name: str, arguments: dict[str, Any]) -> dict[str,
|
||||
await session.initialize()
|
||||
result = await session.call_tool(tool_name, arguments)
|
||||
except TimeoutError as exc:
|
||||
logger.warning("MCP timeout: tool={}", tool_name)
|
||||
raise RuntimeError(f"MCP timeout: {tool_name}") from exc
|
||||
except Exception as exc:
|
||||
logger.exception("MCP transport error: tool={}", tool_name)
|
||||
raise RuntimeError(f"MCP transport error: {tool_name}") from exc
|
||||
|
||||
if result.isError:
|
||||
|
||||
@@ -1,5 +1,14 @@
|
||||
"""Builds and runs Agno workflows from JSON scenario definitions.
|
||||
|
||||
Each scenario step is a typed MCP tool call. The runner resolves argument
|
||||
templates from ``session_state``, optionally lets an LLM planner repair
|
||||
missing fields, invokes the tool, and collects per-step results back into
|
||||
``session_state`` for downstream steps.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import OrderedDict
|
||||
from copy import deepcopy
|
||||
from datetime import datetime, timezone
|
||||
import json
|
||||
@@ -8,6 +17,7 @@ from typing import Any, Awaitable, Callable
|
||||
|
||||
from agno.workflow.step import Step, StepInput, StepOutput
|
||||
from agno.workflow.workflow import Workflow
|
||||
from loguru import logger
|
||||
|
||||
from src.mcp_client import call_mcp_tool
|
||||
from src.schemas import ScenarioRunResponse, StepState
|
||||
@@ -193,6 +203,7 @@ def _build_tool_executor(
|
||||
"finished_at": finished_at,
|
||||
}
|
||||
session_state.setdefault("steps", {})[step_name] = error_payload
|
||||
logger.exception("Step {} failed (tool={})", step_name, tool_name)
|
||||
raise RuntimeError(f"{step_name} failed: {exc}") from exc
|
||||
|
||||
return executor
|
||||
@@ -215,6 +226,9 @@ def _build_workflow(scenario_id: str, scenario: dict[str, Any]) -> Workflow:
|
||||
if not step_name or not tool_name:
|
||||
raise ScenarioStoreError("Each tool step must contain non-empty name and tool")
|
||||
|
||||
# Fail-fast by design: the run is considered successful only when every
|
||||
# step passes. There is no per-step retry or skip policy — downstream
|
||||
# steps rely on upstream output, so on any failure the workflow stops.
|
||||
workflow_steps.append(
|
||||
Step(
|
||||
name=step_name,
|
||||
@@ -232,15 +246,20 @@ def _build_workflow(scenario_id: str, scenario: dict[str, Any]) -> Workflow:
|
||||
)
|
||||
|
||||
|
||||
_workflow_cache: dict[str, Workflow] = {}
|
||||
_WORKFLOW_CACHE_MAX_SIZE = _env_int("WORKFLOW_CACHE_MAX_SIZE", 64)
|
||||
_workflow_cache: "OrderedDict[str, Workflow]" = OrderedDict()
|
||||
|
||||
|
||||
def _get_workflow(scenario_id: str, scenario: dict[str, Any]) -> Workflow:
|
||||
cached = _workflow_cache.get(scenario_id)
|
||||
if cached is not None:
|
||||
_workflow_cache.move_to_end(scenario_id)
|
||||
return cached
|
||||
workflow = _build_workflow(scenario_id, scenario)
|
||||
_workflow_cache[scenario_id] = workflow
|
||||
if len(_workflow_cache) > _WORKFLOW_CACHE_MAX_SIZE:
|
||||
evicted_id, _ = _workflow_cache.popitem(last=False)
|
||||
logger.debug("Evicted workflow from LRU cache: {}", evicted_id)
|
||||
return workflow
|
||||
|
||||
|
||||
@@ -275,10 +294,11 @@ def _build_step_states(
|
||||
continue
|
||||
payload = steps_payloads.get(name)
|
||||
if not isinstance(payload, dict):
|
||||
# Workflow aborted before this step ran (strict fail-fast policy).
|
||||
states.append(
|
||||
StepState(
|
||||
node_id=name,
|
||||
status="queued",
|
||||
status="skipped",
|
||||
message="",
|
||||
)
|
||||
)
|
||||
@@ -338,10 +358,14 @@ async def run_scenario(
|
||||
)
|
||||
except Exception as exc:
|
||||
workflow_error = str(exc)
|
||||
logger.exception("Workflow {} failed", scenario_id)
|
||||
|
||||
steps_payloads = session_state.get("steps", {}) or {}
|
||||
step_states = _build_step_states(scenario, steps_payloads)
|
||||
|
||||
# Strict invariant: run is success only when every recorded step payload
|
||||
# has ok=true. `on_error: skip` lets downstream steps keep running after a
|
||||
# failure, but it does NOT whitewash the overall run status.
|
||||
status = "success"
|
||||
if workflow_error is not None:
|
||||
status = "failed"
|
||||
|
||||
+16
-2
@@ -1,5 +1,15 @@
|
||||
"""Phoenix (Arize) OpenTelemetry tracing setup.
|
||||
|
||||
Tracing is initialized via the FastAPI lifespan so that import-time side effects
|
||||
stay out of module load. ``is_phoenix_tracing_enabled`` is cheap and can be
|
||||
consulted before the app starts (for example, to pass a flag into AgentOS).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
|
||||
from loguru import logger
|
||||
from phoenix.otel import register
|
||||
|
||||
_initialized = False
|
||||
@@ -12,11 +22,14 @@ def _env_bool(name: str, default: bool) -> bool:
|
||||
return value.strip().lower() in {"1", "true", "yes", "on"}
|
||||
|
||||
|
||||
def is_phoenix_tracing_enabled() -> bool:
|
||||
return _env_bool("PHOENIX_TRACING_ENABLED", False)
|
||||
|
||||
|
||||
def init_phoenix_tracing() -> bool:
|
||||
global _initialized
|
||||
|
||||
enabled = _env_bool("PHOENIX_TRACING_ENABLED", False)
|
||||
if not enabled:
|
||||
if not is_phoenix_tracing_enabled():
|
||||
return False
|
||||
|
||||
if _initialized:
|
||||
@@ -33,4 +46,5 @@ def init_phoenix_tracing() -> bool:
|
||||
auto_instrument=True,
|
||||
)
|
||||
_initialized = True
|
||||
logger.info("Phoenix tracing initialized (project={})", project_name)
|
||||
return True
|
||||
|
||||
@@ -1,3 +1,10 @@
|
||||
"""File-backed loader for scenario definitions.
|
||||
|
||||
Scenarios live under ``scenarios/`` and are indexed by ``scenarios/index.json``.
|
||||
Each scenario is a JSON object with a ``scenario_id`` that must match the
|
||||
index key it was looked up by.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
|
||||
+3
-1
@@ -1,3 +1,5 @@
|
||||
"""Pydantic schemas for the scenario-run REST API."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Literal
|
||||
@@ -5,7 +7,7 @@ from typing import Any, Literal
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
RunStatus = Literal["queued", "running", "success", "failed", "waiting_human"]
|
||||
StepStatus = Literal["queued", "running", "success", "failed", "waiting_human"]
|
||||
StepStatus = Literal["queued", "running", "success", "failed", "skipped", "waiting_human"]
|
||||
|
||||
|
||||
class ScenarioRunRequest(BaseModel):
|
||||
|
||||
@@ -1,3 +1,12 @@
|
||||
"""LLM-backed fallback planner for MCP tool arguments.
|
||||
|
||||
When a step's resolved arguments are missing required fields, this module
|
||||
calls an OpenAI-compatible chat completion to fill them from the current
|
||||
scope (``input`` + prior ``steps``). The planner is best-effort: on any
|
||||
failure it returns the base arguments unchanged so the caller's validator
|
||||
can produce a clean error.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from copy import deepcopy
|
||||
@@ -5,6 +14,7 @@ import json
|
||||
import os
|
||||
from typing import Any
|
||||
|
||||
from loguru import logger
|
||||
from openai import AsyncOpenAI
|
||||
|
||||
|
||||
@@ -121,6 +131,12 @@ async def plan_arguments(
|
||||
raw = completion.choices[0].message.content if completion.choices else ""
|
||||
planned = _extract_arguments(raw)
|
||||
except Exception:
|
||||
logger.warning(
|
||||
"Planner call failed for step={} tool={} attempt={}",
|
||||
step_name,
|
||||
tool_name,
|
||||
attempt_no,
|
||||
)
|
||||
planned = {}
|
||||
|
||||
merged = deepcopy(base_arguments)
|
||||
|
||||
@@ -1,3 +1,10 @@
|
||||
"""Variable templating for scenario step inputs.
|
||||
|
||||
A dict of shape ``{"from": "path.to.value"}`` resolves to the value at that
|
||||
dotted path in the current scope. Nested dicts/lists are resolved
|
||||
recursively; plain values pass through via ``deepcopy``.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from copy import deepcopy
|
||||
|
||||
Reference in New Issue
Block a user