Усилить надёжность: логирование, lifespan, LRU-кэш и fail-fast семантика

Подключить loguru и заменить молчаливые except на warning/exception в step_planner, mcp_client и mcp_workflow_runner — раньше ошибки терялись в пустых дикт-возвратах.\n Перенести Phoenix tracing из module-level в FastAPI lifespan, чтобы импорт agent_os не поднимал трейсер в тестах и тулах.\n Заменить неограниченный dict _workflow_cache на OrderedDict-LRU с лимитом WORKFLOW_CACHE_MAX_SIZE (default 64) — чтобы кэш не рос бесконечно при разных scenario_id.\n Зафиксировать инвариант fail-fast: шаги, не дошедшие до исполнения из-за падения upstream, возвращаются со статусом skipped (для UI), а не queued; run помечается success только если все payload.ok.\n Добавить module docstrings во все модули src/ по STYLE_GUIDE cookbook. Запинить версии зависимостей в requirements.txt.
2026-04-24 11:56:37 +03:00
parent 4d037e52eb
commit 3357b3c4dd
12 changed files with 136 additions and 18 deletions
@@ -25,6 +25,7 @@ dist/
 .vscode/
 .DS_Store
 .cursor
 .claude
 # Cookbook code
 vendor/agno/cookbook/
@@ -1,9 +1,10 @@
-agno
+agno==2.5.17
-fastapi
+fastapi==0.136.0
-uvicorn
+uvicorn==0.44.0
-python-dotenv
+python-dotenv==1.2.2
-ollama
+ollama==0.6.1
-socksio
+socksio==1.0.0
-openai
+openai==2.32.0
-arize-phoenix-otel
+arize-phoenix-otel==0.15.0
-openinference-instrumentation-agno
+openinference-instrumentation-agno==0.1.30
 loguru==0.7.3
@@ -1,26 +1,47 @@
 """AgentOS entrypoint: wires the agent, REST routes and FastAPI lifespan.
 Phoenix tracing is initialized from the lifespan (not at import time) so that
 importing this module for tooling or tests does not spin up the tracer.
 """
 from __future__ import annotations
 import os
 from contextlib import asynccontextmanager
 from dotenv import load_dotenv
 from fastapi import FastAPI
 from loguru import logger
 from agno.os import AgentOS
 from src.api_routes import router as api_router
 from src.agent_runner import get_agent
-from src.observability import init_phoenix_tracing
+from src.api_routes import router as api_router
 from src.observability import init_phoenix_tracing, is_phoenix_tracing_enabled
 load_dotenv()
-_tracing_enabled = init_phoenix_tracing()
+
@asynccontextmanager
 async def _lifespan(_app: FastAPI):
    init_phoenix_tracing()
    logger.info("Prisma Platform API starting up")
    try:
        yield
    finally:
        logger.info("Prisma Platform API shutting down")
 _agent = get_agent()
 _base_app = FastAPI(
    title="Prisma Platform API",
    version="0.1.0",
    lifespan=_lifespan,
 )
 _base_app.include_router(api_router)
 _agent_os = AgentOS(
    agents=[_agent],
-    tracing=_tracing_enabled,
+    tracing=is_phoenix_tracing_enabled(),
    base_app=_base_app,
 )
 app = _agent_os.get_app()
@@ -1,3 +1,11 @@
 """Lazy factory for the top-level Prisma agent.
 Config is read from environment variables so the same module can be used by
 the API server, CLI tools and tests without re-wiring.
 """
 from __future__ import annotations
 import os
 from agno.agent import Agent
@@ -1,3 +1,11 @@
 """REST routes for scenario execution.
 These endpoints live on the FastAPI ``base_app`` that AgentOS composes with
 its own routes, so the prefix ``/api`` does not collide with AgentOS paths.
 """
 from __future__ import annotations
 from fastapi import APIRouter
 from src.mcp_workflow_runner import run_scenario
@@ -1,3 +1,9 @@
 """Thin async client for MCP tool invocation over streamable HTTP.
 Opens a short-lived ``ClientSession`` per call, wraps the tool response in
 a normalized dict, and raises ``RuntimeError`` on transport/tool errors.
 """
 from __future__ import annotations
 from datetime import timedelta
@@ -5,6 +11,7 @@ import json
 import os
 from typing import Any
 from loguru import logger
 from mcp import ClientSession
 from mcp.client.streamable_http import streamablehttp_client
 from mcp.types import TextContent
@@ -33,8 +40,10 @@ async def call_mcp_tool(tool_name: str, arguments: dict[str, Any]) -> dict[str,
                await session.initialize()
                result = await session.call_tool(tool_name, arguments)
    except TimeoutError as exc:
        logger.warning("MCP timeout: tool={}", tool_name)
        raise RuntimeError(f"MCP timeout: {tool_name}") from exc
    except Exception as exc:
        logger.exception("MCP transport error: tool={}", tool_name)
        raise RuntimeError(f"MCP transport error: {tool_name}") from exc
    if result.isError:
@@ -1,5 +1,14 @@
 """Builds and runs Agno workflows from JSON scenario definitions.
 Each scenario step is a typed MCP tool call. The runner resolves argument
 templates from ``session_state``, optionally lets an LLM planner repair
 missing fields, invokes the tool, and collects per-step results back into
 ``session_state`` for downstream steps.
 """
 from __future__ import annotations
 from collections import OrderedDict
 from copy import deepcopy
 from datetime import datetime, timezone
 import json
@@ -8,6 +17,7 @@ from typing import Any, Awaitable, Callable
 from agno.workflow.step import Step, StepInput, StepOutput
 from agno.workflow.workflow import Workflow
 from loguru import logger
 from src.mcp_client import call_mcp_tool
 from src.schemas import ScenarioRunResponse, StepState
@@ -193,6 +203,7 @@ def _build_tool_executor(
                "finished_at": finished_at,
            }
            session_state.setdefault("steps", {})[step_name] = error_payload
            logger.exception("Step {} failed (tool={})", step_name, tool_name)
            raise RuntimeError(f"{step_name} failed: {exc}") from exc
    return executor
@@ -215,6 +226,9 @@ def _build_workflow(scenario_id: str, scenario: dict[str, Any]) -> Workflow:
        if not step_name or not tool_name:
            raise ScenarioStoreError("Each tool step must contain non-empty name and tool")
        # Fail-fast by design: the run is considered successful only when every
        # step passes. There is no per-step retry or skip policy — downstream
        # steps rely on upstream output, so on any failure the workflow stops.
        workflow_steps.append(
            Step(
                name=step_name,
@@ -232,15 +246,20 @@ def _build_workflow(scenario_id: str, scenario: dict[str, Any]) -> Workflow:
    )
-_workflow_cache: dict[str, Workflow] = {}
+_WORKFLOW_CACHE_MAX_SIZE = _env_int("WORKFLOW_CACHE_MAX_SIZE", 64)
 _workflow_cache: "OrderedDict[str, Workflow]" = OrderedDict()
 def _get_workflow(scenario_id: str, scenario: dict[str, Any]) -> Workflow:
    cached = _workflow_cache.get(scenario_id)
    if cached is not None:
        _workflow_cache.move_to_end(scenario_id)
        return cached
    workflow = _build_workflow(scenario_id, scenario)
    _workflow_cache[scenario_id] = workflow
    if len(_workflow_cache) > _WORKFLOW_CACHE_MAX_SIZE:
        evicted_id, _ = _workflow_cache.popitem(last=False)
        logger.debug("Evicted workflow from LRU cache: {}", evicted_id)
    return workflow
@@ -275,10 +294,11 @@ def _build_step_states(
            continue
        payload = steps_payloads.get(name)
        if not isinstance(payload, dict):
            # Workflow aborted before this step ran (strict fail-fast policy).
            states.append(
                StepState(
                    node_id=name,
-                    status="queued",
+                    status="skipped",
                    message="",
                )
            )
@@ -338,10 +358,14 @@ async def run_scenario(
        )
    except Exception as exc:
        workflow_error = str(exc)
        logger.exception("Workflow {} failed", scenario_id)
    steps_payloads = session_state.get("steps", {}) or {}
    step_states = _build_step_states(scenario, steps_payloads)
    # Strict invariant: run is success only when every recorded step payload
    # has ok=true. `on_error: skip` lets downstream steps keep running after a
    # failure, but it does NOT whitewash the overall run status.
    status = "success"
    if workflow_error is not None:
        status = "failed"
@@ -1,5 +1,15 @@
 """Phoenix (Arize) OpenTelemetry tracing setup.
 Tracing is initialized via the FastAPI lifespan so that import-time side effects
 stay out of module load. ``is_phoenix_tracing_enabled`` is cheap and can be
 consulted before the app starts (for example, to pass a flag into AgentOS).
 """
 from __future__ import annotations
 import os
 from loguru import logger
 from phoenix.otel import register
 _initialized = False
@@ -12,11 +22,14 @@ def _env_bool(name: str, default: bool) -> bool:
    return value.strip().lower() in {"1", "true", "yes", "on"}
 def is_phoenix_tracing_enabled() -> bool:
    return _env_bool("PHOENIX_TRACING_ENABLED", False)
 def init_phoenix_tracing() -> bool:
    global _initialized
-    enabled = _env_bool("PHOENIX_TRACING_ENABLED", False)
+    if not is_phoenix_tracing_enabled():
    if not enabled:
        return False
    if _initialized:
@@ -33,4 +46,5 @@ def init_phoenix_tracing() -> bool:
        auto_instrument=True,
    )
    _initialized = True
    logger.info("Phoenix tracing initialized (project={})", project_name)
    return True
@@ -1,3 +1,10 @@
 """File-backed loader for scenario definitions.
 Scenarios live under ``scenarios/`` and are indexed by ``scenarios/index.json``.
 Each scenario is a JSON object with a ``scenario_id`` that must match the
 index key it was looked up by.
 """
 from __future__ import annotations
 import json
@@ -1,3 +1,5 @@
 """Pydantic schemas for the scenario-run REST API."""
 from __future__ import annotations
 from typing import Any, Literal
@@ -5,7 +7,7 @@ from typing import Any, Literal
 from pydantic import BaseModel, Field
 RunStatus = Literal["queued", "running", "success", "failed", "waiting_human"]
-StepStatus = Literal["queued", "running", "success", "failed", "waiting_human"]
+StepStatus = Literal["queued", "running", "success", "failed", "skipped", "waiting_human"]
 class ScenarioRunRequest(BaseModel):
@@ -1,3 +1,12 @@
 """LLM-backed fallback planner for MCP tool arguments.
 When a step's resolved arguments are missing required fields, this module
 calls an OpenAI-compatible chat completion to fill them from the current
 scope (``input`` + prior ``steps``). The planner is best-effort: on any
 failure it returns the base arguments unchanged so the caller's validator
 can produce a clean error.
 """
 from __future__ import annotations
 from copy import deepcopy
@@ -5,6 +14,7 @@ import json
 import os
 from typing import Any
 from loguru import logger
 from openai import AsyncOpenAI
@@ -121,6 +131,12 @@ async def plan_arguments(
        raw = completion.choices[0].message.content if completion.choices else ""
        planned = _extract_arguments(raw)
    except Exception:
        logger.warning(
            "Planner call failed for step={} tool={} attempt={}",
            step_name,
            tool_name,
            attempt_no,
        )
        planned = {}
    merged = deepcopy(base_arguments)
@@ -1,3 +1,10 @@
 """Variable templating for scenario step inputs.
 A dict of shape ``{"from": "path.to.value"}`` resolves to the value at that
 dotted path in the current scope. Nested dicts/lists are resolved
 recursively; plain values pass through via ``deepcopy``.
 """
 from __future__ import annotations
 from copy import deepcopy