Усилить надёжность: логирование, lifespan, LRU-кэш и fail-fast семантика

Подключить loguru и заменить молчаливые except на warning/exception в step_planner, mcp_client и mcp_workflow_runner — раньше ошибки терялись в пустых дикт-возвратах.\n Перенести Phoenix tracing из module-level в FastAPI lifespan, чтобы импорт agent_os не поднимал трейсер в тестах и тулах.\n Заменить неограниченный dict _workflow_cache на OrderedDict-LRU с лимитом WORKFLOW_CACHE_MAX_SIZE (default 64) — чтобы кэш не рос бесконечно при разных scenario_id.\n Зафиксировать инвариант fail-fast: шаги, не дошедшие до исполнения из-за падения upstream, возвращаются со статусом skipped (для UI), а не queued; run помечается success только если все payload.ok.\n Добавить module docstrings во все модули src/ по STYLE_GUIDE cookbook. Запинить версии зависимостей в requirements.txt.
2026-04-24 11:56:37 +03:00
parent 4d037e52eb
commit 3357b3c4dd
12 changed files with 136 additions and 18 deletions
@@ -25,6 +25,7 @@ dist/
 .vscode/
 .DS_Store
 .cursor
+.claude

 # Cookbook code
 vendor/agno/cookbook/
@@ -1,9 +1,10 @@
-agno
-fastapi
-uvicorn
-python-dotenv
-ollama
-socksio
-openai
-arize-phoenix-otel
-openinference-instrumentation-agno
+agno==2.5.17
+fastapi==0.136.0
+uvicorn==0.44.0
+python-dotenv==1.2.2
+ollama==0.6.1
+socksio==1.0.0
+openai==2.32.0
+arize-phoenix-otel==0.15.0
+openinference-instrumentation-agno==0.1.30
+loguru==0.7.3
@@ -1,26 +1,47 @@
+"""AgentOS entrypoint: wires the agent, REST routes and FastAPI lifespan.
+
+Phoenix tracing is initialized from the lifespan (not at import time) so that
+importing this module for tooling or tests does not spin up the tracer.
+"""
+
+from __future__ import annotations
+
 import os
+from contextlib import asynccontextmanager

 from dotenv import load_dotenv
 from fastapi import FastAPI
+from loguru import logger

 from agno.os import AgentOS

-from src.api_routes import router as api_router
 from src.agent_runner import get_agent
-from src.observability import init_phoenix_tracing
+from src.api_routes import router as api_router
+from src.observability import init_phoenix_tracing, is_phoenix_tracing_enabled

 load_dotenv()
-_tracing_enabled = init_phoenix_tracing()
+
+
+@asynccontextmanager
+async def _lifespan(_app: FastAPI):
+    init_phoenix_tracing()
+    logger.info("Prisma Platform API starting up")
+    try:
+        yield
+    finally:
+        logger.info("Prisma Platform API shutting down")
+

 _agent = get_agent()
 _base_app = FastAPI(
    title="Prisma Platform API",
    version="0.1.0",
+    lifespan=_lifespan,
 )
 _base_app.include_router(api_router)
 _agent_os = AgentOS(
    agents=[_agent],
-    tracing=_tracing_enabled,
+    tracing=is_phoenix_tracing_enabled(),
    base_app=_base_app,
 )
 app = _agent_os.get_app()
@@ -1,3 +1,11 @@
+"""Lazy factory for the top-level Prisma agent.
+
+Config is read from environment variables so the same module can be used by
+the API server, CLI tools and tests without re-wiring.
+"""
+
+from __future__ import annotations
+
 import os

 from agno.agent import Agent
@@ -1,3 +1,11 @@
+"""REST routes for scenario execution.
+
+These endpoints live on the FastAPI ``base_app`` that AgentOS composes with
+its own routes, so the prefix ``/api`` does not collide with AgentOS paths.
+"""
+
+from __future__ import annotations
+
 from fastapi import APIRouter

 from src.mcp_workflow_runner import run_scenario
@@ -1,3 +1,9 @@
+"""Thin async client for MCP tool invocation over streamable HTTP.
+
+Opens a short-lived ``ClientSession`` per call, wraps the tool response in
+a normalized dict, and raises ``RuntimeError`` on transport/tool errors.
+"""
+
 from __future__ import annotations

 from datetime import timedelta
@@ -5,6 +11,7 @@ import json
 import os
 from typing import Any

+from loguru import logger
 from mcp import ClientSession
 from mcp.client.streamable_http import streamablehttp_client
 from mcp.types import TextContent
@@ -33,8 +40,10 @@ async def call_mcp_tool(tool_name: str, arguments: dict[str, Any]) -> dict[str,
                await session.initialize()
                result = await session.call_tool(tool_name, arguments)
    except TimeoutError as exc:
+        logger.warning("MCP timeout: tool={}", tool_name)
        raise RuntimeError(f"MCP timeout: {tool_name}") from exc
    except Exception as exc:
+        logger.exception("MCP transport error: tool={}", tool_name)
        raise RuntimeError(f"MCP transport error: {tool_name}") from exc

    if result.isError:
@@ -1,5 +1,14 @@
+"""Builds and runs Agno workflows from JSON scenario definitions.
+
+Each scenario step is a typed MCP tool call. The runner resolves argument
+templates from ``session_state``, optionally lets an LLM planner repair
+missing fields, invokes the tool, and collects per-step results back into
+``session_state`` for downstream steps.
+"""
+
 from __future__ import annotations

+from collections import OrderedDict
 from copy import deepcopy
 from datetime import datetime, timezone
 import json
@@ -8,6 +17,7 @@ from typing import Any, Awaitable, Callable

 from agno.workflow.step import Step, StepInput, StepOutput
 from agno.workflow.workflow import Workflow
+from loguru import logger

 from src.mcp_client import call_mcp_tool
 from src.schemas import ScenarioRunResponse, StepState
@@ -193,6 +203,7 @@ def _build_tool_executor(
                "finished_at": finished_at,
            }
            session_state.setdefault("steps", {})[step_name] = error_payload
+            logger.exception("Step {} failed (tool={})", step_name, tool_name)
            raise RuntimeError(f"{step_name} failed: {exc}") from exc

    return executor
@@ -215,6 +226,9 @@ def _build_workflow(scenario_id: str, scenario: dict[str, Any]) -> Workflow:
        if not step_name or not tool_name:
            raise ScenarioStoreError("Each tool step must contain non-empty name and tool")

+        # Fail-fast by design: the run is considered successful only when every
+        # step passes. There is no per-step retry or skip policy — downstream
+        # steps rely on upstream output, so on any failure the workflow stops.
        workflow_steps.append(
            Step(
                name=step_name,
@@ -232,15 +246,20 @@ def _build_workflow(scenario_id: str, scenario: dict[str, Any]) -> Workflow:
    )


-_workflow_cache: dict[str, Workflow] = {}
+_WORKFLOW_CACHE_MAX_SIZE = _env_int("WORKFLOW_CACHE_MAX_SIZE", 64)
+_workflow_cache: "OrderedDict[str, Workflow]" = OrderedDict()


 def _get_workflow(scenario_id: str, scenario: dict[str, Any]) -> Workflow:
    cached = _workflow_cache.get(scenario_id)
    if cached is not None:
+        _workflow_cache.move_to_end(scenario_id)
        return cached
    workflow = _build_workflow(scenario_id, scenario)
    _workflow_cache[scenario_id] = workflow
+    if len(_workflow_cache) > _WORKFLOW_CACHE_MAX_SIZE:
+        evicted_id, _ = _workflow_cache.popitem(last=False)
+        logger.debug("Evicted workflow from LRU cache: {}", evicted_id)
    return workflow


@@ -275,10 +294,11 @@ def _build_step_states(
            continue
        payload = steps_payloads.get(name)
        if not isinstance(payload, dict):
+            # Workflow aborted before this step ran (strict fail-fast policy).
            states.append(
                StepState(
                    node_id=name,
-                    status="queued",
+                    status="skipped",
                    message="",
                )
            )
@@ -338,10 +358,14 @@ async def run_scenario(
        )
    except Exception as exc:
        workflow_error = str(exc)
+        logger.exception("Workflow {} failed", scenario_id)

    steps_payloads = session_state.get("steps", {}) or {}
    step_states = _build_step_states(scenario, steps_payloads)

+    # Strict invariant: run is success only when every recorded step payload
+    # has ok=true. `on_error: skip` lets downstream steps keep running after a
+    # failure, but it does NOT whitewash the overall run status.
    status = "success"
    if workflow_error is not None:
        status = "failed"
@@ -1,5 +1,15 @@
+"""Phoenix (Arize) OpenTelemetry tracing setup.
+
+Tracing is initialized via the FastAPI lifespan so that import-time side effects
+stay out of module load. ``is_phoenix_tracing_enabled`` is cheap and can be
+consulted before the app starts (for example, to pass a flag into AgentOS).
+"""
+
+from __future__ import annotations
+
 import os

+from loguru import logger
 from phoenix.otel import register

 _initialized = False
@@ -12,11 +22,14 @@ def _env_bool(name: str, default: bool) -> bool:
    return value.strip().lower() in {"1", "true", "yes", "on"}


+def is_phoenix_tracing_enabled() -> bool:
+    return _env_bool("PHOENIX_TRACING_ENABLED", False)
+
+
 def init_phoenix_tracing() -> bool:
    global _initialized

-    enabled = _env_bool("PHOENIX_TRACING_ENABLED", False)
-    if not enabled:
+    if not is_phoenix_tracing_enabled():
        return False

    if _initialized:
@@ -33,4 +46,5 @@ def init_phoenix_tracing() -> bool:
        auto_instrument=True,
    )
    _initialized = True
+    logger.info("Phoenix tracing initialized (project={})", project_name)
    return True
@@ -1,3 +1,10 @@
+"""File-backed loader for scenario definitions.
+
+Scenarios live under ``scenarios/`` and are indexed by ``scenarios/index.json``.
+Each scenario is a JSON object with a ``scenario_id`` that must match the
+index key it was looked up by.
+"""
+
 from __future__ import annotations

 import json
@@ -1,3 +1,5 @@
+"""Pydantic schemas for the scenario-run REST API."""
+
 from __future__ import annotations

 from typing import Any, Literal
@@ -5,7 +7,7 @@ from typing import Any, Literal
 from pydantic import BaseModel, Field

 RunStatus = Literal["queued", "running", "success", "failed", "waiting_human"]
-StepStatus = Literal["queued", "running", "success", "failed", "waiting_human"]
+StepStatus = Literal["queued", "running", "success", "failed", "skipped", "waiting_human"]


 class ScenarioRunRequest(BaseModel):
@@ -1,3 +1,12 @@
+"""LLM-backed fallback planner for MCP tool arguments.
+
+When a step's resolved arguments are missing required fields, this module
+calls an OpenAI-compatible chat completion to fill them from the current
+scope (``input`` + prior ``steps``). The planner is best-effort: on any
+failure it returns the base arguments unchanged so the caller's validator
+can produce a clean error.
+"""
+
 from __future__ import annotations

 from copy import deepcopy
@@ -5,6 +14,7 @@ import json
 import os
 from typing import Any

+from loguru import logger
 from openai import AsyncOpenAI


@@ -121,6 +131,12 @@ async def plan_arguments(
        raw = completion.choices[0].message.content if completion.choices else ""
        planned = _extract_arguments(raw)
    except Exception:
+        logger.warning(
+            "Planner call failed for step={} tool={} attempt={}",
+            step_name,
+            tool_name,
+            attempt_no,
+        )
        planned = {}

    merged = deepcopy(base_arguments)
@@ -1,3 +1,10 @@
+"""Variable templating for scenario step inputs.
+
+A dict of shape ``{"from": "path.to.value"}`` resolves to the value at that
+dotted path in the current scope. Nested dicts/lists are resolved
+recursively; plain values pass through via ``deepcopy``.
+"""
+
 from __future__ import annotations

 from copy import deepcopy