Merge pull request 'Agent Error fix' (#21 ) from zfp-oss into main

Reviewed-on: #21 Reviewed-by: jonny <jonny@noreply.localhost>
Merge branch 'main' into zfp-oss
2026-04-13 09:11:41 -04:00 · 2026-04-13 09:11:09 -04:00 · 2026-04-10 16:39:11 -04:00
1 changed files with 438 additions and 31 deletions
--- a/src/ria_toolkit_oss/agent.py
+++ b/src/ria_toolkit_oss/agent.py
@ -14,19 +14,40 @@ Usage::
        [--device plutosdr] \\
        [--insecure]
    # Or store credentials in a config file and omit them from the command line:
    ria-agent --config ~/.config/ria-agent/config.json --name lab-bench-1
 The agent:
  1. Registers with RIA Hub and receives a ``node_id``.
  2. Sends a heartbeat every 30 s so the hub knows it is online.
  3. Long-polls ``GET /orchestrator/nodes/{id}/commands`` (30 s timeout).
-  4. Executes received campaigns via :class:`ria_toolkit_oss.orchestration.executor.CampaignExecutor`.
+  4. Dispatches received commands:
-  5. Uploads recordings to the hub via chunked POST, keeping each request
+     - ``run_campaign``: executes via CampaignExecutor, uploads recordings.
-     under 50 MB so it passes through Cloudflare without needing the bypass
+     - ``load_model``: loads an ONNX fingerprint or detector model.
-     subdomain.
+     - ``start_inference``: opens the SDR, runs the inference loop, posts
-  6. Deregisters cleanly on SIGINT / SIGTERM.
+       detection events to the hub for SSE fan-out to browsers.
     - ``stop_inference``: gracefully stops the inference loop.
     - ``configure_inference``: queues an SDR parameter update (applied at the
       next capture boundary without restarting the loop).
  5. Deregisters cleanly on SIGINT / SIGTERM.
 Config file (JSON, optional)::
    {
        "hub": "https://riahub.company.com",
        "key": "secret",
        "name": "lab-bench-1",
        "device": "plutosdr",
        "insecure": false,
        "log_level": "INFO"
    }
 CLI arguments always override config file values.
 """
 from __future__ import annotations
 import json
 import logging
 import math
 import os
@ -49,6 +70,8 @@ _POLL_CLIENT_TIMEOUT = 40  # client read timeout — slightly longer than server
 _RECONNECT_PAUSE = 5  # seconds to wait after a poll error before retrying
 _CHUNK_SIZE = 50 * 1024 * 1024  # 50 MB — well below Cloudflare's 100 MB limit
 _DIRECT_THRESHOLD = 90 * 1024 * 1024  # files above this use chunked upload
 _CAPTURE_SAMPLES = 4096  # IQ samples per inference window
 _IDLE_LABELS = frozenset({"noise", "idle", "no_signal", "unknown_protocol", "background"})
 # ---------------------------------------------------------------------------
@ -80,6 +103,30 @@ class NodeAgent:
        self.node_id: str | None = None
        self._stop = threading.Event()
        # ── Inference state ─────────────────────────────────────────────────
        # Protected by _inf_lock for cross-thread model swaps.
        self._inf_lock = threading.Lock()
        self._inf_session: Any = None  # primary fingerprint ONNX session
        self._inf_index_to_label: dict[int, str] = {}
        self._inf_detector_session: Any = None  # optional protocol-detector session
        self._inf_detector_index_to_label: dict[int, str] = {}
        self._inf_detector_threshold: float = 0.7
        self._inf_pending_config: dict = {}  # queued SDR attribute updates
        self._inf_stop = threading.Event()
        self._inf_thread: threading.Thread | None = None
        # Detect optional dependencies once at startup so capability
        # advertising is accurate from the first registration.
        try:
            import onnxruntime as _ort_mod
            self._ort: Any = _ort_mod
            self._ort_available = True
        except ImportError:
            self._ort = None
            self._ort_available = False
        try:
            import ria_toolkit_oss
@ -114,6 +161,7 @@ class NodeAgent:
            self._command_loop()
        finally:
            self._stop.set()
            self._stop_inference()
            self._deregister()
    # ------------------------------------------------------------------
@ -121,13 +169,16 @@ class NodeAgent:
    # ------------------------------------------------------------------
    def _register(self) -> None:
        capabilities = ["campaign"]
        if self._ort_available:
            capabilities.append("inference")
        resp = self._post(
            "/orchestrator/nodes/register",
            json={
                "name": self.name,
                "sdr_device": self.sdr_device,
                "ria_toolkit_version": self._ria_version,
-                "capabilities": ["inference", "campaign"],
+                "capabilities": capabilities,
            },
            timeout=15,
        )
@ -200,6 +251,24 @@ class NodeAgent:
                daemon=True,
                name=f"campaign-{campaign_id[:8]}",
            ).start()
        elif command == "load_model":
            threading.Thread(
                target=self._load_model,
                args=(cmd,),
                daemon=True,
                name="ria-load-model",
            ).start()
        elif command == "start_inference":
            threading.Thread(
                target=self._start_inference,
                args=(cmd,),
                daemon=True,
                name="ria-start-inf",
            ).start()
        elif command == "stop_inference":
            self._stop_inference()
        elif command == "configure_inference":
            self._queue_sdr_config(cmd)
        else:
            logger.warning("Unknown command %r — ignored", command)
@ -232,6 +301,270 @@ class NodeAgent:
            logger.error("Campaign %s failed: %s", campaign_id[:8], exc)
            self._report_campaign_status(campaign_id, "failed", error=str(exc))
    # ------------------------------------------------------------------
    # Inference — model loading
    # ------------------------------------------------------------------
    def _load_model(self, cmd: dict) -> None:
        """Load an ONNX model into the fingerprint or detector slot.
        The ``model_path`` field may be either a local filesystem path or an
        ``http(s)://`` URL; in the latter case the file is downloaded first.
        """
        if not self._ort_available:
            logger.error("load_model: onnxruntime is not installed — cannot load model")
            return
        model_path: str = cmd.get("model_path", "")
        label_map: dict[str, int] = cmd.get("label_map") or {}
        stage: str = cmd.get("stage", "fingerprint")
        detector_threshold: float = float(cmd.get("detector_threshold") or 0.7)
        if model_path.startswith(("http://", "https://")):
            model_path = self._download_model(model_path)
            if model_path is None:
                return
        try:
            session = self._ort.InferenceSession(model_path, providers=["CPUExecutionProvider"])
        except Exception as exc:
            logger.error("Failed to load model %r: %s", model_path, exc)
            return
        index_to_label = {v: k for k, v in label_map.items()}
        with self._inf_lock:
            if stage == "detector":
                self._inf_detector_session = session
                self._inf_detector_index_to_label = index_to_label
                self._inf_detector_threshold = detector_threshold
                logger.info(
                    "Detector model loaded: path=%s classes=%d threshold=%.2f",
                    model_path,
                    len(label_map),
                    detector_threshold,
                )
            else:
                self._inf_session = session
                self._inf_index_to_label = index_to_label
                logger.info(
                    "Fingerprint model loaded: path=%s classes=%d",
                    model_path,
                    len(label_map),
                )
    def _download_model(self, url: str) -> str | None:
        """Download a model from *url* to a temp file and return the local path."""
        import tempfile
        import requests as _requests
        try:
            logger.info("Downloading model from %s", url)
            resp = _requests.get(
                url,
                headers={"X-API-Key": self.api_key},
                verify=not self.insecure,
                timeout=120,
            )
            resp.raise_for_status()
            with tempfile.NamedTemporaryFile(suffix=".onnx", delete=False) as fh:
                fh.write(resp.content)
                path = fh.name
            logger.info("Model downloaded to %s (%d bytes)", path, len(resp.content))
            return path
        except Exception as exc:
            logger.error("Model download from %s failed: %s", url, exc)
            return None
    # ------------------------------------------------------------------
    # Inference — loop lifecycle
    # ------------------------------------------------------------------
    def _start_inference(self, cmd: dict) -> None:
        """Start the SDR capture + ONNX inference loop."""
        if not self._ort_available:
            logger.error("start_inference: onnxruntime is not installed")
            return
        with self._inf_lock:
            if self._inf_session is None:
                logger.error("start_inference: no fingerprint model loaded — call load_model first")
                return
        if self._inf_thread is not None and self._inf_thread.is_alive():
            logger.warning("start_inference: inference loop is already running — ignoring")
            return
        center_freq: float = float(cmd.get("center_freq", 2.4e9))
        sample_rate: float = float(cmd.get("sample_rate", 10e6))
        gain: float | str = cmd.get("gain", "auto")
        device_type: str = cmd.get("device") or self.sdr_device
        self._inf_stop.clear()
        self._inf_thread = threading.Thread(
            target=self._inference_loop,
            args=(device_type, center_freq, sample_rate, gain),
            daemon=True,
            name="ria-agent-inference",
        )
        self._inf_thread.start()
        logger.info(
            "Inference started (device=%s freq=%.3f MHz rate=%.1f MHz)",
            device_type,
            center_freq / 1e6,
            sample_rate / 1e6,
        )
    def _stop_inference(self) -> None:
        """Signal the inference loop to stop and wait up to 5 s for it to exit."""
        self._inf_stop.set()
        if self._inf_thread is not None and self._inf_thread.is_alive():
            self._inf_thread.join(timeout=5.0)
            if self._inf_thread.is_alive():
                logger.warning("Inference thread did not exit within 5 s")
        logger.info("Inference stopped")
    def _queue_sdr_config(self, cmd: dict) -> None:
        """Merge SDR parameter updates into the pending-config dict.
        The inference loop checks this at each capture boundary and applies
        the updates without restarting.
        """
        cfg = {k: v for k, v in cmd.items() if k != "command" and v is not None}
        with self._inf_lock:
            self._inf_pending_config.update(cfg)
        logger.debug("SDR reconfiguration queued: %s", cfg)
    # ------------------------------------------------------------------
    # Inference — main loop
    # ------------------------------------------------------------------
    def _inference_loop(
        self,
        device_type: str,
        center_freq: float,
        sample_rate: float,
        gain: float | str,
    ) -> None:
        """Continuous SDR capture → ONNX inference → POST events to hub.
        Mirrors the two-stage pipeline in the hub's ``_inference_loop``:
        an optional protocol-detector gates the fingerprint model so the
        fingerprint model only runs when an active transmission is detected.
        """
        try:
            from ria_toolkit_oss.sdr import get_sdr_device
        except ImportError as exc:
            logger.error("inference_loop: ria_toolkit_oss not installed: %s", exc)
            return
        try:
            sdr = get_sdr_device(device_type)
            _apply_sdr_config(sdr, {"center_freq": center_freq, "sample_rate": sample_rate, "gain": gain})
        except Exception as exc:
            logger.error("SDR initialisation failed: %s", exc)
            return
        try:
            import numpy as np
            try:
                from ria_toolkit_oss.orchestration.qa import estimate_snr_db
            except ImportError:
                estimate_snr_db = None
            # Snapshot model state once at loop start.  If the hub sends a
            # new load_model command while the loop is running, the new session
            # will be picked up on the next loop restart (stop + start).
            with self._inf_lock:
                session = self._inf_session
                index_to_label = dict(self._inf_index_to_label)
                det_session = self._inf_detector_session
                det_threshold = self._inf_detector_threshold
            input_name = session.get_inputs()[0].name
            det_input_name = det_session.get_inputs()[0].name if det_session else None
            while not self._inf_stop.is_set():
                # Apply any queued SDR configuration changes.
                with self._inf_lock:
                    pending = self._inf_pending_config.copy()
                    self._inf_pending_config.clear()
                if pending:
                    _apply_sdr_config(sdr, pending)
                try:
                    samples = sdr.rx(_CAPTURE_SAMPLES)
                except Exception as exc:
                    logger.warning("SDR capture error: %s", exc)
                    # Avoid a tight spin when the SDR is in a persistent error
                    # state (e.g. physically disconnected).
                    self._inf_stop.wait(timeout=0.5)
                    continue
                samples = np.array(samples, dtype=np.complex64)
                snr_db = float(estimate_snr_db(samples)) if estimate_snr_db is not None else 0.0
                iq = np.stack([samples.real, samples.imag], axis=0).astype(np.float32)
                # Stage 1: protocol detector gate (optional).
                if det_session is not None:
                    det_out = _run_onnx_session(det_session, det_input_name, iq)
                    det_probs = _softmax(det_out[0][0])
                    det_confidence = float(det_probs.max())
                    if det_confidence < det_threshold:
                        # No active protocol detected — report idle and skip
                        # the fingerprint model for this window.
                        self._post_event(device_id=None, confidence=det_confidence, snr_db=snr_db)
                        continue
                # Stage 2: fingerprint model.
                out = _run_onnx_session(session, input_name, iq)
                probs = _softmax(out[0][0])
                pred_idx = int(probs.argmax())
                confidence = float(probs[pred_idx])
                device_id = index_to_label.get(pred_idx)
                idle = (device_id in _IDLE_LABELS) if device_id else True
                self._post_event(
                    device_id=None if idle else device_id,
                    confidence=confidence,
                    snr_db=snr_db,
                )
        except Exception as exc:
            logger.exception("Inference loop terminated unexpectedly: %s", exc)
        finally:
            try:
                sdr.close()
            except Exception:
                pass
            logger.info("Inference loop exited")
    def _post_event(self, device_id: str | None, confidence: float, snr_db: float) -> None:
        """POST a single detection event to ``POST /orchestrator/nodes/{id}/events``.
        Failures are logged at DEBUG level and silently swallowed so that a
        transient network blip does not crash the inference loop.
        """
        from datetime import datetime, timezone
        payload = {
            "type": "detection",
            "device_id": device_id,
            "confidence": round(confidence, 6),
            "snr_db": round(snr_db, 2),
            "timestamp": datetime.now(timezone.utc).isoformat(),
        }
        try:
            resp = self._post(
                f"/orchestrator/nodes/{self.node_id}/events",
                json=payload,
                timeout=5,
            )
            if resp.status_code not in (200, 204):
                logger.debug("Event POST returned HTTP %d", resp.status_code)
        except Exception as exc:
            logger.debug("Event POST failed (will retry next inference cycle): %s", exc)
    # ------------------------------------------------------------------
    # Recording upload (chunked for large files)
    # ------------------------------------------------------------------
@ -244,7 +577,7 @@ class NodeAgent:
        repo_owner, repo_name = output_repo.split("/", 1)
        base_url = f"{self.hub_url}/datasets/upload"
-        steps = getattr(result, "steps", None) or []
+        steps = (result.get("steps") if isinstance(result, dict) else getattr(result, "steps", None)) or []
        for step in steps:
            output_path: str | None = getattr(step, "output_path", None)
@ -304,7 +637,6 @@ class NodeAgent:
        headers = {"X-API-Key": self.api_key}
        verify = not self.insecure
        # Small files: single POST (unchanged endpoint, no assembly needed server-side).
        if size <= _DIRECT_THRESHOLD:
            with open(file_path, "rb") as fh:
                resp = _requests.post(
@ -318,7 +650,6 @@ class NodeAgent:
            resp.raise_for_status()
            return resp.json()
        # Large files: chunked upload — each request is ≤ 50 MB.
        total_chunks = math.ceil(size / _CHUNK_SIZE)
        upload_id = str(uuid.uuid4())
        chunk_url = base_url + "/chunk"
@ -339,18 +670,13 @@ class NodeAgent:
                    chunk_url,
                    headers=headers,
                    files={"file": (filename, chunk, "application/octet-stream")},
-                    data={
+                    data={**metadata, "upload_id": upload_id, "chunk_index": i, "total_chunks": total_chunks},
                        **metadata,
                        "upload_id": upload_id,
                        "chunk_index": i,
                        "total_chunks": total_chunks,
                    },
                    timeout=120,
                    verify=verify,
                )
                if not resp.ok:
                    raise RuntimeError(
-                        f"Chunk {i + 1}/{total_chunks} failed: " f"HTTP {resp.status_code}: {resp.text[:300]}"
+                        f"Chunk {i + 1}/{total_chunks} failed: HTTP {resp.status_code}: {resp.text[:300]}"
                    )
                resp_data = resp.json()
                logger.debug("Chunk %d/%d uploaded", i + 1, total_chunks)
@ -393,10 +719,41 @@ class NodeAgent:
 # ---------------------------------------------------------------------------
-# Helpers
+# Module-level helpers (shared by NodeAgent._inference_loop)
 # ---------------------------------------------------------------------------
 def _run_onnx_session(session: Any, input_name: str, iq: Any) -> list:
    """Run an ONNX session on an IQ array (2, N).
    Tries channel-first layout (1, 2, N) first; falls back to interleaved flat
    (1, 2*N) when the model expects a flattened input.
    """
    import numpy as np
    x = iq[np.newaxis]  # (1, 2, N)
    try:
        return session.run(None, {input_name: x})
    except Exception:
        return session.run(None, {input_name: iq.flatten()[np.newaxis]})
 def _softmax(x: Any) -> Any:
    import numpy as np
    e = np.exp(x - x.max())
    return e / e.sum()
 def _apply_sdr_config(sdr: Any, cfg: dict) -> None:
    for attr in ("center_freq", "sample_rate", "gain"):
        if attr in cfg:
            try:
                setattr(sdr, attr, cfg[attr])
            except Exception as exc:
                logger.warning("SDR config %s=%r failed: %s", attr, cfg[attr], exc)
 def _sigmf_files(data_path: str) -> list[str]:
    """Return paths to both SigMF files (.sigmf-data and .sigmf-meta) for a recording."""
    candidates = [data_path]
@ -405,6 +762,29 @@ def _sigmf_files(data_path: str) -> list[str]:
    return [p for p in candidates if os.path.exists(p)]
 # ---------------------------------------------------------------------------
 # Config file helpers
 # ---------------------------------------------------------------------------
 _DEFAULT_CONFIG_PATH = os.path.join(
    os.environ.get("XDG_CONFIG_HOME", os.path.expanduser("~/.config")),
    "ria-agent",
    "config.json",
 )
 def _load_config(path: str) -> dict:
    """Load a JSON config file, returning an empty dict if it does not exist."""
    try:
        with open(path) as fh:
            return json.load(fh)
    except FileNotFoundError:
        return {}
    except Exception as exc:
        logger.warning("Could not read config file %s: %s", path, exc)
        return {}
 # ---------------------------------------------------------------------------
 # CLI entry point
 # ---------------------------------------------------------------------------
@ -420,67 +800,94 @@ def main() -> None:
            "campaigns / inference on local SDR hardware."
        ),
    )
    parser.add_argument(
        "--config",
        default=None,
        metavar="PATH",
        help=(
            f"Path to a JSON config file (default: {_DEFAULT_CONFIG_PATH}). "
            "CLI arguments override config file values."
        ),
    )
    parser.add_argument(
        "--hub",
-        required=True,
+        default=None,
        metavar="URL",
        help="RIA Hub base URL, e.g. https://riahub.company.com",
    )
    parser.add_argument(
        "--key",
-        required=True,
+        default=None,
        metavar="API_KEY",
        help="Shared API key (must match [wac] API_KEY in the hub's app.ini)",
    )
    parser.add_argument(
        "--name",
-        required=True,
+        default=None,
        metavar="NAME",
        help='Human-readable name shown in the Target Node dropdown, e.g. "lab-bench-1"',
    )
    parser.add_argument(
        "--device",
-        default="unknown",
+        default=None,
        metavar="SDR",
        help=(
-            "SDR device type reported to the hub (informational only). "
+            "SDR device type reported to the hub and used for inference. "
            "Examples: plutosdr, usrp_b210, rtlsdr, mock. Default: unknown"
        ),
    )
    parser.add_argument(
        "--insecure",
        action="store_true",
        default=None,
        help="Disable TLS certificate verification (dev/self-signed certs only)",
    )
    parser.add_argument(
        "--log-level",
-        default="INFO",
+        default=None,
        choices=["DEBUG", "INFO", "WARNING", "ERROR"],
        help="Logging verbosity (default: INFO)",
    )
    args = parser.parse_args()
    # Merge: config file → CLI args (CLI wins).
    config_path = args.config or _DEFAULT_CONFIG_PATH
    cfg = _load_config(config_path)
    hub = args.hub or cfg.get("hub")
    key = args.key or cfg.get("key")
    name = args.name or cfg.get("name")
    device = args.device or cfg.get("device", "unknown")
    insecure = args.insecure if args.insecure is not None else cfg.get("insecure", False)
    log_level = args.log_level or cfg.get("log_level", "INFO")
    if not hub:
        parser.error("--hub is required (or set 'hub' in the config file)")
    if not key:
        parser.error("--key is required (or set 'key' in the config file)")
    if not name:
        parser.error("--name is required (or set 'name' in the config file)")
    logging.basicConfig(
-        level=getattr(logging, args.log_level),
+        level=getattr(logging, log_level),
        format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S",
        stream=sys.stderr,
    )
-    # Warn loudly if --insecure is used outside of development.
+    if insecure:
    if args.insecure:
        logger.warning(
            "--insecure disables TLS certificate verification. "
            "Only use this for local development with self-signed certs."
        )
    agent = NodeAgent(
-        hub_url=args.hub,
+        hub_url=hub,
-        api_key=args.key,
+        api_key=key,
-        name=args.name,
+        name=name,
-        sdr_device=args.device,
+        sdr_device=device,
-        insecure=args.insecure,
+        insecure=insecure,
    )
    agent.run()
Author	SHA1	Message	Date
benchinnery	5718e109b5	Merge pull request 'Agent Error fix' (#21 ) from zfp-oss into main Some checks failed Build Sphinx Docs Set / Build Docs (push) Successful in 21s Details Build Project / Build Project (3.10) (push) Successful in 5m34s Details Test with tox / Test with tox (3.11) (push) Successful in 4m57s Details Build Project / Build Project (3.11) (push) Successful in 7m24s Details Build Project / Build Project (3.12) (push) Successful in 10m19s Details Test with tox / Test with tox (3.10) (push) Failing after 17m4s Details Test with tox / Test with tox (3.12) (push) Successful in 15m16s Details Reviewed-on: #21 Reviewed-by: jonny <jonny@noreply.localhost>	2026-04-13 09:11:41 -04:00
benchinnery	d81c61c3cf	Merge branch 'main' into zfp-oss Some checks failed Test with tox / Test with tox (3.11) (pull_request) Successful in 10m40s Details Build Project / Build Project (3.10) (pull_request) Successful in 10m54s Details Test with tox / Test with tox (3.10) (pull_request) Failing after 10m53s Details Build Sphinx Docs Set / Build Docs (pull_request) Successful in 11m4s Details Build Project / Build Project (3.11) (pull_request) Successful in 11m31s Details Build Project / Build Project (3.12) (pull_request) Successful in 11m32s Details Test with tox / Test with tox (3.12) (pull_request) Successful in 2m26s Details	2026-04-13 09:11:09 -04:00
ben	54b9bd4fc8	Agent Error fix Some checks failed Build Sphinx Docs Set / Build Docs (pull_request) Successful in 18s Details Build Project / Build Project (3.10) (pull_request) Successful in 1m15s Details Build Project / Build Project (3.11) (pull_request) Successful in 1m13s Details Build Project / Build Project (3.12) (pull_request) Successful in 1m23s Details Test with tox / Test with tox (3.11) (pull_request) Successful in 2m25s Details Test with tox / Test with tox (3.12) (pull_request) Successful in 2m18s Details Test with tox / Test with tox (3.10) (pull_request) Failing after 1m12s Details	2026-04-10 16:39:11 -04:00