Skip to main content

Monitoring

Protocol health monitoring for Seesaw infrastructure.

Overview

Effective monitoring ensures the protocol operates smoothly and issues are detected early.

Key Metrics

Market Lifecycle Metrics

MetricDescriptionAlert Threshold
markets_createdMarkets created per hour< 3
snapshot_latency_secondsTime from boundary to snapshot> 60s
resolution_latency_secondsTime from snapshots to resolution> 30s
settlement_backlogUnsettled positions count> 1000
markets_stuckMarkets not progressing> 0

Infrastructure Metrics

MetricDescriptionAlert Threshold
treasury_balance_solSOL balance for rewards< 10 SOL
crank_success_rate% of successful crank ops< 99%
rpc_latency_msRPC response time> 1000ms
crank_uptimeCrank service availability< 99%

Oracle Metrics

MetricDescriptionAlert Threshold
pyth_price_age_secondsAge of latest Pyth price> 30s
pyth_confidence_ratioPrice confidence / price> 0.01
oracle_errorsOracle-related failures> 5/hour

Prometheus Metrics

Exporter Setup

import { Registry, Counter, Gauge, Histogram } from 'prom-client';

const register = new Registry();

// Market metrics
const marketsCreated = new Counter({
  name: 'seesaw_markets_created_total',
  help: 'Total markets created',
  registers: [register],
});

const marketsResolved = new Counter({
  name: 'seesaw_markets_resolved_total',
  help: 'Total markets resolved',
  labelNames: ['outcome'],
  registers: [register],
});

const snapshotLatency = new Histogram({
  name: 'seesaw_snapshot_latency_seconds',
  help: 'Time from epoch boundary to snapshot capture',
  labelNames: ['type'],
  buckets: [1, 5, 10, 30, 60, 120, 300],
  registers: [register],
});

// Position metrics
const unsettledPositions = new Gauge({
  name: 'seesaw_unsettled_positions',
  help: 'Number of unsettled positions',
  registers: [register],
});

// Treasury metrics
const treasuryBalance = new Gauge({
  name: 'seesaw_treasury_balance_lamports',
  help: 'Treasury balance in lamports',
  registers: [register],
});

// Crank metrics
const crankOperations = new Counter({
  name: 'seesaw_crank_operations_total',
  help: 'Total crank operations',
  labelNames: ['operation', 'status'],
  registers: [register],
});

const rpcLatency = new Histogram({
  name: 'seesaw_rpc_latency_ms',
  help: 'RPC request latency in milliseconds',
  buckets: [10, 50, 100, 250, 500, 1000, 2500, 5000],
  registers: [register],
});

Metrics Endpoint

import express from 'express';

const app = express();

app.get('/metrics', async (req, res) => {
  res.set('Content-Type', register.contentType);
  res.end(await register.metrics());
});

app.listen(9090, () => {
  console.log('Metrics server listening on port 9090');
});

Grafana Dashboards

Market Health Dashboard

{
  "title": "Seesaw Market Health",
  "panels": [
    {
      "title": "Markets Created (24h)",
      "type": "stat",
      "targets": [
        {
          "expr": "increase(seesaw_markets_created_total[24h])"
        }
      ]
    },
    {
      "title": "Snapshot Latency",
      "type": "graph",
      "targets": [
        {
          "expr": "histogram_quantile(0.95, seesaw_snapshot_latency_seconds_bucket)",
          "legendFormat": "p95"
        },
        {
          "expr": "histogram_quantile(0.50, seesaw_snapshot_latency_seconds_bucket)",
          "legendFormat": "p50"
        }
      ]
    },
    {
      "title": "Unsettled Positions",
      "type": "graph",
      "targets": [
        {
          "expr": "seesaw_unsettled_positions"
        }
      ]
    },
    {
      "title": "Crank Success Rate",
      "type": "gauge",
      "targets": [
        {
          "expr": "rate(seesaw_crank_operations_total{status='success'}[5m]) / rate(seesaw_crank_operations_total[5m]) * 100"
        }
      ]
    }
  ]
}

Infrastructure Dashboard

{
  "title": "Seesaw Infrastructure",
  "panels": [
    {
      "title": "Treasury Balance (SOL)",
      "type": "stat",
      "targets": [
        {
          "expr": "seesaw_treasury_balance_lamports / 1e9"
        }
      ]
    },
    {
      "title": "RPC Latency",
      "type": "graph",
      "targets": [
        {
          "expr": "histogram_quantile(0.95, seesaw_rpc_latency_ms_bucket)"
        }
      ]
    },
    {
      "title": "Crank Operations/min",
      "type": "graph",
      "targets": [
        {
          "expr": "rate(seesaw_crank_operations_total[1m]) * 60",
          "legendFormat": "{{operation}}"
        }
      ]
    }
  ]
}

Health Checks

Service Health

interface CrankHealth {
  lastCreateAt: number;
  lastSnapshotAt: number;
  lastResolveAt: number;
  pendingSettlements: number;
  treasuryBalance: bigint;
  rpcLatencyMs: number;
}

function isHealthy(health: CrankHealth, currentTime: number): boolean {
  const epochDuration = 900;

  return (
    health.lastCreateAt > currentTime - epochDuration * 2 &&
    health.lastSnapshotAt > currentTime - epochDuration * 2 &&
    health.pendingSettlements < 1000 &&
    health.treasuryBalance > 10_000_000_000n && // 10 SOL
    health.rpcLatencyMs < 1000
  );
}

Health Endpoint

app.get('/health', async (req, res) => {
  const health = await collectHealth();

  if (isHealthy(health, Date.now() / 1000)) {
    res.status(200).json({ status: 'healthy', ...health });
  } else {
    res.status(503).json({ status: 'unhealthy', ...health });
  }
});

app.get('/ready', async (req, res) => {
  // Check if service is ready to accept traffic
  const ready = await checkConnections();

  if (ready) {
    res.status(200).json({ status: 'ready' });
  } else {
    res.status(503).json({ status: 'not ready' });
  }
});

Alerting

Alert Definitions

groups:
  - name: seesaw_alerts
    rules:
      - alert: TreasuryLow
        expr: seesaw_treasury_balance_lamports < 5e9
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: Treasury balance below 5 SOL
          description: Treasury balance is {{ $value | humanize }} lamports

      - alert: TreasuryCritical
        expr: seesaw_treasury_balance_lamports < 1e9
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: Treasury balance below 1 SOL
          description: Crank rewards may fail soon

      - alert: MarketStuck
        expr: seesaw_markets_stuck > 0
        for: 15m
        labels:
          severity: warning
        annotations:
          summary: Market stuck in lifecycle
          description: '{{ $value }} markets are not progressing'

      - alert: HighSnapshotLatency
        expr: histogram_quantile(0.95, seesaw_snapshot_latency_seconds_bucket) > 60
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: High snapshot latency
          description: p95 snapshot latency is {{ $value }}s

      - alert: CrankSuccessRateLow
        expr: rate(seesaw_crank_operations_total{status='success'}[5m]) / rate(seesaw_crank_operations_total[5m]) < 0.95
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: Crank success rate below 95%

      - alert: OracleStale
        expr: seesaw_pyth_price_age_seconds > 60
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: Pyth oracle price is stale
          description: Price age is {{ $value }}s

      - alert: SettlementBacklog
        expr: seesaw_unsettled_positions > 1000
        for: 30m
        labels:
          severity: warning
        annotations:
          summary: Large settlement backlog
          description: '{{ $value }} positions awaiting settlement'

Slack Integration

import { WebClient } from '@slack/web-api';

const slack = new WebClient(process.env.SLACK_TOKEN);

async function sendAlert(alert: Alert) {
  const color = alert.severity === 'critical' ? '#FF3366' : '#FFB800';

  await slack.chat.postMessage({
    channel: '#seesaw-alerts',
    attachments: [
      {
        color,
        title: alert.summary,
        text: alert.description,
        fields: [
          { title: 'Severity', value: alert.severity, short: true },
          { title: 'Time', value: new Date().toISOString(), short: true },
        ],
      },
    ],
  });
}

Log Aggregation

Structured Logging

import pino from 'pino';

const logger = pino({
  level: process.env.LOG_LEVEL || 'info',
  formatters: {
    level: (label) => ({ level: label }),
  },
});

// Log crank operations
logger.info({
  event: 'crank_operation',
  operation: 'snapshot_start',
  marketId: '1234567',
  latencyMs: 150,
  status: 'success',
});

// Log errors with context
logger.error({
  event: 'crank_error',
  operation: 'resolve_market',
  marketId: '1234567',
  error: error.message,
  stack: error.stack,
});

Log Queries (Loki)

# All crank operations
{app="seesaw-crank"} |= "crank_operation"

# Failed operations
{app="seesaw-crank"} | json | status="error"

# Specific market
{app="seesaw-crank"} | json | marketId="1234567"

# High latency operations
{app="seesaw-crank"} | json | latencyMs > 1000

On-Chain Monitoring

Account Watcher

import { Connection, PublicKey } from '@solana/web3.js';

const connection = new Connection(rpcUrl, 'confirmed');

// Watch treasury account
connection.onAccountChange(
  treasuryPubkey,
  (accountInfo, context) => {
    const balance = accountInfo.lamports;
    treasuryBalance.set(balance);

    if (balance < 5_000_000_000) {
      logger.warn({ event: 'treasury_low', balance });
    }
  },
  'confirmed'
);

// Watch market accounts
async function watchMarkets() {
  const markets = await fetchActiveMarkets();

  for (const market of markets) {
    connection.onAccountChange(
      market.pubkey,
      (accountInfo) => {
        const data = parseMarketAccount(accountInfo.data);
        updateMarketMetrics(data);
      },
      'confirmed'
    );
  }
}

Event Indexing

// Subscribe to program logs
connection.onLogs(
  programId,
  (logs, context) => {
    for (const log of logs.logs) {
      if (log.includes('MarketCreated')) {
        marketsCreated.inc();
      } else if (log.includes('MarketResolved')) {
        const outcome = parseOutcome(log);
        marketsResolved.inc({ outcome });
      } else if (log.includes('PositionSettled')) {
        unsettledPositions.dec();
      }
    }
  },
  'confirmed'
);

Runbook Integration

Auto-Remediation

async function autoRemediate(alert: Alert) {
  switch (alert.name) {
    case 'TreasuryLow':
      // Notify treasury manager
      await notifyTreasuryManager(alert);
      break;

    case 'MarketStuck':
      // Attempt manual crank
      const marketId = parseMarketId(alert);
      await manualCrank(marketId);
      break;

    case 'OracleStale':
      // Check Pyth status page
      const pythStatus = await checkPythStatus();
      await notifyWithStatus(alert, pythStatus);
      break;

    default:
      // Generic notification
      await sendAlert(alert);
  }
}

Best Practices

Monitoring Checklist

  • Prometheus metrics exposed on /metrics
  • Grafana dashboards for market and infrastructure health
  • Alert rules for critical conditions
  • Slack/PagerDuty integration for alerts
  • Structured logging with correlation IDs
  • Health check endpoints for load balancers
  • On-chain event indexing
  • Auto-remediation for common issues

SLO Targets

MetricTargetMeasurement
Market creation99.9%Markets created within 5 min of epoch start
Snapshot capture99.5%Snapshots captured within 60s of boundary
Resolution99.5%Markets resolved within 5 min of both snapshots
Settlement99%Positions settled within 1 hour of resolution

Next Steps