# Monitoring Template
# Standardized template for comprehensive system monitoring and observability across the AI Agentic Data Stack Framework

metadata:
  template_id: "monitoring-tmpl"
  name: "Monitoring Template"
  version: "1.0.0"
  description: "Comprehensive template for system monitoring, alerting, and observability implementation"
  category: "operations-maintenance"
  tags: ["monitoring", "observability", "alerting", "metrics", "logging"]
  created_by: "AI Agentic Data Stack Framework"
  created_date: "2025-01-23"

template:
  name: "Monitoring Template"
  description: "Template for comprehensive system monitoring and observability"
  version: "1.0.0"

# Monitoring Configuration
monitoring_config:
  monitoring_framework_id: "${monitoring_framework_id}"
  framework_name: "${monitoring_framework_name}"
  monitored_system: "${monitored_system_name}"
  monitoring_scope: "${monitoring_scope}" # application, infrastructure, business, security
  monitoring_strategy: "${monitoring_strategy}" # reactive, proactive, predictive
  monitoring_owner: "${monitoring_owner}"

# Metrics Collection
metrics_collection:
  # Metrics Platform
  metrics_platform: "${metrics_platform}" # prometheus, datadog, new_relic, cloudwatch
  
  # System Metrics
  system_metrics:
    # Infrastructure Metrics
    infrastructure_metrics:
      - metric_name: "${infrastructure_metric_name}"
        metric_type: "${infrastructure_metric_type}" # counter, gauge, histogram, summary
        metric_description: "${infrastructure_metric_description}"
        collection_interval: ${infrastructure_metric_interval_seconds}
        retention_period: "${infrastructure_metric_retention}"
        
        # CPU Metrics
        cpu_metrics:
          cpu_utilization: ${cpu_utilization_enabled}
          cpu_load_average: ${cpu_load_average_enabled}
          cpu_cores: ${cpu_cores_monitoring_enabled}
          
        # Memory Metrics
        memory_metrics:
          memory_usage: ${memory_usage_enabled}
          heap_usage: ${heap_usage_enabled}
          garbage_collection: ${gc_monitoring_enabled}
          
        # Storage Metrics
        storage_metrics:
          disk_usage: ${disk_usage_enabled}
          disk_io: ${disk_io_enabled}
          inode_usage: ${inode_usage_enabled}
          
        # Network Metrics
        network_metrics:
          network_throughput: ${network_throughput_enabled}
          network_latency: ${network_latency_enabled}
          connection_count: ${connection_count_enabled}
          
  # Application Metrics
  application_metrics:
    # Performance Metrics
    performance_metrics:
      - metric_name: "${app_performance_metric_name}"
        metric_endpoint: "${app_metric_endpoint}"
        collection_method: "${app_metric_collection_method}" # pull, push, scrape
        
        # Response Time Metrics
        response_time:
          average_response_time: ${avg_response_time_enabled}
          percentile_response_times: ["${response_time_percentiles}"] # p50, p95, p99
          max_response_time: ${max_response_time_enabled}
          
        # Throughput Metrics
        throughput:
          requests_per_second: ${requests_per_second_enabled}
          transactions_per_second: ${transactions_per_second_enabled}
          concurrent_users: ${concurrent_users_enabled}
          
        # Error Metrics
        error_metrics:
          error_rate: ${error_rate_enabled}
          error_count: ${error_count_enabled}
          error_types: ["${monitored_error_types}"]
          
  # Business Metrics
  business_metrics:
    - metric_name: "${business_metric_name}"
      metric_description: "${business_metric_description}"
      business_value: "${business_metric_value}"
      calculation_method: "${business_metric_calculation}"
      data_source: "${business_metric_data_source}"
      update_frequency: "${business_metric_update_frequency}"

# Logging Strategy
logging_strategy:
  # Log Aggregation
  log_aggregation:
    log_platform: "${log_aggregation_platform}" # elk, splunk, fluentd, loki
    log_forwarding: "${log_forwarding_method}" # agent, sidecar, direct
    
  # Log Configuration
  log_config:
    # Application Logs
    application_logs:
      log_level: "${application_log_level}" # trace, debug, info, warn, error, fatal
      log_format: "${application_log_format}" # json, logfmt, plain
      log_rotation: "${application_log_rotation}"
      log_retention: "${application_log_retention_period}"
      
    # System Logs
    system_logs:
      system_log_collection: ${system_log_collection_enabled}
      kernel_logs: ${kernel_logs_enabled}
      audit_logs: ${audit_logs_enabled}
      security_logs: ${security_logs_enabled}
      
    # Access Logs
    access_logs:
      web_server_logs: ${web_server_logs_enabled}
      api_access_logs: ${api_access_logs_enabled}
      database_access_logs: ${database_access_logs_enabled}
      
  # Structured Logging
  structured_logging:
    structured_format: "${structured_log_format}" # json, logstash
    correlation_ids: ${correlation_ids_enabled}
    context_propagation: ${log_context_propagation_enabled}
    
  # Log Analysis
  log_analysis:
    log_parsing: "${log_parsing_rules}"
    log_enrichment: ${log_enrichment_enabled}
    log_indexing: "${log_indexing_strategy}"
    search_capabilities: ["${log_search_capabilities}"]

# Distributed Tracing
distributed_tracing:
  # Tracing Configuration
  tracing_config:
    tracing_enabled: ${distributed_tracing_enabled}
    tracing_platform: "${tracing_platform}" # jaeger, zipkin, x_ray, datadog_apm
    sampling_strategy: "${tracing_sampling_strategy}" # probabilistic, rate_limiting, adaptive
    sampling_rate: ${tracing_sampling_rate}
    
  # Trace Collection
  trace_collection:
    instrumentation_method: "${instrumentation_method}" # auto, manual, hybrid
    trace_exporters: ["${trace_exporters}"]
    trace_processors: ["${trace_processors}"]
    
  # Service Mapping
  service_mapping:
    service_discovery: "${service_discovery_method}"
    dependency_mapping: ${dependency_mapping_enabled}
    service_graph_visualization: ${service_graph_enabled}
    
  # Performance Analysis
  performance_analysis:
    bottleneck_detection: ${bottleneck_detection_enabled}
    latency_analysis: ${latency_analysis_enabled}
    error_correlation: ${error_correlation_enabled}

# Alerting Configuration
alerting_config:
  # Alerting Platform
  alerting_platform: "${alerting_platform}" # prometheus_alertmanager, pagerduty, opsgenie
  
  # Alert Rules
  alert_rules:
    # Infrastructure Alerts
    infrastructure_alerts:
      - alert_name: "${infrastructure_alert_name}"
        alert_condition: "${infrastructure_alert_condition}"
        severity: "${infrastructure_alert_severity}" # critical, warning, info
        threshold_value: ${infrastructure_alert_threshold}
        evaluation_interval: ${infrastructure_alert_evaluation_interval}
        
        # CPU Alerts
        cpu_alerts:
          high_cpu_usage:
            threshold: ${high_cpu_threshold_percentage}
            duration: "${high_cpu_duration}"
            
        # Memory Alerts
        memory_alerts:
          high_memory_usage:
            threshold: ${high_memory_threshold_percentage}
            duration: "${high_memory_duration}"
            
        # Storage Alerts
        storage_alerts:
          disk_space_low:
            threshold: ${low_disk_space_threshold_percentage}
            duration: "${low_disk_space_duration}"
            
    # Application Alerts
    application_alerts:
      - alert_name: "${app_alert_name}"
        alert_description: "${app_alert_description}"
        
        # Performance Alerts
        performance_alerts:
          high_response_time:
            threshold: ${high_response_time_threshold_ms}
            percentile: "${response_time_percentile}" # p95, p99
            
          low_throughput:
            threshold: ${low_throughput_threshold}
            measurement_window: "${throughput_measurement_window}"
            
        # Error Alerts  
        error_alerts:
          high_error_rate:
            threshold: ${high_error_rate_threshold_percentage}
            measurement_window: "${error_rate_measurement_window}"
            
    # Business Alerts
    business_alerts:
      - alert_name: "${business_alert_name}"
        business_impact: "${business_alert_impact}"
        alert_condition: "${business_alert_condition}"
        
  # Notification Configuration
  notification_config:
    # Notification Channels
    notification_channels:
      - channel_name: "${notification_channel_name}"
        channel_type: "${notification_channel_type}" # email, sms, slack, webhook
        channel_endpoint: "${notification_channel_endpoint}"
        channel_priority: "${notification_channel_priority}"
        
    # Escalation Policies
    escalation_policies:
      - policy_name: "${escalation_policy_name}"
        escalation_levels:
          - level: ${escalation_level}
            escalation_time: ${escalation_time_minutes}
            notification_targets: ["${escalation_notification_targets}"]
            
    # Alert Suppression
    alert_suppression:
      suppression_rules: ["${alert_suppression_rules}"]
      maintenance_windows: ["${maintenance_windows}"]
      alert_correlation: ${alert_correlation_enabled}

# Dashboard Configuration
dashboard_config:
  # Dashboard Platform
  dashboard_platform: "${dashboard_platform}" # grafana, kibana, datadog, new_relic
  
  # Dashboard Categories
  dashboard_categories:
    # Infrastructure Dashboards
    infrastructure_dashboards:
      - dashboard_name: "${infrastructure_dashboard_name}"
        dashboard_description: "${infrastructure_dashboard_description}"
        refresh_interval: "${infrastructure_dashboard_refresh}"
        
        # Dashboard Panels
        panels:
          - panel_name: "${infrastructure_panel_name}"
            panel_type: "${infrastructure_panel_type}" # graph, table, heatmap, stat
            data_source: "${infrastructure_panel_data_source}"
            query: "${infrastructure_panel_query}"
            
    # Application Dashboards
    application_dashboards:
      - dashboard_name: "${app_dashboard_name}"
        dashboard_scope: "${app_dashboard_scope}" # service, component, endpoint
        
        # Performance Panels
        performance_panels:
          response_time_panel: "${response_time_panel_config}"
          throughput_panel: "${throughput_panel_config}"
          error_rate_panel: "${error_rate_panel_config}"
          
    # Business Dashboards
    business_dashboards:
      - dashboard_name: "${business_dashboard_name}"
        stakeholder_audience: ["${business_dashboard_audience}"]
        update_frequency: "${business_dashboard_update_frequency}"
        
  # Dashboard Access Control
  dashboard_access:
    authentication_required: ${dashboard_authentication_required}
    role_based_access: ${dashboard_rbac_enabled}
    public_dashboards: ["${public_dashboards}"]

# Health Checks
health_checks:
  # Health Check Configuration
  health_check_config:
    health_check_enabled: ${health_checks_enabled}
    health_check_endpoint: "${health_check_endpoint}"
    health_check_interval: ${health_check_interval_seconds}
    
  # Application Health Checks
  application_health:
    - service_name: "${health_check_service_name}"
      health_check_type: "${health_check_type}" # http, tcp, command, database
      endpoint_url: "${health_check_endpoint_url}"
      timeout: ${health_check_timeout_seconds}
      success_criteria: ["${health_check_success_criteria}"]
      
  # Infrastructure Health Checks
  infrastructure_health:
    - component_name: "${infrastructure_component_name}"
      component_type: "${infrastructure_component_type}" # server, database, network
      health_indicators: ["${component_health_indicators}"]
      
  # Dependency Health Checks
  dependency_health:
    - dependency_name: "${dependency_name}"
      dependency_type: "${dependency_type}" # external_api, database, queue
      health_check_method: "${dependency_health_check_method}"
      circuit_breaker_enabled: ${dependency_circuit_breaker_enabled}

# Performance Monitoring
performance_monitoring:
  # Performance Baselines
  performance_baselines:
    - metric_name: "${baseline_metric_name}"
      baseline_value: ${baseline_metric_value}
      baseline_period: "${baseline_measurement_period}"
      deviation_threshold: ${baseline_deviation_threshold}
      
  # Performance Testing Integration
  performance_testing:
    load_testing_integration: ${load_testing_integration_enabled}
    performance_regression_detection: ${performance_regression_detection}
    automated_performance_alerts: ${automated_performance_alerts}
    
  # Capacity Planning
  capacity_planning:
    capacity_metrics: ["${capacity_planning_metrics}"]
    growth_projections: ["${capacity_growth_projections}"]
    scaling_recommendations: ["${scaling_recommendations}"]

# Incident Management Integration
incident_management:
  # Incident Detection
  incident_detection:
    automated_incident_creation: ${automated_incident_creation_enabled}
    incident_correlation: ${incident_correlation_enabled}
    incident_prioritization: "${incident_prioritization_method}"
    
  # Incident Response
  incident_response:
    response_team_notification: ["${incident_response_team}"]
    incident_escalation: "${incident_escalation_procedure}"
    incident_documentation: "${incident_documentation_template}"
    
  # Post-Incident Analysis
  post_incident_analysis:
    root_cause_analysis: "${root_cause_analysis_process}"
    lessons_learned: "${lessons_learned_process}"
    improvement_actions: ["${post_incident_improvement_actions}"]

# Compliance Monitoring
compliance_monitoring:
  # Regulatory Compliance
  regulatory_compliance:
    - regulation: "${regulation_name}" # gdpr, hipaa, sox, pci_dss
      compliance_metrics: ["${regulation_compliance_metrics}"]
      monitoring_requirements: ["${regulation_monitoring_requirements}"]
      reporting_frequency: "${regulation_reporting_frequency}"
      
  # Audit Trail
  audit_trail:
    audit_logging_enabled: ${audit_logging_enabled}
    audit_log_retention: "${audit_log_retention_period}"
    audit_log_integrity: ${audit_log_integrity_protection}
    
  # Security Monitoring
  security_monitoring:
    security_events: ["${monitored_security_events}"]
    threat_detection: ${threat_detection_enabled}
    security_dashboards: ["${security_dashboards}"]

# Data Retention and Archival
data_retention:
  # Metrics Retention
  metrics_retention:
    short_term_retention: "${metrics_short_term_retention}"
    long_term_retention: "${metrics_long_term_retention}"
    data_compression: ${metrics_data_compression_enabled}
    
  # Log Retention
  log_retention:
    application_log_retention: "${application_log_retention_period}"
    system_log_retention: "${system_log_retention_period}"
    archive_strategy: "${log_archive_strategy}"
    
  # Data Lifecycle Management
  data_lifecycle:
    hot_data_period: "${hot_data_retention_period}"
    warm_data_period: "${warm_data_retention_period}"
    cold_data_period: "${cold_data_retention_period}"
    data_deletion_policy: "${data_deletion_policy}"

# Integration Configuration
integration_config:
  # External Integrations
  external_integrations:
    - integration_name: "${monitoring_integration_name}"
      integration_type: "${monitoring_integration_type}" # webhook, api, message_queue
      endpoint_url: "${monitoring_integration_endpoint}"
      authentication: "${monitoring_integration_auth}"
      
  # ITSM Integration
  itsm_integration:
    itsm_platform: "${itsm_platform}" # servicenow, jira_service_desk, remedy
    ticket_creation: ${automated_ticket_creation_enabled}
    ticket_correlation: ${ticket_correlation_enabled}
    
  # Communication Platform Integration
  communication_integration:
    chat_platform: "${monitoring_chat_platform}" # slack, teams, discord
    notification_formatting: "${chat_notification_format}"
    channel_routing: ["${chat_channel_routing_rules}"]

# Validation Rules
validation_rules:
  required_fields:
    - monitoring_framework_id
    - framework_name
    - monitored_system
    - metrics_collection
    - logging_strategy
    - alerting_config
    - dashboard_config

# Template Metadata
template_metadata:
  author: "AI Agentic Data Stack Framework"
  maintainer: "Site Reliability Engineer"
  last_updated: "2025-01-23"

sections:
  - name: "monitoring_overview"
    description: "Monitoring strategy and objectives"
    required: true
  - name: "metrics"
    description: "Key metrics and KPIs"
    required: true
  - name: "alerting"
    description: "Alerting rules and notifications"
    required: true
  - name: "dashboards"
    description: "Monitoring dashboards"
    required: false
