# Data Pipeline Template
# Standardized template for data pipeline design and implementation across the AI Agentic Data Stack Framework

metadata:
  template_id: "data-pipeline-tmpl"
  name: "Data Pipeline Template"
  version: "1.0.0"
  description: "Comprehensive template for building scalable and reliable data pipelines"
  category: "data-engineering"
  tags: ["pipeline", "etl", "data-flow", "orchestration", "automation"]
  created_by: "AI Agentic Data Stack Framework"
  created_date: "2025-01-23"

# Pipeline Configuration
pipeline_config:
  pipeline_id: "${data_pipeline_id}"
  pipeline_name: "${data_pipeline_name}"
  description: "${pipeline_description}"
  version: "${pipeline_version}"
  pipeline_type: "${pipeline_type}" # batch, streaming, hybrid, micro_batch
  business_domain: "${business_domain}"

# Data Sources
data_sources:
  - source_id: "${source_id}"
    source_name: "${source_name}"
    source_type: "${source_type}" # database, file, api, stream, queue
    connection_config:
      connection_string: "${connection_string}"
      authentication: "${source_authentication}"
      timeout: ${connection_timeout}
    extraction_config:
      extraction_method: "${extraction_method}" # full, incremental, cdc
      schedule: "${extraction_schedule}"
      batch_size: ${extraction_batch_size}

# Data Targets
data_targets:
  - target_id: "${target_id}"
    target_name: "${target_name}"
    target_type: "${target_type}" # database, file, api, stream
    loading_strategy: "${loading_strategy}" # full_refresh, append, upsert, merge
    connection_config:
      connection_string: "${target_connection_string}"
      authentication: "${target_authentication}"

# Pipeline Stages
pipeline_stages:
  # Data Extraction
  extraction_stage:
    stage_name: "Data Extraction"
    stage_order: 1
    execution_config:
      parallel_execution: ${extraction_parallel_execution}
      timeout: ${extraction_timeout}
      retry_policy: "${extraction_retry_policy}"
    quality_checks:
      - check_name: "Source Data Availability"
        check_type: "availability"
        
  # Data Transformation
  transformation_stage:
    stage_name: "Data Transformation"
    stage_order: 2
    transformations:
      - transformation_id: "${transformation_id}"
        transformation_name: "${transformation_name}"
        transformation_type: "${transformation_type}" # cleansing, enrichment, aggregation
        business_rules: ["${business_rules}"]
        
  # Data Loading
  loading_stage:
    stage_name: "Data Loading"
    stage_order: 3
    loading_config:
      loading_method: "${loading_method}"
      conflict_resolution: "${conflict_resolution_strategy}"
      post_load_validation: ${post_load_validation_enabled}

# Orchestration Configuration
orchestration_config:
  orchestration_tool: "${orchestration_tool}" # airflow, prefect, dagster, azure_data_factory
  scheduling:
    schedule_type: "${schedule_type}" # cron, event_driven, manual
    schedule_expression: "${schedule_expression}"
    timezone: "${schedule_timezone}"
    
  dependencies:
    upstream_dependencies: ["${upstream_dependencies}"]
    downstream_dependencies: ["${downstream_dependencies}"]
    
  execution_config:
    max_parallel_tasks: ${max_parallel_tasks}
    task_timeout: ${task_timeout_minutes}
    retry_attempts: ${pipeline_retry_attempts}

# Data Quality Framework
data_quality:
  quality_gates:
    - gate_id: "${quality_gate_id}"
      gate_name: "${quality_gate_name}"
      gate_type: "${gate_type}" # pre_processing, post_processing, final
      quality_rules:
        - rule_name: "${quality_rule_name}"
          rule_type: "${quality_rule_type}"
          threshold: ${quality_rule_threshold}
          action_on_failure: "${failure_action}" # stop, warn, continue
          
  data_profiling:
    profiling_enabled: ${data_profiling_enabled}
    profiling_frequency: "${profiling_frequency}"
    profile_storage: "${profile_storage_location}"

# Monitoring and Alerting
monitoring_alerting:
  monitoring_config:
    metrics_collection: ${metrics_collection_enabled}
    log_aggregation: ${log_aggregation_enabled}
    performance_tracking: ${performance_tracking_enabled}
    
  alerting_rules:
    - alert_name: "${alert_name}"
      alert_condition: "${alert_condition}"
      alert_severity: "${alert_severity}" # low, medium, high, critical
      notification_channels: ["${notification_channels}"]
      
  sla_configuration:
    execution_time_sla: ${execution_time_sla_minutes}
    data_freshness_sla: ${data_freshness_sla_hours}
    success_rate_sla: ${success_rate_sla_percentage}

# Error Handling and Recovery
error_handling:
  error_strategy: "${error_handling_strategy}"
  recovery_procedures:
    - error_type: "${error_type}"
      recovery_action: "${recovery_action}"
      escalation_required: ${escalation_required}
      
  backup_and_restore:
    backup_enabled: ${backup_enabled}
    backup_frequency: "${backup_frequency}"
    retention_period: "${backup_retention_period}"

# Security Configuration
security_config:
  data_encryption:
    encryption_at_rest: ${encryption_at_rest}
    encryption_in_transit: ${encryption_in_transit}
    key_management: "${key_management_service}"
    
  access_control:
    authentication_method: "${pipeline_authentication}"
    authorization_model: "${pipeline_authorization}"
    audit_logging: ${pipeline_audit_logging}

# Performance Optimization
performance_config:
  resource_allocation:
    cpu_cores: ${pipeline_cpu_cores}
    memory_gb: ${pipeline_memory_gb}
    storage_gb: ${pipeline_storage_gb}
    
  optimization_techniques:
    parallel_processing: ${parallel_processing_enabled}
    caching: "${caching_strategy}"
    compression: "${compression_algorithm}"
    partitioning: "${partitioning_strategy}"

# Testing Configuration
testing_config:
  unit_testing:
    test_coverage_target: ${test_coverage_target}
    test_data_location: "${test_data_location}"
    
  integration_testing:
    end_to_end_testing: ${e2e_testing_enabled}
    performance_testing: ${performance_testing_enabled}
    
  data_testing:
    data_validation_tests: ${data_validation_tests_enabled}
    schema_evolution_tests: ${schema_evolution_tests_enabled}

# Validation Rules
validation_rules:
  required_fields:
    - data_pipeline_id
    - pipeline_name
    - pipeline_type
    - data_sources
    - data_targets
    - pipeline_stages

template:
  name: "Data Pipeline Template"
  description: "Comprehensive template for building and managing data pipelines"
  version: "1.0.0"

sections:
  - name: "pipeline_metadata"
    description: "Pipeline identification and metadata"
    required: true
  - name: "data_sources"
    description: "Input data source configurations"
    required: true
  - name: "data_targets"
    description: "Output data target configurations"
    required: true
  - name: "pipeline_stages"
    description: "Transformation and processing stages"
    required: true
  - name: "quality_assurance"
    description: "Data quality checks and monitoring"
    required: true
  - name: "validation_rules"
    description: "Template validation and quality rules"
    required: true

# Template Metadata
template_metadata:
  author: "AI Agentic Data Stack Framework"
  maintainer: "Data Engineer"
  last_updated: "2025-01-23"
