# Data Profiling Template
# Standardized template for comprehensive data profiling across the AI Agentic Data Stack Framework

metadata:
  template_id: "data-profiling-tmpl"
  name: "Data Profiling Template"
  version: "1.0.0"
  description: "Comprehensive template for systematic data profiling and quality assessment"
  category: "data-quality"
  tags: ["profiling", "quality", "analysis", "assessment", "validation"]
  created_by: "AI Agentic Data Stack Framework"
  created_date: "2025-01-23"

template:
  name: "Data Profiling Template"
  description: "Template for comprehensive data profiling and assessment"
  version: "1.0.0"

sections:
  - name: "profiling_overview"
    description: "Profiling objectives and scope definition"
    required: true
  - name: "data_source_analysis"
    description: "Data source characteristics and metadata"
    required: true
  - name: "structural_profiling"
    description: "Data structure and schema analysis"
    required: true
  - name: "content_profiling"
    description: "Data content and value analysis"
    required: true
  - name: "quality_assessment"
    description: "Data quality metrics and issues"
    required: true
  - name: "relationship_analysis"
    description: "Data relationships and dependencies"
    required: true
  - name: "profiling_recommendations"
    description: "Findings and improvement recommendations"
    required: true
  - name: "validation_rules"
    description: "Profiling validation requirements"
    required: true

# Profiling Overview
profiling_overview:
  # Basic Information
  profiling_id: "${profiling_id}"
  profiling_name: "${profiling_name}"
  description: "${profiling_description}"
  version: "${profiling_version}"
  
  # Profiling Objectives
  objectives:
    primary_objective: "${primary_objective}"
    secondary_objectives: ["${secondary_objectives}"]
    business_questions: ["${business_questions}"]
    quality_focus_areas: ["${quality_focus_areas}"]
    
  # Scope Definition
  scope:
    data_sources: ["${data_sources_to_profile}"]
    tables_views: ["${tables_views_to_profile}"]
    columns_fields: ["${specific_columns}"]
    time_period: "${profiling_time_period}"
    
  # Profiling Context
  context:
    business_context: "${business_context}"
    data_usage_context: "${data_usage_context}"
    regulatory_requirements: ["${regulatory_requirements}"]
    compliance_standards: ["${compliance_standards}"]
    
  # Stakeholders
  stakeholders:
    profiling_owner: "${profiling_owner}"
    data_steward: "${data_steward}"
    business_users: ["${business_users}"]
    technical_reviewers: ["${technical_reviewers}"]

# Data Source Analysis
data_source_analysis:
  # Source Characteristics
  data_sources:
    - source_id: "${data_source_id}"
      source_name: "${data_source_name}"
      source_type: "${source_type}" # database, file, api, data_lake, data_warehouse
      
      # Technical Details
      technical_details:
        system_type: "${system_type}"
        version: "${system_version}"
        location: "${data_location}"
        access_method: "${access_method}"
        
      # Size and Volume
      size_metrics:
        total_size_gb: ${total_size_gb}
        record_count: ${total_record_count}
        table_count: ${table_count}
        column_count: ${total_column_count}
        
      # Update Patterns
      update_characteristics:
        update_frequency: "${update_frequency}"
        last_updated: "${last_updated_timestamp}"
        growth_rate: "${data_growth_rate}"
        update_pattern: "${update_pattern}" # batch, real_time, mixed
        
  # System Metadata
  system_metadata:
    database_schema: "${database_schema}"
    catalog_information: ["${catalog_info}"]
    data_dictionary_available: ${data_dictionary_exists}
    documentation_completeness: "${documentation_completeness_score}"
    
  # Access Patterns
  access_patterns:
    primary_users: ["${primary_data_users}"]
    usage_frequency: "${usage_frequency}"
    peak_usage_times: ["${peak_usage_times}"]
    access_methods: ["${access_methods}"]

# Structural Profiling
structural_profiling:
  # Schema Analysis
  schema_analysis:
    # Tables/Entities
    tables:
      - table_id: "${table_id}"
        table_name: "${table_name}"
        table_type: "${table_type}" # fact, dimension, staging, reference
        
        # Table Characteristics
        characteristics:
          row_count: ${table_row_count}
          column_count: ${table_column_count}
          table_size_mb: ${table_size_mb}
          creation_date: "${table_creation_date}"
          
        # Columns/Fields
        columns:
          - column_id: "${column_id}"
            column_name: "${column_name}"
            
            # Data Type Information
            data_type_info:
              data_type: "${column_data_type}"
              max_length: ${max_column_length}
              precision: ${numeric_precision}
              scale: ${numeric_scale}
              nullable: ${is_nullable}
              
            # Constraints
            constraints:
              primary_key: ${is_primary_key}
              foreign_key: ${is_foreign_key}
              unique_constraint: ${has_unique_constraint}
              check_constraints: ["${check_constraints}"]
              default_value: "${default_value}"
              
  # Key Relationships
  key_relationships:
    primary_keys:
      - table_name: "${pk_table_name}"
        primary_key_columns: ["${pk_columns}"]
        uniqueness_verified: ${pk_uniqueness_verified}
        
    foreign_keys:
      - relationship_id: "${fk_relationship_id}"
        source_table: "${fk_source_table}"
        source_columns: ["${fk_source_columns}"]
        target_table: "${fk_target_table}"
        target_columns: ["${fk_target_columns}"]
        referential_integrity: ${referential_integrity_score}
        
  # Index Analysis
  index_analysis:
    - index_name: "${index_name}"
      index_type: "${index_type}" # clustered, non_clustered, unique, partial
      indexed_columns: ["${indexed_columns}"]
      index_usage_stats: "${index_usage_frequency}"

# Content Profiling
content_profiling:
  # Column-Level Analysis
  column_analysis:
    # Numeric Columns
    numeric_columns:
      - column_name: "${numeric_column_name}"
        
        # Basic Statistics
        basic_stats:
          count: ${value_count}
          null_count: ${null_count}
          null_percentage: ${null_percentage}
          distinct_count: ${distinct_value_count}
          distinct_percentage: ${distinct_percentage}
          
        # Descriptive Statistics
        descriptive_stats:
          mean: ${column_mean}
          median: ${column_median}
          mode: ${column_mode}
          standard_deviation: ${column_std_dev}
          variance: ${column_variance}
          
        # Distribution Analysis
        distribution:
          minimum: ${column_minimum}
          maximum: ${column_maximum}
          range: ${column_range}
          quartile_1: ${q1}
          quartile_3: ${q3}
          interquartile_range: ${iqr}
          skewness: ${distribution_skewness}
          kurtosis: ${distribution_kurtosis}
          
        # Outlier Detection
        outliers:
          outlier_count: ${outlier_count}
          outlier_percentage: ${outlier_percentage}
          outlier_detection_method: "${outlier_method}"
          outlier_threshold: ${outlier_threshold}
          
    # Categorical Columns
    categorical_columns:
      - column_name: "${categorical_column_name}"
        
        # Basic Statistics
        basic_stats:
          count: ${category_count}
          null_count: ${category_null_count}
          null_percentage: ${category_null_percentage}
          distinct_count: ${category_distinct_count}
          cardinality: "${cardinality_level}" # low, medium, high
          
        # Value Distribution
        value_distribution:
          most_frequent_value: "${most_frequent_value}"
          most_frequent_count: ${most_frequent_count}
          most_frequent_percentage: ${most_frequent_percentage}
          least_frequent_value: "${least_frequent_value}"
          value_frequency_distribution: ["${frequency_distribution}"]
          
        # Pattern Analysis
        patterns:
          common_patterns: ["${common_value_patterns}"]
          irregular_values: ["${irregular_values}"]
          format_consistency: ${format_consistency_score}
          
    # Date/Time Columns
    datetime_columns:
      - column_name: "${datetime_column_name}"
        
        # Basic Statistics
        basic_stats:
          count: ${datetime_count}
          null_count: ${datetime_null_count}
          null_percentage: ${datetime_null_percentage}
          distinct_count: ${datetime_distinct_count}
          
        # Temporal Analysis
        temporal_analysis:
          earliest_date: "${earliest_date}"
          latest_date: "${latest_date}"
          date_range_days: ${date_range_days}
          most_common_date: "${most_common_date}"
          
        # Format Analysis
        format_analysis:
          date_formats: ["${detected_date_formats}"]
          format_consistency: ${datetime_format_consistency}
          timezone_information: "${timezone_info}"
          
    # Text Columns
    text_columns:
      - column_name: "${text_column_name}"
        
        # Basic Statistics
        basic_stats:
          count: ${text_count}
          null_count: ${text_null_count}
          null_percentage: ${text_null_percentage}
          distinct_count: ${text_distinct_count}
          
        # Text Characteristics
        text_characteristics:
          min_length: ${min_text_length}
          max_length: ${max_text_length}
          avg_length: ${avg_text_length}
          empty_string_count: ${empty_string_count}
          whitespace_only_count: ${whitespace_only_count}
          
        # Content Analysis
        content_analysis:
          character_set: "${detected_character_set}"
          language_detected: "${detected_language}"
          common_words: ["${common_words}"]
          special_characters: ["${special_characters}"]

# Quality Assessment
quality_assessment:
  # Completeness Assessment
  completeness:
    # Overall Completeness
    overall_completeness: ${overall_completeness_percentage}
    
    # Column-Level Completeness
    column_completeness:
      - column_name: "${completeness_column_name}"
        completeness_percentage: ${column_completeness_percentage}
        missing_value_count: ${missing_count}
        completeness_rating: "${completeness_rating}" # excellent, good, fair, poor
        
  # Accuracy Assessment
  accuracy:
    # Data Type Accuracy
    data_type_accuracy:
      - column_name: "${accuracy_column_name}"
        expected_data_type: "${expected_data_type}"
        actual_data_type: "${actual_data_type}"
        type_conformity_percentage: ${type_conformity_percentage}
        type_violations: ${type_violation_count}
        
    # Business Rule Accuracy
    business_rule_accuracy:
      - rule_name: "${business_rule_name}"
        rule_description: "${rule_description}"
        conformity_percentage: ${rule_conformity_percentage}
        violation_count: ${rule_violation_count}
        
  # Consistency Assessment
  consistency:
    # Format Consistency
    format_consistency:
      - column_name: "${consistency_column_name}"
        expected_format: "${expected_format}"
        format_compliance_percentage: ${format_compliance_percentage}
        format_variations: ["${format_variations}"]
        
    # Cross-Table Consistency
    cross_table_consistency:
      - relationship_name: "${consistency_relationship}"
        table_1: "${consistency_table_1}"
        table_2: "${consistency_table_2}"
        consistency_percentage: ${cross_table_consistency_percentage}
        inconsistency_count: ${inconsistency_count}
        
  # Validity Assessment
  validity:
    # Domain Validity
    domain_validity:
      - column_name: "${validity_column_name}"
        valid_domain: ["${valid_domain_values}"]
        domain_compliance_percentage: ${domain_compliance_percentage}
        invalid_values: ["${invalid_values}"]
        
    # Range Validity
    range_validity:
      - column_name: "${range_column_name}"
        valid_range_min: ${valid_range_min}
        valid_range_max: ${valid_range_max}
        range_compliance_percentage: ${range_compliance_percentage}
        out_of_range_count: ${out_of_range_count}
        
  # Uniqueness Assessment
  uniqueness:
    # Column Uniqueness
    column_uniqueness:
      - column_name: "${uniqueness_column_name}"
        expected_uniqueness: ${expected_uniqueness}
        actual_uniqueness_percentage: ${actual_uniqueness_percentage}
        duplicate_count: ${duplicate_count}
        
  # Timeliness Assessment
  timeliness:
    data_freshness: "${data_freshness_status}"
    last_update_lag: "${update_lag_hours}"
    timeliness_rating: "${timeliness_rating}" # excellent, good, fair, poor

# Relationship Analysis
relationship_analysis:
  # Intra-Table Relationships
  intra_table_relationships:
    # Column Correlations
    correlations:
      - column_pair: ["${correlation_column_1}", "${correlation_column_2}"]
        correlation_coefficient: ${correlation_coefficient}
        correlation_strength: "${correlation_strength}" # weak, moderate, strong
        correlation_type: "${correlation_type}" # positive, negative
        
    # Functional Dependencies
    functional_dependencies:
      - dependency_id: "${dependency_id}"
        determinant_columns: ["${determinant_columns}"]
        dependent_columns: ["${dependent_columns}"]
        dependency_strength: ${dependency_strength}
        
  # Inter-Table Relationships
  inter_table_relationships:
    # Join Analysis
    join_analysis:
      - join_relationship: "${join_relationship_name}"
        left_table: "${left_table}"
        right_table: "${right_table}"
        join_columns: ["${join_columns}"]
        join_success_rate: ${join_success_rate}
        unmatched_records: ${unmatched_record_count}
        
    # Data Lineage
    data_lineage:
      - lineage_path: ["${lineage_path}"]
        source_system: "${lineage_source}"
        target_system: "${lineage_target}"
        transformation_steps: ["${transformation_steps}"]

# Profiling Recommendations
profiling_recommendations:
  # Data Quality Improvements
  quality_improvements:
    - issue_category: "${quality_issue_category}"
      issue_description: "${quality_issue_description}"
      severity: "${issue_severity}" # critical, high, medium, low
      affected_columns: ["${affected_columns}"]
      
      # Recommended Actions
      recommendations:
        immediate_actions: ["${immediate_actions}"]
        long_term_improvements: ["${long_term_improvements}"]
        estimated_effort: "${estimated_effort}"
        priority: "${recommendation_priority}"
        
  # Performance Optimizations
  performance_optimizations:
    - optimization_area: "${optimization_area}"
      current_performance: "${current_performance_metric}"
      target_performance: "${target_performance_metric}"
      optimization_strategies: ["${optimization_strategies}"]
      
  # Monitoring Recommendations
  monitoring_recommendations:
    ongoing_monitoring: ["${ongoing_monitoring_needs}"]
    quality_alerts: ["${recommended_quality_alerts}"]
    review_frequency: "${recommended_review_frequency}"

# Validation Rules
validation_rules:
  required_fields:
    - profiling_id
    - profiling_name
    - data_sources
    - structural_profiling
    - content_profiling
    - quality_assessment
    
  quality_thresholds:
    min_completeness_percentage: 90
    max_null_percentage: 10
    min_accuracy_percentage: 95
    max_duplicate_percentage: 5
    
  profiling_standards:
    - sample_size: "Minimum 10% of data or 10,000 records"
    - statistical_confidence: "95% confidence level required"
    - profiling_coverage: "All critical columns must be profiled"
    - documentation_completeness: "All findings must be documented"

# Template Metadata
template_metadata:
  author: "AI Agentic Data Stack Framework"
  maintainer: "Data Quality Engineer"
  last_updated: "2025-01-23"
  changelog:
    - version: "1.0.0"
      date: "2025-01-23"
      changes: "Initial template creation with comprehensive data profiling configuration"