#!/usr/bin/env python3
"""
Upwork API Documentation Parser - Level 1 Chunking
Parses the massive HTML file and organizes it into logical chunks.
"""

import os
import re
from pathlib import Path
from bs4 import BeautifulSoup

def clean_text(text):
    """Clean up text content."""
    if not text:
        return ""
    # Remove extra whitespace and normalize
    text = re.sub(r'\s+', ' ', text.strip())
    return text

def safe_filename(text):
    """Convert text to safe filename."""
    # Remove HTML tags if any
    text = re.sub(r'<[^>]+>', '', text)
    # Keep only alphanumeric, spaces, hyphens
    text = re.sub(r'[^\w\s-]', '', text)
    # Replace spaces with hyphens and convert to lowercase
    text = re.sub(r'\s+', '-', text.strip().lower())
    # Limit length
    return text[:50] if text else "untitled"

def parse_html_structure(html_file):
    """Parse HTML and identify main sections."""
    print(f"📖 Reading HTML file: {html_file}")
    
    with open(html_file, 'r', encoding='utf-8') as f:
        content = f.read()
    
    print(f"📏 File size: {len(content):,} characters")
    
    soup = BeautifulSoup(content, 'html.parser')
    
    # Let's first examine the structure
    print("\n🔍 LEVEL 1 ANALYSIS:")
    print("=" * 50)
    
    # Find navigation or main structure elements
    nav_elements = soup.find_all(['nav', 'ul', 'ol'], class_=re.compile(r'nav|menu|toc|sidebar', re.I))
    print(f"Navigation elements found: {len(nav_elements)}")
    
    # Find main heading levels
    headings = {}
    for level in range(1, 7):
        h_tags = soup.find_all(f'h{level}')
        if h_tags:
            headings[f'h{level}'] = len(h_tags)
            print(f"H{level} headings: {len(h_tags)}")
    
    # Find main content areas
    main_areas = soup.find_all(['main', 'article', 'section', 'div'], class_=re.compile(r'content|main|docs|api', re.I))
    print(f"Main content areas: {len(main_areas)}")
    
    return soup, headings

def extract_sections_by_h1(soup):
    """Extract main sections based on H1 tags."""
    print("\n📝 EXTRACTING BY H1 SECTIONS:")
    print("=" * 50)
    
    sections = []
    h1_tags = soup.find_all('h1')
    
    if not h1_tags:
        print("❌ No H1 tags found, trying H2...")
        h1_tags = soup.find_all('h2')[:10]  # Limit to first 10 H2s
    
    print(f"Found {len(h1_tags)} main sections")
    
    for i, h1 in enumerate(h1_tags):
        section_title = clean_text(h1.get_text())
        print(f"  {i+1}. {section_title[:80]}...")
        
        # Find content between this H1 and the next H1
        content_elements = []
        current = h1.next_sibling
        
        while current:
            if current.name == 'h1':  # Stop at next H1
                break
            if hasattr(current, 'get_text') and current.get_text().strip():
                content_elements.append(current)
            current = current.next_sibling
        
        # Extract text content
        content = ""
        for elem in content_elements:
            if hasattr(elem, 'get_text'):
                content += elem.get_text() + "\n"
        
        content = clean_text(content)
        
        sections.append({
            'title': section_title,
            'filename': safe_filename(section_title),
            'content': content,
            'length': len(content)
        })
    
    return sections

def save_sections(sections, output_dir):
    """Save sections to organized files."""
    print(f"\n💾 SAVING SECTIONS TO: {output_dir}")
    print("=" * 50)
    
    os.makedirs(output_dir, exist_ok=True)
    
    for i, section in enumerate(sections):
        if not section['content'].strip():
            print(f"  ⚠️  Skipping empty section: {section['title'][:50]}")
            continue
            
        filename = f"{i+1:02d}-{section['filename']}.txt"
        filepath = os.path.join(output_dir, filename)
        
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(f"# {section['title']}\n\n")
            f.write(section['content'])
        
        print(f"  ✅ {filename} ({section['length']:,} chars)")
    
    print(f"\n🎉 Saved {len([s for s in sections if s['content'].strip()])} sections!")

def main():
    """Main parsing function."""
    print("🚀 UPWORK API DOCS PARSER - LEVEL 1")
    print("=" * 50)
    
    # File paths
    html_file = "API Documentation.html"
    output_dir = "chunks"
    
    # Check if HTML file exists
    if not os.path.exists(html_file):
        print(f"❌ HTML file not found: {html_file}")
        print("📁 Please place 'API Documentation.html' in this directory")
        return
    
    # Parse HTML structure
    soup, headings = parse_html_structure(html_file)
    
    # Extract sections
    sections = extract_sections_by_h1(soup)
    
    # Save sections
    save_sections(sections, output_dir)
    
    # Summary
    print(f"\n📊 SUMMARY:")
    print("=" * 30)
    total_chars = sum(s['length'] for s in sections)
    print(f"Total sections: {len(sections)}")
    print(f"Total content: {total_chars:,} characters")
    print(f"Average section: {total_chars // len(sections):,} characters")
    
    print(f"\n📂 Next steps:")
    print("1. Check the 'chunks/' folder")
    print("2. Review the section files")
    print("3. Refine chunking if needed")

if __name__ == "__main__":
    main()