"""
This script generates the iptc-mapping.php file by scraping the list of IPTC tags
at the ExifTool library website located at:

http://owl.phy.queensu.ca/~phil/exiftool/TagNames/IPTC.html
"""

import urllib2
import re
import pprint

print 'start'

url = 'http://owl.phy.queensu.ca/~phil/exiftool/TagNames/IPTC.html'

# load the page
page = urllib2.urlopen(url).read()

# get IPTC record sections table
match = re.search(r"""
<table.*?>                     # find table
    .*?
    <th.*?>\s*Record\s*</\s*th>    # with this tag inside
    .*?
</\s*table>
""", page, re.X | re.S)
record_section_table = match.group() if match else None

# parse the record sections table and get IDs
record_section_IDs = re.findall(r"""
<tr.*?>              # find all table rows
    .*?
    <td.*?title.*?>  # which contain cells with title
        (.*?)        # capture section ID
    </\s*td>
    .*?
    </\s*td>?
""", record_section_table, re.X | re.S)

print '\nrecord section IDs', len(record_section_IDs)
pprint.pprint(record_section_IDs)


# get all tables
record_section_tables = re.findall(r"""
<table.*?inner.*?>   # match from beginning of a table
    .*?
(?:<h2>|</html>)           # to first h2 tag or end of html tag
""", page, re.X | re.S)

# get rid of first table which is the record sections table
record_section_tables = record_section_tables[1:]

# zip it together
record_sections = zip(record_section_IDs, record_section_tables)

print '\nfound tags:\n'

# generate php file
with open('iptc-mapping.php', 'w') as f:
    
    f.write('<?php\n')
    f.write('// This file was generated by collectiptc.py\n\n')
    f.write('$this->IPTC_MAPPING = array(\n')
    
    for record_id, table in record_sections:
        tags = re.findall(r"""
        <tr.*?>              # find all table rows
            .*?
            <td.*?title.*?>  # which contain cells with title
                (.*?)        # capture tag ID
            </\s*td>
            \s*
            <td*?>
                (.*?)        # capture tag Name
            </\s*td>
            .*?
        </\s*tr>
        """, table, re.X | re.S)
        for tag_id, name in tags:
            id = '{}#{:03d}'.format(record_id, int(tag_id))
            f.write("    '{}' => '{}',\n".format(id, name))
            print id, name
    
    f.write(');\n')
    f.write('?>')

print '\ndone'
