/******************************************************************************
 * $Id: generate_encoding_table.c 340b059d097242e67d5869f851d4c8d5978fe7c4 2016-11-22 23:34:00Z Even Rouault $
 *
 * Project:  OGR
 * Purpose:  Generate a mapping table from a 1-byte encoding to unicode,
 *           for ogr_expat.cpp
 * Author:   Even Rouault, even dot rouault at mines dash paris dot org
 *
 ******************************************************************************
 * Copyright (c) 2012, Even Rouault <even dot rouault at mines-paris dot org>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included
 * in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 ****************************************************************************/

#include <errno.h>
#include <iconv.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

static unsigned utf8decode(const char* p, const char* end, int* len)
{
  unsigned char c = *(unsigned char*)p;
  if (c < 0x80) {
    *len = 1;
    return c;
#if ERRORS_TO_CP1252
  } else if (c < 0xa0) {
    *len = 1;
    return cp1252[c-0x80];
#endif
  } else if (c < 0xc2) {
    goto FAIL;
  }
  if (p+1 >= end || (p[1]&0xc0) != 0x80) goto FAIL;
  if (c < 0xe0) {
    *len = 2;
    return
      ((p[0] & 0x1f) << 6) +
      ((p[1] & 0x3f));
  } else if (c == 0xe0) {
    if (((unsigned char*)p)[1] < 0xa0) goto FAIL;
    goto UTF8_3;
#if STRICT_RFC3629
  } else if (c == 0xed) {
    // RFC 3629 says surrogate chars are illegal.
    if (((unsigned char*)p)[1] >= 0xa0) goto FAIL;
    goto UTF8_3;
  } else if (c == 0xef) {
    // 0xfffe and 0xffff are also illegal characters
    if (((unsigned char*)p)[1]==0xbf &&
    ((unsigned char*)p)[2]>=0xbe) goto FAIL;
    goto UTF8_3;
#endif
  } else if (c < 0xf0) {
  UTF8_3:
    if (p+2 >= end || (p[2]&0xc0) != 0x80) goto FAIL;
    *len = 3;
    return
      ((p[0] & 0x0f) << 12) +
      ((p[1] & 0x3f) << 6) +
      ((p[2] & 0x3f));
  } else if (c == 0xf0) {
    if (((unsigned char*)p)[1] < 0x90) goto FAIL;
    goto UTF8_4;
  } else if (c < 0xf4) {
  UTF8_4:
    if (p+3 >= end || (p[2]&0xc0) != 0x80 || (p[3]&0xc0) != 0x80) goto FAIL;
    *len = 4;
#if STRICT_RFC3629
    // RFC 3629 says all codes ending in fffe or ffff are illegal:
    if ((p[1]&0xf)==0xf &&
    ((unsigned char*)p)[2] == 0xbf &&
    ((unsigned char*)p)[3] >= 0xbe) goto FAIL;
#endif
    return
      ((p[0] & 0x07) << 18) +
      ((p[1] & 0x3f) << 12) +
      ((p[2] & 0x3f) << 6) +
      ((p[3] & 0x3f));
  } else if (c == 0xf4) {
    if (((unsigned char*)p)[1] > 0x8f) goto FAIL; // after 0x10ffff
    goto UTF8_4;
  } else {
  FAIL:
    *len = 1;
#if ERRORS_TO_ISO8859_1
    return c;
#else
    return 0xfffd; // Unicode REPLACEMENT CHARACTER
#endif
  }
}

int main(int argc, char* argv[])
{
    iconv_t sConv;
    const char* pszSrcEncoding;
    const char* pszDstEncoding = "UTF-8";
    int i;
    int nLastIdentical = -1;

    if( argc != 2 )
    {
        fprintf(stderr, "Usage: generate_encoding_table encoding_name\n");
        return 1;
    }

    pszSrcEncoding = argv[1];

    sConv = iconv_open( pszDstEncoding, pszSrcEncoding );

    if ( sConv == (iconv_t)-1 )
    {
        fprintf(stderr,
                  "Recode from %s to %s failed with the error: \"%s\".",
                  pszSrcEncoding, pszDstEncoding, strerror(errno) );
        return 1;
    }

    for(i = 0; i < 256; i++)
    {
        char szSrcBuf[2] = {(char)i, 0};
        char szDstBuf[5] = {0,0,0,0,0};
        char *pszSrcBuf = szSrcBuf;
        char *pszDstBuf = szDstBuf;
        size_t  nSrcLen = strlen( szSrcBuf );
        size_t  nDstLen = sizeof(szDstBuf);
        size_t  nConverted =
            iconv( sConv, &pszSrcBuf, &nSrcLen, &pszDstBuf, &nDstLen );

        int nUnicode = -1;
        if( nConverted == -1 )
        {
            if ( errno == EILSEQ )
            {
                /* fprintf(stderr, "EILSEQ for %d\n", i); */
            }

            else if ( errno == E2BIG )
            {
                fprintf(stderr, "E2BIG for %d\n", i);
                return 1;
            }
            else
            {
                fprintf(stderr, "other error for %d\n", i);
                return 1;
            }
        }
        else
        {
            int len;
            nUnicode = utf8decode(szDstBuf, szDstBuf + strlen(szDstBuf), &len);
            if( nUnicode == 0xfffd )
                nUnicode = -1;
        }

        if( nLastIdentical >= 0 && i != nUnicode )
        {
            if( nLastIdentical + 1 == i )
                printf("info->map[0x%02X] = 0x%02X;\n", nLastIdentical, nLastIdentical);
            else
            {
                printf("for(i = 0x%02X; i < 0x%02X; i++)\n", nLastIdentical, i);
                printf("    info->map[i] = i;\n");
            }
            nLastIdentical = -1;
        }

        if( nUnicode < 0 )
            printf("info->map[0x%02X] = -1;\n", i);
        else if (nUnicode <= 0xFF )
        {
            if( i == nUnicode )
            {
                if( nLastIdentical < 0 )
                    nLastIdentical = i;
            }
            else
                printf("info->map[0x%02X] = 0x%02X;\n", i, nUnicode);
        }
        else if (nUnicode <= 0xFFFF )
            printf("info->map[0x%02X] = 0x%04X;\n", i, nUnicode);
        else if (nUnicode <= 0xFFFFFF )
            printf("info->map[0x%02X] = 0x%06X;\n", i, nUnicode);
        else
            printf("info->map[0x%02X] = 0x%08X;\n", i, nUnicode);
    }

    if( nLastIdentical >= 0 )
    {
        if( nLastIdentical + 1 == i )
            printf("info->map[0x%02X] = 0x%02X;\n", nLastIdentical, nLastIdentical);
        else
        {
            printf("for(i = 0x%02X; i < 0x%02X; i++)\n", nLastIdentical, i);
            printf("    info->map[i] = i;\n");
        }
    }

    iconv_close( sConv );

    return 0;
}
