#!/usr/bin/env python3

import argparse, os, re, subprocess, sys
from common_functions import filter_if_fast

def report_illegal_comment(file_name, line_num, line, multiline):
    print("Illegal " + multiline + "comment in " + file_name + ":" + str(line_num - 1) + " " +
        line.strip('\n'))

def check_c_comments(file_name):
    count = 0
    with open(file_name) as f:
        line_num = 0
        for line in f:
            stripped = line.strip()
            if has_single_line_comment(stripped):
                report_illegal_comment(file_name, line_num, line, "")
                count +=1
            line_num += 1
    return count

def has_single_line_comment(stripped_line):
    # We don't worry about whitespace in matches as we strip the line of whitespace.
    # URLs are a false positive that we want to ignore.
    url = re.compile(r'https?:\/\/')

    # // Style comments. The gist of it is: match anything except certain
    # combinations of characters before a //. The trick is in the "certain combinations
    # of characters" part, which consists of an odd number of un-escaped string
    # markers, while ignoring everything else. As an aside, the "?:" notation indicates
    # a non-capturing group, and speeds up the script considerably since we're saving
    # any captures.
    single_line_comment = re.compile(r'^(?:[^"\\]|"(?:[^"\\]|\\.)*"|\\.)*//(?:.*)$')

    return single_line_comment.match(stripped_line) and not url.search(stripped_line)

def check_cpp_comments(file_name):
    # Text before comments
    text_check = re.compile(r'^[^\/\n]+')

    with open(file_name) as f:
        count = 0
        line_num = 0
        length = 0
        for line in f:
            stripped = line.strip()
            if has_single_line_comment(stripped):
                # Detecting multi-line comments of // is not easy as technically they can occur
                # on contiguous lines without being multi-line.
                # E.g:
                # int height; // the height of our object
                # int width; // the width of our object
                # So we can't just count the number of lines we need to check that there is no text
                # preceding the comment in subsequent lines.
                if length != 0 and not text_check.match(line.strip()):
                    # If the comment is length 2 and we found another matching line then we have
                    # found an illegal comment.
                    if length == 2:
                        report_illegal_comment(file_name, line_num, line, "multiline ")
                        count += 1
                    length += 1
                    # Try and print only one error per comment, just keep incrementing the count and
                    # the above if will only run once.
                else:
                    length += 1
            else:
                length = 0
            line_num += 1
        return count

def file_is_cpp(name):
    if re.search('(.cpp|.hpp)$', name) is not None:
        return True
    if re.search('(.c|.i|.in)$', name) is not None:
        return False

    # House style is that C++ header files use ".h", which unfortunately makes
    # this sort of code rather difficult. Luckily, libmagic can identify C/C++
    # based on content. Don't import it because Python packaging is a disaster
    # and this script needs to run reliably.
    result = subprocess.run("file {}".format(name),
                            shell=True,
                            capture_output=True,
                            text=True).stdout.strip('\n')
    return "C++" in result

# A collection of cases we handle.
tests = [
    (r'// hello world', True),
    (r'     // hello world', True),
    (r'hello world', False),
    (r'printf("Hello, World!\n"); // prints hello world', True),
    (r'String url = "http://www.example.com"', False),
    (r'// hello world', True),
    (r'//\\', True),
    (r'// "some comment"', True),
    (r'new URI("http://www.google.com")', False),
    (r'printf("Escaped quote\""); // Comment', True),
    (r' * http://www.google.com', False)
]

def validate(line, expect_match):
    if (expect_match and not has_single_line_comment(line.strip())) \
        or (not expect_match and has_single_line_comment(line.strip())):
        print("Test failed:" + line)
    else:
        print("Test success")

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('-t', '--test', default=False,
                        help='Run self tests', action='store_true')
    parser.add_argument('-F', '--fast', default=False,
                        help='Use fast mode by only checking files changed in git',
                        action='store_true')
    args = parser.parse_args()

    if args.test:
        for line, expected in tests:
            validate(line, expected)
        sys.exit(0)

    # Move up to root dir.
    os.chdir("..")

    # Some files aren't expected to comply with WiredTiger style. Ignore them.
    ignore_files = [
        'src/support/mtx_rw.c',
    ]

    command = "find bench examples ext src test -name \"*.[ch]\" \
        -o \\( -name \"*.in\" ! -name \"Makefile.in\" \\) \
        -o -name \"*.cpp\" -o -name \"*.i\" "

    result = subprocess.run(command, shell=True, capture_output=True, text=True).stdout.splitlines()
    count = 0
    if result:
        for file_name in filter_if_fast(result, fast=args.fast, prefix=""):
            if file_name in ignore_files:
                continue

            if file_is_cpp(file_name):
                count += check_cpp_comments(file_name)
            else:
                count += check_c_comments(file_name)

    if count != 0:
        print('Detected ' + str(count) +' comment format issues!')
        sys.exit(1)