#!/usr/bin/env perl
##############################################################################
#   DEiXTo Perl Executor for Windows - Version 1.4.0 - 26 January 2014       #
#----------------------------------------------------------------------------#
#   Executes DEiXTo generated wrapper project files (.wpf) and produces      #
#   user specified output (tab delimited, XML, HTML, RSS, CSV, Excel, ODS).  #
#   For more information about DEiXTo: http://deixto.com/                    #
#----------------------------------------------------------------------------#
#   Copyright 2007-2014, Kostas Ntonas <kntonas@gmail.com>                   #
#----------------------------------------------------------------------------#
#   This program is free software: you can redistribute it and/or modify     #
#   it under the terms of the GNU General Public License as published by     #
#   the Free Software Foundation, either version 3 of the License, or        #
#   (at your option) any later version.                                      #
#                                                                            #
#   This program is distributed in the hope that it will be useful,          #
#   but WITHOUT ANY WARRANTY; without even the implied warranty of           #
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            #
#   GNU General Public License for more details.                             #
#                                                                            #
#   You should have received a copy of the GNU General Public License        #
#   along with this program. If not, see <http://www.gnu.org/licenses/>.     #
##############################################################################

use strict;
use warnings;
use utf8;

###########################################################################
# Library Modules                                                         #
###########################################################################

use URI;
use Encode;
use Getopt::Long;
use XML::LibXML;
use LWP::ConnCache;
use DEiXToBot;
use Tree::Fast;
use UTF8BOM;
use Regexp::Common qw /URI/;
use List::MoreUtils qw(any uniq);
use Fcntl qw(:flock);

###########################################################################
# Constants                                                               #
###########################################################################

use constant version => 1.40;  # Version number
use constant timeout => 60;    # Agent's timeout in seconds

# define states of pattern treenodes
use constant {
    unchecked       => 1, # don't care about this node
    checked         => 2, # required - extract its text content
    grayed          => 3, # required to be there
    checked_implied => 4, # optional - extract its content if there
    grayed_implied  => 5, # optional
    checked_source  => 6, # required - extract its HTML source code
};

###########################################################################
# Global variables                                                        #
###########################################################################

my $agent;  # DEiXToBot agent
my $utf8_flag; # 1 if $system_charset is utf8 or utf-8-strict, 0 otherwise
my $system_charset; # operating system's character set
my $pagenc; # target page character set, overridden by the user
my $NavigationFailed; # 1 if a navigation fails, 0 otherwise
my $PatternRoot;      # the real root (first node) of the pattern tree
my $VRoot; # the virtual root of the pattern tree
# The following eight variables get values from the command line or the
# wpf file (command line overrides wpf)
###########################################################################
my $Keyword;   # word(s) - search term for auto fill and submit form
my $OutFile;   # output filename
my $Mode;      # output file write mode (overwrite or append or prepend)
my $InFile;    # input filename
my $OutFormat; # output file format
my $MaxHits;   # max number of hits
my $MaxCrawlDepth;  # max crawling depth
my $ExtractPageURL; # override the wpf defined ExtractPageURL flag
#--------------------------------------------------------------------------
my @targets;     # list of target URLs specified via command line
my $imageURL;    # image URL used in RSS files if -image command line option entered
my $delay;       # sleep time in seconds between http requests
my $nice;        # respect robots.txt, by default 'nice' mode is enabled.
my $mimic;       # mimic Mozilla if -mimic command line option entered
my $proxy;       # use a proxy server
my $style;       # XSL stylesheet to use (for XML output)
my $credentials; # basic credentials to be used for HTTP Basic authentication
# space seperated values entered by user are used as username and password
my $dbconfig;    # name of the dbconfig file
my $postconfig;  # name of the postprocessing config file
my $timestamp;   # flag to signal that we wish to add a timestamp to records
my $PrintToScreen; # print results to screen if -print option is entered
my @urlList;     # list of target urls
my $Project;     # the wrapper project file supplied from command line
my %rsshash;     # hash with rss channel sub-elements
my %ProjectInfo; # hash with project (scalar) information
# InputFile:   filename with target urls
# OutputFile:  output filename
# OutFileMode: output mode: overwrite or append or prepend
# OutFormat:   tab delimited text, xml, rss, excel, csv, html
# FollowLinkText: text contained by link to follow
# MaxCrawlDepth:  max crawling depth, 0 => don't follow any links,
# -1 => follow any number of links
# SubmitForm: 1, if a form should be submitted
# FormName: form name to be submitted, optional
# InputName: input name of the form to be submitted
# TermToSearch: search keyword, for auto submit, optional
# MultiplePage: multiple page navigation mode, 1 if enabled
# MaxHits: maximum number of hits (defined in wpf)
# ExtractPageURL: extract item's page url as well

###########################################################################
# Main                                                                    #
###########################################################################
MAIN:
{
    $system_charset = 'iso-8859-1'; # English Windows by default
    # -charset 'cp737' for Greek Windows ('iso-8859-7')
    # the -charset option can be used to override the operating system's
    # character set. To find out in detail which encodings are supported
    # and their code names, see Encode::Supported.

    $nice = 1; # check robots.txt by default

    parse_options();

    resolve_charset();

    CreateAgent();

    Initialize();

    OpenProject();

    SetRunParameters();

    $agent->set_pattern($PatternRoot,$VRoot);

    Executor();

    if ($PrintToScreen) {
        $agent->print_results();
    }

    print "\nExecution finished: ",$agent->hits," total hits.\n";
}

###########################################################################
# Subroutines                                                             #
###########################################################################

sub Executor {
    if ($ProjectInfo{'SubmitForm'} == 1) { # auto fill and submit form mode
        SubmitFetchResults();
    }
    else { # navigate the list of target urls and extract data
        NavigateTargetsExtract();
    }

    if (!@{$agent->records}) { return; } # no results

    if ($OutFile) { # export results to file

    # the best strategy for safe file locking is to use semaphore files,
    # which are files that will be locked outside of the data resource.
    # The beauty of semaphores is we completely separate the data resource
    # from the task of protecting it. If a program needs to access a resource
    # it will have to obtain a lock on the semaphore file before it can touch
    # the resource.
        my $semaphore = $OutFile . ".lock";
        # an exclusive lock isn't always granted when a file is opened for
        # just reading. We want to be sure we get the lock, so we open it
        # for writing.
        open(my $lock, ">", $semaphore)
            or die "Couldn't open '$semaphore': $!\n";

        flock($lock, LOCK_EX); # lock the semaphore file

        eval{
          if ($OutFormat eq 'TabDelimited') { # tab delimited text output
            if ($Mode eq 'overwrite') {
                $agent->results_to_file($OutFile,'txt',DEiXToBot::OVERWRITE,$timestamp,$postconfig);
            }
            elsif ($Mode eq 'append') {
                $agent->results_to_file($OutFile,'txt',DEiXToBot::APPEND,$timestamp,$postconfig);
            }
            elsif ($Mode eq 'prepend') {
                $agent->results_to_file($OutFile,'txt',DEiXToBot::PREPEND,$timestamp,$postconfig);
            }
          }
          elsif ($OutFormat eq 'XML') { # XML output
            if ($Mode eq 'overwrite') {
                $agent->results_to_file($OutFile,'xml',DEiXToBot::OVERWRITE,$timestamp,$postconfig,$style);
            }
            elsif ($Mode eq 'append') {
                $agent->results_to_file($OutFile,'xml',DEiXToBot::APPEND,$timestamp,$postconfig,$style);
            }
            elsif ($Mode eq 'prepend') {
                $agent->results_to_file($OutFile,'xml',DEiXToBot::PREPEND,$timestamp,$postconfig,$style);
            }
          }
          elsif ($OutFormat eq 'HTML') { # HTML output
            if ($Mode eq 'overwrite') {
                $agent->results_to_file($OutFile,'html',DEiXToBot::OVERWRITE,$timestamp,$postconfig);
            }
            elsif ($Mode eq 'append') {
                $agent->results_to_file($OutFile,'html',DEiXToBot::APPEND,$timestamp,$postconfig);
            }
            elsif ($Mode eq 'prepend') {
                $agent->results_to_file($OutFile,'html',DEiXToBot::PREPEND,$timestamp,$postconfig);
            }
          }

          elsif ($OutFormat eq 'RSS') { # RSS output
            if ($Mode eq 'overwrite') {
                $agent->write_rss($OutFile,\%rsshash,$postconfig,$imageURL);
            }
            else {
                $agent->append_rss($OutFile,\%rsshash,$Mode,$postconfig,$imageURL);
            }
          }
          elsif ($OutFormat eq 'CSV') { # CSV output
            if ($Mode eq 'append') {
                $agent->results_to_file($OutFile,'csv',DEiXToBot::APPEND,$timestamp,$postconfig);
            }
            elsif ($Mode eq 'overwrite') {
                $agent->results_to_file($OutFile,'csv',DEiXToBot::OVERWRITE,$timestamp,$postconfig);
            }
            elsif ($Mode eq 'prepend') {
                $agent->results_to_file($OutFile,'csv',DEiXToBot::PREPEND,$timestamp,$postconfig);
            }
          }
          elsif ($OutFormat eq 'Excel') { # Excel output
            if ($Mode eq 'append') {
                $agent->results_to_file($OutFile,'xls',DEiXToBot::APPEND,$timestamp,$postconfig);
            }
            elsif ($Mode eq 'overwrite') {
                $agent->results_to_file($OutFile,'xls',DEiXToBot::OVERWRITE,$timestamp,$postconfig);
            }
            elsif ($Mode eq 'prepend') {
                $agent->results_to_file($OutFile,'xls',DEiXToBot::PREPEND,$timestamp,$postconfig);
            }
          }
          elsif ($OutFormat eq 'ODS') { # ODS output
            if ($Mode eq 'append') {
                $agent->results_to_file($OutFile,'ods',DEiXToBot::APPEND,$timestamp,$postconfig);
            }
            elsif ($Mode eq 'overwrite') {
                $agent->results_to_file($OutFile,'ods',DEiXToBot::OVERWRITE,$timestamp,$postconfig);
            }
            elsif ($Mode eq 'prepend') {
                $agent->results_to_file($OutFile,'ods',DEiXToBot::PREPEND,$timestamp,$postconfig);
            }
          }
          elsif ($OutFormat eq 'DBI') { # DB support - create and execute SQL statements
                $agent->db_insert($dbconfig, $OutFile,$timestamp,$postconfig);
          }
        };

        if ($@) {
            if (utf8::is_utf8($OutFile)) {
                $OutFile = encode($system_charset,$OutFile);
            }
            print "\nFailed to export results to $OutFile.\n";
            print "$@";
        }

        # close semaphore file, the lock will automatically be removed
        # remove the locking from the resource
        close $lock or die "Couldn't close '$semaphore': $!\n";
        # delete the semaphore file
        unlink $semaphore or die "Couldn't unlink '$semaphore': $!\n";
    }

    elsif ($OutFormat eq 'DBI') {
        $agent->db_insert($dbconfig);
    }
}

# fetch home page of the web site, submit form, navigate
# the pages returned and fetch the results
sub SubmitFetchResults {
    my $url = $urlList[0]; # auto submit mode has just one single target

    my $response = $agent->get($url); # get home page of the target website
    unless ($response->is_success) {  # failure to get page
        print "Failed to fetch page $url: ",
        $agent->response->status_line,"\n";
        $NavigationFailed = 1;
        return;
    }
    else {
        print "Fetched page containing the search form.\n";
    }

    my $form; # the selected form, the first if not specified
    if ($ProjectInfo{'FormName'}) {
        $form = $agent->form_name($ProjectInfo{'FormName'});
        # an HTML::Form object is returned by the form_name method
        if (!$form) {
            print "Could not find form with name:",
                    $ProjectInfo{'FormName'}, "\n";
            return;
        }
    }
    else {
        $form = $agent->form_number(1);
        if (!$form) {
            print "No form found on the page!\n";
            return;
        }
    }

    my $keyword = encode($agent->get_charset,$Keyword);

    # set value to the appropriate input field of the current form
    # select the first input if not specified
    if ($ProjectInfo{'InputName'}) {
        $agent->field($ProjectInfo{'InputName'}, $keyword );
    }
    else {
        my @inputs = $form->inputs;
        my $input = $inputs[0];
        $input->value( $keyword );
    }

    $response = $agent->click(); # submit form

    if ($response->is_success) {
        print "Form submitted successfully.\n";
        print "Fetched first results page: ",$agent->uri,"\n";
    }
    else {
        print "Failed to submit form: ",$agent->response->status_line,"\n";
        $NavigationFailed = 1;
        return;
    }

    $agent->build_dom();
    $agent->extract_content(); # extract content from the first page returned
    print "Found ". $agent->hits ." records on page.\n";

    if (defined $MaxHits and $MaxHits > 0 and $agent->hits >= $MaxHits) {
        last;
    }

    FollowLinks(); # follow 'next' links if needed and keep extracting..
}

# navigate the list of target urls and extract data
sub NavigateTargetsExtract {
    foreach my $url (@urlList) { # visit all target urls
        Navigate($url);
        next if $NavigationFailed == 1;

        $agent->build_dom();

        my $hits = $agent->hits;

        $agent->extract_content(); # extract data in page under interest

        print "Found ", $agent->hits - $hits ," records on page.\n";

        if (defined $MaxHits and $MaxHits > 0 and $agent->hits >= $MaxHits) {
            last;
        }

        FollowLinks();
    }
}

# multiple page navigation mode - follow links specified by text or name
sub FollowLinks {
    if ($ProjectInfo{'MultiplePage'} == 1 && $MaxCrawlDepth) {
        my $crawl = 1;
        while (1) {
            if ($MaxCrawlDepth > 0 && $crawl > $MaxCrawlDepth) {
                last;
            }

            my $reg = $ProjectInfo{'FollowLinkText'};
            my $regex = encode($agent->get_charset,$reg);

            # match the text of the link against regex
            my $link = $agent->find_link( text_regex => qr/$regex/ );

            if (!$link) { # match the name of the link against regex
                $link = $agent->find_link( name_regex => qr/$regex/ );
            }

            # for special cases where encoding related issues arise..
            if (!$link) { # match the text of the link against reg
                $link = $agent->find_link( text_regex => qr/$reg/ );
            }
            if (!$link) { # match the name of the link against reg
                $link = $agent->find_link( name_regex => qr/$reg/ );
            }

            if (!$link){
                print "Could not find link to follow..\n";
                last;
            }

            Navigate($link->url);

            if ($NavigationFailed == 1) {
                print "Navigation failed.\n";
                last;
            }

            $agent->build_dom();

            my $hits = $agent->hits;

            $agent->extract_content(); # extract data

            print "Found ", $agent->hits - $hits ," records on page.\n";

            if (defined $MaxHits and $MaxHits > 0 and $agent->hits >= $MaxHits) {
                last;
            }

            $crawl++;
        }
    }
}

# get a page and build its DOM tree
sub Navigate {
    my $url = shift;

    $NavigationFailed = 0;

    my $response = $agent->get($url); # HTTP::Response object returned

    unless ($response->is_success) {
        # failure to get page
        print "Failed to fetch page $url: ",
        $agent->response->status_line,"\n";
        $NavigationFailed = 1;
        return;
    }
    print "\nFetched ",URI->new_abs($url,$agent->base)->as_string,"\n";
    return 1; # the response was successful
}

############# OPEN PROJECT #############

# Parse a wpf project file and process its nodes
sub OpenProject {
    die "Could not open $Project for reading: $!\n" unless (-e $Project);

    my $xmlparser = XML::LibXML->new();
    $xmlparser->validation(1);

    my $xmltree;
    eval { $xmltree = $xmlparser->parse_file($Project); };
    if ($@) {
        print "Could not parse the wrapper project file: $Project\n";
        print "Make sure wpf.dtd is in the same directory.\n";
        die "OpenProject: $@\n";
    }

    my $XMLDocElm = $xmltree->getDocumentElement;
    if (!$XMLDocElm) {
        die "OpenProject: could not get Document Element.\n";
    }

    if ($XMLDocElm->nodeName ne 'Project') {
        die  $XMLDocElm->nodeName+": syntax error! 'Project' expected.";
    }

    my $iNode = $XMLDocElm->firstChild;

    while ($iNode) {
        ProcessXMLNode($iNode);
        $iNode = $iNode->nextSibling;
    }
}

# do the proper assignments to various necessary
# variables and build the pattern-rule tree
sub ProcessXMLNode {
    my $node = shift;

    if ($node->nodeName eq 'ExtractionPattern') {
        # build the pattern-rule tree
        ProcessPatternNode($node->firstChild);
    }
    elsif ($node->nodeName eq 'InputFile') {
        $ProjectInfo{'InputFile'} = $node->getAttribute('Filename');
        # fetch urls contained in the input file

        if ( $ProjectInfo{'InputFile'} and !defined $InFile ) {
            $InFile = $ProjectInfo{'InputFile'};
        }

        if ($InFile) {
            if (utf8::is_utf8($InFile) && !$utf8_flag) {
                $InFile = encode($system_charset,$InFile);
            }

            die "$InFile is not a valid filename.\n" unless -e $InFile;

            UTF8BOM->remove_from_file($InFile);

            open(my $fh, '<', $InFile) or
                die "Couldn't open '$InFile': $!\n";

            while (my $http = <$fh>) {
                chomp $http;
                if ($http =~ $RE{URI}{HTTP}{-scheme => qr/https?/}{-keep}){
                    push @urlList, $1;
                }
            }
            close($fh) or die "Couldn't close '$InFile': $!\n";
        }
    }
    elsif($node->nodeName eq 'OutputFile') {
        $ProjectInfo{'OutputFile'} = $node->getAttribute('Filename');
        $ProjectInfo{'OutFileMode'} = $node->getAttribute('FileMode');
        $ProjectInfo{'OutFormat'} = $node->getAttribute('Format');
    }
    elsif ($node->nodeName eq 'MultiplePage') {
        if ($node->getAttribute('Enabled') eq 'true') {
            $ProjectInfo{'MultiplePage'} = 1;
            $ProjectInfo{'FollowLinkText'} = $node->getAttribute('ContainsText');
            $ProjectInfo{'MaxCrawlDepth'} =  $node->getAttribute('MaxCrawlDepth');
        }
    }
    elsif ($node->nodeName eq 'TargetUrls') {
        if ($InFile && @urlList && !@targets) { return 1; };
        if (@targets) {
        # override target urls with those given via -target options
        # merge with those contained in $InFile (if given)
            @targets = split(',',join(',',@targets));
            push @urlList, @targets;
            return 1;
        };
        my $cNode = $node->firstChild;
        while ($cNode) {
            if ($cNode->nodeName ne 'URL') {
                die $cNode->nodeName,": wrong tagname! 'URL' expected";
            }
            if ($cNode->getAttribute('Address')
                =~ $RE{URI}{HTTP}{-scheme => qr/https?/}{-keep}) {
                push @urlList, $1;
            }
            $cNode = $cNode->nextSibling;
        }
    }
    elsif ($node->nodeName eq 'IgnoredTagsList') {
        my $cNode = $node->firstChild;
        my @tags;
        while ($cNode) {
            $cNode->getAttribute('Label') =~m/<(.*)>/;
            push @tags, $1;
            $cNode = $cNode->nextSibling;
        }
        $agent->ignore_tags(\@tags);
    }
    elsif ($node->nodeName eq 'SubmitForm') {
        if ($node->getAttribute('Enabled') eq 'true') {
            $ProjectInfo{'SubmitForm'} = 1;
            $ProjectInfo{'FormName'} = $node->getAttribute('FormName');
            $ProjectInfo{'InputName'} = $node->getAttribute('InputName');
            $ProjectInfo{'TermToSearch'} = $node->getAttribute('Term');
        }
    }
    elsif ($node->nodeName eq 'MaxHits') {
        $ProjectInfo{'MaxHits'} = $node->getAttribute('Value');
    }
    elsif ($node->nodeName eq 'ExtractPageURL') {
        $ProjectInfo{'ExtractPageURL'} = 1;
    }

    elsif ($node->nodeName eq 'RssChannel') {
        my $cNode = $node->firstChild;
        while ($cNode) {
            if ($cNode->nodeName ne 'ChannelElement') {
                $rsshash{$cNode->nodeName} = $cNode->getAttribute('Value');
            }
            else {
                $rsshash{$cNode->getAttribute('Name')}
                  = $cNode->getAttribute('Value');
            }
            $cNode = $cNode->nextSibling;
        }
    }
    else {
        die $node->nodeName,": wrong element name!";
    }
}

# parse the XML 'extraction pattern' subtree and build the pattern tree
sub ProcessPatternNode {
    my $node = shift;
    my $parent = shift;

    if ($node->nodeName ne 'Node') {
        die $node->nodeName + ": syntax error! 'Node' expected\n";
    }

    my %info;

    my @taglabel = split q{:}, $node->getAttribute('tag');
    $info{'Name'} = $taglabel[0];
    $info{'Label'} = $taglabel[1];

    if ($parent) {
        $info{'SiblingOrder'} = scalar($parent->children);
    }

    if ($node->getAttribute('IsRoot')) {
        $info{'IsRoot'} = 1;
    }

    if ($node->getAttribute('regexpr')) {
        $info{'RegExpr'} = $node->getAttribute('regexpr');
        if ( $node->getAttribute('inverse') ) {
            $info{'inverse'} = 1;
        }
    }
    else {
        $info{'RegExpr'} = undef;
        $info{'inverse'} = undef;
    }

    if ($node->getAttribute('fson')) {
        $info{'fson'} = $node->getAttribute('fson');
    }
    else { $info{'fson'} = 0; }

    if ( $node->getAttribute('CareAboutSO') ) {
        $info{'CareAboutSO'} = 1;
        $info{'so_start'} = $node->getAttribute('so_start');
        $info{'so_step'} = $node->getAttribute('so_step');
    }
    else { $info{'CareAboutSO'} = 0; }

    my $st = $node->getAttribute('stateIndex');
    if (!$st) {die "$taglabel[0] does not have a stateIndex attribute!\n";}
    if ($st eq 'checked') {$info{'State'} = checked;}
    elsif ($st eq 'grayed') {$info{'State'} = grayed;}
    elsif ($st eq 'grayed_implied') {$info{'State'} = grayed_implied;}
    elsif ($st eq 'checked_implied') {$info{'State'} = checked_implied;}
    elsif ($st eq 'checked_source') {$info{'State'} = checked_source;}
    else { die $st,": invalid state index!\n"; }

    my $treenode = Tree::Fast->new(\%info);
    if (defined $parent) {
        $parent->add_child({},$treenode);
    }
    else {
        $PatternRoot = $treenode;
    }

    if (exists $info{'IsRoot'}) { $VRoot = $treenode };

    my $cNode = $node->firstChild;
    while ($cNode) {
        ProcessPatternNode($cNode,$treenode);
        $cNode = $cNode->nextSibling;
    }
}

#============ OPEN PROJECT =============

# DEiXToBot is a subclass of WWW::Mechanize::Sleepy
# WWW::Mechanize, or Mech for short, helps you automate interaction with
# a website. LWP::UserAgent and the rest of the LWP suite provide powerful
# tools for accessing and downloading web content, while Mech can automate
# many of the tasks you'd normally have to code.
sub CreateAgent {
    if (! defined $delay){
        $delay = 0; # don't sleep
    }

    # create the 'agent'
    my %option;
    $option{'nice'} = $nice;
    $option{'os_charset'} = $system_charset;
    $option{'sleep'} = $delay;
    if ($pagenc) {
        $option{'pagenc'}  = $pagenc;
    }
    $agent = DEiXToBot->new(%option);

    $agent->parse_head(0); # don't initialize response headers

    # set the depth of the page stack to 1, to avoid eating up memory
    $agent->stack_depth(1);

    $agent->quiet(1); # turn off warnings

    # the requests is aborted if no activity on the connection
    # to the server is observed for timeout seconds
    $agent->timeout(timeout);

    if ($mimic) {
        if ($^O eq 'MSWin32') { # Windows
            $agent->agent_alias('Windows Mozilla');
        }
        elsif ($^O eq 'linux') { # Linux
            $agent->agent_alias('Linux Mozilla');
        }
    }

    if ($proxy) {
        $agent->proxy('http', $proxy);
    }

    if ($credentials) {
        my ($username,$password) = split(/ +/, $credentials);
        $agent->credentials($username,$password);
    }

    # to support HTTP Keep-Alive, call the conn_cache( ) method to
    # a connection cache object
    my $cache = $agent->conn_cache(LWP::ConnCache->new( ));

    # the newly created connection cache object will cache all
    # connections (no limits)
    $agent->conn_cache->total_capacity(undef);

    if (defined $ExtractPageURL) {
        $agent->extract_url($ExtractPageURL);
    }
    if (defined $MaxHits) {
        $agent->max_hits($MaxHits);
    }
}

# initialize the appropriate-necessary global variables
sub Initialize {
    @urlList = ();
    $ProjectInfo{'SubmitForm'} = 0;
    $ProjectInfo{'MultiplePage'} = 0;
    $NavigationFailed = 0;
}

sub SetRunParameters {
    if ($ProjectInfo{'MaxHits'} and not defined $MaxHits) {
        $MaxHits = $ProjectInfo{'MaxHits'};
        $agent->max_hits($MaxHits);
    }
    if ($ProjectInfo{'TermToSearch'} and not defined $Keyword) {
        $Keyword = $ProjectInfo{'TermToSearch'};
    }
    if ($ProjectInfo{'OutputFile'} and not defined $OutFile) {
        $OutFile = $ProjectInfo{'OutputFile'};
        if (!$utf8_flag) {
            $OutFile = encode($system_charset,$OutFile);
        }
    }
    if ($ProjectInfo{'OutFormat'} and not defined $OutFormat) {
        $OutFormat = $ProjectInfo{'OutFormat'};
    }
    if ($ProjectInfo{'OutFileMode'} and not defined $Mode) {
        $Mode = lc $ProjectInfo{'OutFileMode'};
    }
    if (exists $ProjectInfo{'ExtractPageURL'} and not defined $ExtractPageURL) {
        $ExtractPageURL = $ProjectInfo{'ExtractPageURL'};
        $agent->extract_url($ExtractPageURL);
    }
    if (exists $ProjectInfo{'MaxCrawlDepth'} and not defined $MaxCrawlDepth) {
        $MaxCrawlDepth = $ProjectInfo{'MaxCrawlDepth'};
    }

    @urlList = uniq @urlList;
}

sub resolve_charset {
    die "charset cannot be undef!\n" if !defined $system_charset;
    my $obj = find_encoding($system_charset);
    die "$system_charset is not a supported encoding.\n"
        unless ref $obj;
    my $name = $obj->name;
    if ($name eq 'utf8' or $name eq 'utf-8-strict'){
        $utf8_flag = 1;
    }
    else {
        $utf8_flag = 0;
    }
}

sub trim {
    my $string = shift;
    $string =~ s{\A \s* | \s* \z}{}gxm;
    return $string;
}

############# PARSE COMMAND LINE PARAMETERS #############

sub parse_options {
    my $options_okay = GetOptions (
        '<>'                   => \&SetProject,
        "print|p"              => \$PrintToScreen,
        "search=s"             => \&SetKeyword,
        "usage"                => sub { usage_message(); exit; },
        "help|?"               => sub { help_message(); exit; },
        "version"              => sub { version_message(); exit; },
        "charset=s"            => \$system_charset,
        "out=s"                => \$OutFile,
        "in=s"                 => \$InFile,
        "target=s"             => \@targets,
        "native_url=i"         => \&SetNative,
        "append=i"             => \&SetMode,
        "format=s"             => \&SetOutFormat,
        "delay=s"              => \&SetDelay,
        "max=i"                => \&SetMaxHits,
        "nice=i"               => \&SetNice,
        "depth=i"              => \&SetCrawlDepth,
        "mimic|pretend"        => \$mimic,
        "proxy=s"              => \$proxy,
        "style=s"              => \$style,
        "credentials=s"        => \$credentials,
        "image=s"              => \$imageURL,
        "dbconfig=s"           => \$dbconfig,
        "postprocess=s"        => \$postconfig,
        "timestamp"            => \$timestamp,
        "pagenc=s"             => \$pagenc,
    );

    if (!$options_okay) {
        print "Failed to get options!\n";
        usage_message();
        exit;
    }

    if ( !$Project ) {
        print "Please enter a wrapper project file name.\n";
        usage_message();
        exit;
    }
}

# add a wpf to the list of projects for execution
sub SetProject {
    my $file = shift;
    if (!defined $file) { help_message(); exit; }
    if ($Project) {
        print "\nYou have already entered a wrapper project file.\n";
        print "Only one wpf must be supplied.\n\n";
        help_message();
        exit;
    }
    if ( ! -e $file ) { # check if the file exists
        print "Invalid project file: ", $file, "\n";
        exit;
    }
    $Project = $file;
}

sub SetKeyword {
    my ($option,$arg) = @_;
    if (!defined $arg) { help_message(); exit; }
    if (!$utf8_flag) {
        $arg = decode($system_charset,$arg);
    }
    $Keyword = $arg;
}

sub SetMode {
    my ($option,$arg) = @_;
    if (!defined $arg) { help_message(); exit; }
    if ($arg == 0) { $Mode = 'overwrite'; }
    elsif ($arg == 1) { $Mode = 'append'; }
    elsif ($arg == 2) { $Mode = 'prepend'; }
    else { help_message(); exit; }
}

sub SetNice {
    my ($option,$arg) = @_;
    if (!defined $arg) { help_message(); exit; }
    if ($arg == 0) { $nice = 0; }
    elsif ($arg == 1) { $nice = 1; }
    else { help_message(); exit; }
}

sub SetNative {
    my ($option,$arg) = @_;
    if (!defined $arg) { help_message(); exit; }
    if ($arg == 0) { $ExtractPageURL = 0; }
    elsif ($arg == 1) { $ExtractPageURL = 1; }
    else { help_message(); exit; }
}

sub SetOutFormat {
    my ($option,$arg) = @_;
    if (!defined $arg) { help_message(); exit; }
    if (!$utf8_flag) {
        $arg = decode($system_charset,$arg);
    }
    if ($arg =~m/^txt$/i) { $OutFormat = 'TabDelimited'; }
    elsif ($arg =~m/^xml$/i) { $OutFormat = 'XML'; }
    elsif ($arg =~m/^html$/i) { $OutFormat = 'HTML'; }
    elsif ($arg =~m/^xls$/i) { $OutFormat = 'Excel'; }
    elsif ($arg =~m/^ods$/i) { $OutFormat = 'ODS'; }
    elsif ($arg =~m/^csv$/i) { $OutFormat = 'CSV'; }
    elsif ($arg =~m/^rss$/i) { $OutFormat = 'RSS'; }
    elsif ($arg =~m/^dbi$/i) { $OutFormat = 'DBI'; }
    else { help_message(); exit; }
}

sub SetCrawlDepth{
    my ($option,$arg) = @_;
    if (!defined $arg) { help_message(); exit; }
    if (!$utf8_flag) {
        $arg = decode($system_charset,$arg);
    }
    if ($arg < -1){
        print "Max crawl depth must be greater than or equal to -1.\n";
        print "0 => don't follow any links, -1 => follow any number of links\n";
        help_message();
        exit;
    }
    $MaxCrawlDepth = $arg;
}

sub SetMaxHits {
    my ($option,$arg) = @_;
    if (!$utf8_flag) {
        $arg = decode($system_charset,$arg);
    }
    if ($arg < 0){
        print "Max hits must be greater than or equal to 0.\n";
        help_message();
        exit;
    }
    $MaxHits = $arg;
}

sub SetDelay {
    my ($option,$arg) = @_;
    if (!defined $arg) { help_message(); exit; }
    if (!$utf8_flag) {
        $arg = decode($system_charset,$arg);
    }
    $delay = $arg;
}

# prints the version if run with --version
sub version_message {
    print "DEiXTo Perl Executor: Version ". version. ".\n";
}

# prints a summary of the usage
sub usage_message {
    print <<"END_USAGE";
Usage: deixto_executor.exe PROJECT
       [-out path] [-in path] [-target url(s)] [-append 0|1|2]
       [-format txt|csv|xls|ods|xml|html|rss|dbi] [-delay time]
       [-max \#hits] [-nice 0|1] [-mimic] [-search keyword]
       [-proxy http://your.proxy.server:port] [-image url]
       [-credentials "username password"] [-style xsl]
       [-depth \#pages] [-charset encoding] [-dbconfig path]
       [-postprocess path] [-native_url 0|1] [-timestamp] [-pagenc encoding]
       [-print] [-help] [-usage] [-version]
# Command line parameters override those defined in a wpf.
END_USAGE
}

# prints the help message if run with --help
sub help_message {
    version_message();
    usage_message();
    print <<"END_HELP";
Options:
 -help|?                    Displays this message.
 -usage                     Usage summary.
 -version                   Version number.
 -charset encoding          Operating system\'s character set
 -out filepath              Output filename.
 -in filepath               Input filename with target URLs.
 -target url(s)             Override target URL(s).
 -append 0|1|2              Output write mode (overwrite or append or prepend).
 -native_url 0|1            Extract record\'s native URL.
 -format filetype           Output file format.
 -max hits                  Max number of hits.
 -search keyword            Keyword for auto_fill_and_submit_form.
 -depth pages               Max crawling depth when following \'next\' links.
 -delay time                Sleep time between agent\'s http requests.
 -print                     Print results to screen.
 -nice 0|1                  Respect robots.txt file on target website's root folder.
 -mimic                     Mimim Mozilla browser.
 -proxy                     Proxy server to be used.
 -style xsl                 XSL Stylesheet to be used (for XML output).
 -credentials               Credentials for HTTP Basic authentication.
 -image url                 Image URL used in RSS output files.
 -dbconfig filepath         Dbconfig file.
 -postprocess filepath      Enable the postprocessing mechanism via a config file.
 -timestamp                 Add a timestamp at the records found.
 -pagenc encoding           Force use of a certain encoding instead of that specified in the target page source.
END_HELP
}

#============ PARSE COMMAND LINE PARAMETERS =============
