#!/usr/bin/perl
##############################################################################
#   DEiXToBot - Version 1.4.0 - 26 January 2014                              #
#----------------------------------------------------------------------------#
#   A Sleepy Mechanize Agent that can execute GUI DEiXTo generated tree      #
#   based extraction rules and produce user specified formatted output.      #
#   For more information about DEiXTo: http://deixto.com/                    #
#----------------------------------------------------------------------------#
#   Copyright 2007-2014, Kostas Ntonas <kntonas@gmail.com>                   #
#----------------------------------------------------------------------------#
#   This program is free software: you can redistribute it and/or modify     #
#   it under the terms of the GNU General Public License as published by     #
#   the Free Software Foundation, either version 3 of the License, or        #
#   (at your option) any later version.                                      #
#                                                                            #
#   This program is distributed in the hope that it will be useful,          #
#   but WITHOUT ANY WARRANTY; without even the implied warranty of           #
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            #
#   GNU General Public License for more details.                             #
#                                                                            #
#   You should have received a copy of the GNU General Public License        #
#   along with this program. If not, see <http://www.gnu.org/licenses/>.     #
##############################################################################

package DEiXToBot;

use version;
our $VERSION = qv('1.4.0');

use strict;
use warnings;
use utf8;

use base qw( WWW::Mechanize::Sleepy );

###########################################################################
# Library Modules                                                         #
###########################################################################

use URI;
use Encode;
use Encode::Guess;
use Tree::Fast;
use XML::LibXML;
use HTML::Strip;
use HTML::Entities;
use IO::File;
use XML::Writer;
use HTML::Template;
use File::stat;
use XML::RSS;
use Config::General;
use DBI;
use Spreadsheet::Write;
use Spreadsheet::Read;
use OpenOffice::OODoc;
use WWW::RobotRules;
use List::Util qw ( max );
use List::MoreUtils qw ( any firstidx );
use Scalar::Util qw( looks_like_number reftype );
use Carp qw ( croak );
use UNIVERSAL::isa;
use Class::Std::Utils;
use DateTime;

###########################################################################
# Constants                                                               #
###########################################################################

# define states of pattern treenodes
use constant {
    unchecked       => 1, # don't care about this node
    checked         => 2, # required - extract its text content
    grayed          => 3, # required to be there
    checked_implied => 4, # optional - extract its content if there
    grayed_implied  => 5, # optional
    checked_source  => 6, # required - extract its HTML source code
};

# output file write modes (overwrite or append or prepend)
use constant OVERWRITE => 0;
use constant APPEND => 1;
use constant PREPEND => 2;

###########################################################################
# Global variables                                                        #
###########################################################################

my $droot;      # the root treenode of the fetched page's DOM tree
my $proot;      # the real root (first node) of the pattern tree
my $vroot;      # the virtual root of the pattern tree
my @candidates; # treenodes of the DOM tree of the fetched web page
# that have the same tag name as the virtual root node.
# Only these will be checked against the pattern.

# Objects of this class have the following attributes...
my %os_charset_of;  # Character set of the operating system
my %pagenc_of;  # Encoding of the target page (force the use of a different encoding than the one specified in the source)
my %extract_url_of; # Extract a record's page url as well
my %nice_of; # Respect robots.txt flag
my %hits_of; # Counter of records found on a page
my %max_hits_of; # Max number of hits
my %ignore_tags_of;  # HTML tags that should be ignored  when building DOM
my %parser_of;  # XML::LibXML parser for parsing HTML pages!
my %rules_of;   # a WWW::RobotRules object for robots.txt files
my %labels_of;  # Labels of the extracted fields of a record instance
my %records_of; # Records found
my %checked_robots_of; # Hash with those authorities whose "robots.txt"
# files we 've gotten and parsed

my $DOWARN = 1; # flag used to enable/disable compile-time or run-time warnings (enabled by default)

# wipe out all compile-time or run-time warnings
BEGIN { $SIG{'__WARN__'} = sub { warn $_[0] if $DOWARN } }

sub new {
    my $class = shift;
    my %option = @_;

    my $nice = 1;
    if ( exists($option{nice}) ) {
        $nice = $option{nice}  ? 1 : 0;
        delete $option{nice};
    }

    my $os_charset = 'utf8';
    if ( exists($option{os_charset}) ) {
        $os_charset = $option{os_charset};
        croak "new: os_charset can't be undef.\n"
            unless defined $os_charset;
        my $obj = find_encoding($os_charset);
        croak "new: $os_charset is not a supported encoding.\n"
            unless ref $obj;
        delete $option{os_charset};
    }
    
    my $pagenc;
    if ( exists($option{pagenc}) ) {
        $pagenc = $option{pagenc};
        croak "new: pagenc can't be undef.\n"
            unless defined $pagenc;
        my $obj = find_encoding($pagenc);
        croak "new: $pagenc is not a supported encoding.\n"
            unless ref $obj;
        delete $option{pagenc};
    }    

    my $self = $class->SUPER::new( %option );

    $self->agent( 'DEiXToBot/' . $VERSION );

    # Convert an object reference into a unique ID number
    my $id_num = ident $self;

    $nice_of{$id_num} = $nice;
    $os_charset_of{$id_num} = $os_charset;
    $pagenc_of{$id_num} = $pagenc;
    $extract_url_of{$id_num} = 0;
    $hits_of{$id_num} = 0;
    $records_of{$id_num} = [qw ()];
    $labels_of{$id_num} = [qw ()];
    $ignore_tags_of{$id_num} = [qw ()];

    # sometimes using the wrong tool for the right reasons can be
    # pretty useful! That's why we parse HTML using an XML parser!
    # Set up the parser, and set it to recover
    # from errors so that it can handle broken HTML
    my $parser = XML::LibXML->new();

    # turn the parser warnings off and turn the parser's recover mode on
    $parser->recover_silently(1);

    # do not maintain white-space in the document
    $parser->keep_blanks(0);

    $parser_of{$id_num} = $parser;

    # create a database of robots.txt-derived permissions
    my $rules = WWW::RobotRules->new($self->agent);
    $rules_of{$id_num} = $rules;

    return $self;
}

sub DESTROY {
    my $self = shift;

    my $id_num = ident $self;

    delete $os_charset_of{$id_num};
    delete $pagenc_of{$id_num};    
    delete $extract_url_of{$id_num};
    delete $nice_of{$id_num};
    delete $hits_of{$id_num};
    delete $max_hits_of{$id_num};
    delete $ignore_tags_of{$id_num};
    delete $labels_of{$id_num};
    delete $records_of{$id_num};
    delete $parser_of{$id_num};
    delete $rules_of{$id_num};
    delete $checked_robots_of{$id_num};
}

sub _check_robotstxt {
    my ($self, $url) = @_;

    my $uri = eval { URI->new($url) };
    $@ and do {
        croak "Error: $@\n";
    };

    my $authority = $uri->authority;

    if (! exists $checked_robots_of{ident $self}{$authority}) {
        $checked_robots_of{ident $self}{$authority} = 1;
        my $robots_url = $uri->scheme.'://'.$authority.'/robots.txt';
        my $response = $self->get($robots_url);
        return 1
            unless $response->is_success; # ok if robots.txt does not exist
        my $robots_txt =  $self->content;
        $rules_of{ident $self}->parse($robots_url, $robots_txt);
    }

    my $name = $self->agent;

    # Now we can check the URL under interest against the "robots.txt"
    croak "Access forbidden for $name.\n"
        unless $rules_of{ident $self}->allowed($url);

    return 1;
}

sub request {
    my $self = shift;
    if ($nice_of{ident $self} && ! ($_[0]->uri =~/robots\.txt$/)) {
        $self->_check_robotstxt($_[0]->uri);
    }
    $self->SUPER::request(@_);
}

#========== SETTERS - GETTERS ==========

# set extract record's page URL flag - default is off
sub extract_url {
    croak "extract_url: you must supply an argument.\n" if @_ != 2;
    my ($self, $param) = @_;
    ref $self or croak "Instance variable needed.\n";
    $extract_url_of{ident $self} = $param ? 1 : 0;
}

# get number of hits found
sub hits {
    my $self = shift;
    ref $self or croak "Instance variable needed.\n";
    $hits_of{ident $self};
}

# set/get max number of hits to find on a page
sub max_hits {
    my $self = shift;

    ref $self or croak "Instance variable needed.\n";

    if (@_) { # are there any more parameters?
        # yes, it's a setter
        my $param = shift;

        croak "Error: max_hits should be an integer.\n"
            unless looks_like_number $param;

        croak "Error: max_hits should be greater than or equal to 0.\n"
            unless $param >= 0;

        $max_hits_of{ident $self} = $param;
    }
    else {
        # no, it's a getter:
        $max_hits_of{ident $self};
    }
}

# get records found - returns reference to an array of array references
sub records {
    my $self = shift;
    ref $self or croak "Instance variable needed.\n";
    #my @records = @{$records_of{ident $self}};
    if (wantarray) { # list context
        return @{$records_of{ident $self}};#@records;
    }
    # scalar context
    return \@{$records_of{ident $self}};#@records;
}

# set/get the operating system's character set
sub os_charset {
    my ($self) = shift;

    ref $self or croak "Instance variable needed.\n";

    if (@_) {
        my $param = shift;
        my $obj = find_encoding($param);
        croak "os_charset: supplied encoding not supported.\n"
            unless ref $obj;
        $os_charset_of{ident $self} = $param;
    }
    else {
        $os_charset_of{ident $self};
    }
}

sub pagenc {
    my ($self) = shift;

    ref $self or croak "Instance variable needed.\n";

    if (@_) {
        my $param = shift;
        my $obj = find_encoding($param);
        croak "pagenc: supplied encoding not supported.\n"
            unless ref $obj;
        $pagenc_of{ident $self} = $param;
    }
    else {
        $pagenc_of{ident $self};
    }
}

# set/get the nice option (check or not the robots.txt files)
sub nice {
    my ($self) = shift;

    ref $self or croak "Instance variable needed.\n";

    if (@_) {
        my $param = shift;
        $nice_of{ident $self} = $param ? 1 : 0;
    }
    else {
        $nice_of{ident $self};
    }
}

# get labels of extracted fields.
sub labels {
    my $self = shift;
    ref $self or croak "Instance variable needed.\n";
    my @labels = @{$labels_of{ident $self}};
    if (wantarray) { # list context
        return @labels;
    }
    # scalar context
    return \@labels;
}

# set/get tags to ignore when building a page's DOM tree. The second
# parameter should be a reference to an array containing HTML tags
sub ignore_tags {
    my $self = shift;

    ref $self or croak "Instance variable needed.\n";

    if (@_) {
        my $param = shift;
        croak "ignore_tags: the parameter should be an array reference.\n"
            unless _is_array_ref($param);
        $ignore_tags_of{ident $self} = $param;
    }
    else {
        my @tags = @{$ignore_tags_of{ident $self}};
        if (wantarray) { # list context
            return @tags;
        }
        # scalar context
        return \@tags;
    }
}

sub load_pattern {
    my ($self, $file) = @_;

    ref $self or croak "Instance variable needed.\n";

    croak "load_pattern: a filename must be supplied." if !defined $file;

    croak "Could not open $file for reading: $!\n"
        unless -e $file;

    my $xmlparser = XML::LibXML->new();
    #$xmlparser->validation(1); pattern.dtd..

    my $xmltree;
    eval { $xmltree = $xmlparser->parse_file($file); };
    if ($@) {
        print "Could not parse the pattern file: $file.\n";
        croak "$@\n";
    }

    my $doc_elm = $xmltree->getDocumentElement;
    if (!$doc_elm) {
        croak "load_pattern: could not get Document Element.\n";
    }

    if ($doc_elm->nodeName ne 'Pattern') {
        croak $doc_elm->nodeName.": syntax error! 'Pattern' expected.\n";
    }

    my $node = $doc_elm->firstChild;

    $labels_of{ident $self} = [qw ()]; # initialize labels

    $self->_process_pattern_node($node);

    # initialize hits and the records structure
    $self->discard_records;

    return 1;
}

# set pattern - supply proot, vroot nodes
sub set_pattern {
    my ($self, $pattern_root, $virtual_root) = @_;

    ref $self or croak "Instance variable needed.\n";

    croak "set_pattern: the supplied pattern_root argument must be a "
          ."reference to a Tree::Fast object.\n"
        if (ref $pattern_root ne 'Tree::Fast');

    croak "set_pattern: the supplied virtual_root argument must be a "
          ."reference to a Tree::Fast object.\n"
        if (ref $virtual_root ne 'Tree::Fast');

    $proot = $pattern_root;
    $vroot = $virtual_root;

    $labels_of{ident $self} = [qw ()]; # initialize labels

    # find labels of the fields under interest
    $self->_find_labels();

    # initialize hits and the records structure
    $self->discard_records;

    return 1;
}

# check if a variable is an array reference
sub _is_array_ref {
    my ($ref) = @_;

    # throw out non-references early.
    return 0 unless ref $ref;

    # throw out non array references
    return 0 unless reftype $ref eq 'ARRAY';

    # it's an array reference
    return 1;
}

# check if a variable is a hash reference
sub _is_hash_ref {
    my ($ref) = @_;

    # throw out non-references early.
    return 0 unless ref $ref;

    # throw out non hash references
    return 0 unless reftype $ref eq 'HASH';

    # it's a hash reference
    return 1;
}

#---------- SETTERS - GETTERS ----------

sub clone {
    my $self  = shift;
    my $clone = $self->SUPER::clone();

    $clone->cookie_jar( $self->cookie_jar );

    my $id_num1 = ident $self;
    my $id_num2 = ident $clone;

    $nice_of{$id_num2} = $nice_of{$id_num1};
    $os_charset_of{$id_num2} = $os_charset_of{$id_num1};
    $pagenc_of{$id_num2} = $pagenc_of{$id_num1};    
    $extract_url_of{$id_num2} = 0;
    $hits_of{$id_num2} = 0;
    $records_of{$id_num2} = [qw ()];
    $labels_of{$id_num2} = [qw ()];
    $ignore_tags_of{$id_num2} = [qw ()];

    my $parser = XML::LibXML->new();
    $parser->recover_silently(1);
    $parser->keep_blanks(0);
    $parser_of{$id_num2} = $parser;

    my $rules = WWW::RobotRules->new($clone->agent);
    $rules_of{$id_num2} = $rules;

    return $clone;
}

sub discard_records {
    my $self = shift;
    ref $self or croak "Instance variable needed.\n";
    $hits_of{ident $self} = 0;
    $records_of{ident $self} = [qw ()];
    return 1;
}

sub results_to_file {
    my ($self, $file, $format, $mode, $timestamp, $postconfig, $style) = @_;

    ref $self or croak "Instance variable needed.\n";

    croak "results_to_file: all five (or six) arguments must be supplied.\n"
        if (@_ != 6 && @_ != 7);

    croak "results_to_file: all three arguments must be supplied.\n"
        if any {!defined $_} $file, $format, $mode;

    if ($mode) { # append/prepend mode
        if ($format eq 'txt') {
            if ($mode == 1) {
                $self->_append_txt($file,$timestamp,$postconfig);
            }
            else {
                $self->_prepend_txt($file,$timestamp,$postconfig);
            }
        }
        elsif ($format eq 'xml') {
            $self->_append_xml($file,$timestamp,$postconfig,$style,$mode);
        }
        elsif ($format eq 'html') {
            $self->_append_html($file,$mode,$timestamp,$postconfig);
        }

        elsif ($format eq 'xls') {
            $self->_append_xls($file,$mode,$timestamp,$postconfig);
        }
        elsif ($format eq 'csv') {
            $self->_append_csv($file,$mode,$timestamp,$postconfig);
        }
        elsif ($format eq 'ods') {
            $self->_append_ods($file,$mode,$timestamp,$postconfig);
        }
        else {
            croak "results_to_file: not supported format.\n";
        }
    }
    else { # overwrite mode
        if ($format eq 'txt') {
            $self->_write_txt($file,$timestamp,$postconfig);
        }
        elsif ($format eq 'xml') {
            $self->_write_xml($file,$timestamp,$postconfig,$style);
        }
        elsif ($format eq 'html') {
            $self->_write_html($file,$timestamp,$postconfig);
        }
        elsif ($format eq 'xls') {
            $self->_write_xls($file,$timestamp,$postconfig);
        }
        elsif ($format eq 'csv') {
            $self->_write_csv($file,$timestamp,$postconfig);
        }
        elsif ($format eq 'ods') {
            $self->_write_ods($file,$timestamp,$postconfig);
        }
        else {
            croak "results_to_file: not supported format.\n";
        }
    }

    return 1;
}

sub _write_ods {
    my ($self, $filename, $timestamp, $postconfig) = @_;

    ref $self or croak "Instance variable needed.\n";

    my $id_num = ident $self;

    return if (!@{$records_of{$id_num}});

    if (! defined $filename) {
        croak "_write_ods: filename required.\n";
    }

    if ($postconfig) {
        postprocess($self,$postconfig);
    }

    # print results to file
    my $doc = odfDocument(file => $filename, create => 'spreadsheet', template_path => '.');

    my @headers = @{$labels_of{$id_num}};
    if ( $extract_url_of{$id_num} ) {
        unshift @headers, 'native_url';
    }

    if ($timestamp) {
        push @headers, 'timestamp';
    }

    my $first_sheet = $doc->getTable(0);
    my $rows = scalar(@{$records_of{$id_num}});
    my $columns = scalar(@headers);

    $doc->expandTable($first_sheet, $rows+1, $columns);

    my $col = 0;
    foreach my $label (@headers) {
        $doc->updateCell($first_sheet, 0, $col++, $label);
    }

    my $row = 1;
    foreach my $record (@{$records_of{$id_num}}) {
        my $col = 0;
        for my $field (@$record) {
            if ($field) {
                $field =~s#\s+# #g;
                $doc->updateCell($first_sheet, $row, $col++, encode_utf8($field));
            }
            else {
                $doc->updateCell($first_sheet, $row, $col++, '');
            }
        }
        if ($timestamp) {
            my $now = DateTime->now( time_zone => 'local' );
            $doc->updateCell($first_sheet, $row, $col++, $now);
        }
        $row++;
    }
    $doc->renameTable(0, 'DEiXTO Extracted Content');
    $doc->save;

    return 1;
}

sub _append_ods {
    my ($self, $filename, $mode, $timestamp, $postconfig) = @_;

    ref $self or croak "Instance variable needed.\n";

    my $id_num = ident $self;

    return if (!@{$records_of{$id_num}});

    if (! defined $filename) {
        croak "_append_ods: filename required.\n";
    }

    if (! -e $filename) {
        return $self->_write_ods($filename,$timestamp,$postconfig);
    }

    if ($postconfig) {
        postprocess($self,$postconfig);
    }

    # read previous contents
    my $doc = odfDocument(file => $filename, template_path => '.');
    if (!$doc) {
        croak "_append_ods: could not get access to the content of the file.\n";
    }

    my $first_sheet = $doc->getTable(0);

    my ($rows_old, $columns_old) = $doc->getTableSize($first_sheet);
    if (! $rows_old) {
        return $self->_write_ods($filename);
    }
    my $prev_rows=$rows_old-1;

    my @prev_contents; # get previous contents into an array
    for my $row (1..$rows_old-1) { # skip headers - row 0
        my @record;
        for my $col (0..$columns_old-1) {
            push @record, $doc->getCellValue($first_sheet, $row, $col);
        }
        push @prev_contents, \@record;
    }

    # print results to  a newly created file
    $doc = odfDocument(file => $filename, create => 'spreadsheet', template_path => '.');

    my @headers = @{$labels_of{$id_num}};
    if ( $extract_url_of{$id_num} ) {
        unshift @headers, 'native_url';
    }

    if ($timestamp) {
        push @headers, 'timestamp';
    }

    $first_sheet = $doc->getTable(0);
    my $rows = scalar(@{$records_of{$id_num}});
    my $columns = scalar(@headers);

    $doc->expandTable($first_sheet, $rows+$rows_old, max($columns,$columns_old));

    my $col = 0;
    foreach my $label (@headers) {
        $doc->updateCell($first_sheet, 0, $col++, $label);
    }

    if ($mode == 1) {
        for (my $i=0; $i < $prev_rows; $i++) { # add previous contents
            for(my $j=0; $j < $columns_old; $j++) {
                my $cell = ${$prev_contents[$i]}[$j];
                $doc->updateCell($first_sheet, $i+1, $j, $cell);
            }
        }

        my $row = $prev_rows+1;
        foreach my $record (@{$records_of{$id_num}}) { # add new records
            my $col = 0;
            for my $field (@$record) {
                if ($field) {
                    $field =~s#\s+# #g;
                    $doc->updateCell($first_sheet, $row, $col++, encode_utf8($field));
                }
                else {
                    $doc->updateCell($first_sheet, $row, $col++, '');
                }
            }
            if ($timestamp) {
                my $now = DateTime->now( time_zone => 'local' );
                $doc->updateCell($first_sheet, $row, $col++, $now);
            }
            $row++;
        }
    }
    else { # mode 2
        my $row = 1;
        foreach my $record (@{$records_of{$id_num}}) { # add new records
            my $col = 0;
            for my $field (@$record) {
                if ($field) {
                    $field =~s#\s+# #g;
                    $doc->updateCell($first_sheet, $row, $col++, encode_utf8($field));
                }
                else {
                    $doc->updateCell($first_sheet, $row, $col++, '');
                }
            }
            if ($timestamp) {
                my $now = DateTime->now( time_zone => 'local' );
                $doc->updateCell($first_sheet, $row, $col++, $now);
            }
            $row++;
        }

        $row = $rows+1;
        for (my $i=0; $i < $rows_old; $i++) { # add previous contents
            for(my $j=0; $j < $columns_old; $j++) {
                my $cell = ${$prev_contents[$i]}[$j];
                $doc->updateCell($first_sheet, $row, $j, $cell);
            }
            $row++;
        }
    }

    $doc->save;

    return 1;
}

sub _write_csv {
    my ($self, $filename, $timestamp, $postconfig) = @_;

    ref $self or croak "Instance variable needed.\n";

    my $id_num = ident $self;

    return if (!@{$records_of{$id_num}});

    if (! defined $filename) {
        croak "_write_csv: filename required.\n";
    }

    if ($postconfig) {
        postprocess($self,$postconfig);
    }

    my $h = Spreadsheet::Write->new(
        'file'      => $filename,
        'format'  	=> 'csv',
        'encoding'  => 'UTF-8',
    );
    croak $h->error() if $h->error;

    # print results to file
    foreach my $record (@{$records_of{$id_num}}) {
        for my $field (@$record) {
            $field =~s#\s+# #g;
        }
        if ($timestamp) {
            my $now = DateTime->now( time_zone => 'local' );
            push @$record, $now;
        }
        $h->addrow( @$record );
    }
    return 1;
}

sub _append_csv {
    my ($self, $filename,$mode,$timestamp,$postconfig) = @_;

    ref $self or croak "Instance variable needed.\n";

    my $id_num = ident $self;

    return if (!@{$records_of{$id_num}});

    if (! defined $filename) {
        croak "_append_csv: filename required.\n";
    }

    if (! -e $filename) {
        return $self->_write_csv($filename,$timestamp,$postconfig);
    }

    if ($postconfig) {
        postprocess($self,$postconfig);
    }

    my $ref = Spreadsheet::Read::ReadData($filename);
    if ( not defined $ref ) {
        return $self->_write_csv($filename);
    }

    my @rows = Spreadsheet::Read::rows ($ref->[1]);
    my $last_row = $ref->[1]{maxrow};
    my $last_col = $ref->[1]{maxcol};

    my $h = Spreadsheet::Write->new(
        'file'     => $filename,
        'format'   => 'csv',
        'encoding' => 'UTF-8',
    );

    croak $h->error() if $h->error;

    if ($mode == 1) {
        for (my $i=0; $i < $last_row; $i++) { # add previous contents
            my @prev_contents;
            for(my $j=0; $j < $last_col; $j++) {
                my $cell = $rows[$i][$j];
                my $utf8;
                if ($cell) {
                    my $enc = guess_encoding($cell);
                    if ( ref($enc) ) {
                        $utf8 = $enc->decode($cell);
                    }
                    else {
                        $utf8 = decode("UTF-16BE", $cell);
                    }
                }
                push @prev_contents, $utf8;
            }
            $h->addrow( @prev_contents );
        }

        foreach my $record (@{$records_of{$id_num}}) { # add new records
            for my $field (@$record) {
                if ($field) {
                    $field =~s#\s+# #g;
                }
            }
            if ($timestamp) {
                my $now = DateTime->now( time_zone => 'local' );
                push @$record, $now;
            }
            $h->addrow( @$record );
        }
    }
    else { # mode 2
        foreach my $record (@{$records_of{$id_num}}) { # add new records
            for my $field (@$record) {
                if ($field) {
                    $field =~s#\s+# #g;
                }
            }
            if ($timestamp) {
                my $now = DateTime->now( time_zone => 'local' );
                push @$record, $now;
            }
            $h->addrow( @$record );
        }
        for (my $i=0; $i < $last_row; $i++) { # add previous contents
            my @prev_contents;
            for(my $j=0; $j < $last_col; $j++) {
                my $cell = $rows[$i][$j];
                my $utf8;
                if ($cell) {
                    my $enc = guess_encoding($cell);
                    if ( ref($enc) ) {
                        $utf8 = $enc->decode($cell);
                    }
                    else {
                        $utf8 = decode("UTF-16BE", $cell);
                    }
                }
                push @prev_contents, $utf8;
            }
            $h->addrow( @prev_contents );
        }
    }

    return 1;
}

sub _write_xls {
    my ($self, $filename, $timestamp, $postconfig) = @_;

    ref $self or croak "Instance variable needed.\n";

    my $id_num = ident $self;

    return if (!@{$records_of{$id_num}});

    if (! defined $filename) {
        croak "_write_xls: filename required.\n";
    }

    if ($postconfig) {
        postprocess($self,$postconfig);
    }

    my $h = Spreadsheet::Write->new(
        'file'     => $filename,
        'format'   => 'xls',
        'sheet'    => 'DEiXTO Extracted Content',
        'styles'   => {
            header => { font_weight => 'bold' },
        },
    );

    croak $h->error() if $h->error;

    my $headers = _excel_headers($self);
    if ($timestamp) {
        push @$headers, 'timestamp';
    }
    $h->addrow({ style => 'header', content => $headers } );

    # print results to file
    foreach my $record (@{$records_of{$id_num}}) {
        for my $field (@$record) {
            if (! defined $field) { next; }
            $field =~s#\s+# #g;
        }

        if ($timestamp) {
            my $now = DateTime->now( time_zone => 'local' );
            push @$record, $now;
        }
        $h->addrow( @$record );
    }

    return 1;
}

sub _append_xls {
    my ($self, $filename, $mode, $timestamp, $postconfig) = @_;

    ref $self or croak "Instance variable needed.\n";

    my $id_num = ident $self;

    return if (!@{$records_of{$id_num}});

    if (! -e $filename) {
        return $self->_write_xls($filename,$timestamp,$postconfig);
    }

    my $ref = Spreadsheet::Read::ReadData($filename);
    if ( not defined $ref ) {
        return $self->_write_xls($filename);
    }

    if ($postconfig) {
        postprocess($self,$postconfig);
    }

    my @rows = Spreadsheet::Read::rows ($ref->[1]);
    my $last_row = $ref->[1]{maxrow};
    my $last_col = $ref->[1]{maxcol};

    my $h = Spreadsheet::Write->new(
        'file'     => $filename,
        'format'   => 'xls',
        'sheet'    => 'DEiXTO Extracted Content',
        'styles'   => {
            header => { font_weight => 'bold' },
        },
    );
    croak $h->error() if $h->error;

    my $headers = _excel_headers($self);
    if ($timestamp) {
        push @$headers, 'timestamp';
    }

    $h->addrow({ style => 'header', content => $headers } );

    if ($mode == 1) {
        for (my $i=1; $i < $last_row; $i++) { # add previous contents
            my @prev_contents;
            for(my $j=0; $j < $last_col; $j++) {
                my $cell = $rows[$i][$j];
                my $utf8;
                if ($cell) {
                    my $enc = guess_encoding($cell);
                    if ( ref($enc) ) {
                        $utf8 = $enc->decode($cell);
                    }
                    else { $utf8 = decode("UTF-16BE", $cell); }
                }
                push @prev_contents, $utf8;
            }
            $h->addrow( @prev_contents );
        }

        foreach my $record (@{$records_of{$id_num}}) { # add new records
            for my $field (@$record) {
                if ($field) {
                    $field =~s#\s+# #g;
                }
            }
            if ($timestamp) {
                my $now = DateTime->now( time_zone => 'local' );
                push @$record, $now;
            }
            $h->addrow( @$record );
        }
    }
    else { # mode 2
        foreach my $record (@{$records_of{$id_num}}) { # add new records
            for my $field (@$record) {
                if ($field) {
                    $field =~s#\s+# #g;
                }
            }
            if ($timestamp) {
                my $now = DateTime->now( time_zone => 'local' );
                push @$record, $now;
            }
            $h->addrow( @$record );
        }
        for (my $i=1; $i < $last_row; $i++) { # add previous contents
            my @prev_contents;
            for(my $j=0; $j < $last_col; $j++) {
                my $cell = $rows[$i][$j];
                my $utf8;
                if ($cell) {
                    my $enc = guess_encoding($cell);
                    if ( ref($enc) ) {
                        $utf8 = $enc->decode($cell);
                    }
                    else { $utf8 = decode("UTF-16BE", $cell); }
                }
                push @prev_contents, $utf8;
            }
            $h->addrow( @prev_contents );
        }
    }

    return 1;
}

sub _excel_headers {
    my $self = shift;

    my @array;
    my $id_num = ident $self;
    if ( $extract_url_of{$id_num} ) {
        push @array, 'native_url';
    }

    foreach my $label (@{$labels_of{$id_num}}) {
        push @array, $label;
    }

    return(\@array);
}

sub write_rss {
    my ($self, $filename, $rsshash_ref, $postconfig, $imgURL) = @_;

    ref $self or croak "Instance variable needed.\n";

    my $id_num = ident $self;

    return if (!@{$records_of{$id_num}});

    if (! defined $filename) {
        croak "write_rss: filename required.\n";
    }

    if (! defined $rsshash_ref) {
        croak "write_rss: a hash reference is required.\n";
    }

    croak "write_rss: a hash reference is required.\n"
        unless _is_hash_ref($rsshash_ref);

    unless (any {$_ eq 'RssTitle'} $self->labels or
            any {$_ eq 'RssDescription'} $self->labels) {
      croak 'All elements of an item are optional, however at least one of '
            .'title (RssTitle label) or description (RssDescription label) '
            .'must be present.';
    }

    if ($postconfig) {
        postprocess($self,$postconfig);
    }

    my %rsshash = %$rsshash_ref;

    my $rss = XML::RSS->new(version => '2.0');

    my $gm = gmtime();
    my $curtime = _RFC_822_date($gm);
    my $lastBuildDate = $curtime;
    my $pubDate;
    if ( ! -e $filename) {
        $pubDate = $curtime;
    }
    else {
        my $t = stat($filename)->ctime;
        $t = gmtime($t);
        $pubDate = _RFC_822_date($t);
    }

    my $title;
    if ( exists $rsshash{'title'} ) {
        $title = $rsshash{'title'};
    }
    elsif ($self->title) {
        eval {$title = decode($self->get_charset,$self->title)};
        if ($@) {
            $title = "DEiXTo: ".$self->title;
        }
        else {
            $title = "DEiXTo: ".$title;
        }
    }
    else {
        $title = "DEiXTo: No channel title!";
    }

    my $description;
    if ( exists $rsshash{'description'} ) {
        $description = $rsshash{'description'};
    }
    else {
        $description = 'DEiXTo generated RSS feed';
    }

    my $link;
    if ( exists $rsshash{'link'} ) {
        $link = $rsshash{'link'};
    }
    else {
        $link = $self->uri;
    }

    my $copyright;
    if ( exists $rsshash{'copyright'} ) {
        $copyright = $rsshash{'copyright'};
    }
    else {
        $copyright = 'Copyright, 2007-2010';
    }

    my $language;
    if ( exists $rsshash{'language'} ) {
        $language = $rsshash{'language'};
    }
    else {
        $language = 'en-us';
    }

    my $managingEditor;
    if ( exists $rsshash{'managingEditor'} ) {
        $managingEditor = $rsshash{'managingEditor'};
    }
    else {
        $managingEditor = 'myaddress@mydomain.com';
    }

    my $webMaster;
    if ( exists $rsshash{'webMaster'} ) {
        $webMaster = $rsshash{'webMaster'};
    }
    else {
        $webMaster = 'myaddress@mydomain.com';
    }

    # create the channel object.
    $rss->channel(
        title          => $title,
        link           => $link,
        description    => $description,
        language       => $language,
        copyright      => $copyright,
        pubDate        => $pubDate,
        lastBuildDate  => $lastBuildDate,
        managingEditor => $managingEditor,
        webMaster      => $webMaster,
        generator      => 'DEiXTo',
    );

    $rss->image(
       title  => 'Go to source!',
       url    => $imgURL ? $imgURL : "http://deixto.com/images/deixto.gif",
       link   => $link,
    );

    $self->_add_rss_items($rss,'overwrite'); # add items of extracted data

    $rss->save($filename); # Save it

    return 1;
}

sub append_rss {
    my ($self, $filename, $rsshash_ref, $mode, $postconfig, $imgURL) = @_;

    ref $self or croak "Instance variable needed.\n";

    my $id_num = ident $self;

    return if (!@{$records_of{$id_num}});

    if (! defined $filename) {
        croak "append_rss: filename required.\n";
    }

    if (! -e $filename) {
        return $self->write_rss($filename, $rsshash_ref, $postconfig, $imgURL);
    }

    if ($postconfig) {
        postprocess($self,$postconfig);
    }

    my $rss = new XML::RSS();

    $rss->parsefile($filename);

    my $gm = gmtime();
    my $curtime = _RFC_822_date($gm);
    my $lastBuildDate = $curtime;

    $rss->channel(lastBuildDate => $lastBuildDate);

    $self->_add_rss_items($rss,$mode); # add items of extracted data

    $rss->save($filename); # Save it

    return 1;
}

sub _add_rss_items {
    my ($self,$rss,$mode) = @_;

    my $id_num = ident $self;

    my $i=0;  # counter
    my %hash; # store label-field index pairs
    foreach my $label (@{$labels_of{$id_num}}) {
        $hash{$label}= $i;
        $i++;
    }

    my @records;
    if ($mode eq 'prepend') {
        @records = reverse @{$records_of{$id_num}};
        $mode = 'insert';
    }
    else {
        @records = @{$records_of{$id_num}};
        $mode = 'append';
    }

    foreach my $result (@records) {
        my @record = @{$result};

        my ($title,$link,$description,$author,$pubDate);

        if (exists $hash{'RssTitle'}) {
            $title = $record[$hash{'RssTitle'}];
        }

        if (exists $hash{'RssLink'}) {
            $link = $record[$hash{'RssLink'}];
        }
        else {
            $link = $self->uri;
        }

        if (exists $hash{'RssDescription'} ) {
            $description = $record[$hash{'RssDescription'}];
        }
        if (exists $hash{'RssAuthor'} ) {
            $author =$record[$hash{'RssAuthor'}];
        }
        if (exists $hash{'RssPubDate'} ) {
            $pubDate = $record[$hash{'RssPubDate'}];
        }

        $rss->add_item(
            'title'       => $title,
            'link'        => $link,
            'description' => $description,
            'author'      => $author,
            'pubDate'     => $pubDate,
            'mode'        => $mode,
        );
    }
    return 1;
}

sub _RFC_822_date {
    # return the date in RFC_822 format (compliant to the rss specification)
    my $c_time = shift;
    my ($dw, $mo, $da, $ti, $yr) =
    ($c_time =~ /(\w{3}) +(\w{3}) +(\d{1,2}) +(\d{2}:\d{2}:\d{2}) +(\d{4})$/);

    $da = sprintf("%02d", $da);

    return $dw.", ".$da." ".$mo." ".$yr." ".$ti." GMT";
}

sub _write_html {
    my ($self, $filename, $timestamp, $postconfig) = @_;

    ref $self or croak "Instance variable needed.\n";

    my $id_num = ident $self;

    return if (!@{$records_of{$id_num}});

    if (! defined $filename) {
        croak "_write_html: filename required.\n";
    }

    if (! -e 'template.tmpl') {
        croak "template.tmpl not found..\nPlease make sure it is in the same directory as the executor.\n";
    }

    if ($postconfig) {
        postprocess($self, $postconfig);
    }

    #my $tpl = HTML::Template->new(filename => 'template.tmpl');
    open my $tmplh, '<:utf8', 'template.tmpl' or
        croak "Couldn't open 'template.tmpl': $!\n";;
    my $tpl = HTML::Template->new(filehandle => $tmplh);
    close $tmplh or croak "Couldn't close 'template.tmpl': $!\n";

    my @labels = $self->labels;
    if ( $extract_url_of{$id_num} ) {
        unshift @labels, 'native_url';
    }
    if ($timestamp) {
        push @labels, 'timestamp';
    }

    my @loop_data = ();  # initialize an array to hold your loop
    foreach my $record (@{$records_of{$id_num}}) {
        my @templabels=@labels;
        my %row_data;  # get a fresh hash for the row data
        while (@templabels and @$record) {
            my $label=shift @templabels;
            my $field = shift @$record;
            if (!$field){$field='&nbsp;';}
            $row_data{$label} = $field;
        }

        # the crucial step - push a reference to this row into the loop!
        push(@loop_data, \%row_data);
    }
    # finally, assign the loop data to the loop param, again with a reference:
    $tpl->param(RECORDS => \@loop_data);

    my $file;
    if (utf8::is_utf8($filename)) {
        $file = encode($os_charset_of{$id_num},$filename);
    }
    else { $file = $filename; }

    open my $fh, '>:utf8', $filename or
        croak "Couldn't open '$file': $!\n";

    # process and print parsed template
    print {$fh} $tpl->output or croak "Couldn't write '$file': $!\n";

    close $fh or croak "Couldn't close '$file': $!\n";
}

sub _append_html {
    my ($self, $filename, $mode, $timestamp, $postconfig) = @_;

    ref $self or croak "Instance variable needed.\n";

    my $id_num = ident $self;

    return if (!@{$records_of{$id_num}});

    if (! -e $filename) {
        return $self->_write_html($filename);
    }

    if (! -e 'template.tmpl') {
        croak "template.tmpl not found..\nPlease make sure it is in the same directory as the executor.\n";
    }

    if ($postconfig) {
        postprocess($self, $postconfig);
    }

    #my $tpl = HTML::Template->new(filename => 'template.tmpl');
    open my $tmplh, '<:utf8', 'template.tmpl' or
        croak "Couldn't open 'template.tmpl': $!\n";;
    my $tpl = HTML::Template->new(filehandle => $tmplh);
    close $tmplh or croak "Couldn't close 'template.tmpl': $!\n";;

    my @labels = $self->labels;
    if ( $extract_url_of{$id_num} ) {
        unshift @labels, 'native_url';
    }
    if ($timestamp) {
        push @labels, 'timestamp';
    }

    my @loop_data = ();  # initialize an array to hold your loop
    foreach my $record (@{$records_of{$id_num}}) {
        my @templabels=@labels;
        my %row_data;  # get a fresh hash for the row data
        while (@templabels and @$record) {
            my $label=shift @templabels;
            my $field = shift @$record;
            if (!$field){$field='&nbsp;';}
            $row_data{$label} = $field;
        }

        # the crucial step - push a reference to this row into the loop!
        push(@loop_data, \%row_data);
    }
    # finally, assign the loop data to the loop param, again with a reference:
    $tpl->param(RECORDS => \@loop_data);

    my $html = $tpl->output;
    $html =~m#.*<!-- from now on -->(.*)<!-- till here -->.*#is;
    my $body = $1;

    my $file;
    if (utf8::is_utf8($filename)) {
        $file = encode($os_charset_of{$id_num},$filename);
    }

    open my $fh, '<:utf8', $filename or
        croak "Couldn't open '$file': $!\n";

    undef $/;
    my $content = <$fh>;
    close $fh or croak "Couldn't close '$file': $!\n";

    if ($mode == 1) {
        $content =~s#<!-- till here -->#$body<!-- till here -->#is;
    }
    else { # mode 2
        $content =~s#<!-- from now on -->#<!-- from now on -->$body#is;
    }

    open $fh, '>:utf8', $filename or
        croak "Couldn't open '$file': $!\n";

    print {$fh} $content or croak "Couldn't write '$file': $!\n";

    close $fh or croak "Couldn't close '$file': $!\n";
}

sub _write_xml {
    my ($self, $filename, $timestamp, $postconfig, $style) = @_;

    ref $self or croak "Instance variable needed.\n";

    my $id_num = ident $self;

    return if (!@{$records_of{$id_num}});

    if (! defined $filename) {
        croak "_write_xml: filename required.\n";
    }

    if ($postconfig) {
        postprocess($self, $postconfig);
    }

    my $output = new IO::File("> $filename");
    my $writer = new XML::Writer(OUTPUT => $output,
                                 DATA_MODE => 1,
                                 DATA_INDENT => 4,
                                 ENCODING => 'utf-8');

    $writer->xmlDecl('UTF-8', 'yes');

    if (defined $style) {
        $writer->pi('xml-stylesheet', 'href="'.$style.'" type="text/xsl"');
    }

    $writer->startTag('items');

    # add items of extracted data
    foreach my $record (@{$records_of{$id_num}}) {
        $writer->startTag('item');

        if ( $extract_url_of{$id_num} ) {
            my $url = shift @{$record};
            $writer->dataElement('native_url', $url);
        }

        my $j = 0;
        foreach my $item (@{$record}) {
            $writer->dataElement($labels_of{$id_num}->[$j++], $item);
        }

        if ($timestamp) {
            my $now = DateTime->now( time_zone => 'local' );
            $writer->dataElement('timestamp',$now);
        }

        $writer->endTag('item');
    }

    $writer->endTag('items');

    $writer->end();
}

sub _append_xml {
    my ($self, $filename, $timestamp, $postconfig, $style, $mode) = @_;

    ref $self or croak "Instance variable needed.\n";

    my $id_num = ident $self;

    return if (!@{$records_of{$id_num}});

    if (! -e $filename) {
        return $self->_write_xml($filename, $timestamp, $postconfig, $style);
    }

    if ($postconfig) {
        postprocess($self, $postconfig);
    }

    my $xmlparser = XML::LibXML->new();
    $xmlparser->keep_blanks(0);
    my $doc;
    eval { $doc = $xmlparser->parse_file($filename); };
    if ($@) {
        if (utf8::is_utf8($filename)) {
            $filename = encode($os_charset_of{$id_num},$filename);
        }
        print "Could not parse $filename.\n";
        croak "_append_xml: $@\n";
    }

    my $XMLDocElm = $doc->getDocumentElement;
    if ( !$XMLDocElm ) {
        croak "_append_xml: could not get Document Element.\n";
    }

    if ($mode == 1) {
        my $comment = XML::LibXML::Comment->new( " Another round " );
        $XMLDocElm->appendChild($comment);
    }

    my @records;
    if ($mode == 2) {
        my $comment = XML::LibXML::Comment->new( " Another round " );
        $XMLDocElm->insertBefore( $comment, $XMLDocElm->firstChild );
        @records = reverse @{$records_of{$id_num}};
    }
    else { @records = @{$records_of{$id_num}}; }

    # add items of extracted data
    foreach my $record (@records) {
        my $itemnode = $doc->createElement('item');

        if ($mode == 1) {
            $XMLDocElm->appendChild($itemnode);
        }
        else {
            $XMLDocElm->insertBefore( $itemnode, $XMLDocElm->firstChild );
        }

        if ( $extract_url_of{$id_num} ) {
            my $url = shift @{$record};
            my $pageurlnode = $doc->createElement('native_url');
            my $textnode = $doc->createTextNode( $url );
            $itemnode->appendChild($pageurlnode);
            $pageurlnode->appendChild($textnode);
        }

        my $j = 0;
        foreach my $item (@{$record}) {
            my $childnode = $doc->createElement($labels_of{$id_num}->[$j++]);
            $itemnode->appendChild($childnode);
            if ($item){
                my $textnode = $doc->createTextNode( $item );
                $childnode->appendChild($textnode);
            }
        }

        if ($timestamp) {
            my $childnode = $doc->createElement('timestamp');
            my $now = DateTime->now( time_zone => 'local' );
            my $textnode = $doc->createTextNode( $now );
            $itemnode->appendChild($childnode);
            $childnode->appendChild($textnode);
        }
    }

    $doc->toFile($filename,1);

    return 1;
}

# export results to a tab delimited UTF-8 text file
sub _results2txt {
    my ($self, $filename, $mode, $timestamp, $postconfig) = @_;

    my $id_num = ident $self;

    return if (!@{$records_of{$id_num}});

    croak "_results2txt: filename required.\n" if ! defined $filename;
    croak "_results2txt: mode required.\n" if ! defined $mode;

    if ($postconfig) {
        postprocess($self,$postconfig);
    }

    my $file;
    if (utf8::is_utf8($filename)) {
        $file = encode($os_charset_of{$id_num},$filename);
    }
    open my $fh, $mode, $filename or
        croak "Couldn't open '$file': $!\n";

    # print results to file
    foreach my $record (@{$records_of{$id_num}}) {
        for my $field (@$record) {
            if ($field) {
                # substitute white characters with a space
                $field =~s#\s+# #g;
            }
            elsif (! defined $field) { $field = ''; }
        }
        if ($timestamp) {
            my $now = DateTime->now( time_zone => 'local' );
            push @$record, $now;
        }
        print {$fh} join("\t", @$record), "\n" or
            croak "Couldn't write '$file': $!\n";
    }

    close $fh or croak "Couldn't close '$file': $!\n";

    return 1;
}

# write results to a tab delimited UTF-8 text file
sub _write_txt {
    my ($self, $filename, $timestamp, $postconfig) = @_;

    ref $self or croak "Instance variable needed.\n";

    if (! defined $filename) {
        croak "_write_txt: filename required.\n";
    }

    $self->_results2txt($filename, '>:utf8', $timestamp, $postconfig);

    return 1;
}

# append results to a tab delimited UTF-8 text file
sub _append_txt {
    my ($self, $filename, $timestamp, $postconfig) = @_;

    ref $self or croak "Instance variable needed.\n";

    if (! defined $filename) {
        croak "_append_txt: filename required.\n";
    }

    $self->_results2txt($filename, '>>:utf8', $timestamp, $postconfig);

    return 1;
}

# prepend results to a tab delimited UTF-8 text file
sub _prepend_txt {
    my ($self, $filename, $timestamp, $postconfig) = @_;

    my $id_num = ident $self;

    return if (!@{$records_of{$id_num}});

    croak "_prepend_txt: filename required.\n" if ! defined $filename;

    if (! -e $filename) {
        return $self->_write_txt($filename, $timestamp, $postconfig);
    }

    if ($postconfig) {
        postprocess($self,$postconfig);
    }

    my $file;
    if (utf8::is_utf8($filename)) {
        $file = encode($os_charset_of{$id_num},$filename);
    }

    open my $fh, '<:utf8', $filename or
        croak "Couldn't open '$file': $!\n";

    undef $/;
    my $prevcontent = <$fh>;
    close $fh or croak "Couldn't close '$file': $!\n";

    open $fh, ">:utf8", $filename or
        croak "Couldn't open '$file': $!\n";

    # print results to file
    foreach my $record (@{$records_of{$id_num}}) {
        for my $field (@$record) {
            if ($field) {
                # substitute white characters with a space
                $field =~s#\s+# #g;
            }
            elsif (! defined $field) { $field = ''; }
        }
        if ($timestamp) {
            my $now = DateTime->now( time_zone => 'local' );
            push @$record, $now;
        }

        print {$fh} join("\t", @$record), "\n" or
            croak "Couldn't write '$file': $!\n";
    }

    print {$fh} $prevcontent,"\n";

    close $fh or croak "Couldn't close '$file': $!\n";
    return 1;
}

sub db_insert {
    my ($self, $dbconfig, $filename, $timestamp, $postconfig) = @_;

    my $id_num = ident $self;

    return if (!@{$records_of{$id_num}});

    if ($postconfig) {
        postprocess($self,$postconfig);
    }

    if (!$dbconfig) { $dbconfig = 'dbconfig' }
    my $conf = new Config::General($dbconfig);
    my %config = $conf->getall;

    my $server = $config{server};
    my $driver = $config{driver};
    my $db = $config{db};
    my $mode = $config{mode};
    my $user = $config{user};
    my $passwd = $config{passwd};

    # Connect to the database
    my $dbh = DBI->connect("DBI:$driver:database=$db;host=$server",$user, $passwd,{'RaiseError' => 1});

    if (($mode ne 'update') and ($mode ne 'insert')) {
        croak "db_insert: invalid mode. Choose insert or update.\n";
    }

    my %tablehash;

    my $i=0;
    if ( $extract_url_of{$id_num} ) {
        my $table = $config{native_url}{table};
        my $field = $config{native_url}{field};
        if (! $table or ! $field) {
            croak "db_insert: did not found a native_url element in the config file.\n";
        }
        $tablehash{$table}{$field} = 0;
        $i=1;
    }

    my $key; # the label / field name of the node which is going to be used as a primary key
    my $tablekey; # the name of the table which contains the key field

    for my $label ($self->labels) {
        my $table = $config{$label}{table};
        my $field = $config{$label}{field};
        if (! $table or ! $field) { # this variable is not mapped  to a table field
            $i++; # probably due to postprocessing, a calculated one is used instead
            next; # go on with the next variables
        }
        if ($config{$label}{key}) {
            $key = $label;
            $tablekey = $table;
        }
        $tablehash{$table}{$field} = $i++;
    }

    if (! $key) {
        die "You have not specified the primary key of the database. Please edit the dbconfig file.\n";
    }

    if ($timestamp) {
        my $table = $config{timestamp}{table};
        my $field = $config{timestamp}{field};
        if (! $table or ! $field) {
            croak "db_insert: it seems you forgot to map the timestamp variable to a table field.\n";
        }
        $tablehash{$table}{timestamp} = $i++;
    }

    my @tablenames = keys %tablehash;
    my $index = firstidx { $_ eq $tablekey } @tablenames;
    splice @tablenames, $index, 1;
    unshift @tablenames, $tablekey;

    if ($filename) {
        open my $fh,">:utf8",$filename;
        close $fh;
    }

    $dbh->do("set character set utf8");

    my $statement;

    if ($mode eq 'update') {

        for my $record (@{$records_of{$id_num}}) {
            if ($timestamp) {
                my $now = DateTime->now( time_zone => 'local' );
                push @$record, $now;
            }

            my $keyvalue = $$record[$tablehash{$tablekey}{$key}];
            $keyvalue =~s#'#''#g;

            my $sth = $dbh->prepare("SELECT $key FROM $tablekey WHERE $key='$keyvalue'") or die $dbh->errstr;

            $sth->execute() or die $sth->errstr;

            if ( $sth->fetchrow_array ) { # row exists, just update it

                $sth->finish();

                for my $table (@tablenames) {
                    my %fields = %{$tablehash{$table}};
                    my @columns = keys %fields;
                    my $cols = join ',', @columns;

                    $statement = "UPDATE $table SET";
                    my @array;
                    my $value;
                    for my $i (0..scalar(@columns)-1) {
                        my $col = $columns[$i];
                        if (($key eq $col) and ($table eq $tablekey)) { next; }
                        $value = $$record[$tablehash{$table}{$col}];
                        if ($value) {
                            $value =~s#\s+# #g;
                            $value =~s#'#''#g;
                            push @array, " $col='$value' ";
                        }
                        else {
                            push @array, " $col='' ";
                        }
                    }

                    $statement .= join ',', @array;
                    $statement .= " WHERE $key='$keyvalue';";
                    if ($filename) {
                        open my $fh,">>:utf8",$filename;
                        print $fh $statement,"\n";
                        close $fh;
                    }
                    eval { $dbh->do($statement) };
                    if ($@) {
                        print "Error: $DBI::errstr\n";
                        next;
                    }
                }
            }

            else { # insert a new row

                $sth->finish();

                for my $table (@tablenames) {

                    my %fields = %{$tablehash{$table}};
                    my @columns = keys %fields;
                    if ($table ne $tablekey) {
                        unshift @columns, $key;
                    }
                    my $cols = join ',', @columns;

                    $statement = "INSERT INTO $table($cols) VALUES (";

                    my @array;
                    if ($table ne $tablekey) {
                        push @array, "'$keyvalue'";
                    }
                    my $j = 0;
                    if ($table ne $tablekey) { $j = 1; }
                    for my $i ($j..scalar(@columns)-1) {
                        my $value = $$record[$tablehash{$table}{$columns[$i]}];
                        $value =~s#\s+# #g;
                        $value =~s#'#''#g;
                        push @array, "'$value'";
                    }

                    $statement .= join ',', @array;
                    $statement .= ');';
                    if ($filename) {
                        open my $fh,">>:utf8",$filename;
                        print $fh $statement,"\n";
                        close $fh;
                    }
                    eval { $dbh->do($statement) };
                    if ($@) {
                        print "Error: $DBI::errstr\n";
                        next;
                    }
                }
            }
        }
    }

    else { # insert mode - insert new rows and delete the previous ones with the same keys (if already in database)

        for my $record (@{$records_of{$id_num}}) {
            if ($timestamp) {
                my $now = DateTime->now( time_zone => 'local' );
                push @$record, $now;
            }

            my $keyvalue = $$record[$tablehash{$tablekey}{$key}];
            $keyvalue =~s#\s+# #g;
            $keyvalue =~s#'#''#g;

            $dbh->do("DELETE FROM $tablekey WHERE $key='$keyvalue'") or die $dbh->errstr;

            for my $table (@tablenames) {
                my %fields = %{$tablehash{$table}};
                my @columns = keys %fields;
                if ($table ne $tablekey) {
                    unshift @columns, $key;
                }
                my $cols = join ',', @columns;

                $statement = "INSERT INTO $table($cols) VALUES (";

                my @array;
                if ($table ne $tablekey) {
                    push @array, "'$keyvalue'";
                }
                my $j = 0;
                if ($table ne $tablekey) { $j = 1; }
                for my $i ($j..scalar(@columns)-1) {
                    my $value = $$record[$tablehash{$table}{$columns[$i]}];
                        $value =~s#\s+# #g;
                        $value =~s#'#''#g;
                        push @array, "'$value'";
                }

                $statement .= join ',', @array;
                $statement .= ');';
                if ($filename) {
                    open my $fh,">>:utf8",$filename;
                    print $fh $statement,"\n";
                    close $fh;
                }
                eval { $dbh->do($statement) };
                if ($@) {
                    print "Error: $DBI::errstr\n";
                    next;
                }
            }
        }
    }

    $dbh->disconnect(); # Disconnect from the database
}

sub postprocess {
    my ($self, $postprocess) = @_;

    my $id_num = ident $self;

    return if (!@{$records_of{$id_num}});
    my @labels = @{$labels_of{$id_num}};

    if ( $extract_url_of{$id_num} ) {
        unshift @labels, 'native_url';
    }

    my $conf = new Config::General($postprocess);
    my %config = $conf->getall;

    my $post_ref = $config{postprocess};

    for my $key (keys %$post_ref) {
        if ($key eq 'merge') {
            my ($new, $dlm, @fields);
            for my $subkey (keys %{$$post_ref{$key}}) {
                if ($subkey eq 'field') {
                    for my $subkey2 (keys %{$$post_ref{$key}{$subkey}}) {
                        push @fields, _trim($subkey2);
                    }
                }
                elsif ($subkey eq 'new') {
                    $new = _trim($$post_ref{$key}{$subkey});
                }

                elsif ($subkey eq 'delimiter') {
                    $dlm = $$post_ref{$key}{$subkey};
                }
                else {
                    die "$subkey is not valid.\n";
                }
            }

            if (!$dlm){
                croak "You shoud provide a delimiter element with a value of one or more characters.\n";
            }
            if (!$new){
                croak "You shoud provide a new element with a variable name.\n";
            }
            if (scalar(@fields) < 2) {
                croak "You shoud provide at least 2 field elements to merge.\n";
            }

            my $index = 0;
            foreach my $record (@{$records_of{$id_num}}) {
                my @values;
                for (my $j=0; $j < scalar(@labels); $j++) {
                    if (any {$labels[$j] eq $_} @fields) {
                        push @values, @$record[$j];
                    }
                }
                my $merged_value = join $dlm, @values;
                push @{@{$records_of{$id_num}}[$index]}, $merged_value;
                $index++;
            }
            push @{$labels_of{$id_num}}, $new;
        }
        elsif ($key eq 'split') {
            my (@new_fields, $regexp, $field);
            for my $subkey (keys %{$$post_ref{$key}}) {
                if ($subkey eq 'field') {
                    $field = _trim($$post_ref{$key}{$subkey});
                }
                elsif ($subkey eq 'new') {
                    for my $subkey2 (keys %{$$post_ref{$key}{$subkey}}) {
                        push @new_fields, _trim($subkey2);
                    }
                }

                elsif ($subkey eq 'regexp') {
                    $regexp = $$post_ref{$key}{$subkey};
                }
                else {
                    die "$subkey is not valid.\n";
                }
            }

            if (!$regexp){
                croak "You shoud provide a regexp element with a regular expression.\n";
            }
            if (!$field){
                croak "You shoud provide a field element with a variable name.\n";
            }
            if (scalar(@new_fields) < 2) {
                croak "You shoud provide at least 2 new field elements to merge.\n";
            }

            my $index = 0;
            foreach my $record (@{$records_of{$id_num}}) {
                my @values;
                for (my $j=0; $j < scalar(@labels); $j++) {
                    if ($labels[$j] eq $field) {
                        @values = (@$record[$j] =~m#$regexp#);
                    }
                }
                if (! @values) {
                    for (my $i=0;$i<scalar(@new_fields);$i++) {
                        push @values, undef;
                    }
                }
                push @{@{$records_of{$id_num}}[$index]}, @values;
                $index++;
            }
            for my $new (@new_fields) {
                push @{$labels_of{$id_num}}, $new;
            }
        }
        else {
            die "$key is not valid.\n";
        }
    }

    # perform necessary transformations by evaluating subroutines on the fly!
    for (my $j=0; $j < scalar(@labels); $j++) {
        if ($config{$labels[$j]}) {
            my $body = $config{$labels[$j]}{func};
            if (!$body){
                croak "You shoud provide a func element with a subroutine.\n";
            }
            $body =~m#sub\s+(.+?)\s*{#;
            my $name = $1;
            eval $body;
            if ($@) { croak $@; }
            my $ref = \&$name;

            my $index=0;
            foreach my $record (@{$records_of{$id_num}}) {
                ${$records_of{$id_num}[$index]}[$j] = &$ref(@$record[$j]);
                $index++;
            }
        }
    }
}

# print results to screen
sub print_results {
    my $self = shift;

    ref $self or croak "Instance variable needed.\n";

    _print_number_of_hits($self);

    my $id_num = ident $self;
    my $j = 1;
    foreach my $record (@{$records_of{$id_num}}) {
        my $i=0;
        print "\n---------- Record $j ----------\n";
        if ( $extract_url_of{$id_num} ) {
            print "PageURL: ",shift @{$record},"\n";
        }
        foreach my $item (@{$record}) {
            my $label = @{$labels_of{$id_num}}[$i++];
            my $string = defined $item  ? "$label: $item\n"
                                        : "$label:\n";
            if ( utf8::is_utf8($string) ) {
                $string = encode($os_charset_of{$id_num}, $string);
            }
            print $string;
        }
        $j++;
    }
    return 1;
}

sub _print_number_of_hits {
    my $self = shift;

    my $s = "* ".$self->hits." results! *";
    my $n = length($s);
    print "\n","*"x$n,"\n";
    print "$s\n";
    print "*"x$n,"\n";
}

# remove leading and trailing spaces
sub _trim {
    my $string = shift;
    $string =~ s{\A \s* | \s* \z}{}gxm;
    return $string;
}

# remove all white space (used for checked_source - html segments)
sub _remove_white {
    my $string = shift;
    $string =~ s/\s+/ /mg;
    return $string;
}

# parse the XML 'extraction pattern' subtree and build the pattern tree
sub _process_pattern_node {
    my $self = shift;
    my $node = shift;
    my $parent = shift;

    if ($node->nodeName ne 'Node') {
        croak $node->nodeName.": syntax error! 'Node' expected\n";
    }

    my %info;

    my @taglabel = split(/:/,$node->getAttribute('tag'));
    $info{'Name'} = $taglabel[0];
    $info{'Label'} = $taglabel[1];

    if ($parent) {
        $info{'SiblingOrder'} = scalar($parent->children);
    }

    if ($node->getAttribute('IsRoot')) {
        $info{'IsRoot'} = 1;
    }

    if ($node->getAttribute('regexpr')) {
        $info{'RegExpr'} = $node->getAttribute('regexpr');
        if ( $node->getAttribute('inverse') ) {
            $info{'inverse'} = 1;
        }
    }
    else {
        $info{'RegExpr'} = undef;
        $info{'inverse'} = undef;
    }

    if ($node->getAttribute('fson')) {
        $info{'fson'} = $node->getAttribute('fson');
    }
    else { $info{'fson'} = 0; }

    if ( $node->getAttribute('CareAboutSO') ) {
        $info{'CareAboutSO'} = 1;
        $info{'so_start'} = $node->getAttribute('so_start');
        $info{'so_step'} = $node->getAttribute('so_step');
    }
    else { $info{'CareAboutSO'} = 0; }

    my $st = $node->getAttribute('stateIndex');
    if (!$st) {
        croak "$taglabel[0] does not have a stateIndex attribute!\n";
    }
    if ($st eq 'checked') {$info{'State'} = checked;}
    elsif ($st eq 'grayed') {$info{'State'} = grayed;}
    elsif ($st eq 'grayed_implied') {$info{'State'} = grayed_implied;}
    elsif ($st eq 'checked_implied') {$info{'State'} = checked_implied;}
    elsif ($st eq 'checked_source') {$info{'State'} = checked_source;}
    else { croak $st.": invalid state index!\n"; }

    if ($st eq 'checked' or $st eq 'checked_implied'
        or $st eq 'checked_source') {
        if ($info{'Label'}) {
            push @{$labels_of{ident $self}}, $info{'Label'};
        }
        else {
            my $index = scalar(@{$labels_of{ident $self}}) + 1;
            push @{$labels_of{ident $self}}, "VAR".$index;
        }
    }

    my $treenode = Tree::Fast->new(\%info);
    if (defined $parent) {
        $parent->add_child({},$treenode);
    }
    else {
        $proot = $treenode;
    }

    if (exists $info{'IsRoot'}) {
        $vroot = $treenode;
    }

    my $cNode = $node->firstChild;
    while ($cNode) {
        _process_pattern_node($self,$cNode,$treenode);
        $cNode = $cNode->nextSibling;
    }
    return 1;
}

# function that traverses the pattern tree, finds the labels of the
# extracted fields and pushes them to the labels array
sub _find_labels {
    my $self = shift;
    my @nodelist = $proot->traverse();
    my $i = 1;
    foreach my $Node (@nodelist) {
        my $state = ${$Node->value}{'State'};
        if ($state == checked or $state == checked_implied or $state == checked_source) {
            if (${$Node->value}{'Label'}) {
                push @{$labels_of{ident $self}}, ${$Node->value}{'Label'};
            }
            else {
                push @{$labels_of{ident $self}}, "VAR$i";
                $i++;
            }
        }
    }
    return 1;
}

sub build_dom {
    my $self = shift;

    my $page = $self->content;
    if (!$page) {
        croak "build_dom: no page fetched!\n";
    }

    eval { $page = decode($self->get_charset, $page); };


    #if ($@) {
        #print $@;
    #}

    decode_entities($page);

    my $doc; # XML::LibXML::Document object, which is a DOM object

    $DOWARN = 0; # no compile-time or run-time warnings after here

    eval { $doc = $parser_of{ident $self}->parse_html_string($page) };

    if ($@) { # probably "Malformed UTF-8 character (fatal)" error
        # encode only non-plain ascii
        my $encoded = encode_entities($page,'^\n\x20-\x25\x27-\x7e');
        eval { $doc = $parser_of{ident $self}->parse_html_string($encoded) };
        if ($@) { # could not recover
            croak "build_dom: XML::LibXML parser error: $@\n";
        }
    }

    $DOWARN = 1; # compile-time and run-time warnings enabled after here

    # get the root element of the fetched HTML document
    my $root_elm = $doc->getDocumentElement;
    if (!$root_elm) {
        croak "build_dom: Could not get Document Element.\n";
    }

    @candidates = ();

    if (exists $ignore_tags_of{ident $self} and
        defined $ignore_tags_of{ident $self} and
        @{$ignore_tags_of{ident $self}}) {
        _dom2tree2($self,$root_elm);
    }
    else {
        _dom2tree($self,$root_elm);
    }
    return 1;
}

# _dom2tree: builds myDOM tree representation
# _dom2tree2: builds myDOM tree representation ignoring specified nodes
# =====================================================================
# Foreach node, I store Name, Content and SiblingOrder
# Content depends on node type (A, IMG, TEXT, HTML element). That is
# a string for text nodes, but a pointer to a DOM node otherwise.
# nodeType: this property holds a numeric value that corresponds to
# the type of node. Elements have a nodeType value of 1, while text
# nodes have a nodeType value of 3.
# nodeName: this property holds a string that, like nodeType,
# corresponds to the type of node. All text nodes have the string
# "#text" as the value of nodeName. For elements, nodeName contains
# the name of the element tag. Thus, an HTML image tag would have
# a nodeName of "IMG."
# nodeValue: this property holds the value of the node, if any.
# Elements have a nodeValue of null. Text nodes have a nodeValue that
# is the actual string of text within that node.
sub _dom2tree2 {
    my $self = shift;
    my $node = shift; # current element examined
    my $parent = shift; # parent of the treenode to be inserted

    my $type = $node->nodeType;

    if ($type != 1 and $type != 3) { # not html element nor text node
        return;
    }

    if ($type == 3 and _trim($node->nodeValue) eq '') { # empty text node
        return;
    }

    my $treenode;

    my $name;
    if ($type == 3) { # nodeName is #TEXT
        $name = 'TEXT';
    }
    else {
        $name = uc($node->nodeName);
    }

    if (! any {uc($_) eq $name} @{$ignore_tags_of{ident $self}} ) {
        my %info;

        if ($parent) {
            $info{'SiblingOrder'} = scalar($parent->children);
        }

        if ($type == 3) { # text node
            # merge neighbour text nodes, if needed
            my $LastNode = $parent->children(-1);
            if ($LastNode and ${$LastNode->value}{'Name'} eq 'TEXT') {
                my $string = _trim($node->nodeValue);
                $string =~ s/ +/ /;
                ${$LastNode->value}{'Content'} .= " ".$string;
                return;
            }
            else {
                $info{'Name'} = 'TEXT';
                $info{'Content'} = _trim($node->nodeValue);
                $info{'Content'} =~ s/ +/ /;
            }
        }

        else {
            $info{'Name'} = $name;
            $info{'Content'} = $node;
        }

        $treenode = Tree::Fast->new(\%info);

        if (defined $parent) {
            $parent->add_child({},$treenode);
            push @candidates, $treenode;
            if ($name eq 'TABLE') {
                $treenode = _add_tbody($treenode);
            }
        }
        else {
            $droot = $treenode;
        }
    }
    else {
        $treenode = $parent;
    }

    if ($type == 1) {
        my @children = $node->getChildNodes();
        foreach my $child (@children) {
            _dom2tree2($self,$child,$treenode);
        }
    }
    return 1;
}

sub _dom2tree {
    my $self = shift;
    my $node = shift; # current element examined
    my $parent = shift; # parent of the treenode to be inserted

    my $type = $node->nodeType;

    if ($type != 1 and $type != 3) { # not html element nor text node
        return;
    }

    if ($type == 3 and _trim($node->nodeValue) eq '') { # empty text node
        return;
    }

    my $treenode;
    my %info;

    if ($parent) {
        $info{'SiblingOrder'} = scalar($parent->children);
    }

    if ($type == 3) { # text node
        $info{'Name'} = 'TEXT';
        $info{'Content'} = _trim($node->nodeValue);
        $info{'Content'} =~ s/ +/ /;
    }

    else {
        $info{'Name'} = uc($node->nodeName);
        $info{'Content'} = $node;
    }

    my $name = $info{'Name'};

    $treenode = Tree::Fast->new(\%info);

    if (defined $parent) {
        $parent->add_child({},$treenode);
        push @candidates, $treenode;
        if ($name eq 'TABLE') {
            $treenode = _add_tbody($treenode);
        }
    }
    else {
        $droot = $treenode;
    }

    if ($type == 1) {
        my @children = $node->getChildNodes();
        foreach my $child (@children) {
            _dom2tree($self,$child,$treenode);
        }
    }
    return 1;
}

# add TBODY nodes manually
sub _add_tbody {
    my $treenode = shift;

    my %tbodyinfo;
    $tbodyinfo{'Name'} = 'TBODY';
    $tbodyinfo{'SiblingOrder'} = 0;
    #$tbodyinfo{'Content'} = _trim($node->textContent);
    my $tbody = Tree::Fast->new(\%tbodyinfo);
    $treenode->add_child({},$tbody);

    return $tbody;
}

sub _resolve_content {
    my $self = shift;
    my $node = shift; # myDOM node examined

    my $name = ${$node->value}{'Name'};
    my $data = ${$node->value}{'Content'};

    if ($name eq 'A') { # hyperlink
        my $href_attr = $data->getAttributeNode("href");
        if ($href_attr) {
            my $href_val = $href_attr->getValue;
            # convert relative URLs to absolute ones
            # if $href_val is actually an absolute URL, base is ignored
            if ($href_val and _trim($href_val) ne '') {
                my $abs_url = URI->new_abs($href_val,$self->base)->as_string;
                return($abs_url);
            }
        }
    }

    elsif ($name eq 'IMG') { # image
        my $src_attr = $data->getAttributeNode("src");
        if ($src_attr) {
            my $src_val = $src_attr->getValue;
            my $abs_img_src = URI->new_abs($src_val,$self->base)->as_string;
            return($abs_img_src);
        }
    }

    elsif ($name eq 'FORM' || $name eq 'INPUT') { # form or input element
        my $name_attr = $data->getAttributeNode("name");
        if ($name_attr) {
            return($name_attr->value);
        }
    }

    elsif ($name eq 'TEXT') { # text node
        return( $data );
    }

    elsif ($name ne 'TBODY') {
        # html element, but not A, IMG, FORM, INPUT or TBODY
        my $hs = HTML::Strip->new();
        # HTML::Strip will only attempt decoding of HTML entities if
        # HTML::Entities is installed.
        my $clean_text = $hs->parse($data->toString(1));
        if (!utf8::is_utf8($clean_text)) {
            $clean_text = decode_utf8($clean_text);
        }
        $hs->eof;
        return (_trim($clean_text));
    }

    return q{};
}

# find the character set of a web page (response of a mech request)
sub get_charset {
    my ($self) = shift;

    ref $self or croak "Instance variable needed.\n";

    croak "get_charset: no fetched page!\n"
        if ! $self->content;

    my $id_num = ident $self;
    if ($pagenc_of{$id_num}) { return $pagenc_of{$id_num}; }
    
    my $pcharset  = _getCharset($self);
    if (! $pcharset) {
        my $enc = guess_encoding($self->content);
        if ( ref($enc) ) {
            $pcharset = $enc->name;
        }
        else {
            croak "Could not find charset for ",$self->uri,"\n";
        }
    }
    return $pcharset;

}

# extract the useful content under interest from a web page.
# parses all the candidate DOM nodes and locates matches with the pattern.
# the passed parameter "discard" determines whether the previous
# contents of the records array will be discared or not. By default
# it keeps them and continues extracting and counting hits
sub extract_content {
    my ($self, $discard) = @_;

    ref $self or croak "Instance variable needed.\n";

    croak "Error: you probably forgot to load pattern.\n" unless ($proot);

    my $id_num = ident $self;

    if ($discard) {
        # initialize hits and the records structure
        $self->discard_records;
    }

    if ($max_hits_of{$id_num}) {
        if ($hits_of{$id_num} >= $max_hits_of{$id_num}) {
            print "The number of hits is greater than or equal "
                  ."to max_hits.\n";
            return;
        }
    }

    foreach my $node (@candidates) {
        if ( _checkmatch($self, $vroot, $node) ) {
            if ( _checkupper($self,$vroot->parent,$node->parent) ) {
                # successful match - hit
                $hits_of{$id_num}++;
                $self->_get_record_data();

                if ($max_hits_of{$id_num}) {
                    if ($hits_of{$id_num} >= $max_hits_of{$id_num}) {
                        last;
                    }
                }
            }
        }
    }

    return 1;
}

# function that visits all the pattern treenodes
# and harvests the extracted information into OutputListView
sub _get_record_data {
    my $self = shift;
    my @record;

    if ( $extract_url_of{ident $self} ) {
        push @record, $self->uri();
    }
    my @nodelist = $proot->traverse();
    foreach my $Node (@nodelist) {
        my $state = ${$Node->value}{'State'};
        if ($state == checked or $state == checked_implied or $state == checked_source) {
            push @record, ${$Node->value}{'Content'};
        }
    }
    push @{$records_of{ident $self}}, [@record];
    return 1;
}

# extract a record from a web page under interest.
# parses the candidate DOM nodes until it locates a match with the pattern.
# shifts the candidate that yielded the hit
sub extract_record {
    my ($self, $discard) = @_;

    ref $self or croak "Instance variable needed.\n";

    croak "Error: you probably forgot to load pattern.\n" unless ($proot);

    my $id_num = ident $self;

    if ($max_hits_of{$id_num}) {
        if ($hits_of{$id_num} >= $max_hits_of{$id_num}) {
            print "The number of hits is greater than or equal "
                  ."to max_hits.\n";
            return;
        }
    }

    while (my $node = shift @candidates) {
        if ( _checkmatch($self, $vroot, $node) ) {
            if ( _checkupper($self,$vroot->parent,$node->parent) ) {
                # successful match - hit
                $hits_of{$id_num}++;
                $self->_get_record_data();
                return $records_of{ident $self}->[-1];
            }
        }
    }
    return;
}

# pattern matching algorithm - find record instances in page under interest
sub _checkmatch {
    my $self = shift;
    my $RuleNode = shift;
    my $DomNode = shift;

    my $tag = ${$RuleNode->value}{'Name'};

    if ($tag eq ${$DomNode->value}{'Name'}) {
        if ( ${$RuleNode->value}{'CareAboutSO'} == 1) {
            my $index = ${$DomNode->value}{'SiblingOrder'};
            if ( ${$RuleNode->value}{'so_step'} ) {  # i*K + C
                my $dist = ($index - ${$RuleNode->value}{'so_start'});
                if ($dist < 0) { return(0); }
                if ( $dist % ${$RuleNode->value}{'so_step'} != 0 ) {
                    return(0);
                }
            }
            else { # sibling order is constant, C
                if ( ${$RuleNode->value}{'so_start'} != $index ) {
                    return(0);
                }
            }
        }

        if ( ${$RuleNode->value}{'RegExpr'} ) {  # node with regular expression
            my $reg = ${$RuleNode->value}{'RegExpr'};

            my @subexprs = _resolve_content($self,$DomNode) =~m/$reg/s;
            if ( @subexprs > 0 and !${$RuleNode->value}{'inverse'} ) { # match
                if (@subexprs == 1) {
                    if (!$1) { ${$RuleNode->value}{'Content'} = _trim($&); }
                    else { ${$RuleNode->value}{'Content'} = _trim($1); }
                }
                elsif (@subexprs > 1) {
                    ${$RuleNode->value}{'Content'} = join '', @subexprs;
                }
            }
            elsif ( @subexprs == 0 and ${$RuleNode->value}{'inverse'} ) { # match
            # parentheses in regular expression do not matter due to inverse evaluation
                ${$RuleNode->value}{'Content'} = _resolve_content($self,$DomNode);
            }
            else { # no match
                ${$RuleNode->value}{'Content'} = '';
                return(0);
            }
        }
        else {
            my $state = ${$RuleNode->value}{'State'};
            if ($state == checked or $state == checked_implied) {
                ${$RuleNode->value}{'Content'} = _resolve_content($self,$DomNode);
            }
            elsif ($state == checked_source) { # want outer HTML
                ${$RuleNode->value}{'Content'} =
                _remove_white(${$DomNode->value}{'Content'}->toString(1));
            }
        }

        my @domChildren = $DomNode->children;
        my @ruleChildren = $RuleNode->children;

        my $order = 0;
        my $childDOMNode = $domChildren[$order];
        my $childRuleNode = shift @ruleChildren;

        my $TempNode;
        my $res;

        while ($childRuleNode) {
            $TempNode = $order;

            # found a (required or optional) checked or grayed child rule node
            while ($childDOMNode) {
                if ( _checkmatch($self,$childRuleNode,$childDOMNode) ) {
                    last;
                }
                $childDOMNode = $domChildren[++$order];
            }

            if (! $childDOMNode) { # failure to match this node
                if (${$childRuleNode->value}{'State'} == checked or
                    ${$childRuleNode->value}{'State'} == checked_source or
                    ${$childRuleNode->value}{'State'} == grayed)
                {
                # required node
                    ${$RuleNode->value}{'Content'} = '';
                    return(0);
                }
                else { # optional
                    $childDOMNode = $domChildren[$TempNode];
                    $order = $TempNode;

                    my $fson = ${$childRuleNode->value}{'fson'};
                    my $i = 0;
                    while (($childRuleNode) and ($i <= $fson) and
                        (${$childRuleNode->value}{'State'} == checked_implied or
                        ${$childRuleNode->value}{'State'} == grayed_implied)) {
                        _discard_node_content($childRuleNode);
                        $childRuleNode = shift @ruleChildren;
                        $i++;
                    }
                    next;
                }
            }
            else {
                $childDOMNode = $domChildren[++$order];
            }
            $childRuleNode = shift @ruleChildren;
        }

        return(1);
    }
    else {
        return(0);
    }
}

# function that checks the ancestors of the pattern (virtual) root.
# These 'upper' nodes are like constraints. The pattern becomes stricter.
sub _checkupper {
    my $self = shift;
    my $RuleNode = shift;
    my $DomNode = shift;

    my $ParentRuleNode;
    my $ParentDomNode;
    my $DomSibling;
    my $RuleSibling;
    my $tag;

    if (!$RuleNode) { return(1); }
    if (!$DomNode) { return(0); }

    $tag = ${$RuleNode->value}{'Name'};
    if ($tag eq ${$DomNode->value}{'Name'}) {

        if ( ${$RuleNode->value}{'CareAboutSO'} == 1) {
            my $index = ${$DomNode->value}{'SiblingOrder'};
            if ( ${$RuleNode->value}{'so_step'} ) {  # i*K + C
                my $dist = ($index - ${$RuleNode->value}{'so_start'});
                if ($dist < 0) { return(0); }
                if ( $dist % ${$RuleNode->value}{'so_step'} != 0 ) {
                    return(0);
                }
            }
            else { # sibling order is constant, C
                if ( ${$RuleNode->value}{'so_start'} != $index ) {
                    return(0);
                }
            }
        }


        if ( ${$RuleNode->value}{'RegExpr'} ) {  # node with regular expression
            my $reg = ${$RuleNode->value}{'RegExpr'};

            my @subexprs = _resolve_content($self,$DomNode) =~m/$reg/;

            if ( @subexprs > 0 and !${$RuleNode->value}{'inverse'} ) { # match

                if (@subexprs == 1) {
                    if (!$1) { ${$RuleNode->value}{'Content'} = _trim($&); }
                    else { ${$RuleNode->value}{'Content'} = _trim($1); }
                }
                elsif (@subexprs > 1) {
                    ${$RuleNode->value}{'Content'} = join '',@subexprs;
                }
            }
            elsif ( @subexprs == 0 and ${$RuleNode->value}{'inverse'} ) { # match
                ${$RuleNode->value}{'Content'} = _resolve_content($self,$DomNode);
            }
            else { # no match
                ${$RuleNode->value}{'Content'} = '';
                return(0);
            }
        }
        else {
            my $state = ${$RuleNode->value}{'State'};
            if ($state == checked or $state == checked_implied) {
                ${$RuleNode->value}{'Content'} = _resolve_content($self,$DomNode);
            }
            elsif ($state == checked_source) { # want outer HTML
                ${$RuleNode->value}{'Content'} =
                    _remove_white(${$DomNode->value}{'Content'}->toString(1));
            }

        }

        $RuleSibling = _next_sibling($RuleNode);
        $DomSibling = _next_sibling($DomNode);
        my $lastmatched = $DomSibling;
        while ($RuleSibling) {
            if (!$DomSibling) {
                if (${$RuleSibling->value}{'State'} == checked or
                    ${$RuleSibling->value}{'State'} == grayed or
                    ${$RuleSibling->value}{'State'} == checked_source) {
                        ${$RuleSibling->value}{'Content'} = '';
                        return(0);
                }
                else {
                    ${$RuleSibling->value}{'Content'} = '';
                    $RuleSibling = _next_sibling($RuleSibling);
                    $DomSibling = $lastmatched;
                    next;
                }
            }

            if ( !_checkmatch($self,$RuleSibling,$DomSibling) ) {
                $DomSibling = _next_sibling($DomSibling);
                next;
            }
            else {
                $lastmatched = $DomSibling;
            }

            $RuleSibling = _next_sibling($RuleSibling);
            $DomSibling = _next_sibling($DomSibling);
        }

        $RuleSibling = _prev_sibling($RuleNode);
        $DomSibling = _prev_sibling($DomNode);
        $lastmatched = $DomSibling;
        while ($RuleSibling) {
            if (!$DomSibling) {
                if (${$RuleSibling->value}{'State'} == checked or
                    ${$RuleSibling->value}{'State'} == grayed or
                    ${$RuleSibling->value}{'State'} == checked_source) {
                        ${$RuleSibling->value}{'Content'} = '';
                        return(0);
                }
                else {
                    ${$RuleSibling->value}{'Content'} = '';
                    $RuleSibling = _prev_sibling($RuleSibling);
                    $DomSibling = $lastmatched;
                    next;
                }
            }

            if ( !_checkmatch($self,$RuleSibling,$DomSibling) ) {
                $DomSibling = _prev_sibling($DomSibling);
                next;
            }
            else {
                $lastmatched = $DomSibling;
            }

            $RuleSibling = _prev_sibling($RuleSibling);
            $DomSibling = _prev_sibling($DomSibling);
        }

        $ParentRuleNode = $RuleNode->parent;
        if (!$ParentRuleNode) { return(1); }

        $ParentDomNode = $DomNode->parent;
        if (!$ParentDomNode) {
            ${$RuleNode->value}{'Content'} = '';
            return(0);
        }

        if (${$ParentDomNode->value}{'Name'} eq ${$ParentRuleNode->value}{'Name'}) {
            if (_checkupper($self,$ParentRuleNode,$ParentDomNode)) {
                return(1);
            }
            else {
                ${$RuleNode->value}{'Content'} = '';
                return(0);
            }
        }

        else {
            ${$RuleNode->value}{'Content'} = '';
            return(0);
        }
    }

    else {
        return(0);
    }
}

# function that returns the next sibling of a treenode
sub _next_sibling {
    my $node = shift;
    my $next;

    my $s = ${$node->value}{'SiblingOrder'};
    if (! $s) { $s = 0 }
    if ($s + 1 < scalar($node->parent->children)) {
        $next = $node->parent->children(${$node->value}{'SiblingOrder'}+1);
    }
    return($next);
}

# function that returns the previous sibling of a treenode
sub _prev_sibling {
    my $node = shift;
    my $prev;

    my $s = ${$node->value}{'SiblingOrder'};
    if (! $s) { $s = 0 }
    if ($s - 1 >= 0) {
        $prev = $node->parent->children(${$node->value}{'SiblingOrder'}-1);
    }
    return($prev);
}

# recursive function that visits all nodes of a tree  rooted at the
# 'node' parameter (that failed _checkmatch) and discards their contents
sub _discard_node_content {
    my $node = shift;

    my $hash = $node->value;
    $$hash{'Content'} = '';

    my @children = $node->children;
    my $childnode = shift @children;
    while ($childnode) {
        _discard_node_content($childnode);
        $childnode = shift @children;
    }
    return 1;
}

# LWP::Charset no more available from CPAN?
sub _getCharset {
    my ($self) = @_;
    return _getCharsetFromHeader($self->response) ||
        _getCharsetFromMeta($self);
    # we use $self instead of $response because sometimes $response
    # for certain sites has garbage..
}

sub _getCharsetFromHeader {
    my ($response) = @_;

    my $headers = $response->headers();
    my $cth;
    if (UNIVERSAL::isa($headers->{'content-type'}, "ARRAY")) {
        $cth = $headers->{'content-type'}->[1];
    } else {
        $cth = $headers->{'content-type'};
    }
    my ($charset) = $cth =~ /charset=([^\s\";]*)/;
    if (! $charset) { return 'utf8'; }
    return lc($charset);
}

sub _getCharsetFromMeta {
    my ($response) = @_;
    _getCharsetFromMetaString($response->content);
}

sub _getCharsetFromMetaString {
    my ($string) = @_;
    while ($string =~ /(<meta.*?>)/gis) {
        my $meta = $1;
        if (my ($charset) = $meta =~ /charset=([^\s\";]*)/i) {
            return lc($charset);
        }
    }
    return;
}

1;
