# **DOMUtil** provides utilty functions for working with DOM trees.
#
# DOMUtil is designed to work with the DOM structure generated by
# [Chris Winberry's node-htmlparser](https://github.com/tautologistics/node-htmlparser).
# DOMUtil doesn't have a strict dependency on node-htmlparser, but it
# expects DOM structures compatible with those generated by node-htmlparser.
# See [the htmlparser documentation](https://github.com/tautologistics/node-htmlparser#example-output)
# for more information about that format.
#
# (Currently DOMUtil should work with any DOM format that supports the `type`,
# `name`, `children`, `attribs` and `raw` attributes as used in htmlparser, but
# that may change in future verions.)
#
#
# Method names that start with `_` are subject to change without notice. Other methods may be considered a part of the public API.
class DOMUtil

  # **The DOMUtil constructor.**
  #
  # While the DOMUtil methods are essentially stateless
  # (and hence thread-safe, i.e., calls to the same DOMUtil instance can be safely
  # interleaved), DOMUtil is implemented as an instantiable class to allow for
  # alternative configurations.
  #
  # The constuctor accepts an optional `params` map. Currently one parameter key
  # is supported:
  #
  #  - The value `params.decode` (optionally) specifies a function to use
  #    when converting HTML text nodes into "plain text" (in the `to_text` and
  #    `inner_text` functions). This method can be used, for example,
  #    to decode HTML entities into their text equivalents.
  #    By default no conversion is made, the text nodes are output in exactly
  #    the same format as they are found in the DOM.
  #
  constructor:(params = {})->
    @decode = params.decode ? (str)->str

  # **parse_html** is a convenience function that parses a given HTML string
  # into one or more DOM trees using the `htmlparser` library (if present).
  # If `htmlparser` is not available, an error will be passed to the `callback`
  # function.
  #
  #  - The `html` parameter must be a string containing one or more HTML/XML trees.
  #
  #  - The `options` parameter is optional, and may contain a map of
  #      [options to pass to htmlparser](https://github.com/tautologistics/node-htmlparser/#defaulthandler-options).
  #
  #  - The `callback` parameter should contain a function with the signature
  #      `callback(err,dom)`, where:
  #
  #       - The `err` argument will be a non-`null` value if an error occurs
  #           during the parsing.
  #
  #       - Otherwise the `dom` argument will contain a single DOM object
  #           (when there is a single root tag in the given `html` string) or
  #           an array of DOM objects (when there is more than one HTML/XML
  #           structure in the given `html` string).
  #
  parse_html:(html,options,callback)->
    # The `options` parameter is optional, so swap `options` and `callback` if necessary.
    if typeof options is 'function' and typeof callback isnt 'function'
      [ options, callback ] = [ callback, options ]
    # If we haven't yet loaded `htmlparser`, do so now.
    unless @htmlparser?
      try
        @htmlparser = require 'htmlparser'
      catch err
        callback(err,null)
    if @htmlparser?
      # Now create a simple handler that invokes the given `callback`...
      handler = new @htmlparser.DefaultHandler (err,domset)->
        if err?
          callback(err,null)
        else if Array.isArray(domset) and domset.length <= 1
          callback(null,domset[0])
        else
          callback(null,domset)
      # ...create the parser...
      parser = new @htmlparser.Parser(handler,options)
      # ...and parse the HTML.
      parser.parseComplete(html)

  # **as_node** returns `nodeset[0]` if `nodeset` is an array, `nodeset` otherwise.
  as_node: (nodeset)->
    if Array.isArray(nodeset)
      return nodeset[0]
    else
      return nodeset

  # **as_nodeset** returns `node` if `node` is an array, `[ node ]` otherwise.
  as_nodeset: (node)->
    if Array.isArray(node)
      return node
    else if node?
      return [node]
    else
      return []

  # **_kt** returns `true`. It's the default filter for `to_text`.
  _kt: ()->true

  # **to_text** returns a concatenation of all text nodes found
  # within the given DOM `elt`.
  #
  # An optional `filter` parameter may contain a function with
  # the signature `filter(node)` that returns `true` if the
  # text found in or beneath the given `node` should be included
  # in the concatenation or `false` if the text found at or
  # below the given `node` should be excluded.
  #
  # E.g., the function:
  #
  # ```javascript
  # var skip_em = function(node) { return node.name != 'em' };
  # ```
  #
  # will cause `to_text` to exclude any text found within an
  # `<em>` tag.
  #
  to_text:(elt,filter = @_kt)->
    buffer = ''
    @walk_dom elt, visit:(node,node_metadata,all_metadata)=>
      # If `node` is acceptable to `filter`, then append any text, and visit its children.
      if(filter(node,node_metadata,all_metadata))
        buffer += @decode(node.raw) if node?.type is 'text' and node?.raw?
        return {'continue':true,'visit_children':true}
      else
        # If `node` is *not* acceptable to `filter`, then skip it and its children.
        return {'continue':true,'visit_children':false}
    return buffer

  # **inner_text** is an alias for `to_text` (which see).
  inner_text:(elt,filter)->@to_text(elt,filter)

  # **to_html** returns an HTML string representation of
  # the given `elt` and its children (if any).
  #
  # (Currently only `text` and `tag` node types are converted,
  # but that may change in the future.)
  to_html:(elt)->
    buffer = ''
    @walk_dom elt, {
      # When `visit`ing a node...
      visit:(node)->
        switch node.type
          # ...concat the value of `text` nodes.
          when 'text'
            buffer += node.raw
          # ...concat the name and attributes of `tag` nodes.
          when 'tag'
            buffer += "<#{node.name}"
            if node.attribs?
              for name,value of node.attribs
                buffer += " #{name}=\"#{value}\""
            buffer += ">"
        return true
      # `after_visit`ing a node...
      after_visit:(node)->
        switch node.type
          # ...concat the "end tag" for `tag` nodes.
          when 'tag'
            buffer += "</#{node.name}>"
        return true
    }
    return buffer

  # **inner_html** returns an HTML string representation of
  # the the children (if any) of the given `elt`.
  #
  # (Otherwise it behaves just like `to_html`, which see.)
  inner_html:(elt)->
    buffer = null
    # If `elt` is an array, invoke `to_html` on the children of each element of in the array.
    if Array.isArray(elt)
      buffer = ''
      for node in elt
        if node.children?
          buffer += @to_html(node.children)
    # Otherwise `to_html` on the childen of `elt`.
    else if elt?.children?
      buffer = @to_html elt.children
    return buffer


  # **walk_dom** performs a depth-first walk of the given DOM tree (or trees),
  # invoking a specified "visit" function for each node.
  #
  # * The `dom` parameter is either a single DOM node or an array of DOM nodes.
  #
  # * The `callbacks` parameter is a map that contains (at minimum) an
  #    attribute named `visit` containing a function with the signature:
  #
  #       visit(node,node_metadata,all_metadata)
  #
  #    where:
  #
  #      - `node` is the DOM node currently being visited,
  #      - `node_metadata` is a map containing `parent`, `path`, `siblings`
  #          and `sib_index` keys, and
  #      - `all_metadata` is an array of `node_metadata` values
  #          for each previously visited nodes, indexed by the value
  #          stored at `node._stew_node_id`.
  #
  # * The `callbacks.visit` function should return a map containing
  #    `continue` and `visit-children` attributes.
  #
  #     - When `visit-children` is `true`, the children of
  #        `node` (if any) will be visited next. When `false`,
  #        the `node`'s children will be skipped, but processing
  #        will continue with `node`'s siblings (or `node`'s
  #        parent's, siblings, etc.)
  #
  #     - When `continue` is `false`, all subsequent processing
  #        will be aborted and the `walk_dom` method will exit
  #        as soon as possible.
  #
  #     - If the value returned by `visit` is a boolean, that
  #        value will be used for both `continue` and `visit-children`.
  #
  # * If `callbacks` is a function (rather than a map) it be
  #    used as the `visit` function.
  #
  walk_dom:(dom,callbacks)->
    # Fiddle with the input parameters if needed.
    if typeof callbacks is 'function'
      callbacks = { visit:callbacks }
    nodes = @as_nodeset(dom)
    # Create a container for all the node metadata.
    dom_metadata = []
    for node, sib_index in nodes
      # Create the metadata for this node...
      node_metadata = { parent:null, path:[], siblings:nodes, sib_index: sib_index }
      node._stew_node_id = dom_metadata.length
      # ...add it to the container...
      dom_metadata.push node_metadata
      # ...visit the node...
      should_continue = @_unguarded_walk_dom(node,node_metadata,dom_metadata,callbacks)
      # ...and exit if needed.
      if not should_continue
        break

  # **_unguarded_walk_dom** is the "inner" implementation of `walk_dom`.
  # See `walk_dom` for more information
  #
  # * `node` is the current DOM node to visit.
  # * `node_metadata` is a map containing:
  #     - `parent` - the parent of this node, if any
  #     - `path` - an array of this node's ancestors (from "root" to parent)
  #     - `siblings` - an array of this node's parent's children
  #     - `sib_index` - the index of this node in the `siblings` array
  # * `dom_metadata` is an array of `node_metadata` objects, indexed by
  #     `node._stew_node_id`.  Only the already visited nodes are contained
  #     in this array.
  # * `callbacks` is the map of callbacks passed to `walk_dom`, which see.
  #
  # `_unguarded_walk_dom` will return `true` if processing should continue
  # (typically with `node`'s next sibling), or `false` if processing is
  # complete an no more nodes should be visited.
  #
  _unguarded_walk_dom:(node,node_metadata,dom_metadata,callbacks)->
    # Visit the current node.
    response = {'continue':true,'visit_children':true}
    if callbacks.visit?
      response = callbacks.visit(node,node_metadata,dom_metadata)
    # If processing should continue...
    if response is true or response?['continue'] is true or (not response?['continue']?)
      # ...and this node's children should be processed...
      if node.children? and (response is true or response?['visit_children'] is true or (not response?['visit_children']?))
        # ...create the `path` to this `node`'s children...
        new_path = [].concat(node_metadata.path)
        new_path.push(node)
        # ...and recursively visit each child in turn...
        for child,index in node.children
          new_node_metadata = { parent:node, path:new_path, siblings:node.children, sib_index: index }
          child._stew_node_id = dom_metadata.length
          dom_metadata.push new_node_metadata
          should_continue = @_unguarded_walk_dom(child,new_node_metadata,dom_metadata,callbacks)
          # ...aborting further processing if needed.
          if not should_continue
            return false
      # ...invoke the post-visit callback, if any...
      if callbacks['after_visit']?
        response = callbacks.after_visit(node,node_metadata,dom_metadata)
        # ...aborting further processing if needed.
        return response is true or response?['continue'] is true or (not response?['continue']?)
      else # no `after_visit` callback
        return true
    else # processing should not continue
      return false

# The DOMUtil class is exported under the name `DOMUtil`.
exports = exports ? this
exports.DOMUtil = DOMUtil