<?xml version="1.0" encoding="UTF-8"?><api:function-page xml:base="/apidoc/8.0/cts.train.xml" generated="2015-10-07T16:36:00.016766-07:00" mode="javascript" xmlns:api="http://marklogic.com/rundmc/api"><api:function-name>cts.train</api:function-name><api:suggest>cts.train</api:suggest><api:suggest>cts</api:suggest><api:suggest>train</api:suggest><api:function-link mode="xquery" fullname="cts:train">/apidoc/8.0/cts:train.xml</api:function-link><api:function mode="javascript" name="train" type="builtin" lib="cts" category="Classifier" hidden="false" bucket="MarkLogic Built-In Functions" prefix="cts" namespace="http://marklogic.com/cts" fullname="cts.train"><api:summary>
  Produces a set of classifiers from a list of
  labeled training documents.
</api:summary><api:params><api:param class="javascript" name="trainingNodes" type="Array"><api:param-description>
    The array of training nodes.  These are nodes that represent
    members of the classes.
  </api:param-description><api:param-name>trainingNodes</api:param-name><api:param-type>Array</api:param-type></api:param><api:param name="labels" type="Array" class="javascript"><api:param-description>
    An array of labels for the training nodes, in the order corresponding
    to the training nodes.
  </api:param-description><api:param-name>labels</api:param-name><api:param-type>Array</api:param-type></api:param><api:param name="options" type="(element()|map:map)?" optional="true"><api:param-description>
    <span class="javascript" xmlns="http://www.w3.org/1999/xhtml">A JavaScript</span>
    representation of the options for defining the training
    parameters.  The following is a sample options
    <span class="javascript" xmlns="http://www.w3.org/1999/xhtml">object</span>:<p xmlns="http://www.w3.org/1999/xhtml">
    
    <pre class="javascript" xml:space="preserve">

    {
      "classifierType": "supports",
      "kernel": "geodesic"
    }
    </pre>
  </p>
    <p xmlns="http://www.w3.org/1999/xhtml">The 
    <span class="javascript"><code>cts.train</code></span> options include:</p>
    <dl xmlns="http://www.w3.org/1999/xhtml">

    <dt><p>
    <span class="javascript"><code>classifierType</code></span></p></dt>
    <dd>A string defining the kind of classifier to produce, either
    <code>weights</code> or <code>supports</code>. The default is
    <code>weights</code>.</dd>
    <dt><p>
    <span class="javascript"><code>kernel</code></span>
    </p></dt>
    <dd>A string defining which function to use for comparing documents.
    The default is <code>sqrt</code>. Normalization (the values that end in
    <code>-normalized</code>)
    brings document vectors into the unit sphere,
    which may improve the mathematical properties of the calculations.
    Possible values are:
      <dl>
      <dt><p><code>simple</code></p></dt>
      <dd>Model documents as 1 or 0 for presence or absence of each term.</dd>
      <dt><p><code>simple-normalized</code></p></dt>
      <dd>Like <code>simple</code>, but normalized by the square root of
      the document length.</dd>
      <dt><p><code>sqrt</code></p></dt>
      <dd>Model documents using the square root of the term frequencies.</dd>
      <dt><p><code>sqrt-normalized</code></p></dt>
      <dd>Like <code>sqrt</code>, but normalized by the sum of the term
      frequencies.</dd>
      <dt><p><code>linear-normalized</code></p></dt>
      <dd>Model documents as the term frequencies normalized by the
      square root of the sum of the squares of the term frequencies.</dd>
      <dt><p><code>gaussian</code></p></dt>
      <dd>Compare documents using the Gaussian of the term frequencies.
      Requires a 
      <span class="javascript"><code>classifierType</code></span> of
      <code>supports</code>.</dd>
      <dt><p><code>geodesic</code></p></dt>
      <dd>Compare documents using the Riemann geodesic distance over
      term frequencies. Requires a 
      
      <span class="javascript"><code>classifierType</code></span> of
      <code>supports</code>.</dd>
      </dl>
    </dd>
    <dt><p>
    <span class="javascript"><code>maxTerms</code></span></p></dt>
    <dd>An integer defining the maximum number of terms to use to
    represent each document. If a positive number M is given, then the
    M most discriminating terms are used; other terms are dropped.  The
    default is 0 (unlimited), but for larger documents a value in
    500 to 1000 range will produce much better results.</dd>
    <dt><p>
    <span class="javascript"><code>maxSupport</code></span></p></dt>
    <dd>A double specifying the maximum influence a single training node
    can have.  This parameter has a strong influence on performance.
    The default value of 1.0 should work well in most cases. Larger
    values means greater sensitivity and may improve accuracy on small
    datasets, but give longer running times.  Smaller values mean less
    sensitivity and better resistance to mis-classified documents, and
    shorter running times.</dd>
    <dt><p>
    <span class="javascript"><code>minWeight</code></span></p></dt>
    <dd>A double specifying the minimum weight a term can have and still
    be considered for inclusion in the term vector. This parameter only applies
    to the term weight form of the classifier. Smaller values mean longer
    term vectors and as a consequence longer running times and greater memory
    consumption during classification, but may also improve accuracy. The
    initial value may be adjusted downwards during training if a class 
    would otherwise have no terms in its output vector.
    The default is is 0.01.</dd>
    <dt><p>
    <span class="javascript"><code>tolerance</code></span></p></dt>
    <dd>How close the final solutions to the constraint equations must be.
    Smaller values lead to a greater number of iterations and longer
    running times.  Larger values lead to less precise classification.
    The default is 0.01.</dd>
    <dt><p>
    <span class="javascript"><code>epsilon</code></span></p></dt>
    <dd>How close a value must be to 0 to be counted as equal to 0.
    Since double arithmetic is not precise, setting this value to exactly
    0 will likely lead to non-convergence of the algorithm.  Smaller
    values lead to a greater number of iterations and longer running
    times.  Larger values lead to less precise classification. The initial
    value may be adjusted downwards during execution if it is too large to
    be useful. In general the higher the dimensionality (larger documents, 
    larger limits on the number of terms), the smaller this should be.
    The default is 0.01.</dd>
    <dt><p>
    <span class="javascript"><code>maxIterations</code></span></p></dt>
    <dd>The maximum number of iterations of the constraint satisfaction
    algorithm to run. The algorithm usually converges very quickly,
    so this parameter usually has no effect unless it is set very low.
    The default is 500.</dd>
    <dt><p>
    <span class="javascript"><code>defaultThreshold</code>, 
    <code>classThresholds</code></span></p></dt>
    <dd>A definition of the thresholds to use in classification.
    
    You can specify both a default value and per-class values
    (as computed from 
    <span class="javascript"><code>cts.thresholds</code></span>).
    The default value will apply to any classes for which a per-class value
    is not specified.  For example:<p>
    
    <pre class="javascript" xml:space="preserve">
    {
        ...
        defaultThreshold: -1.0,
        classThresholds: {"Example 1": -2.42, "Example 2": 0.41}
        ...
    }
    </pre>
  </p>
    <p>For the initial tuning phase of training your data, leave the value
    of this parameter at its default value which is a very large negative
    number (-1.0e30).  This will allow you to accurately compute the
    threshold values when you run 
    
    <span class="javascript"><code>cts.thresholds</code></span> on the initial
    training data.  Then you can use the calculated threshold values
    when you run the secondary pass through the second part of your training
    data.</p>
    </dd>
    <dt><p>
    <span class="javascript"><code>useDbConfig</code></span></p></dt>
    <dd>A boolean value indicating whether to use the current DB configuration
    for determining which terms to use.  The default is <code>false</code>,
    which means that only the indexing options in the options node will be
    used for calculating the classifier.
    </dd>

    </dl>
    <p xmlns="http://www.w3.org/1999/xhtml">The options  <span class="javascript">object also includes 
    database indexing options.</span>
    These control which terms to use. Note that the use of certain
    options, such as 
    
    <span class="javascript"><code>fastCaseSensitiveSearches</code></span>, 
    will not impact final results unless the term vector size is limited with
    the 
    <span class="javascript"><code>maxTerms</code></span> option.  Other 
    options, such as 
     
    <span class="javascript"><code>phraseThroughs</code></span>, will only
    generate terms if some other option is also enabled (in this case
    
    <span class="javascript"><code>fastPhraseSearches</code></span>).
    </p>
    <p xmlns="http://www.w3.org/1999/xhtml">The database options are the same as the database options shown for
    <a href="/cts:distinctive-terms#db-term-options">
    
    <span class="javascript"><code>cts.distinctiveTerms</code></span></a>.</p>

  </api:param-description><api:param-name>options</api:param-name><api:param-type>Object?</api:param-type></api:param></api:params><api:return class="javascript">Object</api:return><api:usage>
<p xmlns="http://www.w3.org/1999/xhtml">
The elements in the label sequence should match one for one with the nodes
in the training node sequence. The first label element describes the first node
in the training node sequence, the second label element describes the second
node in the training node sequence, and so on.
If there are more labels than training nodes or more training nodes
than labels, an error is raised.
</p>
<p xmlns="http://www.w3.org/1999/xhtml">
The format of each label 
<span class="javascript">object</span> is:
</p>

<pre class="javascript" xml:space="preserve" xmlns="http://www.w3.org/1999/xhtml">
{
    "name": "apple doc",
    "classes": [
        {
            "name": "fruit class",
            "val": 1
        },
        {
            "name": "animal class",
            "val": -1
        }
    ]
}
</pre>
<p xmlns="http://www.w3.org/1999/xhtml">Each class listed indicates whether the corresponding node in the training
sequence is in the given class. Examples are taken to be positive examples
unless specified otherwise (with a <code>val</code> attribute of -1).
The document is assumed to be a negative example of any classes that are
not explicitly listed.
The name 
<span class="javascript">property in the label object</span> 
is an optional name for the labelled node. It is purely for human 
consumption to help in tuning the classification parameters.
</p>

<a id="outputformats" xmlns="http://www.w3.org/1999/xhtml"><b>Output Formats</b></a>

<p xmlns="http://www.w3.org/1999/xhtml">A linear classifier is defined by a weight vector w on terms, and
an offset value b. The  
<span class="javascript">weights property</span> encodes
the weight vector directly. Its children are the classes, and each
class includes a list of terms. The term node uses an internal id to
identify the term and a term weight:
</p>


<pre class="javascript" xml:space="preserve" xmlns="http://www.w3.org/1999/xhtml">
"weights":[
  {
    "name":"animal class",
    "offset":0.9609375,
    "terms":[
      {
        "id":"3701029877487003077",
        "val":-0.132582515478134
      },
      {
        "id":"8051590956710175434",
        "val":0.353553384542465
      },
      :            :
    ]
  },
  :                :
]
</pre>
<p xmlns="http://www.w3.org/1999/xhtml">
The weight vector w is a linear combination of the documents
themselves, and it may be more convenient to express the classifier in
this way. For instance, if the number of terms is not limited, the

<span class="javascript">weights property</span> will be extremely large. 
The weight vector form may not be used if the classifier kernel is
non-linear, that is, with the Gaussian or geodesic kernel.
</p>
<p xmlns="http://www.w3.org/1999/xhtml">The support vector representation of the classifier includes a

<span class="javascript">supports property that has class children</span> 
for each class. Here the class 
<span class="javascript">objects</span> contain a list of doc elements 
which identify the specific training nodes using an internal key.
This internal key is valid across queries only for nodes in the
database.  Each doc  <span class="javascript">object</span> has  <span class="javascript">a property</span> encoding
the weight of that document and an error  <span class="javascript">property</span> 
which shows how well the document fit the classifier. Large positive 
or negative errors (greater than about 1.5) are potentially 
mis-classified documents.</p>


<pre class="javascript" xml:space="preserve" xmlns="http://www.w3.org/1999/xhtml">
"supports":[
  {
    "name":"animal",
    "offset":0.9609375,
    "docs":[
      {
        "id":"10529665449293922777",
        "name":"apple doc",
        "val":-0.3125,
        "err":0
      },
      {
        "id":"95824053707766723",
        "name":"banana doc",
        "val":-0.375,
        "err":0.0078125
      },
      :            :
    ]
  },
  :                :
]
</pre>
<p xmlns="http://www.w3.org/1999/xhtml">Each class is identified by a unique name.</p>
</api:usage><api:example class="javascript"><a id="trainEx1" xmlns="http://www.w3.org/1999/xhtml"></a>
<pre xml:space="preserve" xmlns="http://www.w3.org/1999/xhtml">
var firsthalf = fn.subsequence(xdmp.directory("/shakespeare/plays/", "1"), 1, 19);
var plays = firsthalf.clone();
var labels = [];
for (var x of firsthalf) {
  var singleClass = [{"name": xdmp.documentProperties(xdmp.nodeUri(x)).next().
                                value.xpath("//playtype/fn:string()")
                     }];
  labels.push({"classes": singleClass});
};
cts.train(plays.toArray(), labels, 
          {"classifierType": "supports", 
           epsilon: 0.00001}
         );
  =&gt;
{
  "options": {
    "kernel": "sqrt",
    "classifierType": "supports",
    "minWeight": 0.01,
    "maxTerms": 0,
    "maxIterations": 500,
    "maxSupport": 1,
    "tolerance": 0.01,
    "epsilon": 0.00001,
    "defaultThreshold": -1e+30,
    "classThresholds": {}
  },
  "supports": [
    {
      "name": "HISTORY",
      "offset": 0.679854154586792,
      "docs": [
        { "id": "12231438930115319131",
          "val": -0.0000109664215415251,
          "err": 0.00122268195264041
        },
        { "id": "15339507384182411064",
          "val": 0.0000208658457268029,
          "err": -0.00875759869813919
        },
          ...
      ]
    },
    {
      "name": "COMEDY",
      "offset": 0.502409636974335,
      "docs":
      [
        { "id": "12231438930115319131",
          "val": -0.0000158612419909332,
          "err": 0.000878061284311116
        },
        { "id": "17774930858870475928",
          "val": 0.0000244826205744175,
          "err": 0.00316164619289339
        },
          ...
      ]
    },
    {
      "name": "TRAGEDY",
      "offset": -0.179147496819496,
      "docs":
      [
        { "id": "8900580694384751574",
          "val": 0.0000163165386766195,
          "err": 0.00214929808862507
        },
        { "id": "12231438930115319131",
          "val": 0.000026724021154223,
          "err": 0.00388686032965779
        },
          ...
      ]
    }
  ]
}


</pre></api:example><api:example class="javascript"><a id="trainEx2" xmlns="http://www.w3.org/1999/xhtml"></a>
<pre xml:space="preserve" xmlns="http://www.w3.org/1999/xhtml">
// This example is the same as the first, except that it uses the 
// useDbConfig option.

var firsthalf = fn.subsequence(xdmp.directory("/shakespeare/plays/", "1"), 1, 19);
var plays = firsthalf.clone();
var labels = [];
for (var x of firsthalf) {
  var singleClass = [{"name": xdmp.documentProperties(xdmp.nodeUri(x)).next().value.
                      xpath("//playtype/fn:string()")
                     }];
  labels.push({"classes": singleClass});
};
cts.train(plays.toArray(), labels, 
          {"classifierType": "supports",
           "useDbConfig": true,
           "epsilon": 0.00001
          }
         );
=&gt;
{
  "options": {
    "kernel": "sqrt",
    "classifierType": "supports",
    "minWeight": 0.01,
    "maxTerms": 0,
    "maxIterations": 500,
    "maxSupport": 1,
    "tolerance": 0.01,
    "epsilon": 0.00001,
    "defaultThreshold": -1e+30,
    "classThresholds": {
    },
    "useDbConfig": true
  },
  "supports": [
    {
      "name": "HISTORY",
      "offset": 0.616991937160492,
      "docs": [
        { "id": "11719886725627889310",
          "val": 0.000012535679161374,
          "err": 0.00515030510723591
        },
        { "id": "703569506516702025",
          "val": 0.0000126165068650153,
          "err": 3.86468634872017e-13
        },
          ...
      ]
    },
    {
      "name": "COMEDY",
      "offset": 0.444232106208801,
      "docs": [
        { "id": "347003984347788586",
          "val": -0.0000104659011412878,
          "err": -0.00548016233369708
        },
        { "id": "15822004215638450994",
          "val": 0.0000148163953781477,
          "err": -0.00175983365625143
        },
          ...
      ]
    },
    {
      "name": "TRAGEDY",
      "offset": -0.0621433705091477,
      "docs": [
        { "id": "347003984347788586",
          "val": 0.0000174711658473825,
          "err": 0.000306207628455013
        },
        { "id": "15822004215638450994",
          "val": -0.0000100835841294611,
          "err": -0.000551707693375647
        },
          ...
      ]
    }
  ]
}

</pre></api:example></api:function></api:function-page>