<?xml version="1.0" encoding="UTF-8"?><api:function-page xml:base="/apidoc/8.0/cts.tokenize.xml" generated="2015-10-07T16:36:00.016766-07:00" mode="javascript" xmlns:api="http://marklogic.com/rundmc/api"><api:function-name>cts.tokenize</api:function-name><api:suggest>cts.tokenize</api:suggest><api:suggest>cts</api:suggest><api:suggest>tokenize</api:suggest><api:function-link mode="xquery" fullname="cts:tokenize">/apidoc/8.0/cts:tokenize.xml</api:function-link><api:function mode="javascript" name="tokenize" type="builtin" lib="cts" category="SearchBuiltins" subcategory="Search" hidden="false" bucket="MarkLogic Built-In Functions" prefix="cts" namespace="http://marklogic.com/cts" fullname="cts.tokenize"><api:summary>
  Tokenizes text into words, punctuation, and spaces.  Returns output in
  the type <code xmlns="http://www.w3.org/1999/xhtml">cts:token</code>, which has subtypes
  <code xmlns="http://www.w3.org/1999/xhtml">cts:word</code>, <code xmlns="http://www.w3.org/1999/xhtml">cts:punctuation</code>, and
  <code xmlns="http://www.w3.org/1999/xhtml">cts:space</code>, all of which are subtypes of
  <code xmlns="http://www.w3.org/1999/xhtml">xs:string</code>.
</api:summary><api:params><api:param name="text" type="xs:string"><api:param-description>
    A word or phrase to tokenize.
  </api:param-description><api:param-name>text</api:param-name><api:param-type>String</api:param-type></api:param><api:param name="language" type="xs:string?" optional="true"><api:param-description>
    A language to use for tokenization.  If not supplied, it uses the
    database default language.
  </api:param-description><api:param-name>language</api:param-name><api:param-type>String?</api:param-type></api:param><api:param name="field" type="xs:string?" optional="true"><api:param-description>
    A field to use for tokenization. If the field has custom tokenization rules,
    they will be used. If no field is supplied or the field has no custom
    tokenization rules, the default tokenization rules are used.
  </api:param-description><api:param-name>field</api:param-name><api:param-type>String?</api:param-type></api:param></api:params><api:return>ValueIterator</api:return><api:usage>
<p xmlns="http://www.w3.org/1999/xhtml"> When you tokenize a string with <code>cts:tokenize</code>, each word is
  represented by an instance of
  <code>cts:word</code>, each punctuation character
  is represented by an instance of <code>cts:punctuation</code>,
  each set of adjacent spaces is represented by an instance of
  <code>cts:space</code>, and each set of adjacent line breaks
  is represented by an instance of <code>cts:space</code>.</p>
<p xmlns="http://www.w3.org/1999/xhtml">
   Unlike the standard XQuery function <code>fn:tokenize</code>,
   <code>cts:tokenize</code> returns words, punctuation, and spaces
   as different types. You can therefore use a typeswitch to handle each type
   differently. For example, you can use <code>cts:tokenize</code> to remove
   all punctuation from a string, or create logic to test for the type and
   return different things for different types, as shown in the first
   two examples below.
</p><p xmlns="http://www.w3.org/1999/xhtml">
   You can use <code>xdmp:describe</code> to show how a given string will be
   tokenized. When run on the results of <code>cts:tokenize</code>, the
   <code>xdmp:describe</code> function returns the types and the values
   for each token. For a sample of this pattern, see the third example below.
</p>
</api:usage><api:example class="javascript"><pre xml:space="preserve" xmlns="http://www.w3.org/1999/xhtml">
// Remove all punctuation, normalize space
var string = "The red, blue, green, and orange \
                balloons were launched!";
var noPunctuation = new Array();
for (var token of cts.tokenize(string)) {
      if (fn.deepEqual(sc.name(sc.type(token)),
              fn.QName("http://marklogic.com/cts", "punctuation"))) { }
      else if (fn.deepEqual(sc.name(sc.type(token)),
              fn.QName("http://marklogic.com/cts", "word"))) {
        noPunctuation.push(token); }
      else if (fn.deepEqual(sc.name(sc.type(token)),
              fn.QName("http://marklogic.com/cts", "space"))) { }
      else {  };
      };
noPunctuation.join(" ");

=&gt; The red blue green and orange balloons were launched
</pre></api:example><api:example class="javascript"><pre xml:space="preserve" xmlns="http://www.w3.org/1999/xhtml">
// Insert the string "XX" before and after
//   all punctuation tokens
var str = "The red, blue, green, and orange \
                 balloons were launched!" ;
var tokens = cts.tokenize(str);
var res = new Array();
for (var x of tokens) {
  if ( fn.deepEqual(sc.name(sc.type(x)),
              fn.QName("http://marklogic.com/cts", "punctuation")))  {
       res.push(fn.concat("XX", x, "XX")); }
       else { res.push(x); };
};
fn.normalizeSpace(res.join(" "));

=&gt; The redXX,XX blueXX,XX greenXX,XX and orange balloons were launchedXX!XX

</pre></api:example><api:example class="javascript"><pre xml:space="preserve" xmlns="http://www.w3.org/1999/xhtml">
// show the types and tokens for a string
xdmp.describe(cts.tokenize("blue, green"), 20)

=&gt; *["blue", ",", " ", "green"]

// the same example, iterating over the ValueIterator results
var res = new Array();
for (var x of cts.tokenize("blue, green")) {
	res.push(sc.name(sc.type(cts.tokenize(x)))); };
res;

=&gt; ["cts:word","cts:punctuation","cts:space","cts:word"]
</pre></api:example></api:function></api:function-page>