Replaces tokens by multiple replacements consisting of elements split by a
separator pattern, with the option of retaining the separator. This function
effectively reverses the operation of tokens_compound()
.
tokens_split(
x,
separator = " ",
valuetype = c("fixed", "regex"),
remove_separator = TRUE
)
a tokens object
a single-character pattern match by which tokens are separated
the type of pattern matching: "glob"
for "glob"-style
wildcard expressions; "regex"
for regular expressions; or "fixed"
for
exact matching. See valuetype for details.
if TRUE
, remove separator from new tokens
# undo tokens_compound()
toks1 <- tokens("pork barrel is an idiomatic multi-word expression")
tokens_compound(toks1, phrase("pork barrel"))
#> Tokens consisting of 1 document.
#> text1 :
#> [1] "pork_barrel" "is" "an" "idiomatic" "multi-word"
#> [6] "expression"
#>
tokens_compound(toks1, phrase("pork barrel")) %>%
tokens_split(separator = "_")
#> Tokens consisting of 1 document.
#> text1 :
#> [1] "pork" "barrel" "is" "an" "idiomatic"
#> [6] "multi-word" "expression"
#>
# similar to tokens(x, remove_hyphen = TRUE) but post-tokenization
toks2 <- tokens("UK-EU negotiation is not going anywhere as of 2018-12-24.")
tokens_split(toks2, separator = "-", remove_separator = FALSE)
#> Tokens consisting of 1 document.
#> text1 :
#> [1] "UK" "-" "EU" "negotiation" "is"
#> [6] "not" "going" "anywhere" "as" "of"
#> [11] "2018" "-"
#> [ ... and 4 more ]
#>