This function recompiles a serialized tokens object when the vocabulary has been changed in a way that makes some of its types identical, such as lowercasing when a lowercased version of the type already exists in the type table, or introduces gaps in the integer map of the types. It also re-indexes the types attribute to account for types that may have become duplicates, through a procedure such as stemming or lowercasing; or the addition of new tokens through compounding.
tokens_recompile(x, method = c("C++", "R"))
the tokens object to be recompiled
"C++"
for C++ implementation or "R"
for an older
R-based method
# lowercasing
toks1 <- tokens(c(one = "a b c d A B C D",
two = "A B C d"))
attr(toks1, "types") <- char_tolower(attr(toks1, "types"))
unclass(toks1)
#> $one
#> [1] 1 2 3 4 5 6 7 8
#>
#> $two
#> [1] 5 6 7 4
#>
#> attr(,"types")
#> [1] "a" "b" "c" "d" "a" "b" "c" "d"
#> attr(,"padding")
#> [1] TRUE
#> attr(,"docvars")
#> docname_ docid_ segid_
#> 1 one one 1
#> 2 two two 1
#> attr(,"meta")
#> attr(,"meta")$system
#> attr(,"meta")$system$`package-version`
#> [1] ‘4.3.0’
#>
#> attr(,"meta")$system$`r-version`
#> [1] ‘4.5.0’
#>
#> attr(,"meta")$system$system
#> sysname machine user
#> "Darwin" "arm64" "kbenoit"
#>
#> attr(,"meta")$system$directory
#> [1] "/Users/kbenoit/Dropbox/GitHub/quanteda/quanteda/docs/reference"
#>
#> attr(,"meta")$system$created
#> [1] "2025-05-21"
#>
#>
#> attr(,"meta")$object
#> attr(,"meta")$object$unit
#> [1] "documents"
#>
#> attr(,"meta")$object$what
#> [1] "word"
#>
#> attr(,"meta")$object$tokenizer
#> [1] "tokenize_word4"
#>
#> attr(,"meta")$object$ngram
#> [1] 1
#>
#> attr(,"meta")$object$skip
#> [1] 0
#>
#> attr(,"meta")$object$concatenator
#> [1] "_"
#>
#> attr(,"meta")$object$summary
#> attr(,"meta")$object$summary$hash
#> character(0)
#>
#> attr(,"meta")$object$summary$data
#> NULL
#>
#>
#>
#> attr(,"meta")$user
#> list()
#>
unclass(quanteda:::tokens_recompile(toks1))
#> $one
#> [1] 1 2 3 4 1 2 3 4
#>
#> $two
#> [1] 1 2 3 4
#>
#> attr(,"types")
#> [1] "a" "b" "c" "d"
#> attr(,"padding")
#> [1] TRUE
#> attr(,"docvars")
#> docname_ docid_ segid_
#> 1 one one 1
#> 2 two two 1
#> attr(,"meta")
#> attr(,"meta")$system
#> attr(,"meta")$system$`package-version`
#> [1] ‘4.3.0’
#>
#> attr(,"meta")$system$`r-version`
#> [1] ‘4.5.0’
#>
#> attr(,"meta")$system$system
#> sysname machine user
#> "Darwin" "arm64" "kbenoit"
#>
#> attr(,"meta")$system$directory
#> [1] "/Users/kbenoit/Dropbox/GitHub/quanteda/quanteda/docs/reference"
#>
#> attr(,"meta")$system$created
#> [1] "2025-05-21"
#>
#>
#> attr(,"meta")$object
#> attr(,"meta")$object$unit
#> [1] "documents"
#>
#> attr(,"meta")$object$what
#> [1] "word"
#>
#> attr(,"meta")$object$tokenizer
#> [1] "tokenize_word4"
#>
#> attr(,"meta")$object$ngram
#> [1] 1
#>
#> attr(,"meta")$object$skip
#> [1] 0
#>
#> attr(,"meta")$object$concatenator
#> [1] "_"
#>
#> attr(,"meta")$object$summary
#> attr(,"meta")$object$summary$hash
#> character(0)
#>
#> attr(,"meta")$object$summary$data
#> NULL
#>
#>
#>
#> attr(,"meta")$user
#> list()
#>
# stemming
toks2 <- tokens("Stemming stemmed many word stems.")
unclass(toks2)
#> $text1
#> [1] 1 2 3 4 5 6
#>
#> attr(,"types")
#> [1] "Stemming" "stemmed" "many" "word" "stems" "."
#> attr(,"padding")
#> [1] TRUE
#> attr(,"docvars")
#> docname_ docid_ segid_
#> 1 text1 text1 1
#> attr(,"meta")
#> attr(,"meta")$system
#> attr(,"meta")$system$`package-version`
#> [1] ‘4.3.0’
#>
#> attr(,"meta")$system$`r-version`
#> [1] ‘4.5.0’
#>
#> attr(,"meta")$system$system
#> sysname machine user
#> "Darwin" "arm64" "kbenoit"
#>
#> attr(,"meta")$system$directory
#> [1] "/Users/kbenoit/Dropbox/GitHub/quanteda/quanteda/docs/reference"
#>
#> attr(,"meta")$system$created
#> [1] "2025-05-21"
#>
#>
#> attr(,"meta")$object
#> attr(,"meta")$object$unit
#> [1] "documents"
#>
#> attr(,"meta")$object$what
#> [1] "word"
#>
#> attr(,"meta")$object$tokenizer
#> [1] "tokenize_word4"
#>
#> attr(,"meta")$object$ngram
#> [1] 1
#>
#> attr(,"meta")$object$skip
#> [1] 0
#>
#> attr(,"meta")$object$concatenator
#> [1] "_"
#>
#> attr(,"meta")$object$summary
#> attr(,"meta")$object$summary$hash
#> character(0)
#>
#> attr(,"meta")$object$summary$data
#> NULL
#>
#>
#>
#> attr(,"meta")$user
#> list()
#>
unclass(quanteda:::tokens_recompile(tokens_wordstem(toks2)))
#> $text1
#> [1] 1 2 3 4 2 5
#>
#> attr(,"types")
#> [1] "Stem" "stem" "mani" "word" "."
#> attr(,"padding")
#> [1] TRUE
#> attr(,"docvars")
#> docname_ docid_ segid_
#> 1 text1 text1 1
#> attr(,"meta")
#> attr(,"meta")$system
#> attr(,"meta")$system$`package-version`
#> [1] ‘4.3.0’
#>
#> attr(,"meta")$system$`r-version`
#> [1] ‘4.5.0’
#>
#> attr(,"meta")$system$system
#> sysname machine user
#> "Darwin" "arm64" "kbenoit"
#>
#> attr(,"meta")$system$directory
#> [1] "/Users/kbenoit/Dropbox/GitHub/quanteda/quanteda/docs/reference"
#>
#> attr(,"meta")$system$created
#> [1] "2025-05-21"
#>
#>
#> attr(,"meta")$object
#> attr(,"meta")$object$unit
#> [1] "documents"
#>
#> attr(,"meta")$object$what
#> [1] "word"
#>
#> attr(,"meta")$object$tokenizer
#> [1] "tokenize_word4"
#>
#> attr(,"meta")$object$ngram
#> [1] 1
#>
#> attr(,"meta")$object$skip
#> [1] 0
#>
#> attr(,"meta")$object$concatenator
#> [1] "_"
#>
#> attr(,"meta")$object$summary
#> attr(,"meta")$object$summary$hash
#> character(0)
#>
#> attr(,"meta")$object$summary$data
#> NULL
#>
#>
#>
#> attr(,"meta")$user
#> list()
#>
# compounding
toks3 <- tokens("One two three four.")
unclass(toks3)
#> $text1
#> [1] 1 2 3 4 5
#>
#> attr(,"types")
#> [1] "One" "two" "three" "four" "."
#> attr(,"padding")
#> [1] TRUE
#> attr(,"docvars")
#> docname_ docid_ segid_
#> 1 text1 text1 1
#> attr(,"meta")
#> attr(,"meta")$system
#> attr(,"meta")$system$`package-version`
#> [1] ‘4.3.0’
#>
#> attr(,"meta")$system$`r-version`
#> [1] ‘4.5.0’
#>
#> attr(,"meta")$system$system
#> sysname machine user
#> "Darwin" "arm64" "kbenoit"
#>
#> attr(,"meta")$system$directory
#> [1] "/Users/kbenoit/Dropbox/GitHub/quanteda/quanteda/docs/reference"
#>
#> attr(,"meta")$system$created
#> [1] "2025-05-21"
#>
#>
#> attr(,"meta")$object
#> attr(,"meta")$object$unit
#> [1] "documents"
#>
#> attr(,"meta")$object$what
#> [1] "word"
#>
#> attr(,"meta")$object$tokenizer
#> [1] "tokenize_word4"
#>
#> attr(,"meta")$object$ngram
#> [1] 1
#>
#> attr(,"meta")$object$skip
#> [1] 0
#>
#> attr(,"meta")$object$concatenator
#> [1] "_"
#>
#> attr(,"meta")$object$summary
#> attr(,"meta")$object$summary$hash
#> character(0)
#>
#> attr(,"meta")$object$summary$data
#> NULL
#>
#>
#>
#> attr(,"meta")$user
#> list()
#>
unclass(tokens_compound(toks3, "two three"))
#> $text1
#> [1] 1 2 3 4 5
#>
#> attr(,"types")
#> [1] "One" "two" "three" "four" "."
#> attr(,"padding")
#> [1] TRUE
#> attr(,"docvars")
#> docname_ docid_ segid_
#> 1 text1 text1 1
#> attr(,"meta")
#> attr(,"meta")$system
#> attr(,"meta")$system$`package-version`
#> [1] ‘4.3.0’
#>
#> attr(,"meta")$system$`r-version`
#> [1] ‘4.5.0’
#>
#> attr(,"meta")$system$system
#> sysname machine user
#> "Darwin" "arm64" "kbenoit"
#>
#> attr(,"meta")$system$directory
#> [1] "/Users/kbenoit/Dropbox/GitHub/quanteda/quanteda/docs/reference"
#>
#> attr(,"meta")$system$created
#> [1] "2025-05-21"
#>
#>
#> attr(,"meta")$object
#> attr(,"meta")$object$unit
#> [1] "documents"
#>
#> attr(,"meta")$object$what
#> [1] "word"
#>
#> attr(,"meta")$object$tokenizer
#> [1] "tokenize_word4"
#>
#> attr(,"meta")$object$ngram
#> [1] 1
#>
#> attr(,"meta")$object$skip
#> [1] 0
#>
#> attr(,"meta")$object$concatenator
#> [1] "_"
#>
#> attr(,"meta")$object$summary
#> attr(,"meta")$object$summary$hash
#> character(0)
#>
#> attr(,"meta")$object$summary$data
#> NULL
#>
#>
#>
#> attr(,"meta")$user
#> list()
#>
# lookup
dict <- dictionary(list(test = c("one", "three")))
unclass(tokens_lookup(toks3, dict))
#> $text1
#> [1] 1 1
#>
#> attr(,"types")
#> [1] "test"
#> attr(,"padding")
#> [1] TRUE
#> attr(,"docvars")
#> docname_ docid_ segid_
#> 1 text1 text1 1
#> attr(,"meta")
#> attr(,"meta")$system
#> attr(,"meta")$system$`package-version`
#> [1] ‘4.3.0’
#>
#> attr(,"meta")$system$`r-version`
#> [1] ‘4.5.0’
#>
#> attr(,"meta")$system$system
#> sysname machine user
#> "Darwin" "arm64" "kbenoit"
#>
#> attr(,"meta")$system$directory
#> [1] "/Users/kbenoit/Dropbox/GitHub/quanteda/quanteda/docs/reference"
#>
#> attr(,"meta")$system$created
#> [1] "2025-05-21"
#>
#>
#> attr(,"meta")$object
#> attr(,"meta")$object$unit
#> [1] "documents"
#>
#> attr(,"meta")$object$what
#> [1] "dictionary"
#>
#> attr(,"meta")$object$tokenizer
#> [1] "tokenize_word4"
#>
#> attr(,"meta")$object$ngram
#> [1] 1
#>
#> attr(,"meta")$object$skip
#> [1] 0
#>
#> attr(,"meta")$object$concatenator
#> [1] "_"
#>
#> attr(,"meta")$object$summary
#> attr(,"meta")$object$summary$hash
#> character(0)
#>
#> attr(,"meta")$object$summary$data
#> NULL
#>
#>
#>
#> attr(,"meta")$user
#> list()
#>
# empty pads
unclass(tokens_select(toks3, dict))
#> $text1
#> [1] 1 2
#>
#> attr(,"types")
#> [1] "One" "three"
#> attr(,"padding")
#> [1] TRUE
#> attr(,"docvars")
#> docname_ docid_ segid_
#> 1 text1 text1 1
#> attr(,"meta")
#> attr(,"meta")$system
#> attr(,"meta")$system$`package-version`
#> [1] ‘4.3.0’
#>
#> attr(,"meta")$system$`r-version`
#> [1] ‘4.5.0’
#>
#> attr(,"meta")$system$system
#> sysname machine user
#> "Darwin" "arm64" "kbenoit"
#>
#> attr(,"meta")$system$directory
#> [1] "/Users/kbenoit/Dropbox/GitHub/quanteda/quanteda/docs/reference"
#>
#> attr(,"meta")$system$created
#> [1] "2025-05-21"
#>
#>
#> attr(,"meta")$object
#> attr(,"meta")$object$unit
#> [1] "documents"
#>
#> attr(,"meta")$object$what
#> [1] "word"
#>
#> attr(,"meta")$object$tokenizer
#> [1] "tokenize_word4"
#>
#> attr(,"meta")$object$ngram
#> [1] 1
#>
#> attr(,"meta")$object$skip
#> [1] 0
#>
#> attr(,"meta")$object$concatenator
#> [1] "_"
#>
#> attr(,"meta")$object$summary
#> attr(,"meta")$object$summary$hash
#> character(0)
#>
#> attr(,"meta")$object$summary$data
#> NULL
#>
#>
#>
#> attr(,"meta")$user
#> list()
#>
unclass(tokens_select(toks3, dict, padding = TRUE))
#> $text1
#> [1] 1 0 2 0 0
#>
#> attr(,"types")
#> [1] "One" "three"
#> attr(,"padding")
#> [1] TRUE
#> attr(,"docvars")
#> docname_ docid_ segid_
#> 1 text1 text1 1
#> attr(,"meta")
#> attr(,"meta")$system
#> attr(,"meta")$system$`package-version`
#> [1] ‘4.3.0’
#>
#> attr(,"meta")$system$`r-version`
#> [1] ‘4.5.0’
#>
#> attr(,"meta")$system$system
#> sysname machine user
#> "Darwin" "arm64" "kbenoit"
#>
#> attr(,"meta")$system$directory
#> [1] "/Users/kbenoit/Dropbox/GitHub/quanteda/quanteda/docs/reference"
#>
#> attr(,"meta")$system$created
#> [1] "2025-05-21"
#>
#>
#> attr(,"meta")$object
#> attr(,"meta")$object$unit
#> [1] "documents"
#>
#> attr(,"meta")$object$what
#> [1] "word"
#>
#> attr(,"meta")$object$tokenizer
#> [1] "tokenize_word4"
#>
#> attr(,"meta")$object$ngram
#> [1] 1
#>
#> attr(,"meta")$object$skip
#> [1] 0
#>
#> attr(,"meta")$object$concatenator
#> [1] "_"
#>
#> attr(,"meta")$object$summary
#> attr(,"meta")$object$summary$hash
#> character(0)
#>
#> attr(,"meta")$object$summary$data
#> NULL
#>
#>
#>
#> attr(,"meta")$user
#> list()
#>
# ngrams
unclass(tokens_ngrams(toks3, n = 2:3))
#> $text1
#> [1] 1 2 3 4 5 6 7
#>
#> attr(,"types")
#> [1] "One_two" "two_three" "three_four" "four_."
#> [5] "One_two_three" "two_three_four" "three_four_."
#> attr(,"padding")
#> [1] TRUE
#> attr(,"docvars")
#> docname_ docid_ segid_
#> 1 text1 text1 1
#> attr(,"meta")
#> attr(,"meta")$system
#> attr(,"meta")$system$`package-version`
#> [1] ‘4.3.0’
#>
#> attr(,"meta")$system$`r-version`
#> [1] ‘4.5.0’
#>
#> attr(,"meta")$system$system
#> sysname machine user
#> "Darwin" "arm64" "kbenoit"
#>
#> attr(,"meta")$system$directory
#> [1] "/Users/kbenoit/Dropbox/GitHub/quanteda/quanteda/docs/reference"
#>
#> attr(,"meta")$system$created
#> [1] "2025-05-21"
#>
#>
#> attr(,"meta")$object
#> attr(,"meta")$object$unit
#> [1] "documents"
#>
#> attr(,"meta")$object$what
#> [1] "word"
#>
#> attr(,"meta")$object$tokenizer
#> [1] "tokenize_word4"
#>
#> attr(,"meta")$object$ngram
#> [1] 2 3
#>
#> attr(,"meta")$object$skip
#> [1] 0
#>
#> attr(,"meta")$object$concatenator
#> [1] "_"
#>
#> attr(,"meta")$object$summary
#> attr(,"meta")$object$summary$hash
#> character(0)
#>
#> attr(,"meta")$object$summary$data
#> NULL
#>
#>
#>
#> attr(,"meta")$user
#> list()
#>