Segment tokens into new documents of equally sized token lengths, with the possibility of overlapping the chunks.

tokens_chunk(x, size, overlap = 0, use_docvars = TRUE)

Arguments

x

tokens object whose token elements will be segmented into chunks

size

integer; the token length of the chunks

overlap

integer; the number of tokens in a chunk to be taken from the last overlap tokens from the preceding chunk

use_docvars

if TRUE, repeat the docvar values for each chunk; if FALSE, drop the docvars in the chunked tokens

Value

A tokens object whose documents have been split into chunks of length size.

See also

Examples

txts <- c(doc1 = "Fellow citizens, I am again called upon by the voice of my country to execute the functions of its Chief Magistrate.", doc2 = "When the occasion proper for it shall arrive, I shall endeavor to express the high sense I entertain of this distinguished honor.") toks <- tokens(txts) tokens_chunk(toks, size = 5)
#> tokens from 10 documents. #> doc1.1 : #> [1] "Fellow" "citizens" "," "I" "am" #> #> doc1.2 : #> [1] "again" "called" "upon" "by" "the" #> #> doc1.3 : #> [1] "voice" "of" "my" "country" "to" #> #> doc1.4 : #> [1] "execute" "the" "functions" "of" "its" #> #> doc1.5 : #> [1] "Chief" "Magistrate" "." #> #> doc2.1 : #> [1] "When" "the" "occasion" "proper" "for" #> #> doc2.2 : #> [1] "it" "shall" "arrive" "," "I" #> #> doc2.3 : #> [1] "shall" "endeavor" "to" "express" "the" #> #> doc2.4 : #> [1] "high" "sense" "I" "entertain" "of" #> #> doc2.5 : #> [1] "this" "distinguished" "honor" "." #>
tokens_chunk(toks, size = 5, overlap = 4)
#> tokens from 47 documents. #> doc1.1 : #> [1] "Fellow" "citizens" "," "I" "am" #> #> doc1.2 : #> [1] "citizens" "," "I" "am" "again" #> #> doc1.3 : #> [1] "," "I" "am" "again" "called" #> #> doc1.4 : #> [1] "I" "am" "again" "called" "upon" #> #> doc1.5 : #> [1] "am" "again" "called" "upon" "by" #> #> doc1.6 : #> [1] "again" "called" "upon" "by" "the" #> #> doc1.7 : #> [1] "called" "upon" "by" "the" "voice" #> #> doc1.8 : #> [1] "upon" "by" "the" "voice" "of" #> #> doc1.9 : #> [1] "by" "the" "voice" "of" "my" #> #> doc1.10 : #> [1] "the" "voice" "of" "my" "country" #> #> doc1.11 : #> [1] "voice" "of" "my" "country" "to" #> #> doc1.12 : #> [1] "of" "my" "country" "to" "execute" #> #> doc1.13 : #> [1] "my" "country" "to" "execute" "the" #> #> doc1.14 : #> [1] "country" "to" "execute" "the" "functions" #> #> doc1.15 : #> [1] "to" "execute" "the" "functions" "of" #> #> doc1.16 : #> [1] "execute" "the" "functions" "of" "its" #> #> doc1.17 : #> [1] "the" "functions" "of" "its" "Chief" #> #> doc1.18 : #> [1] "functions" "of" "its" "Chief" "Magistrate" #> #> doc1.19 : #> [1] "of" "its" "Chief" "Magistrate" "." #> #> doc1.20 : #> [1] "its" "Chief" "Magistrate" "." #> #> doc1.21 : #> [1] "Chief" "Magistrate" "." #> #> doc1.22 : #> [1] "Magistrate" "." #> #> doc1.23 : #> [1] "." #> #> doc2.1 : #> [1] "When" "the" "occasion" "proper" "for" #> #> doc2.2 : #> [1] "the" "occasion" "proper" "for" "it" #> #> doc2.3 : #> [1] "occasion" "proper" "for" "it" "shall" #> #> doc2.4 : #> [1] "proper" "for" "it" "shall" "arrive" #> #> doc2.5 : #> [1] "for" "it" "shall" "arrive" "," #> #> doc2.6 : #> [1] "it" "shall" "arrive" "," "I" #> #> doc2.7 : #> [1] "shall" "arrive" "," "I" "shall" #> #> doc2.8 : #> [1] "arrive" "," "I" "shall" "endeavor" #> #> doc2.9 : #> [1] "," "I" "shall" "endeavor" "to" #> #> doc2.10 : #> [1] "I" "shall" "endeavor" "to" "express" #> #> doc2.11 : #> [1] "shall" "endeavor" "to" "express" "the" #> #> doc2.12 : #> [1] "endeavor" "to" "express" "the" "high" #> #> doc2.13 : #> [1] "to" "express" "the" "high" "sense" #> #> doc2.14 : #> [1] "express" "the" "high" "sense" "I" #> #> doc2.15 : #> [1] "the" "high" "sense" "I" "entertain" #> #> doc2.16 : #> [1] "high" "sense" "I" "entertain" "of" #> #> doc2.17 : #> [1] "sense" "I" "entertain" "of" "this" #> #> doc2.18 : #> [1] "I" "entertain" "of" "this" #> [5] "distinguished" #> #> doc2.19 : #> [1] "entertain" "of" "this" "distinguished" #> [5] "honor" #> #> doc2.20 : #> [1] "of" "this" "distinguished" "honor" #> [5] "." #> #> doc2.21 : #> [1] "this" "distinguished" "honor" "." #> #> doc2.22 : #> [1] "distinguished" "honor" "." #> #> doc2.23 : #> [1] "honor" "." #> #> doc2.24 : #> [1] "." #>