Convert a quanteda dfm or corpus object to a format useable by other
packages. The general function convert
provides easy conversion from a dfm
to the document-term representations used in all other text analysis packages
for which conversions are defined. For corpus objects, convert
provides
an easy way to make a corpus and its document variables into a data.frame.
convert(x, to, ...) # S3 method for dfm convert( x, to = c("lda", "tm", "stm", "austin", "topicmodels", "lsa", "matrix", "data.frame", "tripletlist"), docvars = NULL, omit_empty = TRUE, docid_field = "doc_id", ... ) # S3 method for corpus convert(x, to = c("data.frame", "json"), pretty = FALSE, ...)
x | |
---|---|
to | target conversion format, one of:
|
... | unused directly |
docvars | optional data.frame of document variables used as the
|
omit_empty | logical; if |
docid_field | character; the name of the column containing document
names used when |
pretty | adds indentation whitespace to JSON output. Can be TRUE/FALSE or a number specifying the number of spaces to indent. See |
A converted object determined by the value of to
(see above).
See conversion target package documentation for more detailed descriptions
of the return formats.
## convert a dfm toks <- corpus_subset(data_corpus_inaugural, Year > 1970) %>% tokens() dfmat1 <- dfm(toks) # austin's wfm format identical(dim(dfmat1), dim(convert(dfmat1, to = "austin"))) #> [1] TRUE # stm package format stmmat <- convert(dfmat1, to = "stm") str(stmmat) #> List of 3 #> $ documents:List of 13 #> ..$ 1973-Nixon : int [1:2, 1:515] 1 17 2 96 3 1 4 5 6 3 ... #> ..$ 1977-Carter : int [1:2, 1:501] 1 11 2 65 3 7 4 4 7 52 ... #> ..$ 1981-Reagan : int [1:2, 1:850] 1 9 2 174 3 7 4 3 6 5 ... #> ..$ 1985-Reagan : int [1:2, 1:876] 1 12 2 177 3 13 4 7 6 3 ... #> ..$ 1989-Bush : int [1:2, 1:756] 1 7 2 166 3 14 4 16 6 5 ... #> ..$ 1993-Clinton: int [1:2, 1:605] 2 139 3 6 4 5 7 81 10 4 ... #> ..$ 1997-Clinton: int [1:2, 1:726] 1 13 2 131 3 13 4 7 6 3 ... #> ..$ 2001-Bush : int [1:2, 1:592] 1 2 2 110 3 4 4 7 6 1 ... #> ..$ 2005-Bush : int [1:2, 1:734] 1 2 2 120 3 3 4 8 6 2 ... #> ..$ 2009-Obama : int [1:2, 1:900] 1 22 2 130 3 22 4 4 5 1 ... #> ..$ 2013-Obama : int [1:2, 1:786] 1 13 2 99 3 14 4 5 7 89 ... #> ..$ 2017-Trump : int [1:2, 1:547] 1 11 2 96 3 9 4 8 7 88 ... #> ..$ 2021-Biden : int [1:2, 1:743] 1 9 2 147 4 10 6 6 7 210 ... #> $ vocab : chr [1:3616] "-" "," ";" ":" ... #> $ meta :'data.frame': 13 obs. of 4 variables: #> ..$ Year : int [1:13] 1973 1977 1981 1985 1989 1993 1997 2001 2005 2009 ... #> ..$ President: chr [1:13] "Nixon" "Carter" "Reagan" "Reagan" ... #> ..$ FirstName: chr [1:13] "Richard Milhous" "Jimmy" "Ronald" "Ronald" ... #> ..$ Party : Factor w/ 6 levels "Democratic","Democratic-Republican",..: 5 1 5 5 5 1 1 5 5 1 ... # triplet tripletmat <- convert(dfmat1, to = "tripletlist") str(tripletmat) #> List of 3 #> $ document : chr [1:9131] "1973-Nixon" "1981-Reagan" "1989-Bush" "2005-Bush" ... #> $ feature : chr [1:9131] "mr" "mr" "mr" "mr" ... #> $ frequency: num [1:9131] 3 3 6 1 1 69 52 130 124 142 ... if (FALSE) { # tm's DocumentTermMatrix format tmdfm <- convert(dfmat1, to = "tm") str(tmdfm) # topicmodels package format str(convert(dfmat1, to = "topicmodels")) # lda package format str(convert(dfmat1, to = "lda")) } ## convert a corpus into a data.frame corp <- corpus(c(d1 = "Text one.", d2 = "Text two."), docvars = data.frame(dvar1 = 1:2, dvar2 = c("one", "two"), stringsAsFactors = FALSE)) convert(corp, to = "data.frame") #> doc_id text dvar1 dvar2 #> 1 d1 Text one. 1 one #> 2 d2 Text two. 2 two convert(corp, to = "json") #> [{"doc_id":"d1","text":"Text one.","dvar1":1,"dvar2":"one"},{"doc_id":"d2","text":"Text two.","dvar1":2,"dvar2":"two"}]