Download corpus

Download corpus constructed from Report on the Work of the Government published by Premier of the State Council between 1954 and 2017. You can download the corpus using the quanteda.corpora package.

# read text files
devtools::install_github("quanteda/quanteda.corpora")
library(quanteda.corpora)
corp <- quanteda.corpora::download(url = "https://www.dropbox.com/s/37ojd5knz1qeyul/data_corpus_chinesegovreport.rds?dl=1")

Tokenization

# Chinese stopwords
ch_stop <- stopwords("zh", source = "misc")

# tokenize
ch_toks <- corp %>% 
    tokens(remove_punct = TRUE) %>%
    tokens_remove(pattern = ch_stop)

# construct a dfm
ch_dfm <- dfm(ch_toks)
topfeatures(ch_dfm)
## \u53d1\u5c55 \u7ecf\u6d4e \u793e\u4f1a \u5efa\u8bbe \u6539\u9769 \u4eba\u6c11 
##         5627         5036         4255         4248         2931         2897 
## \u4e3b\u4e49 \u5de5\u4f5c \u4f01\u4e1a \u56fd\u5bb6 
##         2817         2642         2627         2595

Analysis

Word cloud

# plot a word cloud
set.seed(100)

# to set the font correctly for macOS
library("quanteda.textplots")
textplot_wordcloud(ch_dfm, min_count = 500, random_order = FALSE,
                   rotation = .25, max_words = 100,
                   min_size = 0.5, max_size = 2.8,
                   font = if (Sys.info()['sysname'] == "Darwin") "SimHei" else NULL,
                   color = RColorBrewer::brewer.pal(8, "Dark2"))

Feature co-occurrence matrix

# fcm within the window size of 5
ch17_corp <- corpus_subset(corp, Year == "2017")
ch17_toks <- 
    tokens(ch17_corp, remove_punct = TRUE) %>% 
    tokens_remove(ch_stop)
ch_fcm <- fcm(ch17_toks, context = "window")  
topfeatures(ch_fcm["改革", ])
## \u63a8\u8fdb \u5236\u5ea6       \u6027 \u4f53\u5236 \u5b8c\u5584       \u8bd5 
##           23           19           11           11           10            8 
##       \u70b9 \u4f9b\u7ed9       \u4fa7 \u7ed3\u6784 
##            8            7            7            7

Unsupervised document scaling

library("quanteda.textmodels")
wf <- textmodel_wordfish(ch_dfm)
y <- 1954:2017
y <- y[y <= 1964 | y >= 1975]
y <- y[!y %in% c(1963, 1961, 1962, 1976, 1977)]
plot(y, wf$theta, xlab = "Year", ylab = "Position")

Collocations

# bigrams cross the whole dataset
library("quanteda.textstats")
ch_col <- textstat_collocations(ch_toks, size = 2, min_count = 20)
knitr::kable(head(ch_col, 10))
collocation count count_nested length lambda z
社会 主义 1787 0 2 5.681516 129.08061
1 9 678 0 2 6.550767 95.51231
亿 元 689 0 2 7.465391 93.26674
0 0 491 0 2 5.737061 85.65216
现代 化 632 0 2 6.970625 83.79442
体制 改革 504 0 2 5.213290 77.68085
9 5 350 0 2 5.747953 75.33618
五年 计划 341 0 2 5.379229 71.91712
各级 政府 306 0 2 6.130740 66.85440
增长 百分 300 0 2 5.540923 66.12134
# bigrams in 2017 report
ch17_col <- textstat_collocations(ch17_toks, size = 2)
knitr::kable(head(ch17_col, 10))
collocation count count_nested length lambda z
人民 群众 12 0 2 5.406843 12.89491
亿 元 14 0 2 8.302839 12.62184
调 控 11 0 2 7.593829 12.41301
政府 工作 9 0 2 4.710228 11.07990
深入 实施 8 0 2 5.018592 10.92455
党 中央 7 0 2 5.747235 10.90905
体制 改革 11 0 2 5.317394 10.53589
国内 生产 6 0 2 6.166877 10.48876
现代 化 8 0 2 5.706046 10.43500
基础 设施 7 0 2 7.549629 10.42514