library(quanteda)

Download corpus

Download corpus constructed from Report on the Work of the Government published by Premier of the State Council between 1954 and 2017. You can download the corpus using the quanteda.corpora package.

# read text files
devtools::install_github("quanteda/quanteda.corpora")

Tokenization

# Chinese stopwords
ch_stop <- stopwords("zh", source = "misc")

# tokenize
ch_toks <- corp %>% 
    tokens(remove_punct = TRUE) %>%
    tokens_remove(pattern = ch_stop)

# construct a dfm
ch_dfm <- dfm(ch_toks)
topfeatures(ch_dfm)
## 发展 经济 社会 建设 改革 人民 主义 工作 企业 国家 
## 5627 5036 4255 4248 2931 2897 2817 2642 2627 2595

Analysis

Feature co-occurrence matrix

# fcm within the window size of 5
ch17_corp <- corpus_subset(corp, Year == "2017")
ch17_toks <- 
    tokens(ch17_corp, remove_punct = TRUE) %>% 
    tokens_remove(ch_stop)
ch_fcm <- fcm(ch17_toks, context = "window")  
topfeatures(ch_fcm["改革", ])
## 推进 制度   性 体制 完善   试   点 供给   侧 结构 
##   23   19   11   11   10    8    8    7    7    7

Unsupervised document scaling

wf <- textmodel_wordfish(ch_dfm)
y <- 1954:2017
y <- y[y <= 1964 | y >= 1975]
y <- y[!y %in% c(1963, 1961, 1962, 1976, 1977)]
plot(y, wf$theta, xlab = "Year", ylab = "Position")

Collocations

collocation count count_nested length lambda z
社会 主义 1787 0 2 5.681248 129.07450
1 9 678 0 2 6.550501 95.50843
亿 元 689 0 2 7.465125 93.26342
0 0 491 0 2 5.736795 85.64818
现代 化 632 0 2 6.970359 83.79122
体制 改革 504 0 2 5.213023 77.67687
9 5 350 0 2 5.747687 75.33269
五年 计划 341 0 2 5.378963 71.91355
各级 政府 306 0 2 6.130473 66.85150
增长 百分 300 0 2 5.540657 66.11816
collocation count count_nested length lambda z
人民 群众 12 0 2 5.406485 12.89405
亿 元 14 0 2 8.302483 12.62130
调 控 11 0 2 7.593472 12.41243
政府 工作 9 0 2 4.709869 11.07905
深入 实施 8 0 2 5.018234 10.92377
党 中央 7 0 2 5.746878 10.90837
体制 改革 11 0 2 5.317035 10.53518
国内 生产 6 0 2 6.166520 10.48816
现代 化 8 0 2 5.705688 10.43435
基础 设施 7 0 2 7.549273 10.42465