Class affinity maximum likelihood text scaling model

textmodel_affinity implements the maximum likelihood supervised text scaling method described in Perry and Benoit (2017).

textmodel_affinity(
  x,
  y,
  exclude = NULL,
  smooth = 0.5,
  ref_smooth = 0.5,
  verbose = quanteda_options("verbose")
)

Arguments

x	the dfm or bootstrap_dfm object on which the model will be fit. Does not need to contain only the training documents, since the index of these will be matched automatically.
y	vector of training classes/scores associated with each document identified in `data`
exclude	a set of words to exclude from the model
smooth	a smoothing parameter for class affinities; defaults to 0.5 (Jeffreys prior). A plausible alternative would be 1.0 (Laplace prior).
ref_smooth	a smoothing parameter for token distributions; defaults to 0.5
verbose	logical; if `TRUE` print diagnostic information during fitting.

References

Perry, P.O. & Benoit, K.R. (2017). Scaling Text with the Class Affinity Model. arXiv:1710.08963 [stat.ML].

Examples

(af <- textmodel_affinity(data_dfm_lbgexample, y = c("L", NA, NA, NA, "R", NA)))
#> Call:
#> textmodel_affinity.dfm(x = data_dfm_lbgexample, y = c("L", NA, 
#>     NA, NA, "R", NA))
#> 
#> Training documents per class:L: 1, R: 1; total training features: 37
predict(af)
#> $coefficients
#>               L            R
#> R1 0.9994964426 0.0005035574
#> R2 0.9994114959 0.0005885041
#> R3 0.5000000000 0.5000000000
#> R4 0.0005885041 0.9994114959
#> R5 0.0005035574 0.9994964426
#> V1 0.9986670961 0.0013329039
#> 
#> $se
#>               L            R
#> R1 0.0007119597 0.0007119597
#> R2 0.0008319957 0.0008319957
#> R3 0.0273371117 0.0273371117
#> R4 0.0008319957 0.0008319957
#> R5 0.0007119597 0.0007119597
#> V1 0.0018655335 0.0018655335
#> 
#> $cov
#> , , R1
#> 
#>               L             R
#> L  5.068867e-07 -5.068867e-07
#> R -5.068867e-07  5.068867e-07
#> 
#> , , R2
#> 
#>               L             R
#> L  6.922169e-07 -6.922169e-07
#> R -6.922169e-07  6.922169e-07
#> 
#> , , R3
#> 
#>               L             R
#> L  0.0007473177 -0.0007473177
#> R -0.0007473177  0.0007473177
#> 
#> , , R4
#> 
#>               L             R
#> L  6.922169e-07 -6.922169e-07
#> R -6.922169e-07  6.922169e-07
#> 
#> , , R5
#> 
#>               L             R
#> L  5.068867e-07 -5.068867e-07
#> R -5.068867e-07  5.068867e-07
#> 
#> , , V1
#> 
#>               L             R
#> L  3.480215e-06 -3.480215e-06
#> R -3.480215e-06  3.480215e-06
#> 
#> 
#> $smooth
#> [1] 0.5 0.5
#> 
#> $newdata
#> Document-feature matrix of: 6 documents, 37 features (54.1% sparse).
#> 
#> $train
#> [1]  TRUE FALSE FALSE FALSE  TRUE FALSE
#> 
#> $level
#> [1] 0.95
#> 
#> $p
#> 37 x 2 sparse Matrix of class "dgCMatrix"
#>         docs
#> features            L            R
#>       A  0.0024582104 0.0004916421
#>       B  0.0034414946 0.0004916421
#>       C  0.0103244838 0.0004916421
#>       D  0.0221238938 0.0004916421
#>       E  0.0447394297 0.0004916421
#>       F  0.0771878073 0.0004916421
#>       G  0.1135693215 0.0004916421
#>       H  0.1440511308 0.0004916421
#>       I  0.1558505408 0.0004916421
#>       J  0.1440511308 0.0004916421
#>       K  0.1135693215 0.0004916421
#>       L  0.0771878073 0.0004916421
#>       M  0.0447394297 0.0004916421
#>       N  0.0221238938 0.0004916421
#>       O  0.0103244838 0.0004916421
#>       P  0.0034414946 0.0004916421
#>       Q  0.0024582104 0.0004916421
#>       R  .            .           
#>       S  .            .           
#>       T  .            .           
#>       U  0.0004916421 0.0024582104
#>       V  0.0004916421 0.0034414946
#>       W  0.0004916421 0.0103244838
#>       X  0.0004916421 0.0221238938
#>       Y  0.0004916421 0.0447394297
#>       Z  0.0004916421 0.0771878073
#>       ZA 0.0004916421 0.1135693215
#>       ZB 0.0004916421 0.1440511308
#>       ZC 0.0004916421 0.1558505408
#>       ZD 0.0004916421 0.1440511308
#>       ZE 0.0004916421 0.1135693215
#>       ZF 0.0004916421 0.0771878073
#>       ZG 0.0004916421 0.0447394297
#>       ZH 0.0004916421 0.0221238938
#>       ZI 0.0004916421 0.0103244838
#>       ZJ 0.0004916421 0.0034414946
#>       ZK 0.0004916421 0.0024582104
#> 
#> $support
#>     A     B     C     D     E     F     G     H     I     J     K     L     M 
#>  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE 
#>     N     O     P     Q     R     S     T     U     V     W     X     Y     Z 
#>  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE 
#>    ZA    ZB    ZC    ZD    ZE    ZF    ZG    ZH    ZI    ZJ    ZK 
#>  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE 
#> 
predict(af, newdata = data_dfm_lbgexample[6, ])
#> $coefficients
#>            L           R
#> V1 0.9986671 0.001332904
#> 
#> $se
#>              L           R
#> V1 0.001865533 0.001865533
#> 
#> $cov
#> , , V1
#> 
#>               L             R
#> L  3.480215e-06 -3.480215e-06
#> R -3.480215e-06  3.480215e-06
#> 
#> 
#> $smooth
#> [1] 0.5 0.5
#> 
#> $newdata
#> Document-feature matrix of: 1 document, 37 features (54.1% sparse).
#> 
#> $train
#> [1] FALSE
#> 
#> $level
#> [1] 0.95
#> 
#> $p
#> 37 x 2 sparse Matrix of class "dgCMatrix"
#>         docs
#> features            L            R
#>       A  0.0024582104 0.0004916421
#>       B  0.0034414946 0.0004916421
#>       C  0.0103244838 0.0004916421
#>       D  0.0221238938 0.0004916421
#>       E  0.0447394297 0.0004916421
#>       F  0.0771878073 0.0004916421
#>       G  0.1135693215 0.0004916421
#>       H  0.1440511308 0.0004916421
#>       I  0.1558505408 0.0004916421
#>       J  0.1440511308 0.0004916421
#>       K  0.1135693215 0.0004916421
#>       L  0.0771878073 0.0004916421
#>       M  0.0447394297 0.0004916421
#>       N  0.0221238938 0.0004916421
#>       O  0.0103244838 0.0004916421
#>       P  0.0034414946 0.0004916421
#>       Q  0.0024582104 0.0004916421
#>       R  .            .           
#>       S  .            .           
#>       T  .            .           
#>       U  0.0004916421 0.0024582104
#>       V  0.0004916421 0.0034414946
#>       W  0.0004916421 0.0103244838
#>       X  0.0004916421 0.0221238938
#>       Y  0.0004916421 0.0447394297
#>       Z  0.0004916421 0.0771878073
#>       ZA 0.0004916421 0.1135693215
#>       ZB 0.0004916421 0.1440511308
#>       ZC 0.0004916421 0.1558505408
#>       ZD 0.0004916421 0.1440511308
#>       ZE 0.0004916421 0.1135693215
#>       ZF 0.0004916421 0.0771878073
#>       ZG 0.0004916421 0.0447394297
#>       ZH 0.0004916421 0.0221238938
#>       ZI 0.0004916421 0.0103244838
#>       ZJ 0.0004916421 0.0034414946
#>       ZK 0.0004916421 0.0024582104
#> 
#> $support
#>     A     B     C     D     E     F     G     H     I     J     K     L     M 
#>  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE 
#>     N     O     P     Q     R     S     T     U     V     W     X     Y     Z 
#>  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE 
#>    ZA    ZB    ZC    ZD    ZE    ZF    ZG    ZH    ZI    ZJ    ZK 
#>  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE 
#> 

if (FALSE) {
# compute bootstrapped SEs
dfmat <- bootstrap_dfm(data_corpus_dailnoconf1991, n = 10, remove_punct = TRUE)
textmodel_affinity(dfmat, y = c("Govt", "Opp", "Opp", rep(NA, 55)))
}

Arguments

References

See also

Examples

Contents

Author