textmodel_affinity implements the maximum likelihood supervised text scaling method described in Perry and Benoit (2017).

textmodel_affinity(
  x,
  y,
  exclude = NULL,
  smooth = 0.5,
  ref_smooth = 0.5,
  verbose = quanteda_options("verbose")
)

Arguments

x

the dfm or bootstrap_dfm object on which the model will be fit. Does not need to contain only the training documents, since the index of these will be matched automatically.

y

vector of training classes/scores associated with each document identified in data

exclude

a set of words to exclude from the model

smooth

a smoothing parameter for class affinities; defaults to 0.5 (Jeffreys prior). A plausible alternative would be 1.0 (Laplace prior).

ref_smooth

a smoothing parameter for token distributions; defaults to 0.5

verbose

logical; if TRUE print diagnostic information during fitting.

References

Perry, P.O. & Benoit, K.R. (2017). Scaling Text with the Class Affinity Model. arXiv:1710.08963 [stat.ML].

See also

predict.textmodel_affinity for methods of applying a fitted textmodel_affinity model object to predict quantities from (other) documents.

Examples

(af <- textmodel_affinity(data_dfm_lbgexample, y = c("L", NA, NA, NA, "R", NA)))
#> Call: #> textmodel_affinity.dfm(x = data_dfm_lbgexample, y = c("L", NA, #> NA, NA, "R", NA)) #> #> Training documents per class:L: 1, R: 1; total training features: 37
#> $coefficients #> L R #> R1 0.9994964426 0.0005035574 #> R2 0.9994114959 0.0005885041 #> R3 0.5000000000 0.5000000000 #> R4 0.0005885041 0.9994114959 #> R5 0.0005035574 0.9994964426 #> V1 0.9986670961 0.0013329039 #> #> $se #> L R #> R1 0.0007119597 0.0007119597 #> R2 0.0008319957 0.0008319957 #> R3 0.0273371117 0.0273371117 #> R4 0.0008319957 0.0008319957 #> R5 0.0007119597 0.0007119597 #> V1 0.0018655335 0.0018655335 #> #> $cov #> , , R1 #> #> L R #> L 5.068867e-07 -5.068867e-07 #> R -5.068867e-07 5.068867e-07 #> #> , , R2 #> #> L R #> L 6.922169e-07 -6.922169e-07 #> R -6.922169e-07 6.922169e-07 #> #> , , R3 #> #> L R #> L 0.0007473177 -0.0007473177 #> R -0.0007473177 0.0007473177 #> #> , , R4 #> #> L R #> L 6.922169e-07 -6.922169e-07 #> R -6.922169e-07 6.922169e-07 #> #> , , R5 #> #> L R #> L 5.068867e-07 -5.068867e-07 #> R -5.068867e-07 5.068867e-07 #> #> , , V1 #> #> L R #> L 3.480215e-06 -3.480215e-06 #> R -3.480215e-06 3.480215e-06 #> #> #> $smooth #> [1] 0.5 0.5 #> #> $newdata #> Document-feature matrix of: 6 documents, 37 features (54.1% sparse). #> #> $train #> [1] TRUE FALSE FALSE FALSE TRUE FALSE #> #> $level #> [1] 0.95 #> #> $p #> 37 x 2 sparse Matrix of class "dgCMatrix" #> docs #> features L R #> A 0.0024582104 0.0004916421 #> B 0.0034414946 0.0004916421 #> C 0.0103244838 0.0004916421 #> D 0.0221238938 0.0004916421 #> E 0.0447394297 0.0004916421 #> F 0.0771878073 0.0004916421 #> G 0.1135693215 0.0004916421 #> H 0.1440511308 0.0004916421 #> I 0.1558505408 0.0004916421 #> J 0.1440511308 0.0004916421 #> K 0.1135693215 0.0004916421 #> L 0.0771878073 0.0004916421 #> M 0.0447394297 0.0004916421 #> N 0.0221238938 0.0004916421 #> O 0.0103244838 0.0004916421 #> P 0.0034414946 0.0004916421 #> Q 0.0024582104 0.0004916421 #> R . . #> S . . #> T . . #> U 0.0004916421 0.0024582104 #> V 0.0004916421 0.0034414946 #> W 0.0004916421 0.0103244838 #> X 0.0004916421 0.0221238938 #> Y 0.0004916421 0.0447394297 #> Z 0.0004916421 0.0771878073 #> ZA 0.0004916421 0.1135693215 #> ZB 0.0004916421 0.1440511308 #> ZC 0.0004916421 0.1558505408 #> ZD 0.0004916421 0.1440511308 #> ZE 0.0004916421 0.1135693215 #> ZF 0.0004916421 0.0771878073 #> ZG 0.0004916421 0.0447394297 #> ZH 0.0004916421 0.0221238938 #> ZI 0.0004916421 0.0103244838 #> ZJ 0.0004916421 0.0034414946 #> ZK 0.0004916421 0.0024582104 #> #> $support #> A B C D E F G H I J K L M #> TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE #> N O P Q R S T U V W X Y Z #> TRUE TRUE TRUE TRUE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE #> ZA ZB ZC ZD ZE ZF ZG ZH ZI ZJ ZK #> TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE #>
predict(af, newdata = data_dfm_lbgexample[6, ])
#> $coefficients #> L R #> V1 0.9986671 0.001332904 #> #> $se #> L R #> V1 0.001865533 0.001865533 #> #> $cov #> , , V1 #> #> L R #> L 3.480215e-06 -3.480215e-06 #> R -3.480215e-06 3.480215e-06 #> #> #> $smooth #> [1] 0.5 0.5 #> #> $newdata #> Document-feature matrix of: 1 document, 37 features (54.1% sparse). #> #> $train #> [1] FALSE #> #> $level #> [1] 0.95 #> #> $p #> 37 x 2 sparse Matrix of class "dgCMatrix" #> docs #> features L R #> A 0.0024582104 0.0004916421 #> B 0.0034414946 0.0004916421 #> C 0.0103244838 0.0004916421 #> D 0.0221238938 0.0004916421 #> E 0.0447394297 0.0004916421 #> F 0.0771878073 0.0004916421 #> G 0.1135693215 0.0004916421 #> H 0.1440511308 0.0004916421 #> I 0.1558505408 0.0004916421 #> J 0.1440511308 0.0004916421 #> K 0.1135693215 0.0004916421 #> L 0.0771878073 0.0004916421 #> M 0.0447394297 0.0004916421 #> N 0.0221238938 0.0004916421 #> O 0.0103244838 0.0004916421 #> P 0.0034414946 0.0004916421 #> Q 0.0024582104 0.0004916421 #> R . . #> S . . #> T . . #> U 0.0004916421 0.0024582104 #> V 0.0004916421 0.0034414946 #> W 0.0004916421 0.0103244838 #> X 0.0004916421 0.0221238938 #> Y 0.0004916421 0.0447394297 #> Z 0.0004916421 0.0771878073 #> ZA 0.0004916421 0.1135693215 #> ZB 0.0004916421 0.1440511308 #> ZC 0.0004916421 0.1558505408 #> ZD 0.0004916421 0.1440511308 #> ZE 0.0004916421 0.1135693215 #> ZF 0.0004916421 0.0771878073 #> ZG 0.0004916421 0.0447394297 #> ZH 0.0004916421 0.0221238938 #> ZI 0.0004916421 0.0103244838 #> ZJ 0.0004916421 0.0034414946 #> ZK 0.0004916421 0.0024582104 #> #> $support #> A B C D E F G H I J K L M #> TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE #> N O P Q R S T U V W X Y Z #> TRUE TRUE TRUE TRUE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE #> ZA ZB ZC ZD ZE ZF ZG ZH ZI ZJ ZK #> TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE #>
if (FALSE) { # compute bootstrapped SEs dfmat <- bootstrap_dfm(data_corpus_dailnoconf1991, n = 10, remove_punct = TRUE) textmodel_affinity(dfmat, y = c("Govt", "Opp", "Opp", rep(NA, 55))) }