regex2fixed
converts regex and glob patterns to fixed patterns.
regex2id
converts regex or glob to type IDs to allow C++
function to perform fast searches in tokens object. C++ functions use a
list of type IDs to construct a hash table, against which sub-vectors of
tokens object are matched. This function constructs index of glob patterns
for faster matching.
This is an internal function for regex2id()
that select types using an
index of types by regular expressions.
An internal function for regex2id
that constructs an index of
regex patterns (e.g. ^xxxx
, xxxx$
and ^xxxx$
) to avoid
expensive sequential search by stri_detect_regex.
Internal function for select_types()
to search the index using
fastmatch.
Simpler and faster version of expand.grid() in base package
Internal function for select_types() to check if a glob pattern is indexed by
index_types()
.
regex2fixed(pattern, types = NULL, valuetype = NULL, case_insensitive = NULL, index = NULL) regex2id(pattern, types = NULL, valuetype = NULL, case_insensitive = NULL, index = NULL) search_glob(patterns, types_search, index) search_regex(patterns, types_search, case_insensitive) search_fixed(patterns, types_search, index) index_types(types, valuetype, case_insensitive, max_len = NULL) search_index(pattern, index) expand(elem) is_indexed(pattern)
pattern | a character vector, list of character vectors, dictionary, collocations, or dfm. See pattern for details. |
---|---|
types | unique types of tokens obtained by |
valuetype | the type of pattern matching: |
case_insensitive | ignore case when matching, if |
index | If TRUE, index is constructed automatically. It also accept index constructed by index_types(). |
patterns | a list of regular expressions |
types_search | lowercased types when |
max_len | maximum length of types to be indexed |
elem | list of elements to be combined |
case_insensitive | ignore case when matching, if |
index | index object created by |
types | types of tokens to index |
case_insensitive | ignore case when matching, if |
regex | a glob expression to search |
index | an index object created by |
x | a glob pattern to be tested |
regex2fixed
returns a list of character vectors containing
types
regex2id
returns a list of integer vectors containing type
IDs
a list of integer vectors containing type IDs with index keys as an attribute
index_types
pattern <- list(c('^a$', '^b'), c('c'), c('d')) types <- c('A', 'AA', 'B', 'BB', 'BBB', 'C', 'CC') quanteda:::regex2fixed(pattern, types, 'regex', case_insensitive = TRUE)#> [[1]] #> [1] "A" "B" #> #> [[2]] #> [1] "A" "BB" #> #> [[3]] #> [1] "A" "BBB" #> #> [[4]] #> [1] "C" #> #> [[5]] #> [1] "CC" #>index <- quanteda:::index_types(types, 'regex', case_insensitive = TRUE) quanteda:::regex2fixed(pattern, index = index)#> [[1]] #> [1] "A" "B" #> #> [[2]] #> [1] "A" "BB" #> #> [[3]] #> [1] "A" "BBB" #> #> [[4]] #> [1] "C" #> #> [[5]] #> [1] "CC" #>types <- c('A', 'AA', 'B', 'BB', 'BBB', 'C', 'CC') pats_regex <- list(c('^a$', '^b'), c('c'), c('d')) quanteda:::regex2id(pats_regex, types, 'regex', case_insensitive = TRUE)#> [[1]] #> [1] 1 3 #> #> [[2]] #> [1] 1 4 #> #> [[3]] #> [1] 1 5 #> #> [[4]] #> [1] 6 #> #> [[5]] #> [1] 7 #>pats_glob <- list(c('a*', 'b*'), c('c'), c('d')) quanteda:::regex2id(pats_glob, types, 'glob', case_insensitive = TRUE)#> [[1]] #> [1] 1 3 #> #> [[2]] #> [1] 2 3 #> #> [[3]] #> [1] 1 4 #> #> [[4]] #> [1] 2 4 #> #> [[5]] #> [1] 1 5 #> #> [[6]] #> [1] 2 5 #> #> [[7]] #> [1] 6 #>index <- quanteda:::index_types(c('xxx', 'yyyy', 'ZZZ'), 'glob', FALSE, 3) quanteda:::search_glob('yy*', attr(index, 'type_search'), index)#> [[1]] #> [1] 2 #>#> [[1]] #> [1] "a" "x" #> #> [[2]] #> [1] "b" "x" #> #> [[3]] #> [1] "c" "x" #> #> [[4]] #> [1] "a" "y" #> #> [[5]] #> [1] "b" "y" #> #> [[6]] #> [1] "c" "y" #>