data(regex_supplement)
A list with 24 elements
A dataset containing a list of supplemental, canned regular expressions. The
regular expressions in this data set are considered useful but have not been
included in a formal function (of the type rm_XXX
). Users can utilize
the rm_
function to generate functions that can sub/replace/extract as
desired.
The following canned regular expressions are included:
"%s"
that is replaced by sprintf
and is not a valid regex on its own (user supplies (1) n before, (2) the point, & (3) n after)"%s"
that is replaced by sprintf
and is not a valid regex on its own (user supplies (1) n before, (2) the point, & (3) n after)"%s"
that is replaced by sprintf
and is not a valid regex on its own"%s"
that is replaced by sprintf
and is not a valid regex on its own"%s"
that is replaced by sprintf
and is not a valid regex on its own (user supplies the delimiter)[:punct:]
) with the ability to negate; note contains "%s"
that is replaced by sprintf
and is not a valid regex on its own"%s"
that is replaced by sprintf
and is not a valid regex on its ownRegexes from this data set can be added to the pattern
argument of any
rm_XXX
function via an at sign (@) followed by a regex name from
this data set (e.g., pattern = "@after_the"
) provided the regular
expression does not contain non-regex such as sprintf
character string %s
.
Note that regexes containing %s
are replaced by
sprintf
and are not a valid regex on their own. The
S
is useful for adding these missing %s
parameters.
time <- rm_(pattern="@time_12_hours") time("I will go at 12:35 pm")[1] "I will go at"x <- "v6.0.156 for Windows 2000/2003/XP/Vista Server version 1.1.20 Client Manager version 1.1.24" rm_default(x, pattern = "@version", extract=TRUE)[[1]] [1] "6.0.156" "1.1.20" "1.1.24"rm_default(x, pattern = "@version2", extract=TRUE)[[1]] [1] "v6.0.156" "version 1.1.20" "version 1.1.24"x <- "this is 1000000 big 4356. And little 123 number." rm_default(x, pattern="@thousands_separator", replacement="\\1,")[1] "this is 1,000,000 big 4,356. And little 123 number."rm_default(x, pattern="@thousands_separator", replacement="\\1.")[1] "this is 1.000.000 big 4.356. And little 123 number."rm_default("I was,but it costs 10,000.", pattern="@white_after_comma", replacement=", ")[1] "I was, but it costs 10,000."x <- "I like; the donuts; a lot" strsplit(x, ";")[[1]] [1] "I like" " the donuts" " a lot"strsplit(x, S(grab("split_keep_delim"), ";"), perl=TRUE)[[1]] [1] "I like" "; the donuts" "; a lot"stringi::stri_split_regex(x, S(grab("split_keep_delim"), ";"))[[1]] [1] "I like" "; the donuts" "; a lot"stringi::stri_split_regex("I like; the donuts; a lot:cool", S(grab("split_keep_delim"), ";|:"))[[1]] [1] "I like" "; the donuts" "; a lot" ":cool"## Grab words around a point x <- c( "the magic word is e", "the dog is red and they are blue", "I am new but she is not new", "hello world", "why is it so cold? Perhaps it is Winter.", "It is not true the 7 is 8.", "Is that my drink?" ) rm_default(x, pattern = S("@around_", 1, "is", 1), extract=TRUE)[[1]] [1] "word is e" [[2]] [1] "dog is red" [[3]] [1] "she is not" [[4]] [1] NA [[5]] [1] "why is it" "it is Winter" [[6]] [1] "It is not" "7 is 8" [[7]] [1] NArm_default(x, pattern = S("@around_", 2, "is", 2), extract=TRUE)[[1]] [1] "magic word is e" [[2]] [1] "the dog is red and" [[3]] [1] "but she is not new" [[4]] [1] NA [[5]] [1] "why is it so" "Perhaps it is Winter" [[6]] [1] "It is not true" "the 7 is 8" [[7]] [1] NArm_default(x, pattern = S("@around_", 1, "is|are|am", 1), extract=TRUE)[[1]] [1] "word is e" [[2]] [1] "dog is red" "they are blue" [[3]] [1] "I am new" "she is not" [[4]] [1] NA [[5]] [1] "why is it" "it is Winter" [[6]] [1] "It is not" "7 is 8" [[7]] [1] NArm_default(x, pattern = S("@around_", 1, "is not|is|are|am", 1), extract=TRUE)[[1]] [1] "word is e" [[2]] [1] "dog is red" "they are blue" [[3]] [1] "I am new" "she is not new" [[4]] [1] NA [[5]] [1] "why is it" "it is Winter" [[6]] [1] "It is not true" "7 is 8" [[7]] [1] NArm_default(x, pattern = S("@around_", 1, "is not|[Ii]s|[Aa]re|[Aa]m", 1), extract=TRUE)[[1]] [1] "word is e" [[2]] [1] "dog is red" "they are blue" [[3]] [1] "I am new" "she is not new" [[4]] [1] NA [[5]] [1] "why is it" "it is Winter" [[6]] [1] "It is not true" "7 is 8" [[7]] [1] "Is that"x <- c( "hello world", "45", "45 & 5 makes 50", "x and y", "abc and def", "her him foo & bar for Jack and Jill then" ) around_and <- rm_(pattern = S("@around_", 1, "and|\\&", 1), extract=TRUE) around_and(x)[[1]] [1] NA [[2]] [1] NA [[3]] [1] "45 & 5" [[4]] [1] "x and y" [[5]] [1] "abc and def" [[6]] [1] "foo & bar" "Jack and Jill"## Split runs into chunks x <- "1111100000222000333300011110000111000" strsplit(x, grab("@run_split"), per = TRUE)[[1]] [1] "11111" "00000" "222" "000" "3333" "000" "1111" "0000" "111" "000"## <strong>Not run</strong>: # library(qdap);library(ggplot2);library(reshape2) # # out <- setNames(lapply(c("@after_a", "@after_the"), function(x) { # o <- rm_default(stringi:::stri_trans_tolower(pres_debates2012$dialogue), # pattern = x, extract=TRUE) # m <- qdapTools::matrix2df(data.frame(freq=sort(table(unlist(o)), TRUE)), "word") # m[m$freq> 7, ] # }), c("a", "the")) # # # dat <- setNames(Reduce(function(x, y) { # merge(x, y, by = "word", all = TRUE)}, out), c("Word", "A", "THE")) # # dat <- reshape2::melt(dat, id="Word", variable.name="Article", value.name="freq") # # dat <- dat[order(dat$freq, dat$Word), ] # # ord <- aggregate(freq ~ Word, dat, sum) # # dat$word <- factor(dat$Word, levels=ord[order(ord[[2]]), 1]) # ggplot(dat, aes(x=freq, y=Word)) + geom_point()+ facet_grid(~Article) # ## <strong>End(Not run)</strong> ## remove/extract pages numbers x <- c("I read p. 36 and then pp. 45-49", "it's on pp. 23-24;28") rm_pages <- rm_(pattern="@pages", extract=TRUE) rm_pages(x)[[1]] [1] "p. 36 " "pp. 45-49" [[2]] [1] "pp. 23-24;28"rm_default(x, pattern = "@pages")[1] "I read and then" "it's on"rm_default(x, pattern = "@pages", extract=TRUE)[[1]] [1] "p. 36 " "pp. 45-49" [[2]] [1] "pp. 23-24;28"rm_default(x, pattern = "@pages2", extract=TRUE)[[1]] [1] "36 " "45-49" [[2]] [1] "23-24;28"## Validate pages page_val <- validate("@pages2", FALSE) page_val(c(66, "78-82", "hello world", TRUE, "44-45; 56"))[1] TRUE TRUE FALSE FALSE TRUE## Split on last occurrence x <- c( "test@aol@fg.mm.com", "test@hotmail.com", "test@xyz@rr@lk.edu", "test@abc.xx@zz.vv.net" ) strsplit(x, S("@last_occurrence", "\\."), perl=TRUE)[[1]] [1] "test@aol@fg.mm" "com" [[2]] [1] "test@hotmail" "com" [[3]] [1] "test@xyz@rr@lk" "edu" [[4]] [1] "test@abc.xx@zz.vv" "net"strsplit(x, S("@last_occurrence", "@"), perl=TRUE)[[1]] [1] "test@aol" "fg.mm.com" [[2]] [1] "test" "hotmail.com" [[3]] [1] "test@xyz@rr" "lk.edu" [[4]] [1] "test@abc.xx" "zz.vv.net"## True Word Boundaries x <- "this is _not a word666 and this is not a word too." ## Standard regex word boundary rm_default(x, pattern=bind("not a word"))[1] "this is _not a word666 and this is too."## Alphabetic only word boundaries rm_default(x, pattern=S("@word_boundary", "not a word"))[1] "this is _666 and this is too."## Remove punctuation with negation x <- c( "I, love them! Well I like them. Do you like_ them?", "Here are the punctuation characters: !"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~" ) rm_default(x, pattern=S("@punctuation", ""))[1] "I love them Well I like them Do you like them" "Here are the punctuation characters"rm_default(x, pattern=S("@punctuation", ".?!"))[1] "I love them! Well I like them. Do you like them?" "Here are the punctuation characters !"## Remove all but first occurrence of something x <- c( "12-3=4-5=678-9", "ABC-D=EF2-GHI-JK3=L-MN=", "9-87=65", "a - de=4fgh --= i5jkl", NA ) rm_default(x, pattern = S("@except_first", "-"))[1] "12-3=45=6789" "ABC-D=EF2GHIJK3=LMN=" "9-87=65" "a - de=4fgh = i5jkl" NArm_default(x, pattern = S("@except_first", "="))[1] "12-3=4-5678-9" "ABC-D=EF2-GHI-JK3L-MN" "9-87=65" "a - de=4fgh -- i5jkl" NA