construct. regexr 1.0.0

Usage

construct(...)

Arguments

...: A series of comma separated character strings (sub-expressions) that may optionally be named, commented (see ?`%:)%`, and indented.

Construct Human Readable Regular Expressions

Value

Returns a character vector of the class regexr. The attributes of the returned object retain the original name and comment properties.

Description

This function is used to construct human readable regular expressions from sub-expressions. The user may provide additional meta information about each sub-expression. This meta information is an optional name and comment for the sub-expressions. This allows one to write regular expressions in a fashion similar to writing code, that is the regular expression is written top to bottom, the syntax is broken up into manageable chunks, the sub-expressions can be indented to give structural insight such as nested groups. Finally, sub-expressions can be commented to provide linguistic grounding for more complex sub-expressions.

Examples

## Minimal Example
minimal <- construct("a", "b", "c")
minimal

[1] "abc"

unglue(minimal)

[[1]]
[1] "a"

[[2]]
[1] "b"

[[3]]
[1] "c"


comments(minimal)

[[1]]
NULL

[[2]]
NULL

[[3]]
NULL


subs(minimal)

[[1]]
[1] "a"

[[2]]
[1] "b"

[[3]]
[1] "c"


test(minimal)

$regex
[1] TRUE

$subexpressions
[1] TRUE TRUE TRUE


summary(minimal)


 abc 
 === 

SUB-EXPR 1: a
NAME      : 
COMMENT   : 

SUB-EXPR 2: b
NAME      : 
COMMENT   : 

SUB-EXPR 3: c
NAME      : 
COMMENT   : 


## Example 1
m <- construct(
    space =   "\\s+"              %:)%  "I see",
    simp =    "(?<=(foo))",
    or =      "(;|:)\\s*"         %:)%  "comment on what this does",
    is_then = "[ia]s th[ae]n"
)

m

[1] "\\s+(?<=(foo))(;|:)\\s*[ia]s th[ae]n"

unglue(m)

$space
[1] "\\s+"

$simp
[1] "(?<=(foo))"

$or
[1] "(;|:)\\s*"

$is_then
[1] "[ia]s th[ae]n"


summary(m)


 \s+(?<=(foo))(;|:)\s*[ia]s th[ae]n 
 ================================== 

SUB-EXPR 1: \s+
NAME      : space
COMMENT   : "I see"

SUB-EXPR 2: (?<=(foo))
NAME      : simp
COMMENT   : 

SUB-EXPR 3: (;|:)\s*
NAME      : or
COMMENT   : "comment on what this does"

SUB-EXPR 4: [ia]s th[ae]n
NAME      : is_then
COMMENT   : 

subs(m)

$space
[1] "\\s+"

$simp
[1] "(?<=(foo))"

$or
[1] "(;|:)\\s*"

$is_then
[1] "[ia]s th[ae]n"


comments(m)

$space
[1] "I see"

$simp
NULL

$or
[1] "comment on what this does"

$is_then
NULL


subs(m)[4] <- "(FO{2})|(BAR)"
summary(m)


 \s+(?<=(foo))(;|:)\s*(FO{2})|(BAR) 
 ================================== 

SUB-EXPR 1: \s+
NAME      : space
COMMENT   : "I see"

SUB-EXPR 2: (?<=(foo))
NAME      : simp
COMMENT   : 

SUB-EXPR 3: (;|:)\s*
NAME      : or
COMMENT   : "comment on what this does"

SUB-EXPR 4: (FO{2})|(BAR)
NAME      : is_then
COMMENT   : 

test(m)

$regex
[1] TRUE

$subexpressions
  space    simp      or is_then 
   TRUE    TRUE    TRUE    TRUE 



subs(m)[5:7] <- c("(", "([A-Z]|(\\d{5})", ")")
test(m)

Warning message:
The concatenated regex is not valid

\s+(?<=(foo))(;|:)\s*(FO{2})|(BAR)(([A-Z]|(\d{5}))

Warning message:
The following regex sub-expressions are not valid in isolation:

(1) (
(2) ([A-Z]|(\d{5})
(3) )
$regex
[1] FALSE

$subexpressions
  space    simp      or is_then                         
   TRUE    TRUE    TRUE    TRUE   FALSE   FALSE   FALSE 




library(qdapRegex)
explain(m)

NODE                     EXPLANATION
--------------------------------------------------------------------------------
  \\s+                      whitespace (\n, \r, \t, \f, and " ") (1 or
                           more times (matching the most amount
                           possible))
--------------------------------------------------------------------------------
  (?<=                     look behind to see if there is:
--------------------------------------------------------------------------------
    (                        group and capture to \\1:
--------------------------------------------------------------------------------
      foo                      'foo'
--------------------------------------------------------------------------------
    )                        end of \\1
--------------------------------------------------------------------------------
  )                        end of look-behind
--------------------------------------------------------------------------------
  (                        group and capture to \\2:
--------------------------------------------------------------------------------
    ;                        ';'
--------------------------------------------------------------------------------
   |                        OR
--------------------------------------------------------------------------------
    :                        ':'
--------------------------------------------------------------------------------
  )                        end of \\2
--------------------------------------------------------------------------------
  \\s*                      whitespace (\n, \r, \t, \f, and " ") (0 or
                           more times (matching the most amount
                           possible))
--------------------------------------------------------------------------------
  (                        group and capture to \\3:
--------------------------------------------------------------------------------
    F                        'F'
--------------------------------------------------------------------------------
    O{2}                     'O' (2 times)
--------------------------------------------------------------------------------
  )                        end of \\3
--------------------------------------------------------------------------------
 |                        OR
--------------------------------------------------------------------------------
  (                        group and capture to \\4:
--------------------------------------------------------------------------------
    BAR                      'BAR'
--------------------------------------------------------------------------------
  )                        end of \\4
--------------------------------------------------------------------------------
  (                        group and capture to \\5:
--------------------------------------------------------------------------------
    (                        group and capture to \\6:
--------------------------------------------------------------------------------
      [A-Z]                    any character of: 'A' to 'Z'
--------------------------------------------------------------------------------
     |                        OR
--------------------------------------------------------------------------------
      (                        group and capture to \\7:
--------------------------------------------------------------------------------
        \\d{5}                    digits (0-9) (5 times)
--------------------------------------------------------------------------------
      )                        end of \\7
--------------------------------------------------------------------------------
    )                        end of \\6
--------------------------------------------------------------------------------
  )                        end of \\5



## Example 2 (Twitter Handle 2 ways)
## Bigger Sub-expressions
twitter <- construct(
  no_at_wrd = "(?<![@\\w])"            %:)%  "Ensure doesn't start with @ or a word",
  at =        "(@)"                    %:)%  "Capture starting with @ symbol",
  handle =    "(([a-z0-9_]{1,15})\\b)"  %:)%  "Any 15 letters, numbers, or underscores"
)

## Smaller Sub-expressions
twitter <- construct(
  no_at_wrd = "(?<![@\\w])"          %:)%  "Ensure doesn't start with @ or a word",
  at =        "(@)"                  %:)%  "Capture starting with @ symbol",

  s_gr1 =     "("                     %:)%  "GROUP 1 START",
      handle =    "([a-z0-9_]{1,15})"       %:)%  "Any 15 letters, numbers, or underscores",
      boundary =  "\\b",
  e_gr1 =     ")"                      %:)%"GROUP 1 END"
)

twitter

[1] "(?<![@\\w])(@)(([a-z0-9_]{1,15})\\b)"

unglue(twitter)

$no_at_wrd
[1] "(?<![@\\w])"

$at
[1] "(@)"

$s_gr1
[1] "("

$handle
[1] "([a-z0-9_]{1,15})"

$boundary
[1] "\\b"

$e_gr1
[1] ")"


comments(twitter)

$no_at_wrd
[1] "Ensure doesn't start with @ or a word"

$at
[1] "Capture starting with @ symbol"

$s_gr1
[1] "GROUP 1 START"

$handle
[1] "Any 15 letters, numbers, or underscores"

$boundary
NULL

$e_gr1
[1] "GROUP 1 END"


subs(twitter)

$no_at_wrd
[1] "(?<![@\\w])"

$at
[1] "(@)"

$s_gr1
[1] "("

$handle
[1] "([a-z0-9_]{1,15})"

$boundary
[1] "\\b"

$e_gr1
[1] ")"


summary(twitter)


 (?<![@\w])(@)(([a-z0-9_]{1,15})\b) 
 ================================== 

SUB-EXPR 1: (?<![@\w])
NAME      : no_at_wrd
COMMENT   : "Ensure doesn't start with @ or a word"

SUB-EXPR 2: (@)
NAME      : at
COMMENT   : "Capture starting with @ symbol"

SUB-EXPR 3: (
NAME      : s_gr1
COMMENT   : "GROUP 1 START"

SUB-EXPR 4: ([a-z0-9_]{1,15})
NAME      : handle
COMMENT   : "Any 15 letters, numbers, or underscores"

SUB-EXPR 5: \b
NAME      : boundary
COMMENT   : 

SUB-EXPR 6: )
NAME      : e_gr1
COMMENT   : "GROUP 1 END"

test(twitter)

Warning message:
The following regex sub-expressions are not valid in isolation:

(1) (
(2) )
$regex
[1] TRUE

$subexpressions
no_at_wrd        at     s_gr1    handle  boundary     e_gr1 
     TRUE      TRUE     FALSE      TRUE      TRUE     FALSE 


explain(twitter)

NODE                     EXPLANATION
--------------------------------------------------------------------------------
  (?<!                     look behind to see if there is not:
--------------------------------------------------------------------------------
    [@\\w]                    any character of: '@', word characters
                             (a-z, A-Z, 0-9, _)
--------------------------------------------------------------------------------
  )                        end of look-behind
--------------------------------------------------------------------------------
  (                        group and capture to \\1:
--------------------------------------------------------------------------------
    @                        '@'
--------------------------------------------------------------------------------
  )                        end of \\1
--------------------------------------------------------------------------------
  (                        group and capture to \\2:
--------------------------------------------------------------------------------
    (                        group and capture to \\3:
--------------------------------------------------------------------------------
      [a-z0-9_]{1,15}          any character of: 'a' to 'z', '0' to
                               '9', '_' (between 1 and 15 times
                               (matching the most amount possible))
--------------------------------------------------------------------------------
    )                        end of \\3
--------------------------------------------------------------------------------
    \\b                       the boundary between a word char (\\w)
                             and something that is not a word char
--------------------------------------------------------------------------------
  )                        end of \\2



x <- c("@hadley I like #rstats for #ggplot2 work.",
    "Difference between #magrittr and #pipeR, both implement pipeline operators for #rstats:
        http://renkun.me/r/2014/07/26/difference-between-magrittr-and-pipeR.html @timelyportfolio",
    "Slides from great talk: @ramnath_vaidya: Interactive slides from Interactive Visualization
        presentation #user2014. http://ramnathv.github.io/user2014-rcharts/#1",
    "tyler.rinker@gamil.com is my email",
    "A non valid Twitter is @abcdefghijklmnopqrstuvwxyz"
)

library(qdapRegex)
rm_default(x, pattern = twitter, extract = TRUE)

[[1]]
[1] "@hadley"

[[2]]
[1] "@timelyportfolio"

[[3]]
[1] "@ramnath_vaidya"

[[4]]
[1] NA

[[5]]
[1] NA



## Example 3 (Modular Sub-expression Chunks)
combined <- construct(
    twitter = twitter               %:)%"Twitter regex created previously",
    or =      "|"                   %:)%"Join handle regex & hash tag regex",
    hash =    grab("@rm_hash")     %:)%"Twitter hash tag regex"
)

combined

[1] "(?<![@\\w])(@)(([a-z0-9_]{1,15})\\b)|(?<!/)((#)(\\w+))"

unglue(combined)

$twitter
[1] "(?<![@\\w])(@)(([a-z0-9_]{1,15})\\b)"
attr(,"subs")
attr(,"subs")$no_at_wrd
[1] "(?<![@\\w])"

attr(,"subs")$at
[1] "(@)"

attr(,"subs")$s_gr1
[1] "("

attr(,"subs")$handle
[1] "([a-z0-9_]{1,15})"

attr(,"subs")$boundary
[1] "\\b"

attr(,"subs")$e_gr1
[1] ")"

attr(,"comments")
attr(,"comments")$no_at_wrd
[1] "Ensure doesn't start with @ or a word"

attr(,"comments")$at
[1] "Capture starting with @ symbol"

attr(,"comments")$s_gr1
[1] "GROUP 1 START"

attr(,"comments")$handle
[1] "Any 15 letters, numbers, or underscores"

attr(,"comments")$boundary
NULL

attr(,"comments")$e_gr1
[1] "GROUP 1 END"


$or
[1] "|"

$hash
[1] "(?<!/)((#)(\\w+))"


comments(combined)

$twitter
[1] "Twitter regex created previously"

$or
[1] "Join handle regex & hash tag regex"

$hash
[1] "Twitter hash tag regex"


subs(combined)

$twitter
[1] "(?<![@\\w])(@)(([a-z0-9_]{1,15})\\b)"
attr(,"subs")
attr(,"subs")$no_at_wrd
[1] "(?<![@\\w])"

attr(,"subs")$at
[1] "(@)"

attr(,"subs")$s_gr1
[1] "("

attr(,"subs")$handle
[1] "([a-z0-9_]{1,15})"

attr(,"subs")$boundary
[1] "\\b"

attr(,"subs")$e_gr1
[1] ")"

attr(,"comments")
attr(,"comments")$no_at_wrd
[1] "Ensure doesn't start with @ or a word"

attr(,"comments")$at
[1] "Capture starting with @ symbol"

attr(,"comments")$s_gr1
[1] "GROUP 1 START"

attr(,"comments")$handle
[1] "Any 15 letters, numbers, or underscores"

attr(,"comments")$boundary
NULL

attr(,"comments")$e_gr1
[1] "GROUP 1 END"


$or
[1] "|"

$hash
[1] "(?<!/)((#)(\\w+))"


summary(combined)


 (?<![@\w])(@)(([a-z0-9_]{1,15})\b)|(?<!/)((#)(\w+)) 
 =================================================== 

SUB-EXPR 1: (?<![@\w])(@)(([a-z0-9_]{1,15})\b)
NAME      : twitter
COMMENT   : "Twitter regex created previously"

SUB-EXPR 2: |
NAME      : or
COMMENT   : "Join handle regex & hash tag regex"

SUB-EXPR 3: (?<!/)((#)(\w+))
NAME      : hash
COMMENT   : "Twitter hash tag regex"

test(combined)

$regex
[1] TRUE

$subexpressions
twitter      or    hash 
   TRUE    TRUE    TRUE 


explain(combined)

NODE                     EXPLANATION
--------------------------------------------------------------------------------
  (?<!                     look behind to see if there is not:
--------------------------------------------------------------------------------
    [@\\w]                    any character of: '@', word characters
                             (a-z, A-Z, 0-9, _)
--------------------------------------------------------------------------------
  )                        end of look-behind
--------------------------------------------------------------------------------
  (                        group and capture to \\1:
--------------------------------------------------------------------------------
    @                        '@'
--------------------------------------------------------------------------------
  )                        end of \\1
--------------------------------------------------------------------------------
  (                        group and capture to \\2:
--------------------------------------------------------------------------------
    (                        group and capture to \\3:
--------------------------------------------------------------------------------
      [a-z0-9_]{1,15}          any character of: 'a' to 'z', '0' to
                               '9', '_' (between 1 and 15 times
                               (matching the most amount possible))
--------------------------------------------------------------------------------
    )                        end of \\3
--------------------------------------------------------------------------------
    \\b                       the boundary between a word char (\\w)
                             and something that is not a word char
--------------------------------------------------------------------------------
  )                        end of \\2
--------------------------------------------------------------------------------
 |                        OR
--------------------------------------------------------------------------------
  (?<!                     look behind to see if there is not:
--------------------------------------------------------------------------------
    /                        '/'
--------------------------------------------------------------------------------
  )                        end of look-behind
--------------------------------------------------------------------------------
  (                        group and capture to \\4:
--------------------------------------------------------------------------------
    (                        group and capture to \\5:
--------------------------------------------------------------------------------
    )                        end of \\5
--------------------------------------------------------------------------------
  )                        end of \\4



## Different Structure (no names): Example from Martin Fowler:
## *Note: Fowler argues for improved choices in regex representation
## and names that make the regex functionality more evident, commenting
## only where needed. See:
## browseURL("http://martinfowler.com/bliki/ComposedRegex.html")

pattern <- construct(
    '@"^score',
    '\\s+',
    '(\\d+)'          %:)% 'points',
    '\\s+',
    'for',
    '\\s+',
    '(\\d+)'          %:)% 'number of nights',
    '\\s+',
    'night'           ,
    's?'              %:)% 'optional plural',
    '\\s+',
    'at',
    '\\s+',
    '(.*)'            %:)% 'hotel name',
    '";'
)

summary(pattern)


 @"^score\s+(\d+)\s+for\s+(\d+)\s+nights?\s+at\s+(.*)"; 
 ====================================================== 

SUB-EXPR 1: @"^score
NAME      : 
COMMENT   : 

SUB-EXPR 2: \s+
NAME      : 
COMMENT   : 

SUB-EXPR 3: (\d+)
NAME      : 
COMMENT   : "points"

SUB-EXPR 4: \s+
NAME      : 
COMMENT   : 

SUB-EXPR 5: for
NAME      : 
COMMENT   : 

SUB-EXPR 6: \s+
NAME      : 
COMMENT   : 

SUB-EXPR 7: (\d+)
NAME      : 
COMMENT   : "number of nights"

SUB-EXPR 8: \s+
NAME      : 
COMMENT   : 

SUB-EXPR 9: night
NAME      : 
COMMENT   : 

SUB-EXPR 10: s?
NAME       : 
COMMENT    : "optional plural"

SUB-EXPR 11: \s+
NAME       : 
COMMENT    : 

SUB-EXPR 12: at
NAME       : 
COMMENT    : 

SUB-EXPR 13: \s+
NAME       : 
COMMENT    : 

SUB-EXPR 14: (.*)
NAME       : 
COMMENT    : "hotel name"

SUB-EXPR 15: ";
NAME       : 
COMMENT    :