construct. regexr 0.0.2

Usage

construct(...)

Arguments

...: A series of comma separated character strings (chunks) that may optionally be named, commented (see ?`%:)%`, and indented.

Construct Human Readable Regular Expressions

Value

Returns a character vector of the class regexr. The attributes of the returned object retain the original name and comment properties.

Description

This function is used to construct human readable regular expressions from chunks. The user may provide additional meta information about each chunk. This meta information is an optional name and comment for the chunk. This allows one to write regular expressions in a fashion similar to writing code, that is the regular expression is written top to bottom, the syntax is broken up into manageable chunks, the expression can be indented to give structural insight such as nested groups, and the chunks can be commented to provide linguistic grounding for more complex chunks.

Examples

## Minimal Example
minimal <- construct("a", "b", "c")
minimal

[1] "abc"

unglue(minimal)

[[1]]
[1] "a"

[[2]]
[1] "b"

[[3]]
[1] "c"


comments(minimal)

[[1]]
NULL

[[2]]
NULL

[[3]]
NULL


regex(minimal)

[[1]]
[1] "a"

[[2]]
[1] "b"

[[3]]
[1] "c"


test(minimal)

$regex
[1] TRUE

$chunks
[1] TRUE TRUE TRUE


summary(minimal)


abc
===

REGEX 1: a
NAME   : 
COMMENT: 

REGEX 2: b
NAME   : 
COMMENT: 

REGEX 3: c
NAME   : 
COMMENT: 


## Example 1
m <- construct(
    space =
        "\\s+"
            %:)%"I see",

    simp =
        "(?<=(foo))",

    or =
        "(;|:)\\s*"
            %:)%"comment on what this does",

    "[a]s th[atey]"
)

m

[1] "\\s+(?<=(foo))(;|:)\\s*[a]s th[atey]"

unglue(m)

$space
[1] "\\s+"

$simp
[1] "(?<=(foo))"

$or
[1] "(;|:)\\s*"

[[4]]
[1] "[a]s th[atey]"


summary(m)


\s+(?<=(foo))(;|:)\s*[a]s th[atey]
==================================

REGEX 1: \s+
NAME   : space
COMMENT: "I see"

REGEX 2: (?<=(foo))
NAME   : simp
COMMENT: 

REGEX 3: (;|:)\s*
NAME   : or
COMMENT: "comment on what this does"

REGEX 4: [a]s th[atey]
NAME   : 
COMMENT: 

regex(m)

$space
[1] "\\s+"

$simp
[1] "(?<=(foo))"

$or
[1] "(;|:)\\s*"

[[4]]
[1] "[a]s th[atey]"


comments(m)

$space
[1] "I see"

$simp
NULL

$or
[1] "comment on what this does"

[[4]]
NULL


regex(m)[4] <- "(F{O}2)|(BAR)"
summary(m)


\s+(?<=(foo))(;|:)\s*(F{O}2)|(BAR)
==================================

REGEX 1: \s+
NAME   : space
COMMENT: "I see"

REGEX 2: (?<=(foo))
NAME   : simp
COMMENT: 

REGEX 3: (;|:)\s*
NAME   : or
COMMENT: "comment on what this does"

REGEX 4: (F{O}2)|(BAR)
NAME   : 
COMMENT: 

test(m)

$regex
[1] TRUE

$chunks
space  simp    or       
 TRUE  TRUE  TRUE  TRUE 



regex(m)[5:7] <- c("(", "([A-Z]|(\\d{5})", ")")
test(m)

Warning message:
The concatenated regex is not valid

\s+(?<=(foo))(;|:)\s*(F{O}2)|(BAR)(([A-Z]|(\d{5}))

Warning message:
The following regex chunks are not valid in isolation:

(1) (
(2) ([A-Z]|(\d{5})
(3) )
$regex
[1] FALSE

$chunks
space  simp    or                         
 TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE 




library(qdapRegex)
explain(m)

NODE                     EXPLANATION
--------------------------------------------------------------------------------
  \\s                       whitespace (\n, \r, \t, \f, and " ")
--------------------------------------------------------------------------------
                           ' '
--------------------------------------------------------------------------------
  (?<=                     look behind to see if there is:
--------------------------------------------------------------------------------
    (                        group and capture to \\1:
--------------------------------------------------------------------------------
      foo                      'foo'
--------------------------------------------------------------------------------
    )                        end of \\1
--------------------------------------------------------------------------------
  )                        end of look-behind
--------------------------------------------------------------------------------
  (                        group and capture to \\2:
--------------------------------------------------------------------------------
    ;                        ';'
--------------------------------------------------------------------------------
   |                        OR
--------------------------------------------------------------------------------
    :                        ':'
--------------------------------------------------------------------------------
  )                        end of \\2
--------------------------------------------------------------------------------
  \\s*                      whitespace (\n, \r, \t, \f, and " ") (0 or
                           more times (matching the most amount
                           possible))
--------------------------------------------------------------------------------
  (                        group and capture to \\3:
--------------------------------------------------------------------------------
    F{O}2                    'F{O}2'
--------------------------------------------------------------------------------
  )                        end of \\3
--------------------------------------------------------------------------------
 |                        OR
--------------------------------------------------------------------------------
  (                        group and capture to \\4:
--------------------------------------------------------------------------------
    BAR                      'BAR'
--------------------------------------------------------------------------------
  )                        end of \\4
--------------------------------------------------------------------------------
  (                        group and capture to \\5:
--------------------------------------------------------------------------------
    (                        group and capture to \\6:
--------------------------------------------------------------------------------
      [A-Z]                    any character of: 'A' to 'Z'
--------------------------------------------------------------------------------
     |                        OR
--------------------------------------------------------------------------------
      (                        group and capture to \\7:
--------------------------------------------------------------------------------
        \\d{5}                    digits (0-9) (5 times)
--------------------------------------------------------------------------------
      )                        end of \\7
--------------------------------------------------------------------------------
    )                        end of \\6
--------------------------------------------------------------------------------
  )                        end of \\5



## Example 2 (Twitter Handle)
twitter <- construct(
        neg_lookbehing =
            "(?<![@\\w])"
                %:)%"Make sure the string doesn't start with @ or a word",
        at =
            "(@)"
                %:)%"capture starting with @ symbol",
        s_gr1 =
            "("
                %:)%"Opening parenthesis group 1",
            handle =
                "([a-z0-9_]{1,15})"
                %:)%"Mix of 15 letters, numbers, or underscores",
            boundary =
                "\\b",
        e_gr1 =
            ")"
                %:)%"Closing parenthesis group 1"
)

twitter

[1] "(?<![@\\w])(@)(([a-z0-9_]{1,15})\\b)"

unglue(twitter)

$neg_lookbehing
[1] "(?<![@\\w])"

$at
[1] "(@)"

$s_gr1
[1] "("

$handle
[1] "([a-z0-9_]{1,15})"

$boundary
[1] "\\b"

$e_gr1
[1] ")"


comments(twitter)

$neg_lookbehing
[1] "Make sure the string doesn't start with @ or a word"

$at
[1] "capture starting with @ symbol"

$s_gr1
[1] "Opening parenthesis group 1"

$handle
[1] "Mix of 15 letters, numbers, or underscores"

$boundary
NULL

$e_gr1
[1] "Closing parenthesis group 1"


regex(twitter)

$neg_lookbehing
[1] "(?<![@\\w])"

$at
[1] "(@)"

$s_gr1
[1] "("

$handle
[1] "([a-z0-9_]{1,15})"

$boundary
[1] "\\b"

$e_gr1
[1] ")"


summary(twitter)


(?<![@\w])(@)(([a-z0-9_]{1,15})\b)
==================================

REGEX 1: (?<![@\w])
NAME   : neg_lookbehing
COMMENT: "Make sure the string doesn't start with @ or a word"

REGEX 2: (@)
NAME   : at
COMMENT: "capture starting with @ symbol"

REGEX 3: (
NAME   : s_gr1
COMMENT: "Opening parenthesis group 1"

REGEX 4: ([a-z0-9_]{1,15})
NAME   : handle
COMMENT: "Mix of 15 letters, numbers, or underscores"

REGEX 5: \b
NAME   : boundary
COMMENT: 

REGEX 6: )
NAME   : e_gr1
COMMENT: "Closing parenthesis group 1"

test(twitter)

Warning message:
The following regex chunks are not valid in isolation:

(1) (
(2) )
$regex
[1] TRUE

$chunks
neg_lookbehing             at          s_gr1         handle       boundary          e_gr1 
          TRUE           TRUE          FALSE           TRUE           TRUE          FALSE 


explain(twitter)

NODE                     EXPLANATION
--------------------------------------------------------------------------------
  (?<!                     look behind to see if there is not:
--------------------------------------------------------------------------------
    [@\\w]                    any character of: '@', word characters
                             (a-z, A-Z, 0-9, _)
--------------------------------------------------------------------------------
  )                        end of look-behind
--------------------------------------------------------------------------------
  (                        group and capture to \\1:
--------------------------------------------------------------------------------
    @                        '@'
--------------------------------------------------------------------------------
  )                        end of \\1
--------------------------------------------------------------------------------
  (                        group and capture to \\2:
--------------------------------------------------------------------------------
    (                        group and capture to \\3:
--------------------------------------------------------------------------------
      [a-z0-9_]{1,15}          any character of: 'a' to 'z', '0' to
                               '9', '_' (between 1 and 15 times
                               (matching the most amount possible))
--------------------------------------------------------------------------------
    )                        end of \\3
--------------------------------------------------------------------------------
    \\b                       the boundary between a word char (\\w)
                             and something that is not a word char
--------------------------------------------------------------------------------
  )                        end of \\2



x <- c("@hadley I like #rstats for #ggplot2 work.",
    "Difference between #magrittr and #pipeR, both implement pipeline operators for #rstats:
        http://renkun.me/r/2014/07/26/difference-between-magrittr-and-pipeR.html @timelyportfolio",
    "Slides from great talk: @ramnath_vaidya: Interactive slides from Interactive Visualization
        presentation #user2014. http://ramnathv.github.io/user2014-rcharts/#1",
    "tyler.rinker@gamil.com is my email",
    "A non valid Twitter is @abcdefghijklmnopqrstuvwxyz"
)

library(qdapRegex)
rm_default(x, pattern = twitter, extract = TRUE)

[[1]]
[1] "@hadley"

[[2]]
[1] "@timelyportfolio"

[[3]]
[1] "@ramnath_vaidya"

[[4]]
[1] NA

[[5]]
[1] NA



## Example 3 (Modular Chunks)
combined <- construct(
    twitter =
        twitter
            %:)%"Twitter regex created previously",
    or =
        "|"
            %:)%"Join the twitter handle regex and a hash tag regex",
    hash =
        grab("@rm_hash")
            %:)%"Twitter hash tag regex"
)

combined

[1] "(?<![@\\w])(@)(([a-z0-9_]{1,15})\\b)|(?<!/)((#)(\\w+))"

unglue(combined)

$twitter
[1] "(?<![@\\w])(@)(([a-z0-9_]{1,15})\\b)"
attr(,"regex")
attr(,"regex")$neg_lookbehing
[1] "(?<![@\\w])"

attr(,"regex")$at
[1] "(@)"

attr(,"regex")$s_gr1
[1] "("

attr(,"regex")$handle
[1] "([a-z0-9_]{1,15})"

attr(,"regex")$boundary
[1] "\\b"

attr(,"regex")$e_gr1
[1] ")"

attr(,"comments")
attr(,"comments")$neg_lookbehing
[1] "Make sure the string doesn't start with @ or a word"

attr(,"comments")$at
[1] "capture starting with @ symbol"

attr(,"comments")$s_gr1
[1] "Opening parenthesis group 1"

attr(,"comments")$handle
[1] "Mix of 15 letters, numbers, or underscores"

attr(,"comments")$boundary
NULL

attr(,"comments")$e_gr1
[1] "Closing parenthesis group 1"


$or
[1] "|"

$hash
[1] "(?<!/)((#)(\\w+))"


comments(combined)

$twitter
[1] "Twitter regex created previously"

$or
[1] "Join the twitter handle regex and a hash tag regex"

$hash
[1] "Twitter hash tag regex"


regex(combined)

$twitter
[1] "(?<![@\\w])(@)(([a-z0-9_]{1,15})\\b)"
attr(,"regex")
attr(,"regex")$neg_lookbehing
[1] "(?<![@\\w])"

attr(,"regex")$at
[1] "(@)"

attr(,"regex")$s_gr1
[1] "("

attr(,"regex")$handle
[1] "([a-z0-9_]{1,15})"

attr(,"regex")$boundary
[1] "\\b"

attr(,"regex")$e_gr1
[1] ")"

attr(,"comments")
attr(,"comments")$neg_lookbehing
[1] "Make sure the string doesn't start with @ or a word"

attr(,"comments")$at
[1] "capture starting with @ symbol"

attr(,"comments")$s_gr1
[1] "Opening parenthesis group 1"

attr(,"comments")$handle
[1] "Mix of 15 letters, numbers, or underscores"

attr(,"comments")$boundary
NULL

attr(,"comments")$e_gr1
[1] "Closing parenthesis group 1"


$or
[1] "|"

$hash
[1] "(?<!/)((#)(\\w+))"


summary(combined)


(?<![@\w])(@)(([a-z0-9_]{1,15})\b)|(?<!/)((#)(\w+))
===================================================

REGEX 1: (?<![@\w])(@)(([a-z0-9_]{1,15})\b)
NAME   : twitter
COMMENT: "Twitter regex created previously"

REGEX 2: |
NAME   : or
COMMENT: "Join the twitter handle regex and a hash tag regex"

REGEX 3: (?<!/)((#)(\w+))
NAME   : hash
COMMENT: "Twitter hash tag regex"

test(combined)

$regex
[1] TRUE

$chunks
twitter      or    hash 
   TRUE    TRUE    TRUE 


explain(combined)

NODE                     EXPLANATION
--------------------------------------------------------------------------------
  (?<!                     look behind to see if there is not:
--------------------------------------------------------------------------------
    [@\\w]                    any character of: '@', word characters
                             (a-z, A-Z, 0-9, _)
--------------------------------------------------------------------------------
  )                        end of look-behind
--------------------------------------------------------------------------------
  (                        group and capture to \\1:
--------------------------------------------------------------------------------
    @                        '@'
--------------------------------------------------------------------------------
  )                        end of \\1
--------------------------------------------------------------------------------
  (                        group and capture to \\2:
--------------------------------------------------------------------------------
    (                        group and capture to \\3:
--------------------------------------------------------------------------------
      [a-z0-9_]{1,15}          any character of: 'a' to 'z', '0' to
                               '9', '_' (between 1 and 15 times
                               (matching the most amount possible))
--------------------------------------------------------------------------------
    )                        end of \\3
--------------------------------------------------------------------------------
    \\b                       the boundary between a word char (\\w)
                             and something that is not a word char
--------------------------------------------------------------------------------
  )                        end of \\2
--------------------------------------------------------------------------------
 |                        OR
--------------------------------------------------------------------------------
  (?<!                     look behind to see if there is not:
--------------------------------------------------------------------------------
    /                        '/'
--------------------------------------------------------------------------------
  )                        end of look-behind
--------------------------------------------------------------------------------
  (                        group and capture to \\4:
--------------------------------------------------------------------------------
    (                        group and capture to \\5:
--------------------------------------------------------------------------------
    )                        end of \\5
--------------------------------------------------------------------------------
  )                        end of \\4