#@ Quantifiers: {n,m} {n} {n,} + * ? ####
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
#@
#@ Quantifiers are symbols in the pattern that identify how many repetetitions
#@ to look for of a particular sub-pattern. The quantifiers include
#@
#@ SOME_SUB_PATTERN{n,m} (where n and m are numbers)
#@ SOME_SUB_PATTERN{n} (where n is a number)
#@ SOME_SUB_PATTERN{n,} (where n is a number)
#@ SOME_SUB_PATTERN+
#@ SOME_SUB_PATTERN*
#@ SOME_SUB_PATTERN?
#@
#@ See below for an explanation of each type of quantifier.
#@
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@18 18. Quantifiers and gsub with patterns
18.1 Quantifiers: {n,m} {n} {n,} + * ?
18.1.1 Quantifiers with {curly braces} eg. {3} {2,5} {3,}
# Quantifiers with {curly braces} eg. {3} {2,5} {3,} ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# SOME_PATTERN{3} three matches in a row
# SOME_PATTERN{3,6} between three and six matches in a row
# SOME_PATTERN{3,} at least 3 matches in a row
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# EXAMPLE
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# "[0-9]{4}" matches 4 digits in a row, same as "[0-9][0-9][0-9][0-9]"
#
# Note that this will also return those entries that have more than 4 digits
# in a row since these entries ALSO have 4 digits in a row (plus some extra
# digits)
grep("[0-9]{4}", addresses, value=TRUE) # 4 digits , same as "[0-9][0-9][0-9][0-9]"[1] "12345 Sesame Street" "5678 Park Place"
[3] "Three Main Street Apt 12343"
# Note that if we use gsub, only the first 4 digits will be substituted.
gsub("[0-9]{4}", "xxxx", addresses) # 4 digits , same as "[0-9][0-9][0-9][0-9]" [1] "xxxx5 Sesame Street" "One Micro$oft Way"
[3] "3 Olive St." "Two 1st Ave."
[5] "xxxx Park Place" "Forty Five 2nd Street"
[7] "Ninety Nine Cone St. apartment 7" "9 Main St. apt. 623"
[9] "Five Google Drive" "4\\2 Rechov Yafo"
[11] "Fifteen Watchamacallit Boulevard" "Nineteen Watchamacallit Boulevard"
[13] "One Main Street Apt 12b" "Two Main Street Apt 123c"
[15] "Three Main Street Apt xxxx3" "City Hall Lockport, NY"
grep("[0-9]{3}", addresses, value=TRUE) # 3 digits [1] "12345 Sesame Street" "5678 Park Place"
[3] "9 Main St. apt. 623" "Two Main Street Apt 123c"
[5] "Three Main Street Apt 12343"
# Note that if we use gsub, only the first 3 digits will be substituted.
gsub("[0-9]{3}", "xxxx", addresses) # 4 digits , same as "[0-9][0-9][0-9][0-9]" [1] "xxxx45 Sesame Street" "One Micro$oft Way"
[3] "3 Olive St." "Two 1st Ave."
[5] "xxxx8 Park Place" "Forty Five 2nd Street"
[7] "Ninety Nine Cone St. apartment 7" "9 Main St. apt. xxxx"
[9] "Five Google Drive" "4\\2 Rechov Yafo"
[11] "Fifteen Watchamacallit Boulevard" "Nineteen Watchamacallit Boulevard"
[13] "One Main Street Apt 12b" "Two Main Street Apt xxxxc"
[15] "Three Main Street Apt xxxx43" "City Hall Lockport, NY"
grep("[0-9]{3}", addresses, value=TRUE) # 3 digits [1] "12345 Sesame Street" "5678 Park Place"
[3] "9 Main St. apt. 623" "Two Main Street Apt 123c"
[5] "Three Main Street Apt 12343"
grep("\\b[0-9]{3}\\b", addresses, value=TRUE) # 3 digits [1] "9 Main St. apt. 623"
# EXAMPLE
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# match a word that is exactly 4 letters long
pattern =
paste0 ("^[a-zA-Z]{4}[^a-zA-Z]",
"|[^a-zA-Z][a-zA-Z]{4}[^a-zA-Z]",
"|[^a-zA-Z][a-zA-Z]{4}$",
"|^[a-zA-Z]{4}$")
pattern =
paste0 ("\\b[a-zA-Z]{4}\\b")
# use the same pattern for both addressess and fruit
grep(pattern, addresses, value=TRUE) [1] "5678 Park Place" "Forty Five 2nd Street"
[3] "Ninety Nine Cone St. apartment 7" "9 Main St. apt. 623"
[5] "Five Google Drive" "4\\2 Rechov Yafo"
[7] "One Main Street Apt 12b" "Two Main Street Apt 123c"
[9] "Three Main Street Apt 12343" "City Hall Lockport, NY"
grep(pattern, fruit, value=TRUE)[1] "star fruit" "pear" "prickly pear"
[4] "Beurre Hardy pear" "plum"
# Another way - using \b
pattern = "\\b[a-zA-Z]{4}\\b"
grep(pattern, addresses, value=TRUE) [1] "5678 Park Place" "Forty Five 2nd Street"
[3] "Ninety Nine Cone St. apartment 7" "9 Main St. apt. 623"
[5] "Five Google Drive" "4\\2 Rechov Yafo"
[7] "One Main Street Apt 12b" "Two Main Street Apt 123c"
[9] "Three Main Street Apt 12343" "City Hall Lockport, NY"
# Use a text editor (e.g. the one in RStudio) to see how this works ...
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Paste the following in the text editor to see how this pattern works.
#
# \b[a-zA-Z]{4}\b
#
# This will match any 4 character word. It will NOT match 3 or 5 character words.
# Note that in VS Code you should use only one backslash (i.e. \b ) but in R
# you would use two backslashes (i.e. \\b ) as explained above.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# match a word that is at least 4 letters long
# Use {4,} instead of {4}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Paste this into VSCode regex search.
# ^[a-zA-Z]{4,}[^a-zA-Z]|[^a-zA-Z][a-zA-Z]{4,}[^a-zA-Z]|[^a-zA-Z][a-zA-Z]{4,}$|^[a-zA-Z]{4,}$# match a word that is at least 3,4 or 5 letters long
# Use {3,5} instead of {4}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Paste this into VSCode regex search.
# ^[a-zA-Z]{3,5}[^a-zA-Z]|[^a-zA-Z][a-zA-Z]{3,5}[^a-zA-Z]|[^a-zA-Z][a-zA-Z]{3,5}$|^[a-zA-Z]{3,5}$# ANOTHER EXAMPLE
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# match a word that is exactly 7 letters long
pattern =
paste0 ("^[a-zA-Z]{7}[^a-zA-Z]",
"|[^a-zA-Z][a-zA-Z]{7}[^a-zA-Z]",
"|[^a-zA-Z][a-zA-Z]{7}$",
"|^[a-zA-Z]{7}$")
# use the same pattern for both addresses and fruit
grep(pattern, addresses, value=TRUE)[1] "Fifteen Watchamacallit Boulevard"
grep(pattern, fruit, value=TRUE)[1] "prickly pear" "kumquat"
# match any word that is between 4 and 7 letters long
pattern =
paste0 ("^[a-zA-Z]{4,7}[^a-zA-Z]",
"|[^a-zA-Z][a-zA-Z]{4,7}[^a-zA-Z]",
"|[^a-zA-Z][a-zA-Z]{4,7}$",
"|^[a-zA-Z]{4,7}$")
grep(pattern, addresses, value=TRUE) [1] "12345 Sesame Street" "One Micro$oft Way"
[3] "3 Olive St." "5678 Park Place"
[5] "Forty Five 2nd Street" "Ninety Nine Cone St. apartment 7"
[7] "9 Main St. apt. 623" "Five Google Drive"
[9] "4\\2 Rechov Yafo" "Fifteen Watchamacallit Boulevard"
[11] "One Main Street Apt 12b" "Two Main Street Apt 123c"
[13] "Three Main Street Apt 12343" "City Hall Lockport, NY"
grep(pattern, fruit, value=TRUE) # ... "apple" ... "peach" ... (5 letter long words too) [1] "apple" "N. American apple" "S. Korean Fig"
[4] "star fruit" "pear" "prickly pear"
[7] "Beurre Hardy pear" "cherry" "black cherry"
[10] "peach" "plum" "kumquat"
[13] "banana"
# match a word that is at least 7 letters long
pattern =
paste0 ("^[a-zA-Z]{7,}[^a-zA-Z]",
"|[^a-zA-Z][a-zA-Z]{7,}[^a-zA-Z]",
"|[^a-zA-Z][a-zA-Z]{7,}$",
"|^[a-zA-Z]{7,}$")
grep(pattern, addresses, value=TRUE)[1] "Ninety Nine Cone St. apartment 7" "Fifteen Watchamacallit Boulevard"
[3] "Nineteen Watchamacallit Boulevard" "City Hall Lockport, NY"
grep(pattern, fruit, value=TRUE)[1] "N. American apple" "prickly pear" "kumquat"
[4] "blueberry" "strawberry" "honeydew"
[7] "strawberries" "yumberry"
# Exactly 3 digits (see example in last section of exactly one digit)
pattern = paste0(
"^[0-9]{3}[^0-9]",
"|[^0-9][0-9]{3}[^0-9]",
"|[^0-9][0-9]{3}$",
"|^[0-9]{3}$"
)
grep(pattern, addresses, value=TRUE)[1] "9 Main St. apt. 623" "Two Main Street Apt 123c"
grep("[^aeiou]{5}", fruit, value=TRUE) # at least 5 non vowels in a row[1] "N. American apple" "prickly pear" "Beurre Hardy pear"
[4] "black cherry"
grep("[^aeiou]{6}", fruit, value=TRUE) # at least 6 non vowels in a row[1] "prickly pear"
grep("[^aeiou]{7}", fruit, value=TRUE) # at least 7 non vowels in a rowcharacter(0)
grep("^.[aeiou]{2}", fruit, value=TRUE) # vowels in the 2nd and 3rd positions[1] "pear" "Beurre Hardy pear" "peach"
grep("[aeiou]{2}.$", fruit, value=TRUE) # 2nd & 3rd to last characters are vowels[1] "star fruit" "pear" "prickly pear"
[4] "Beurre Hardy pear" "kumquat" "strawberries"
# QUESTION
# Search for fruit that are 4 or 6 letters long.
#
grep ("^[a-zA-Z]{4}$|^[a-zA-Z]{6}$", fruit, value=TRUE)[1] "pear" "cherry" "plum" "banana"
grep ("(^[a-zA-Z]{4}$)|(^[a-zA-Z]{6}$)", fruit, value=TRUE)[1] "pear" "cherry" "plum" "banana"
grep ("^(([a-zA-Z]{4})|([a-zA-Z]{6}))$", fruit, value=TRUE)[1] "pear" "cherry" "plum" "banana"
18.2 Quantifiers with * + and ?
# Quantifiers with * + and ? ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# PATTERN* is the same as PATTERN{0,} i.e. zero or more repetitions
#
# PATTERN+ is the same as PATTERN{1,} i.e. one or more repetitions
#
# PATTERN? is the same as PATTERN{0,1}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
grep("^[^aeiouAEIOU].*[^aeiouAEIOU]$", fruit, value=TRUE) # start and end with non-vowel [1] "S. Korean Fig" "fig" "star fruit"
[4] "pear" "prickly pear" "Beurre Hardy pear"
[7] "cherry" "black cherry" "peach"
[10] "plum" "kumquat" "blueberry"
[13] "strawberry" "honeydew" "strawberries"
[16] "yumberry"
# match at least two spaces in the text (including just two spaces)
pattern = ".* .* .*"
pattern = ".*e.*e.*"
grep(pattern, fruit, value=TRUE) [1] "N. American apple" "Beurre Hardy pear" "blueberry"
[4] "honeydew" "strawberries"
spacesStuff = c("nospaces",
"this has three spaces",
"just two spaces",
"one space",
"two spaces",
"three spaces",
"",
" ",
" ",
" ")
spacesStuff [1] "nospaces" "this has three spaces" "just two spaces"
[4] "one space" "two spaces" "three spaces"
[7] "" " " " "
[10] " "
grep(pattern, spacesStuff, value=TRUE) [1] "this has three spaces" "one space" "three spaces"
# QUESTION
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Find solutions for the questions at the website.
# The website is free to use.
#
# http://play.inginf.units.it/
#
# 1. When you get to this page, scroll all the way to the bottom.
#
# 2. You will be prompted for your "regex skill level" and other basic signup
# info. It, seems that you must fill in this info in order for the example
# questions to work correctly. However, the exact answers to these questions
# don't seem to matter (it seems that you get the same questions no
# matter what "skill level" you choose)
#
# 3. Press the "Next" button.
#
# 4. You will then be prompted with a list of regex metacharacters
# Some of these are a little challenging: http://play.inginf.units.it/#/
#
# You can find sample answers here: https://avicoder.me/2019/01/21/regex-fun/
# There could definitely be other valid answers.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ANSWERS THAT WE DID TOGETHER IN CLASS
# level 1
# \d+
# level 2
# [a-zA-Z0-9]{2}:[a-zA-Z0-9]{2}:[a-zA-Z0-9]{2}:[a-zA-Z0-9]{2}:[a-zA-Z0-9]{2}:[a-zA-Z0-9]{2}
# ([a-zA-Z0-9]{2}:){5}[a-zA-Z0-9]{2}
# ([0-9a-zA-Z][0-9a-zA-Z]:){5}[0-9a-zA-Z][0-9a-zA-Z]
# level 3
#
# ftp://ftp[^.]*\.[a-zA-Z]+\.[a-zA-Z]+(\.[a-zA-Z]+)?/pub/FreeBSD/
# ftp://ftp[^.]*(\.[a-zA-Z]+)+/pub/FreeBSD/
# ftp://ftp\d*(\.[a-zA-Z]+)+/pub/FreeBSD/
# level 4
# \$[^$]+\$18.3 gsub works with patterns
# gsub ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# gsub works with patterns
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
gsub("[aeiou]", "x", fruit) # replace all vowels with x's [1] "xpplx" "N. Amxrxcxn xpplx" "S. Kxrxxn Fxg"
[4] "fxg" "stxr frxxt" "pxxr"
[7] "prxckly pxxr" "Bxxrrx Hxrdy pxxr" "chxrry"
[10] "blxck chxrry" "pxxch" "plxm"
[13] "kxmqxxt" "bxnxnx" "blxxbxrry"
[16] "strxwbxrry" "hxnxydxw" "strxwbxrrxxs"
[19] "yxmbxrry"
gsub("[^aeiou]", "x", fruit) # replace all non-vowels with x's [1] "axxxe" "xxxxxexixaxxaxxxe" "xxxxoxeaxxxix"
[4] "xix" "xxaxxxxuix" "xeax"
[7] "xxixxxxxxeax" "xeuxxexxaxxxxxeax" "xxexxx"
[10] "xxaxxxxxexxx" "xeaxx" "xxux"
[13] "xuxxuax" "xaxaxa" "xxuexexxx"
[16] "xxxaxxexxx" "xoxexxex" "xxxaxxexxiex"
[19] "xuxxexxx"
gsub("[^aeiou]+", "x", fruit) # replace one or more non-vowels with a single x [1] "axe" "xexixaxaxe" "xoxeaxix" "xix" "xaxuix"
[6] "xeax" "xixeax" "xeuxexaxeax" "xex" "xaxex"
[11] "xeax" "xux" "xuxuax" "xaxaxa" "xuexex"
[16] "xaxex" "xoxexex" "xaxexiex" "xuxex"
gsub("[^aeiou]*", "x", fruit) [1] "xaxex" "xexixaxaxex" "xoxexaxix" "xix"
[5] "xaxuxix" "xexax" "xixexax" "xexuxexaxexax"
[9] "xex" "xaxex" "xexax" "xux"
[13] "xuxuxax" "xaxaxax" "xuxexex" "xaxex"
[17] "xoxexex" "xaxexixex" "xuxex"
gsub("[^aeiou]*", "x", "apple")[1] "xaxex"