18  18. Quantifiers and gsub with patterns

18.1 Quantifiers: {n,m} {n} {n,} + * ?

#@ Quantifiers: {n,m}  {n}  {n,}  +  *  ?  ####
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
#@
#@ Quantifiers are symbols in the pattern that identify how many repetetitions
#@ to look for of a particular sub-pattern. The quantifiers include
#@
#@   SOME_SUB_PATTERN{n,m}    (where n and m are numbers)
#@   SOME_SUB_PATTERN{n}      (where n is a number)
#@   SOME_SUB_PATTERN{n,}      (where n is a number)
#@   SOME_SUB_PATTERN+
#@   SOME_SUB_PATTERN*
#@   SOME_SUB_PATTERN?
#@
#@ See below for an explanation of each type of quantifier.
#@
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

18.1.1 Quantifiers with {curly braces} eg. {3} {2,5} {3,}

# Quantifiers with {curly braces}   eg. {3}   {2,5}   {3,}   ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# 
#    SOME_PATTERN{3}       three matches in a row
#    SOME_PATTERN{3,6}     between three and six matches in a row
#    SOME_PATTERN{3,}      at least 3 matches in a row
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# EXAMPLE
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# "[0-9]{4}" matches 4 digits in a row, same as "[0-9][0-9][0-9][0-9]"
#
# Note that this will also return those entries that have more than 4 digits
# in a row since these entries ALSO have 4 digits in a row (plus some extra
# digits)

grep("[0-9]{4}", addresses, value=TRUE)  # 4 digits , same as "[0-9][0-9][0-9][0-9]"
[1] "12345 Sesame Street"         "5678 Park Place"            
[3] "Three Main Street Apt 12343"
# Note that if we use gsub, only the first 4 digits will be substituted.
gsub("[0-9]{4}", "xxxx", addresses)  # 4 digits , same as "[0-9][0-9][0-9][0-9]"
 [1] "xxxx5 Sesame Street"               "One Micro$oft Way"                
 [3] "3 Olive St."                       "Two 1st Ave."                     
 [5] "xxxx Park Place"                   "Forty Five 2nd Street"            
 [7] "Ninety Nine Cone St. apartment 7"  "9 Main St. apt. 623"              
 [9] "Five Google Drive"                 "4\\2 Rechov Yafo"                 
[11] "Fifteen Watchamacallit Boulevard"  "Nineteen Watchamacallit Boulevard"
[13] "One Main Street Apt 12b"           "Two Main Street Apt 123c"         
[15] "Three Main Street Apt xxxx3"       "City Hall Lockport, NY"           
grep("[0-9]{3}", addresses, value=TRUE)  # 3 digits 
[1] "12345 Sesame Street"         "5678 Park Place"            
[3] "9 Main St. apt. 623"         "Two Main Street Apt 123c"   
[5] "Three Main Street Apt 12343"
# Note that if we use gsub, only the first 3 digits will be substituted.
gsub("[0-9]{3}", "xxxx", addresses)  # 4 digits , same as "[0-9][0-9][0-9][0-9]"
 [1] "xxxx45 Sesame Street"              "One Micro$oft Way"                
 [3] "3 Olive St."                       "Two 1st Ave."                     
 [5] "xxxx8 Park Place"                  "Forty Five 2nd Street"            
 [7] "Ninety Nine Cone St. apartment 7"  "9 Main St. apt. xxxx"             
 [9] "Five Google Drive"                 "4\\2 Rechov Yafo"                 
[11] "Fifteen Watchamacallit Boulevard"  "Nineteen Watchamacallit Boulevard"
[13] "One Main Street Apt 12b"           "Two Main Street Apt xxxxc"        
[15] "Three Main Street Apt xxxx43"      "City Hall Lockport, NY"           
grep("[0-9]{3}", addresses, value=TRUE)  # 3 digits 
[1] "12345 Sesame Street"         "5678 Park Place"            
[3] "9 Main St. apt. 623"         "Two Main Street Apt 123c"   
[5] "Three Main Street Apt 12343"
grep("\\b[0-9]{3}\\b", addresses, value=TRUE)  # 3 digits 
[1] "9 Main St. apt. 623"
# EXAMPLE
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# match a word that is exactly 4 letters long

pattern = 
  paste0 ("^[a-zA-Z]{4}[^a-zA-Z]",
          "|[^a-zA-Z][a-zA-Z]{4}[^a-zA-Z]",
          "|[^a-zA-Z][a-zA-Z]{4}$",
          "|^[a-zA-Z]{4}$")

pattern =
  paste0 ("\\b[a-zA-Z]{4}\\b")

# use the same pattern for both addressess and fruit

grep(pattern, addresses, value=TRUE)
 [1] "5678 Park Place"                  "Forty Five 2nd Street"           
 [3] "Ninety Nine Cone St. apartment 7" "9 Main St. apt. 623"             
 [5] "Five Google Drive"                "4\\2 Rechov Yafo"                
 [7] "One Main Street Apt 12b"          "Two Main Street Apt 123c"        
 [9] "Three Main Street Apt 12343"      "City Hall Lockport, NY"          
grep(pattern, fruit, value=TRUE)
[1] "star fruit"        "pear"              "prickly pear"     
[4] "Beurre Hardy pear" "plum"             
# Another way - using \b
pattern = "\\b[a-zA-Z]{4}\\b"
grep(pattern, addresses, value=TRUE)
 [1] "5678 Park Place"                  "Forty Five 2nd Street"           
 [3] "Ninety Nine Cone St. apartment 7" "9 Main St. apt. 623"             
 [5] "Five Google Drive"                "4\\2 Rechov Yafo"                
 [7] "One Main Street Apt 12b"          "Two Main Street Apt 123c"        
 [9] "Three Main Street Apt 12343"      "City Hall Lockport, NY"          
# Use a text editor (e.g. the one in RStudio) to see how this works ...
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Paste the following in the text editor to see how this pattern works.
#
#  \b[a-zA-Z]{4}\b
#
# This will match any 4 character word. It will NOT match 3 or 5 character words.
# Note that in VS Code you should use only one backslash (i.e. \b ) but in R
# you would use two backslashes (i.e. \\b ) as explained above.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# match a word that is at least 4 letters long
# Use {4,} instead of {4}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Paste this into VSCode regex search.
#   ^[a-zA-Z]{4,}[^a-zA-Z]|[^a-zA-Z][a-zA-Z]{4,}[^a-zA-Z]|[^a-zA-Z][a-zA-Z]{4,}$|^[a-zA-Z]{4,}$
# match a word that is at least 3,4 or 5 letters long
# Use {3,5} instead of {4}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Paste this into VSCode regex search.
#   ^[a-zA-Z]{3,5}[^a-zA-Z]|[^a-zA-Z][a-zA-Z]{3,5}[^a-zA-Z]|[^a-zA-Z][a-zA-Z]{3,5}$|^[a-zA-Z]{3,5}$
# ANOTHER EXAMPLE
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# match a word that is exactly 7 letters long
pattern = 
  paste0 ("^[a-zA-Z]{7}[^a-zA-Z]",
          "|[^a-zA-Z][a-zA-Z]{7}[^a-zA-Z]",
          "|[^a-zA-Z][a-zA-Z]{7}$",
          "|^[a-zA-Z]{7}$")

# use the same pattern for both addresses and fruit
grep(pattern, addresses, value=TRUE)
[1] "Fifteen Watchamacallit Boulevard"
grep(pattern, fruit, value=TRUE)
[1] "prickly pear" "kumquat"     
# match any word that is between 4 and 7 letters long
pattern = 
  paste0 ("^[a-zA-Z]{4,7}[^a-zA-Z]",
          "|[^a-zA-Z][a-zA-Z]{4,7}[^a-zA-Z]",
          "|[^a-zA-Z][a-zA-Z]{4,7}$",
          "|^[a-zA-Z]{4,7}$")

grep(pattern, addresses, value=TRUE)
 [1] "12345 Sesame Street"              "One Micro$oft Way"               
 [3] "3 Olive St."                      "5678 Park Place"                 
 [5] "Forty Five 2nd Street"            "Ninety Nine Cone St. apartment 7"
 [7] "9 Main St. apt. 623"              "Five Google Drive"               
 [9] "4\\2 Rechov Yafo"                 "Fifteen Watchamacallit Boulevard"
[11] "One Main Street Apt 12b"          "Two Main Street Apt 123c"        
[13] "Three Main Street Apt 12343"      "City Hall Lockport, NY"          
grep(pattern, fruit, value=TRUE)      # ... "apple" ... "peach" ... (5 letter long words too)
 [1] "apple"             "N. American apple" "S. Korean Fig"    
 [4] "star fruit"        "pear"              "prickly pear"     
 [7] "Beurre Hardy pear" "cherry"            "black cherry"     
[10] "peach"             "plum"              "kumquat"          
[13] "banana"           
# match a word that is at least 7 letters long
pattern = 
  paste0 ("^[a-zA-Z]{7,}[^a-zA-Z]",
          "|[^a-zA-Z][a-zA-Z]{7,}[^a-zA-Z]",
          "|[^a-zA-Z][a-zA-Z]{7,}$",
          "|^[a-zA-Z]{7,}$")

grep(pattern, addresses, value=TRUE)
[1] "Ninety Nine Cone St. apartment 7"  "Fifteen Watchamacallit Boulevard" 
[3] "Nineteen Watchamacallit Boulevard" "City Hall Lockport, NY"           
grep(pattern, fruit, value=TRUE)
[1] "N. American apple" "prickly pear"      "kumquat"          
[4] "blueberry"         "strawberry"        "honeydew"         
[7] "strawberries"      "yumberry"         
# Exactly 3 digits (see example in last section of exactly one digit)
pattern = paste0(
  "^[0-9]{3}[^0-9]",
  "|[^0-9][0-9]{3}[^0-9]",
  "|[^0-9][0-9]{3}$",
  "|^[0-9]{3}$"
)
grep(pattern, addresses, value=TRUE)
[1] "9 Main St. apt. 623"      "Two Main Street Apt 123c"
grep("[^aeiou]{5}", fruit, value=TRUE)  # at least 5 non vowels in a row
[1] "N. American apple" "prickly pear"      "Beurre Hardy pear"
[4] "black cherry"     
grep("[^aeiou]{6}", fruit, value=TRUE)  # at least 6 non vowels in a row
[1] "prickly pear"
grep("[^aeiou]{7}", fruit, value=TRUE)  # at least 7 non vowels in a row
character(0)
grep("^.[aeiou]{2}", fruit, value=TRUE)  # vowels in the 2nd and 3rd positions
[1] "pear"              "Beurre Hardy pear" "peach"            
grep("[aeiou]{2}.$", fruit, value=TRUE)  # 2nd & 3rd to last characters are vowels
[1] "star fruit"        "pear"              "prickly pear"     
[4] "Beurre Hardy pear" "kumquat"           "strawberries"     
# QUESTION
# Search for fruit that are 4 or 6 letters long.
#

grep ("^[a-zA-Z]{4}$|^[a-zA-Z]{6}$", fruit, value=TRUE)
[1] "pear"   "cherry" "plum"   "banana"
grep ("(^[a-zA-Z]{4}$)|(^[a-zA-Z]{6}$)", fruit, value=TRUE)
[1] "pear"   "cherry" "plum"   "banana"
grep ("^(([a-zA-Z]{4})|([a-zA-Z]{6}))$", fruit, value=TRUE)
[1] "pear"   "cherry" "plum"   "banana"

18.2 Quantifiers with * + and ?

# Quantifiers with    *   +  and  ?  ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#      PATTERN* is the same as PATTERN{0,}  i.e. zero or more repetitions
# 
#      PATTERN+ is the same as PATTERN{1,}  i.e. one or more repetitions
#
#      PATTERN? is the same as PATTERN{0,1}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


grep("^[^aeiouAEIOU].*[^aeiouAEIOU]$", fruit, value=TRUE) # start and end with non-vowel 
 [1] "S. Korean Fig"     "fig"               "star fruit"       
 [4] "pear"              "prickly pear"      "Beurre Hardy pear"
 [7] "cherry"            "black cherry"      "peach"            
[10] "plum"              "kumquat"           "blueberry"        
[13] "strawberry"        "honeydew"          "strawberries"     
[16] "yumberry"         
# match at least two spaces in the text (including just two spaces)
pattern = ".* .* .*"       

pattern = ".*e.*e.*"       
grep(pattern, fruit, value=TRUE) 
[1] "N. American apple" "Beurre Hardy pear" "blueberry"        
[4] "honeydew"          "strawberries"     
spacesStuff = c("nospaces",
          "this has three spaces",
          "just two spaces", 
          "one space", 
          "two  spaces", 
          "three   spaces", 
          "",
          " ", 
          "  ", 
          "   ")
spacesStuff
 [1] "nospaces"              "this has three spaces" "just two spaces"      
 [4] "one space"             "two  spaces"           "three   spaces"       
 [7] ""                      " "                     "  "                   
[10] "   "                  
grep(pattern, spacesStuff, value=TRUE) 
[1] "this has three spaces" "one space"             "three   spaces"       
# QUESTION
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Find solutions for the questions at the website.
# The website is free to use.
#
#         http://play.inginf.units.it/
#
# 1. When you get to this page, scroll all the way to the bottom.
#
# 2. You will be prompted for your "regex skill level" and other basic signup
#    info. It, seems that you must fill in this info in order for the example
#    questions to work correctly. However, the exact answers to these questions
#    don't seem to matter (it seems that you get the same questions no 
#    matter what "skill level" you choose)
# 
# 3. Press the "Next" button.
#
# 4. You will then be prompted with a list of regex metacharacters 
# Some of these are a little challenging: http://play.inginf.units.it/#/
#
# You can find sample answers here: https://avicoder.me/2019/01/21/regex-fun/
# There could definitely be other valid answers.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# ANSWERS THAT WE DID TOGETHER IN CLASS

# level 1
#   \d+


# level 2
#   [a-zA-Z0-9]{2}:[a-zA-Z0-9]{2}:[a-zA-Z0-9]{2}:[a-zA-Z0-9]{2}:[a-zA-Z0-9]{2}:[a-zA-Z0-9]{2}
#   ([a-zA-Z0-9]{2}:){5}[a-zA-Z0-9]{2}
#   ([0-9a-zA-Z][0-9a-zA-Z]:){5}[0-9a-zA-Z][0-9a-zA-Z]


# level 3
#
#   ftp://ftp[^.]*\.[a-zA-Z]+\.[a-zA-Z]+(\.[a-zA-Z]+)?/pub/FreeBSD/
#   ftp://ftp[^.]*(\.[a-zA-Z]+)+/pub/FreeBSD/
#   ftp://ftp\d*(\.[a-zA-Z]+)+/pub/FreeBSD/


# level 4
#   \$[^$]+\$

18.3 gsub works with patterns

# gsub ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# gsub works with patterns
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

gsub("[aeiou]", "x", fruit)   # replace all vowels with x's
 [1] "xpplx"             "N. Amxrxcxn xpplx" "S. Kxrxxn Fxg"    
 [4] "fxg"               "stxr frxxt"        "pxxr"             
 [7] "prxckly pxxr"      "Bxxrrx Hxrdy pxxr" "chxrry"           
[10] "blxck chxrry"      "pxxch"             "plxm"             
[13] "kxmqxxt"           "bxnxnx"            "blxxbxrry"        
[16] "strxwbxrry"        "hxnxydxw"          "strxwbxrrxxs"     
[19] "yxmbxrry"         
gsub("[^aeiou]", "x", fruit)   # replace all non-vowels with x's
 [1] "axxxe"             "xxxxxexixaxxaxxxe" "xxxxoxeaxxxix"    
 [4] "xix"               "xxaxxxxuix"        "xeax"             
 [7] "xxixxxxxxeax"      "xeuxxexxaxxxxxeax" "xxexxx"           
[10] "xxaxxxxxexxx"      "xeaxx"             "xxux"             
[13] "xuxxuax"           "xaxaxa"            "xxuexexxx"        
[16] "xxxaxxexxx"        "xoxexxex"          "xxxaxxexxiex"     
[19] "xuxxexxx"         
gsub("[^aeiou]+", "x", fruit)   # replace one or more non-vowels with a single x
 [1] "axe"         "xexixaxaxe"  "xoxeaxix"    "xix"         "xaxuix"     
 [6] "xeax"        "xixeax"      "xeuxexaxeax" "xex"         "xaxex"      
[11] "xeax"        "xux"         "xuxuax"      "xaxaxa"      "xuexex"     
[16] "xaxex"       "xoxexex"     "xaxexiex"    "xuxex"      
gsub("[^aeiou]*", "x", fruit)
 [1] "xaxex"         "xexixaxaxex"   "xoxexaxix"     "xix"          
 [5] "xaxuxix"       "xexax"         "xixexax"       "xexuxexaxexax"
 [9] "xex"           "xaxex"         "xexax"         "xux"          
[13] "xuxuxax"       "xaxaxax"       "xuxexex"       "xaxex"        
[17] "xoxexex"       "xaxexixex"     "xuxex"        
gsub("[^aeiou]*", "x", "apple")
[1] "xaxex"