19  19. Backreferences (\1, \2, …)

19.1 backreferences \1 \2 etc

# BACKREFERENCES  ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Parenthesized expressions in a regex can be referred "back" to 
# with \1, \2 ... 
# (remember in R you need two backslashes - i.e. \\1, \\2, ...)
#
# The original regex standard only allowed for up to nine
# backreferences, ie. \1 \2 \3 ... \9  
# It did not allow for \10. Some environments have ways to
# allow you to reference \10 and further but I personally
# don't know how to do that in R ... I guess you could 
# research that if you need to but it usually doesn't
# come up. If it becomes an issue, there is almost always
# a simple way to workaround the situation
# using loops and other coding approaches.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~



# Find fruits that have 3 letters in the pattern xyx or aba
grep ("([a-z])[a-z]\\1", fruit, value=TRUE)
[1] "banana"    "blueberry"
grep ("([a-z])([a-z])\\1\\2", fruit, value=TRUE)
[1] "banana"
grep("([a-z][a-z])\\1", fruit, value=TRUE)
[1] "banana"
gsub ("([a-z])([a-z])\\1", "\\2\\1\\2", fruit)
 [1] "apple"             "N. American apple" "S. Korean Fig"    
 [4] "fig"               "star fruit"        "pear"             
 [7] "prickly pear"      "Beurre Hardy pear" "cherry"           
[10] "black cherry"      "peach"             "plum"             
[13] "kumquat"           "bnanna"            "blubebrry"        
[16] "strawberry"        "honeydew"          "strawberries"     
[19] "yumberry"         
gsub("^(.)(.)(.)",   # reverse the first 3 characters  
     "\\3\\2\\1", 
     fruit)
 [1] "ppale"             " .NAmerican apple" " .SKorean Fig"    
 [4] "gif"               "atsr fruit"        "aepr"             
 [7] "irpckly pear"      "ueBrre Hardy pear" "ehcrry"           
[10] "albck cherry"      "aepch"             "ulpm"             
[13] "mukquat"           "nabana"            "ulbeberry"        
[16] "rtsawberry"        "noheydew"          "rtsawberries"     
[19] "muyberry"         
# QUESTION
# Write a command to swap the first character and last character of
# each fruit

gsub( "^(.)(.*)(.)$" , "\\3\\2\\1" , fruit)
 [1] "eppla"             "e. American applN" "g. Korean FiS"    
 [4] "gif"               "ttar fruis"        "reap"             
 [7] "rrickly peap"      "reurre Hardy peaB" "yherrc"           
[10] "ylack cherrb"      "heacp"             "mlup"             
[13] "tumquak"           "aananb"            "ylueberrb"        
[16] "ytrawberrs"        "woneydeh"          "strawberries"     
[19] "yumberry"         
gsub( "^(.)(.*)(.)$" , "\\3-\\2-\\1" , fruit)
 [1] "e-ppl-a"             "e-. American appl-N" "g-. Korean Fi-S"    
 [4] "g-i-f"               "t-tar frui-s"        "r-ea-p"             
 [7] "r-rickly pea-p"      "r-eurre Hardy pea-B" "y-herr-c"           
[10] "y-lack cherr-b"      "h-eac-p"             "m-lu-p"             
[13] "t-umqua-k"           "a-anan-b"            "y-lueberr-b"        
[16] "y-trawberr-s"        "w-oneyde-h"          "s-trawberrie-s"     
[19] "y-umberr-y"         
# QUESTION 
# Find fruits that start and end with the same letter
#

grep("^(.).*\\1$", fruit, value=TRUE)
[1] "strawberries" "yumberry"    
shoppingList = c("35 yumberry pops", 
                 "four strawberries         ", 
                 " five apples",
                 "six yumberry and strawberries pops")
shoppingList
[1] "35 yumberry pops"                   "four strawberries         "        
[3] " five apples"                       "six yumberry and strawberries pops"
# QUESTION
# Use sub or gsub to replace words that start and end with the same letter
# with the first letter then "XXXX" then the last letter of the word

gsub("\\b(.).*\\1\\b", "\\1XXXX\\1", shoppingList)
[1] "35 yumberry pops"           "four strawberries         "
[3] " XXXX apples"               "sXXXXs"                    
# Make the * UN-greedy by following it with a ?
gsub("\\b(.).*?\\1\\b", "\\1XXXX\\1", shoppingList)
[1] "35 yumberry pops"           "four strawberries         "
[3] " XXXX apples"               "sXXXXs"                    
#
gsub("\\b([a-z])[a-z]*?\\1\\b", "\\1XXXX\\1", shoppingList)
[1] "35 yXXXXy pops"             "four sXXXXs         "      
[3] " five apples"               "six yXXXXy and sXXXXs pops"