15 15. ^Anchors$, the dot, and [character classes]

15.1 “anchors” ^ and $

# ^ and $    ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Regular Expressions use special characters to control what is matched.
# These characters are called "meta-characters".
#
#   ^  "matches" the start of the character value
#   $  "matches" the end of the character value
#
#   The [square brackets] described above are also "meta characters" in
#   regular expressions. (We will describe those in more detail next)
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

grep(pattern="^a", x=fruit, value=TRUE)  # find fruits that START with an "a"

[1] "apple"

grep("a$", fruit, value=TRUE)  # find fruits that END with an "a"

[1] "banana"

# QUESTION
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Write a command using grep to display all the fruits that start with
# with a c or an s. Make your search case insensitive.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

#         ANSWERS ####
grep("^[csCS]", fruit, value=TRUE)

[1] "S. Korean Fig" "star fruit"    "cherry"        "strawberry"   
[5] "strawberries"

grep("^[cs]", fruit, value=TRUE, ignore.case = TRUE)

[1] "S. Korean Fig" "star fruit"    "cherry"        "strawberry"   
[5] "strawberries"

grep("^[cs]", tolower(fruit), value=TRUE)

[1] "s. korean fig" "star fruit"    "cherry"        "strawberry"   
[5] "strawberries"

15.2 periods

# . (i.e. a period) ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# a period (ie  .  ) "matches" any single character.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

grep("^.a", fruit, value=TRUE)  # 2nd letter is a

[1] "banana"

grep("^..a", fruit, value=TRUE)  # 3rd letter is a

[1] "star fruit"   "pear"         "black cherry" "peach"

grep("a.$", fruit, value=TRUE)  # 2nd to last letter is an a

[1] "pear"              "prickly pear"      "Beurre Hardy pear"
[4] "kumquat"

grep("....", fruit, value=TRUE) # all fruit that are AT LEAST 4 characters long

 [1] "apple"             "N. American apple" "S. Korean Fig"    
 [4] "star fruit"        "pear"              "prickly pear"     
 [7] "Beurre Hardy pear" "cherry"            "black cherry"     
[10] "peach"             "plum"              "kumquat"          
[13] "banana"            "blueberry"         "strawberry"       
[16] "honeydew"          "strawberries"      "yumberry"

grep("^....$", fruit, value=TRUE) # all fruit that are EXACTLY 4 characters long

[1] "pear" "plum"

15.3 “character classes” eg. [abc]

# [abc] matches a SINGLE "a", "b" or "c".    ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# characters in [square brackets] match a single copy of
# any of those characters, e.g.
#  [abc]    matches exactly one of a,b or c
#
# DEFINITION: 
# The [square brackets] with the characters in them are often referred to as 
# "character classes" or "character sets"
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

15.3.1 Examples

grep("[qbxz]", fruit, value=TRUE)  # find fruit that contain q,b,x or z

[1] "black cherry" "kumquat"      "banana"       "blueberry"    "strawberry"  
[6] "strawberries" "yumberry"

grep("[aeiou][aeiou]", fruit, value=TRUE)  # two vowels in a row

[1] "S. Korean Fig"     "star fruit"        "pear"             
[4] "prickly pear"      "Beurre Hardy pear" "peach"            
[7] "kumquat"           "blueberry"         "strawberries"

grep("^.[aeiou][aeiou]", fruit, value=TRUE)  # vowels in the 2nd and 3rd positions

[1] "pear"              "Beurre Hardy pear" "peach"

grep("[aeiou][aeiou].$", fruit, value=TRUE)  # 2nd & 3rd to last characters are vowels

[1] "star fruit"        "pear"              "prickly pear"     
[4] "Beurre Hardy pear" "kumquat"           "strawberries"

grep("[aeiou][aeiou][aeiou]", fruit, value=TRUE)  # 3 vowels in a row

character(0)

# QUESTION
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# replace the FIRST vowel that appears in any fruit with the letter "X"
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# ANSWER

# QUESTION
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# replace ALL vowels that appears in any fruit with the letter "X"
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# ANSWER
gsub("[aeiouAEIOU]", "X", fruit)

 [1] "XpplX"             "N. XmXrXcXn XpplX" "S. KXrXXn FXg"    
 [4] "fXg"               "stXr frXXt"        "pXXr"             
 [7] "prXckly pXXr"      "BXXrrX HXrdy pXXr" "chXrry"           
[10] "blXck chXrry"      "pXXch"             "plXm"             
[13] "kXmqXXt"           "bXnXnX"            "blXXbXrry"        
[16] "strXwbXrry"        "hXnXydXw"          "strXwbXrrXXs"     
[19] "yXmbXrry"

# QUESTION
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# remove ALL of the vowels that appear in any fruit
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# ANSWER
gsub("[aeiouAEIOU]", "", fruit)

 [1] "ppl"         "N. mrcn ppl" "S. Krn Fg"   "fg"          "str frt"    
 [6] "pr"          "prckly pr"   "Brr Hrdy pr" "chrry"       "blck chrry" 
[11] "pch"         "plm"         "kmqt"        "bnn"         "blbrry"     
[16] "strwbrry"    "hnydw"       "strwbrrs"    "ymbrry"

15.3.2 dashes, e.g. [a-d]

# Specify ranges with dash, e.g. [a-d] or [0-3] or [a-d0-3], etc ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Specify ranges with dash, e.g. [a-d] is same as [abcd], [0-3] is same as [0123] ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

15.3.3 Examples

grep("[j-m]", fruit, value=TRUE)  # fruits that contain any of j,k,l,m

[1] "apple"             "N. American apple" "prickly pear"     
[4] "black cherry"      "plum"              "kumquat"          
[7] "blueberry"         "yumberry"

addresses  # this was defined above

 [1] "12345 Sesame Street"               "One Micro$oft Way"                
 [3] "3 Olive St."                       "Two 1st Ave."                     
 [5] "5678 Park Place"                   "Forty Five 2nd Street"            
 [7] "Ninety Nine Cone St. apartment 7"  "9 Main St. apt. 623"              
 [9] "Five Google Drive"                 "4\\2 Rechov Yafo"                 
[11] "Fifteen Watchamacallit Boulevard"  "Nineteen Watchamacallit Boulevard"
[13] "One Main Street Apt 12b"           "Two Main Street Apt 123c"         
[15] "Three Main Street Apt 12343"       "City Hall Lockport, NY"

grep("[0-9]", addresses, value=TRUE)  # contains a digit

 [1] "12345 Sesame Street"              "3 Olive St."                     
 [3] "Two 1st Ave."                     "5678 Park Place"                 
 [5] "Forty Five 2nd Street"            "Ninety Nine Cone St. apartment 7"
 [7] "9 Main St. apt. 623"              "4\\2 Rechov Yafo"                
 [9] "One Main Street Apt 12b"          "Two Main Street Apt 123c"        
[11] "Three Main Street Apt 12343"

grep("[6-9]", addresses, value=TRUE)  # contains 6,7,8 or 9

[1] "5678 Park Place"                  "Ninety Nine Cone St. apartment 7"
[3] "9 Main St. apt. 623"

grep("[0-9][0-9]", addresses, value=TRUE)  # contains a number with at least 2 digits

[1] "12345 Sesame Street"         "5678 Park Place"            
[3] "9 Main St. apt. 623"         "One Main Street Apt 12b"    
[5] "Two Main Street Apt 123c"    "Three Main Street Apt 12343"

grep("[0-9][0-9][0-9][0-9]", addresses, value=TRUE)  # contains a # with at least 4 digits

[1] "12345 Sesame Street"         "5678 Park Place"            
[3] "Three Main Street Apt 12343"

grep("^[6-9]", addresses, value=TRUE)  # contains 6,7,8 or 9 as the first character

[1] "9 Main St. apt. 623"

grep("^.[6-9]", addresses, value=TRUE) # 6,7,8 or 9 is second character

[1] "5678 Park Place"

grep("[0-9]$", addresses, value=TRUE) # last character is a digit

[1] "Ninety Nine Cone St. apartment 7" "9 Main St. apt. 623"             
[3] "Three Main Street Apt 12343"

grep("[0-9][0-9][0-9]$", addresses, value=TRUE) # ends with at least 3 digits

[1] "9 Main St. apt. 623"         "Three Main Street Apt 12343"

grep("[0-9][0-9][0-9][0-9]$", addresses, value=TRUE) # ends with at least 4 digits

[1] "Three Main Street Apt 12343"

15.3.4 combine multiple ranges one [brackets]

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# You can combine multiple ranges and values in a single [brackets]
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

grep("[1-3x-z]$", addresses, value=TRUE) # ends with 1,2,3,x,y or z

[1] "One Micro$oft Way"           "9 Main St. apt. 623"        
[3] "Three Main Street Apt 12343"

grep("[of-hq]", fruit, value=TRUE)  # seraches for any of o,f,g,h,q

[1] "S. Korean Fig" "fig"           "star fruit"    "cherry"       
[5] "black cherry"  "peach"         "kumquat"       "honeydew"

# REMEMBER THE [BRACKETS]!!!
grep("of-hq", fruit, value=TRUE)  # searches for exactly :  "of-hq"

character(0)

15.3.5 [^abc]

# [^abc] matches a single character that is NOT a,b or c ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# When ^ is the FIRST character in the [^brackets] it means to match a 
# single character that is NOT one of the characters in the brackets
#
# [^abc]  - i.e. a single character that is NOT a,b or c
# [^0-9]  - a non-digit
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

grep("^[^0-9]", addresses, value=TRUE)  # does NOT start with a digit

 [1] "One Micro$oft Way"                 "Two 1st Ave."                     
 [3] "Forty Five 2nd Street"             "Ninety Nine Cone St. apartment 7" 
 [5] "Five Google Drive"                 "Fifteen Watchamacallit Boulevard" 
 [7] "Nineteen Watchamacallit Boulevard" "One Main Street Apt 12b"          
 [9] "Two Main Street Apt 123c"          "Three Main Street Apt 12343"      
[11] "City Hall Lockport, NY"

grep("[^0-9]$", addresses, value=TRUE)  # does NOT end with a digit

 [1] "12345 Sesame Street"               "One Micro$oft Way"                
 [3] "3 Olive St."                       "Two 1st Ave."                     
 [5] "5678 Park Place"                   "Forty Five 2nd Street"            
 [7] "Five Google Drive"                 "4\\2 Rechov Yafo"                 
 [9] "Fifteen Watchamacallit Boulevard"  "Nineteen Watchamacallit Boulevard"
[11] "One Main Street Apt 12b"           "Two Main Street Apt 123c"         
[13] "City Hall Lockport, NY"

grep("[^0-9]", addresses, value=TRUE)

 [1] "12345 Sesame Street"               "One Micro$oft Way"                
 [3] "3 Olive St."                       "Two 1st Ave."                     
 [5] "5678 Park Place"                   "Forty Five 2nd Street"            
 [7] "Ninety Nine Cone St. apartment 7"  "9 Main St. apt. 623"              
 [9] "Five Google Drive"                 "4\\2 Rechov Yafo"                 
[11] "Fifteen Watchamacallit Boulevard"  "Nineteen Watchamacallit Boulevard"
[13] "One Main Street Apt 12b"           "Two Main Street Apt 123c"         
[15] "Three Main Street Apt 12343"       "City Hall Lockport, NY"

# Contains 5 non-vowels in a row (notice that space counts as a non-vowel)
grep("[^aeiou][^aeiou][^aeiou][^aeiou][^aeiou]", fruit, value=TRUE, ignore.case=TRUE)

[1] "prickly pear"      "Beurre Hardy pear" "black cherry"

15.3.6 metachars lose special meaning in brackets eg. [.$*]

# meta characters in [brackets] other than ^ - and ] lose their special meaning ####
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Most meta characters inside of [brackets] are treated like any other character.
# They do NOT have any special meaning in the brackets. Therefore you can use
# them without any problem inside a character class. For example [.$]
# matches either a period or a dollar sign (see exmaples below).
#
# The only exceptions are ^ - and ] which DO have a special meaning inside of
# the [square brackets] - see more info below.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

stuff = c("This is a period: .", "apple", "45 oranges", "$2", "This is an open bracket: [")
stuff

[1] "This is a period: ."        "apple"                     
[3] "45 oranges"                 "$2"                        
[5] "This is an open bracket: ["

# EXAMPLE
#
# Match a period, left-square-bracket, or a dollar sign
# You do NOT need backslashes inside of the [brackets]
grep("[.[$]", stuff, value=TRUE)

[1] "This is a period: ."        "$2"                        
[3] "This is an open bracket: ["

# ANOTHER EXAMPLE
#
# The following matches either a period or a digit.
# You do NOT need to use a backslash before the period.
grep("[.1-9]", stuff, value=TRUE)

[1] "This is a period: ." "45 oranges"          "$2"

# The backslash will not hurt (but it isn't necessary inside the character class)
# (below we will explain why there are TWO backslashes - for now you can leave
# off both of the backslashes)
grep("[\\.1-9]", stuff, value=TRUE)

[1] "This is a period: ." "45 oranges"          "$2"

15.3.7 Special cases: ^ - ]

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Special cases:    ^    -    ]
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# The following characters have to be addressed in a special way inside
# of a character class. 
#
# ^   As we saw above, if the first character in the brackets is ^ the regex will
#     look for characters that are NOT in the brackets. If ^ appears anywhere else
#     inside the brackets it has no special meaning.
#
# -   As we saw above, [a-d] is the same as [abcd]. Therefore the - has a special
#     meaning inside of a character class. If you want to actually search
#     for a -, it must be the first, e.g. [-abc] or last character, eg. [abc-]
#     in the class.
#
# ]   has a special meaning - i.e. to end the character class. Therefore if 
#     you want to seach for an actual "]", the "]" should be specified
#     as the very FIRST character in the class, e.g. []abc]
#-------------------------------------------------------------------

# Examples:

# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 
# The "." inside of [brackets] simply means an actual period.
# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 

stuff = c("...", "def", "...bbbzzzbz.bzz...z.b", "^^^")
stuff

[1] "..."                   "def"                   "...bbbzzzbz.bzz...z.b"
[4] "^^^"

grep ("[.x]", stuff, value=TRUE)

[1] "..."                   "...bbbzzzbz.bzz...z.b"

15.3.8 matching ^ inside [square brackets]

# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 
# The "^" inside of [brackets] has a different meaning if it is in the 
# first position or if it is anywhere else. For example:
#
# [^abc]   matches anything that is NOT an "a","b" or "c"
#
# [a^bc]   
# [abc^]   both of these examples matches one "a","b","c" or "^" character
# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 

caretStuff = c("^^^", "hello", "???")
caretStuff

[1] "^^^"   "hello" "???"

grep ("[^a-z]", caretStuff, value=TRUE)   # "^^^" "???"

[1] "^^^" "???"

grep ("[a-z^]", caretStuff, value=TRUE)   # "^^^" "hello"

[1] "^^^"   "hello"

grep ("[^^]", caretStuff, value=TRUE)     # "hello" "???"

[1] "hello" "???"

# find all entries that have any symbol that is not a ".", "b" or "z"

stuff = c("...", "def", "...bbbzzzbz.bzz...z.b", "^^^")
stuff

[1] "..."                   "def"                   "...bbbzzzbz.bzz...z.b"
[4] "^^^"

grep ("[^.bz]", stuff, value=TRUE)

[1] "def" "^^^"

#grep("[a-z.]", c())

15.3.9 Matching a - inside [square brackets]

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Matching a dash (i.e. - ) inside a character class
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# As we saw earlier, inside of [bracketes] a dash special meaning (to indicate a range).
# To actually match a dash as one of the characters place the dash as either the 
# first or last character in the brackets.


dashStuff = c("---", "hello", "xxx")
dashStuff

[1] "---"   "hello" "xxx"

grep ("[-xyz]", dashStuff, value=TRUE)   # "---" "xxx"

[1] "---" "xxx"

grep ("[xyz-]", dashStuff, value=TRUE)   # "---" "xxx"   (same thing)

[1] "---" "xxx"

grep ("[a-f]", dashStuff, value=TRUE)   # "hello"

[1] "hello"

grep ("[-a-f]", dashStuff, value=TRUE)   # "---" "hello"

[1] "---"   "hello"

15.3.10 Matching a ] inside [square brackets]

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# To match a closing-square-bracket "]" inside a character class you must
# specify the ] as the very FIRST symbol in the character class.
#
# NOTE - there are no special rules for matching an open-square-bracket, "["
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

stuff = c("]", "apple", "zzz" )
stuff

[1] "]"     "apple" "zzz"

grep("]", stuff, value=TRUE)    #  "]"

[1] "]"

# The pattern "[]a]" matches a single character that is either "]" or "a".
# It finds "]" and also "apple" (since "apple" it contains an "a").
#
# This works since "]" is placed as the very first character in the 
# [brackets] so it is simply one of the characters that is searched for.

grep("[]a]", stuff, value=TRUE) #  "]"  "apple"

[1] "]"     "apple"

# This is VERY different for the pattern "[a]]"
#
# The following example shows what happens if you put the "]" in any
# position other than the first. The pattern "[a]]" is broken down as follows:
#
#   [a]   This is the single character "a". Note
#
#   ]     This does NOT signify the end of the character class, but is 
#         rather just a regular character that must be part of the text to be 
#         matched.
#
# Therefore [a]] is looking for the EXACT text "a]" somewhere in the text
# being searched.

grep("[a]]", stuff, value=TRUE) # No matches - looking for "a]" in the text

character(0)