24  24. Practice - Searching and Cleaning Data (with regex in R)

24.1 Introduction - about the data

In this assignment, you’ll practice using regular expressions to identify and clean messy data. The dataset below contains employee expense records with various inconsistencies in formatting. Some of the inconsistencies are described below the data.

24.1.1 Employee expenses data

Click on the link below to see the full dataset. The first few rows of data are shown below the link.

library(tibble)
expenses = tribble(
  ~employee,           ~date,          ~amount,          ~category,         ~comments,
  "Sue Smith",         "1/19/2023",    "59.99",         "food",            "lunch with client",
  "schwartz, joe",     "01/19/2023",   "$27.00",        "office supplies", "paper for printer",
  "mike david harris", "2023-01-19",   "25",            "Office Stuff",    NA,
  "Dr. A. Davis",      "19/1/2023",    "five hundred",  "FOOD",            "NA",
  "Dr Jones",          "1/19/23",      "1,234.56",      "office suppl.",   "chairs",
  "S. Jones Jr",       "19/01/23",     "1000",          "Office supplies", "desk",
  "Conway, Ella Sr.",  "Jan 19, 2023", "$35.23",        "LUNCH",           "---",
  "Brown, Tom",        "2/15/2023",    "42.50",         "food",            "dinner meeting",
  "  Williams, Kate  ","02/15/2023",   "$89.99",        "office supplies", "toner cartridge",
  "robert james lee",  "2023/02/15",   "75",            "Office Stuff",    NA,
  "mike green jr.",  "2023/02/15",     "500",           "OFFICE SUPPLIES", NA,
  "Dr. B. Wilson",     "15/2/2023",    "two hundred",   "FOOD",            "NA",
  "Dr Smith",          "2/15/23",      "2,345.67",      "office suppl.",   "monitor",
  "R. Brown Jr",       "March 2, 2023","750",           "Office supplies", "filing cabinet",
  "Martinez, Ana Sr.", "Feb 15, 2023", "$48.75",        "LUNCH",           "---",
  "Johnson,Mary",      "3/22/2023",    "63.45",         "food",            "team lunch",
  "Cooper,  Sam",       "03/22/2023",   "$156.00",      "office supplies", "office decoration",
  "peter michael wu",  "2023-03-22",   "50",            "OFFICE STUFF",    NA,
  "Dr. C. Taylor",     "2/3/2023",     "one thousand",  "FOOD",            "NA",
  "Drake Wilson",        "3/14/24",      "500",         "Office supplies", "printer",
  "Dr Chen",           "3/2/23",       "3,456.78",      "office suppl.",   "laptop",
  "T. Wilson Jr",      "Jun 27 2024",  "500",           "Office supplies", "printer",
  "Rodriguez, Eva Sr.","Mar 22, 2023", "$92.45",        "LUNCH",           "---",
    "Pat Williams",      "2/20/2023",    "150",           "travel",          "train ticket",
  "Chen, Wei",         "20/02/2023",   "$89.50",        "Travel",          "hotel",
  "jennifer ann martinez", "2023-02-21", "42",          "FOOD",            NA,
  "Ms. B. Kim",        "21/2/2023",    "two hundred",  "office supplies", "stapler",
  "J. Thompson",      "2/21/23",      "2,500.00",      "TRAVEL",          "flight",
  "R. Patel Ph.D",     "21/2/23",      "75",            "office suppl.",   "notebooks",
  "Garcia, Maria Jr.", "Feb 22, 2023", "$120.00",       "food",            "team dinner",
  "Dr. Evans",         "3/01/2023",    "95.99",         "Office Stuff",    "supplies",
  "Thompson III",      "3/05/2023",    "200",           "travel",          "conference",
  "Brown, Tom",        "2/15/2023",    "42.50",         "food",            "dinner meeting",
  "  Williams, Kate  ","02/15/2023",   "$89.99",        "office supplies", "toner cartridge",
  "robert james lee",  "2023/02/15",   "75",            "Office Stuff",    NA,
  "mike green jr.",  "2023/02/15",     "500",           "OFFICE SUPPLIES",    NA,
  "Dr. B. Wilson",     "15/2/2023",    "two hundred",   "FOOD",            "NA",
  "Dr Smith",          "2/15/23",      "2,345.67",      "office suppl.",   "monitor",
  "R. Brown Jr",       "March 2, 2023","750",           "Office supplies", "filing cabinet",
  "Martinez, Ana Sr.", "Feb 15, 2023", "$48.75",        "LUNCH",           "---",
  "Johnson,Mary",      "3/22/2023",    "63.45",         "food",            "team lunch",
  "Cooper,  Sam",       "03/22/2023",   "$156.00",      "office supplies", "office decoration",
  "peter michael wu",  "2023-03-22",   "50",            "Office Stuff",    NA,
  "Dr. C. Taylor",     "2/3/2023",     "one thousand",  "FOOD",            "NA",
  "Drake Wilson",        "3/14/24",      "500",         "Office supplies", "printer",
  "Dr Chen",           "3/2/23",       "3,456.78",      "office suppl.",   "laptop",
  "T. Wilson Jr",      "Jun 27 2024",  "500",           "Office supplies", "printer",
  "Rodriguez, Eva Sr.","Mar 22, 2023", "$92.45",        "LUNCH",           "---",

)
expenses = as.data.frame(expenses)
numrows = 10
cat("There are a total of ", nrow(expenses), " rows. The following is 
    just the first ", numrows, " rows.", sep="")
There are a total of 48 rows. The following is 
    just the first 10 rows.
head(expenses, 10)
             employee         date       amount        category          comments
1           Sue Smith    1/19/2023        59.99            food lunch with client
2       schwartz, joe   01/19/2023       $27.00 office supplies paper for printer
3   mike david harris   2023-01-19           25    Office Stuff              <NA>
4        Dr. A. Davis    19/1/2023 five hundred            FOOD                NA
5            Dr Jones      1/19/23     1,234.56   office suppl.            chairs
6         S. Jones Jr     19/01/23         1000 Office supplies              desk
7    Conway, Ella Sr. Jan 19, 2023       $35.23           LUNCH               ---
8          Brown, Tom    2/15/2023        42.50            food    dinner meeting
9    Williams, Kate     02/15/2023       $89.99 office supplies   toner cartridge
10   robert james lee   2023/02/15           75    Office Stuff              <NA>
  • Inconsistencies with the employee column entries

    • some names are in “first last” (e.g. “Sue Smith”) format and some names are in “last, first” format (e.g. “Brown, Tom”)
    • some names are capitalized, some aren’t
    • some names have middle include middle names, some don’t
    • some names include titles, some don’t
    • some names include a title, e.g. Dr. , while some don’t
    • some of the titles contain a period after the title, some don’t
    • some names include a suffix (e.g. Jr), some don’t
    • some names are missing first names
    • some names use periods (e.g. J. Thompson), some don’t
    • possibly other inconsistencies …
  • Inconsistencies with date column

    • some dates are in mm/dd/yyyy format (USA style)
    • some dates are in dd/mm/yyyy format (European style)
    • some dates are in yyyy-mm-dd format (standard for many computer systems)
    • some dates use 4 digit years, some use 2 digit years
    • some dates have a leading zero for single digit days or months
    • possibly other inconsistencies …
  • Inconsistencies in the amount column

    • some values have $ signs, some don’t,
    • some have commas (e.g. 1,234.56) some don’t
    • some are missing the cents (e.g. “75”)
    • some are written in words (e.g. “five hundred”)
    • possibly other inconsistencies …
  • Inconsistencies in the category column

    • the same categories are written differently (e.g. “food” vs “FOOD”, “office supplies” vs “Office suppl.” vs “OFFICE STUFF”)
    • possibly other inconsistencies …
  • Inconsistencies in the comments column

    • blanks are expressed differently: NA vs “NA” vs “—”.

      Note that in this column, other differences are more forgivable since comments are by their very nature written by people differently and are not expected to be in a standard form. However, there should be a standard way to express that there are no comments for a given row.

24.2 Questions - PART I (writing code)

Your task is to write R code using regular expressions to identify different patterns in the data.

You may use any of the regex functions we covered in class: - grep(): Find patterns and return matching indices or values - grepl(): Find patterns and return TRUE/FALSE for each element - sub(): Replace the first occurrence of a pattern - gsub(): Replace all occurrences of a pattern - strsplit(): Split strings based on a pattern

For each question: 1. First try to solve it yourself 2. If you get stuck, try to break down the pattern you’re looking for into smaller pieces 3. Test your solution with the sample data to make sure it catches all cases

24.2.1 Name Format Questions

Question 1a

Write R code using regex to identify all names that are in “last, first” format (e.g., “Schwartz, Joe”, including those with suffixes like “Sr.” or “Jr.”). Make sure that you allow for any number of spaces between the comma and the first name (including if there aren’t any spaces).

Given the data above, your output should show the following:

schwartz, joe
Conway, Ella Sr.
Brown, Tom
Martinez, Ana Sr.
Johnson,Mary
Cooper,  Sam
Rodriguez, Eva Sr.
Chen, Wei
Garcia, Maria Jr.
Brown, Tom
Martinez, Ana Sr.
Johnson,Mary
Cooper,  Sam
Rodriguez, Eva Sr.
answer = 
 grep("^[a-zA-Z]+,.+$", expenses$employee, value=TRUE)
cat(answer, sep="\n")
schwartz, joe
Conway, Ella Sr.
Brown, Tom
Martinez, Ana Sr.
Johnson,Mary
Cooper,  Sam
Rodriguez, Eva Sr.
Chen, Wei
Garcia, Maria Jr.
Brown, Tom
Martinez, Ana Sr.
Johnson,Mary
Cooper,  Sam
Rodriguez, Eva Sr.

Question 1b

Modify your answer to show only names that are strictly in
last, first form (without a suffix)

Given the data above, your output should show the following:

schwartz, joe
Brown, Tom
Johnson,Mary
Cooper,  Sam
Chen, Wei
Brown, Tom
Johnson,Mary
Cooper,  Sam
answer = 
 grep("^[a-zA-Z]+, *[a-zA-Z]+$", expenses$employee, value=TRUE)
cat(answer, sep="\n")
schwartz, joe
Brown, Tom
Johnson,Mary
Cooper,  Sam
Chen, Wei
Brown, Tom
Johnson,Mary
Cooper,  Sam

Question 2

Write R code using regex to display only names that begin with “Dr” (with or without a period). Make sure that you don’t display names (e.g. “Drake Wilson”) that begin with Dr but who aren’t doctors.

Given the data above, your output should show the following:

Dr. A. Davis
Dr Jones
Dr. B. Wilson
Dr Smith
Dr. C. Taylor
Dr Chen
Dr. Evans
Dr. B. Wilson
Dr Smith
Dr. C. Taylor
Dr Chen
answer = 
  grep("^Dr[. ]", expenses$employee, value=TRUE)
cat(answer, sep="\n")
Dr. A. Davis
Dr Jones
Dr. B. Wilson
Dr Smith
Dr. C. Taylor
Dr Chen
Dr. Evans
Dr. B. Wilson
Dr Smith
Dr. C. Taylor
Dr Chen

Question 3

Write R code using regex to identify all names that are completely lowercase (no capitalization).

Given the data above, your output should show the following:

schwartz, joe
mike david harris
robert james lee
mike green jr.
peter michael wu
jennifer ann martinez
robert james lee
mike green jr.
peter michael wu
answer = 
 grep("^[a-z][a-z. ,]*$", trimws(expenses$employee), value=TRUE)
cat(answer, sep="\n")
schwartz, joe
mike david harris
robert james lee
mike green jr.
peter michael wu
jennifer ann martinez
robert james lee
mike green jr.
peter michael wu

Question 4

Write R code using regex to identify all names that include a middle name or middle initial.

Given the data above, your output should show the following:

mike david harris
Dr. A. Davis
S. Jones Jr
Conway, Ella Sr.
robert james lee
mike green jr.
Dr. B. Wilson
R. Brown Jr
Martinez, Ana Sr.
peter michael wu
Dr. C. Taylor
T. Wilson Jr
Rodriguez, Eva Sr.
jennifer ann martinez
Ms. B. Kim
R. Patel Ph.D
Garcia, Maria Jr.
robert james lee
mike green jr.
Dr. B. Wilson
R. Brown Jr
Martinez, Ana Sr.
peter michael wu
Dr. C. Taylor
T. Wilson Jr
Rodriguez, Eva Sr.
answer = 
 grep("( [A-Za-z]\\. |([^ ]+ ){2,}[^ ]+)", expenses$employee, value=TRUE)
cat(answer, sep="\n")
mike david harris
Dr. A. Davis
S. Jones Jr
Conway, Ella Sr.
robert james lee
mike green jr.
Dr. B. Wilson
R. Brown Jr
Martinez, Ana Sr.
peter michael wu
Dr. C. Taylor
T. Wilson Jr
Rodriguez, Eva Sr.
jennifer ann martinez
Ms. B. Kim
R. Patel Ph.D
Garcia, Maria Jr.
robert james lee
mike green jr.
Dr. B. Wilson
R. Brown Jr
Martinez, Ana Sr.
peter michael wu
Dr. C. Taylor
T. Wilson Jr
Rodriguez, Eva Sr.

Question 5

Write R code using regex to identify all names that have a suffix (Sr. or Jr., with or without the period).

Given the data above, your output should show the following:

S. Jones Jr
Conway, Ella Sr.
R. Brown Jr
Martinez, Ana Sr.
T. Wilson Jr
Rodriguez, Eva Sr.
Garcia, Maria Jr.
R. Brown Jr
Martinez, Ana Sr.
T. Wilson Jr
Rodriguez, Eva Sr.
answer = 
 grep("(Jr\\.?|Sr\\.?) *$", trimws(expenses$employee), value=TRUE)
cat(answer, sep="\n")
S. Jones Jr
Conway, Ella Sr.
R. Brown Jr
Martinez, Ana Sr.
T. Wilson Jr
Rodriguez, Eva Sr.
Garcia, Maria Jr.
R. Brown Jr
Martinez, Ana Sr.
T. Wilson Jr
Rodriguez, Eva Sr.

24.2.2 Date Format Questions

Question 6

Write R code using regex to identify dates in the MM/DD/YYYY format (with or without leading zeros), ensuring valid months (1-12) and days (1-31).

Given the data above, your output should show the following:

1/19/2023
01/19/2023
1/19/23
2/15/2023
02/15/2023
2/15/23
3/22/2023
03/22/2023
2/3/2023
3/14/24
3/2/23
2/20/2023
2/21/23
3/01/2023
3/05/2023
2/15/2023
02/15/2023
2/15/23
3/22/2023
03/22/2023
2/3/2023
3/14/24
3/2/23
answer = 
 grep("^(0?[1-9]|1[0-2])/(0?[1-9]|[12][0-9]|3[01])/([0-9]{4}|[0-9]{2})$", expenses$date, value=TRUE)
cat(answer, sep="\n")
1/19/2023
01/19/2023
1/19/23
2/15/2023
02/15/2023
2/15/23
3/22/2023
03/22/2023
2/3/2023
3/14/24
3/2/23
2/20/2023
2/21/23
3/01/2023
3/05/2023
2/15/2023
02/15/2023
2/15/23
3/22/2023
03/22/2023
2/3/2023
3/14/24
3/2/23

Question 7

Write R code using regex to identify dates in the European format (DD/MM/YYYY or DD/MM/YY), ensuring valid days and months.

Given the data above, your output should show the following:

19/1/2023
19/01/23
15/2/2023
2/3/2023
3/2/23
20/02/2023
21/2/2023
21/2/23
3/01/2023
3/05/2023
15/2/2023
2/3/2023
3/2/23
answer = 
 grep("^(0?[1-9]|[12][0-9]|3[01])/(0?[1-9]|1[0-2])/([0-9]{4}|[0-9]{2})$", expenses$date, value=TRUE)
cat(answer, sep="\n")
19/1/2023
19/01/23
15/2/2023
2/3/2023
3/2/23
20/02/2023
21/2/2023
21/2/23
3/01/2023
3/05/2023
15/2/2023
2/3/2023
3/2/23

Question 8

Write R code using regex to identify dates in ISO format (YYYY-MM-DD).

Given the data above, your output should show the following:

2023-01-19
2023-03-22
2023-02-21
2023-03-22
answer = 
 grep("^[0-9]{4}-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])$", expenses$date, value=TRUE)
cat(answer, sep="\n")
2023-01-19
2023-03-22
2023-02-21
2023-03-22

Question 9

Write R code using regex to identify dates that use written month names (e.g., “Jan 19, 2023”, “Feb 15, 2023”).

Given the data above, your output should show the following:

Jan 19, 2023
March 2, 2023
Feb 15, 2023
Jun 27 2024
Mar 22, 2023
Feb 22, 2023
March 2, 2023
Feb 15, 2023
Jun 27 2024
Mar 22, 2023
answer = 
 grep("(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* [0-9]{1,2},? [0-9]{4}", expenses$date, value=TRUE)
cat(answer, sep="\n")
Jan 19, 2023
March 2, 2023
Feb 15, 2023
Jun 27 2024
Mar 22, 2023
Feb 22, 2023
March 2, 2023
Feb 15, 2023
Jun 27 2024
Mar 22, 2023

24.2.3 Amount Format Questions

Question 10

Write R code using regex to identify amounts that include a dollar sign (e.g., “$27.00”, “$156.00”).

Given the data above, your output should show the following:

$27.00
$35.23
$89.99
$48.75
$156.00
$92.45
$89.50
$120.00
$89.99
$48.75
$156.00
$92.45
answer = 
  grep("^\\$", expenses$amount, value=TRUE)
cat(answer, sep="\n")
$27.00
$35.23
$89.99
$48.75
$156.00
$92.45
$89.50
$120.00
$89.99
$48.75
$156.00
$92.45

Question 11

Write R code using regex to identify amounts that are written out in words (e.g., “five hundred”, “one thousand”).

Given the data above, your output should show the following:

five hundred
two hundred
one thousand
two hundred
two hundred
one thousand
answer = 
  grep("^[a-z ]+$", expenses$amount, value=TRUE)
cat(answer, sep="\n")
five hundred
two hundred
one thousand
two hundred
two hundred
one thousand

Question 12

Write R code using regex to identify amounts that include thousand separators (commas) (e.g., “1,234.56”, “2,345.67”).

Given the data above, your output should show the following:

1,234.56
2,345.67
3,456.78
2,500.00
2,345.67
3,456.78
answer = 
  grep(",[0-9]{3}", expenses$amount, value=TRUE)
cat(answer, sep="\n")
1,234.56
2,345.67
3,456.78
2,500.00
2,345.67
3,456.78

Question 13

Write R code using regex to identify amounts that are whole numbers without decimal places (e.g., “25”, “1000”).

Given the data above, your output should show the following:

25
1000
75
500
750
50
500
500
150
42
75
200
75
500
750
50
500
500
answer = 
  grep("^[0-9]+$", expenses$amount, value=TRUE)
cat(answer, sep="\n")
25
1000
75
500
750
50
500
500
150
42
75
200
75
500
750
50
500
500

24.2.4 Category Format Questions

Question 14

Write R code using regex to identify categories that are entirely uppercase (e.g., “FOOD”, “LUNCH”).

Given the data above, your output should show the following:

FOOD
LUNCH
OFFICE SUPPLIES
FOOD
LUNCH
OFFICE STUFF
FOOD
LUNCH
FOOD
TRAVEL
OFFICE SUPPLIES
FOOD
LUNCH
FOOD
LUNCH
answer = 
  grep("^[A-Z ]+$", expenses$category, value=TRUE)
cat(answer, sep="\n")
FOOD
LUNCH
OFFICE SUPPLIES
FOOD
LUNCH
OFFICE STUFF
FOOD
LUNCH
FOOD
TRAVEL
OFFICE SUPPLIES
FOOD
LUNCH
FOOD
LUNCH

Question 15

Write R code using regex to identify all variations of “office supplies” (including abbreviated forms and different capitalizations).

Given the data above, your output should show the following:

office supplies
Office Stuff
office suppl.
Office supplies
office supplies
Office Stuff
OFFICE SUPPLIES
office suppl.
Office supplies
office supplies
OFFICE STUFF
Office supplies
office suppl.
Office supplies
office supplies
office suppl.
Office Stuff
office supplies
Office Stuff
OFFICE SUPPLIES
office suppl.
Office supplies
office supplies
Office Stuff
Office supplies
office suppl.
Office supplies
answer = 
  grep("office (suppl|stuff|supplies)", expenses$category, ignore.case=TRUE, value=TRUE)
cat(answer, sep="\n")
office supplies
Office Stuff
office suppl.
Office supplies
office supplies
Office Stuff
OFFICE SUPPLIES
office suppl.
Office supplies
office supplies
OFFICE STUFF
Office supplies
office suppl.
Office supplies
office supplies
office suppl.
Office Stuff
office supplies
Office Stuff
OFFICE SUPPLIES
office suppl.
Office supplies
office supplies
Office Stuff
Office supplies
office suppl.
Office supplies

24.2.5 Comment Format Questions

Question 16

Write R code using regex to identify comments that are either NA, “NA”, or consist entirely of dashes (e.g “—” and “—–”).

Given the data above, your output should show the following:

NA
NA
---
NA
NA
NA
---
NA
NA
---
NA
NA
NA
NA
---
NA
NA
---
#cmt = as.character(expenses$comments)
#idx = is.na(expenses$comments) | grepl("^NA$", cmt) | grepl("^[-]+$", cmt)
tf = is.na(expenses$comments) | 
     grepl("^NA$", expenses$comments) | 
     grepl("^[-]+$", expenses$comments)
answer = expenses$comments[tf]
cat(answer, sep="\n")
NA
NA
---
NA
NA
NA
---
NA
NA
---
NA
NA
NA
NA
---
NA
NA
---

Question 17

Write R code using regex to identify comments that describe meetings (containing words like “lunch”, “dinner”, “client”, “team”).

Given the data above, your output should show the following:

lunch with client
dinner meeting
team lunch
team dinner
dinner meeting
team lunch
answer = 
  grep("(lunch|dinner|client|team)", expenses$comments, ignore.case=TRUE, value=TRUE)
cat(answer, sep="\n")
lunch with client
dinner meeting
team lunch
team dinner
dinner meeting
team lunch

24.2.6 Challenge Questions

Question 18

Write R code using regex to extract just the last name from any name format in the employee column. Your solution should work for all name formats in the dataset.

Given the data above, your output should show the following:

Smith
schwartz
harris
Davis
Jones
Jones
Conway
Brown
Williams
lee
green
Wilson
Smith
Brown
Martinez
Johnson
Cooper
wu
Taylor
Wilson
Chen
Wilson
Rodriguez
Williams
Chen
martinez
Kim
Thompson
Patel
Garcia
Evans
Thompson
Brown
Williams
lee
green
Wilson
Smith
Brown
Martinez
Johnson
Cooper
wu
Taylor
Wilson
Chen
Wilson
Rodriguez
name = trimws(expenses$employee)
# Strip trailing suffixes (Jr, Sr, Ph.D, III, II, IV) so "last word" is the surname (case-insensitive)
suffix_pat = "\\s*(Jr\\.?|Sr\\.?|Ph\\.?D\\.?|III|IV|II)\\s*$"
name_clean = trimws(sub(suffix_pat, "", name, ignore.case = TRUE))
last_name = ifelse(grepl("^[^,]+,", name),
  trimws(sub("^([^,]+),.*", "\\1", name)),
  sub(".*\\s+([^ ]+)$", "\\1", name_clean))
# Remove any suffix from last_name (e.g. "Smith Jr" in comma format)
answer = trimws(sub(suffix_pat, "", last_name, ignore.case = TRUE))
cat(answer, sep="\n")
Smith
schwartz
harris
Davis
Jones
Jones
Conway
Brown
Williams
lee
green
Wilson
Smith
Brown
Martinez
Johnson
Cooper
wu
Taylor
Wilson
Chen
Wilson
Rodriguez
Williams
Chen
martinez
Kim
Thompson
Patel
Garcia
Evans
Thompson
Brown
Williams
lee
green
Wilson
Smith
Brown
Martinez
Johnson
Cooper
wu
Taylor
Wilson
Chen
Wilson
Rodriguez

Question 19

Write R code using regex to convert all dates to YYYY-MM-DD format, regardless of their original format. Your solution should handle all date formats in the dataset.

Given the data above, your output should show the following:

2023-01-19
2023-01-19
NA
NA
23-01-19
NA
NA
2023-02-15
2023-02-15
NA
NA
NA
23-02-15
NA
NA
2023-03-22
2023-03-22
NA
2023-02-03
24-03-14
23-03-02
NA
NA
2023-02-20
NA
NA
NA
23-02-21
NA
NA
2023-03-01
2023-03-05
2023-02-15
2023-02-15
NA
NA
NA
23-02-15
NA
NA
2023-03-22
2023-03-22
NA
2023-02-03
24-03-14
23-03-02
NA
NA
answer = format(
  as.Date(expenses$date,
    tryFormats = c("%Y-%m-%d", "%m/%d/%Y", "%m/%d/%y", "%d/%m/%Y", "%d/%m/%y",
      "%B %d, %Y", "%b %d, %Y", "%B %d %Y", "%b %d %Y", "%Y/%m/%d")),
      "%Y-%m-%d")
cat(answer, sep="\n")
2023-01-19
2023-01-19
NA
NA
23-01-19
NA
NA
2023-02-15
2023-02-15
NA
NA
NA
23-02-15
NA
NA
2023-03-22
2023-03-22
NA
2023-02-03
24-03-14
23-03-02
NA
NA
2023-02-20
NA
NA
NA
23-02-21
NA
NA
2023-03-01
2023-03-05
2023-02-15
2023-02-15
NA
NA
NA
23-02-15
NA
NA
2023-03-22
2023-03-22
NA
2023-02-03
24-03-14
23-03-02
NA
NA

Question 20

Write R code using regex to standardize all amounts to a decimal number format (e.g., “1234.56”), removing dollar signs, commas, and converting written amounts to numbers.

Given the data above, your output should show the following:

  59.99
  27.00
  25.00
 500.00
1234.56
1000.00
  35.23
  42.50
  89.99
  75.00
 500.00
 200.00
2345.67
 750.00
  48.75
  63.45
 156.00
  50.00
1000.00
 500.00
3456.78
 500.00
  92.45
 150.00
  89.50
  42.00
 200.00
2500.00
  75.00
 120.00
  95.99
 200.00
  42.50
  89.99
  75.00
 500.00
 200.00
2345.67
 750.00
  48.75
  63.45
 156.00
  50.00
1000.00
 500.00
3456.78
 500.00
  92.45
amt = gsub("[$,]", "", expenses$amount)
amt = gsub("one thousand", "1000", amt, ignore.case=TRUE)
amt = gsub("five hundred", "500", amt, ignore.case=TRUE)
amt = gsub("two hundred", "200", amt, ignore.case=TRUE)
answer = format(round(as.numeric(amt), 2), nsmall=2)
cat(answer, sep="\n")
  59.99
  27.00
  25.00
 500.00
1234.56
1000.00
  35.23
  42.50
  89.99
  75.00
 500.00
 200.00
2345.67
 750.00
  48.75
  63.45
 156.00
  50.00
1000.00
 500.00
3456.78
 500.00
  92.45
 150.00
  89.50
  42.00
 200.00
2500.00
  75.00
 120.00
  95.99
 200.00
  42.50
  89.99
  75.00
 500.00
 200.00
2345.67
 750.00
  48.75
  63.45
 156.00
  50.00
1000.00
 500.00
3456.78
 500.00
  92.45

24.3 Questions - PART II (project)

Write a function named cleanup that takes the dataframe as an argument and returns a cleaned dataframe. Each column should have a consistent format for all columns. Do not lose any data in the process.

Split the employee column into the following fields (and remove the original single employee column):

  • employeeFullName — a paste of all the name components below (title, first, middle, last, suffix)
  • employeeTitle — e.g. Dr, Ms (no period)
  • employeeFirst — first name or initial
  • employeeMiddle — middle name or initial (NA if none)
  • employeeLast — last name or initial
  • employeeSuffix — e.g. Sr, Jr (no period; NA if none)

Leave out periods from all name parts (e.g. J. becomes J, Dr. becomes Dr).
All other columns (date, amount, category, comments) should be cleaned to a consistent structure as before.

Given the data above, your output should show the following:

cleanup(expenses)
        employeeFullName employeeTitle employeeFirst employeeMiddle employeeLast employeeSuffix       date  amount        category          comments
1              Sue Smith          <NA>           Sue           <NA>        Smith           <NA> 2023-01-19   59.99            food lunch with client
2           Joe Schwartz          <NA>           Joe           <NA>     Schwartz           <NA> 2023-01-19   27.00 office supplies paper for printer
3      Mike David Harris          <NA>          Mike          David       Harris           <NA> 2023-01-19   25.00 office supplies              <NA>
4             Dr A Davis            Dr             A           <NA>        Davis           <NA> 2023-01-19  500.00            food              <NA>
5               Dr Jones            Dr          <NA>           <NA>        Jones           <NA> 2023-01-19 1234.56 office supplies            chairs
6             S Jones Jr          <NA>             S           <NA>        Jones             Jr 2023-01-19 1000.00 office supplies              desk
7         Ella Conway Sr          <NA>          Ella           <NA>       Conway             Sr 2023-01-19   35.23            food              <NA>
8              Tom Brown          <NA>           Tom           <NA>        Brown           <NA> 2023-02-15   42.50            food    dinner meeting
9          Kate Williams          <NA>          Kate           <NA>     Williams           <NA> 2023-02-15   89.99 office supplies   toner cartridge
10      Robert James Lee          <NA>        Robert          James          Lee           <NA> 2023-02-15   75.00 office supplies              <NA>
11         Mike Green Jr          <NA>          Mike           <NA>        Green             Jr 2023-02-15  500.00 office supplies              <NA>
12           Dr B Wilson            Dr             B           <NA>       Wilson           <NA> 2023-02-15  200.00            food              <NA>
13              Dr Smith            Dr          <NA>           <NA>        Smith           <NA> 2023-02-15 2345.67 office supplies           monitor
14            R Brown Jr          <NA>             R           <NA>        Brown             Jr 2023-03-02  750.00 office supplies    filing cabinet
15       Ana Martinez Sr          <NA>           Ana           <NA>     Martinez             Sr 2023-02-15   48.75            food              <NA>
16          Mary Johnson          <NA>          Mary           <NA>      Johnson           <NA> 2023-03-22   63.45            food        team lunch
17            Sam Cooper          <NA>           Sam           <NA>       Cooper           <NA> 2023-03-22  156.00 office supplies office decoration
18      Peter Michael Wu          <NA>         Peter        Michael           Wu           <NA> 2023-03-22   50.00 office supplies              <NA>
19           Dr C Taylor            Dr             C           <NA>       Taylor           <NA> 2023-02-03 1000.00            food              <NA>
20          Drake Wilson          <NA>         Drake           <NA>       Wilson           <NA> 2024-03-14  500.00 office supplies           printer
21               Dr Chen            Dr          <NA>           <NA>         Chen           <NA> 2023-03-02 3456.78 office supplies            laptop
22           T Wilson Jr          <NA>             T           <NA>       Wilson             Jr       <NA>  500.00 office supplies           printer
23      Eva Rodriguez Sr          <NA>           Eva           <NA>    Rodriguez             Sr 2023-03-22   92.45            food              <NA>
24          Pat Williams          <NA>           Pat           <NA>     Williams           <NA> 2023-02-20  150.00          travel      train ticket
25              Wei Chen          <NA>           Wei           <NA>         Chen           <NA> 2023-02-20   89.50          travel             hotel
26 Jennifer Ann Martinez          <NA>      Jennifer            Ann     Martinez           <NA> 2023-02-21   42.00            food              <NA>
27              Ms B Kim            Ms             B           <NA>          Kim           <NA> 2023-02-21  200.00 office supplies           stapler
28            J Thompson          <NA>             J           <NA>     Thompson           <NA> 2023-02-21 2500.00          travel            flight
29               R Patel          <NA>             R           <NA>        Patel           <NA> 2023-02-21   75.00 office supplies         notebooks
30       Maria Garcia Jr          <NA>         Maria           <NA>       Garcia             Jr 2023-02-22  120.00            food       team dinner
31              Dr Evans            Dr          <NA>           <NA>        Evans           <NA> 2023-03-01   95.99 office supplies          supplies
32          Thompson III          <NA>          <NA>           <NA>     Thompson            III 2023-03-05  200.00          travel        conference
33             Tom Brown          <NA>           Tom           <NA>        Brown           <NA> 2023-02-15   42.50            food    dinner meeting
34         Kate Williams          <NA>          Kate           <NA>     Williams           <NA> 2023-02-15   89.99 office supplies   toner cartridge
35      Robert James Lee          <NA>        Robert          James          Lee           <NA> 2023-02-15   75.00 office supplies              <NA>
36         Mike Green Jr          <NA>          Mike           <NA>        Green             Jr 2023-02-15  500.00 office supplies              <NA>
37           Dr B Wilson            Dr             B           <NA>       Wilson           <NA> 2023-02-15  200.00            food              <NA>
38              Dr Smith            Dr          <NA>           <NA>        Smith           <NA> 2023-02-15 2345.67 office supplies           monitor
39            R Brown Jr          <NA>             R           <NA>        Brown             Jr 2023-03-02  750.00 office supplies    filing cabinet
40       Ana Martinez Sr          <NA>           Ana           <NA>     Martinez             Sr 2023-02-15   48.75            food              <NA>
41          Mary Johnson          <NA>          Mary           <NA>      Johnson           <NA> 2023-03-22   63.45            food        team lunch
42            Sam Cooper          <NA>           Sam           <NA>       Cooper           <NA> 2023-03-22  156.00 office supplies office decoration
43      Peter Michael Wu          <NA>         Peter        Michael           Wu           <NA> 2023-03-22   50.00 office supplies              <NA>
44           Dr C Taylor            Dr             C           <NA>       Taylor           <NA> 2023-02-03 1000.00            food              <NA>
45          Drake Wilson          <NA>         Drake           <NA>       Wilson           <NA> 2024-03-14  500.00 office supplies           printer
46               Dr Chen            Dr          <NA>           <NA>         Chen           <NA> 2023-03-02 3456.78 office supplies            laptop
47           T Wilson Jr          <NA>             T           <NA>       Wilson             Jr       <NA>  500.00 office supplies           printer
48      Eva Rodriguez Sr          <NA>           Eva           <NA>    Rodriguez             Sr 2023-03-22   92.45            food              <NA>
# =============================================================================
# STRING HELPERS
# =============================================================================

# strip_periods(text)
#   Arguments:  text  = character vector (e.g. "Dr.", "J. Smith")
#   Returns:    character vector of same length with periods removed and
#               leading/trailing spaces trimmed.
#   Example:    strip_periods("  J. ")  ->  "J"
strip_periods <- function(text) {
  gsub("\\.", "", trimws(text))
}

# to_sentence_case(text)
#   Arguments:  text = single string (e.g. "mike david", "O'BRIEN")
#   Returns:    string with first letter of each word uppercase, rest lowercase (Sentence case).
#               Returns NA unchanged if text is NA; empty string returned as-is.
#   Example:    to_sentence_case("mike david harris")  ->  "Mike David Harris"
to_sentence_case <- function(text) {
  if (is.na(text) || nchar(trimws(text)) == 0) return(text)
  words <- strsplit(trimws(text), "\\s+")[[1]]
  capped <- paste0(toupper(substring(words, 1, 1)), tolower(substring(words, 2)))
  paste(capped, collapse = " ")
}

# =============================================================================
# NAME PARSING — split into small functions by format
# =============================================================================

# has_suffix(namePart)
#   Arguments:  namePart = single string, no leading/trailing title (e.g. "Jones Jr")
#   Returns:    TRUE if name ends with a suffix like Jr, Sr, III, II, IV.
#   Example:    has_suffix("Jones Jr")  ->  TRUE
# Suffix pattern: one of Jr/Sr/III/II/IV at end. Require at least one space before suffix
# so that "Thompson III" correctly captures "III" (R's regex can otherwise match "II" inside "III").
suffix_pattern <- "(Jr\\.?|Sr\\.?|III|II|IV)"
suffix_at_end_regex <- paste0("\\s+", suffix_pattern, "\\s*$")
has_suffix <- function(namePart) {
  grepl(suffix_at_end_regex, namePart, ignore.case = TRUE)
}

# extract_suffix(namePart)
#   Arguments:  namePart = string that may end with a suffix
#   Returns:    list of two: suffix (string or NA), and nameWithSuffixRemoved (string).
#   Example:    extract_suffix("Conway, Ella Sr.")  ->  list(suffix = "Sr", nameWithSuffixRemoved = "Conway, Ella")
#   Uses two groups (name)(suffix) so the suffix is captured correctly; space before suffix required.
extract_suffix <- function(namePart) {
  if (!has_suffix(namePart)) {
    return(list(suffix = NA, nameWithSuffixRemoved = trimws(namePart)))
  }
  # Two groups: everything before the suffix, and the suffix. Requiring \s+ before suffix fixes III vs II.
  nameWithoutSuffix <- sub(paste0("^(.*)\\s+", suffix_pattern, "\\s*$"), "\\1", namePart, ignore.case = TRUE)
  suffix <- sub(paste0("^.*\\s+", suffix_pattern, "\\s*$"), "\\1", namePart, ignore.case = TRUE)
  list(suffix = strip_periods(suffix), nameWithSuffixRemoved = trimws(nameWithoutSuffix))
}

# normalize_suffix(suffix)
#   Arguments:  suffix = string (e.g. "jr", "SR", "iii") or NA
#   Returns:    correctly cased suffix: "Jr", "Sr", "II", "III", "IV", or NA if not recognized.
#   Example:    normalize_suffix("jr")  ->  "Jr"
normalize_suffix <- function(suffix) {
  if (is.na(suffix) || trimws(suffix) == "") return(NA)
  s <- tolower(trimws(suffix))
  if (s %in% c("jr", "jr.")) return("Jr")
  if (s %in% c("sr", "sr.")) return("Sr")
  if (s == "ii") return("II")
  if (s == "iii") return("III")
  if (s == "iv") return("IV")
  suffix
}

# has_trailing_title(namePart)
#   Arguments:  namePart = string that might end with "Ph.D" or "PhD"
#   Returns:    TRUE if it ends with that.
trailing_phd_regex <- paste0(
  "\\s+",           # space before Ph.D
  "Ph\\.?D\\.?",    # Ph.D or PhD (periods optional)
  "\\s*$"           # optional spaces at end
)
has_trailing_title <- function(namePart) {
  grepl(trailing_phd_regex, namePart, ignore.case = TRUE)
}

# extract_trailing_phd(namePart)
#   Returns:  list(titlePart = "PhD" or NA, nameWithTitleRemoved = string).
extract_trailing_phd <- function(namePart) {
  if (!has_trailing_title(namePart)) {
    return(list(titlePart = NA, nameWithTitleRemoved = trimws(namePart)))
  }
  tit <- sub(paste0(".*", trailing_phd_regex), "\\1", namePart, ignore.case = TRUE)
  nameWithout <- sub(trailing_phd_regex, "", namePart, ignore.case = TRUE)
  list(titlePart = strip_periods(tit), nameWithTitleRemoved = trimws(nameWithout))
}

# has_leading_title(namePart)
#   TRUE if name starts with Dr, Mr, Ms, Mrs (with optional period).
leading_title_regex <- paste0(
  "^(Dr\\.?|Mr\\.?|Ms\\.?|Mrs\\.?)",  # title at start
  "\\s+"                               # at least one space after
)
has_leading_title <- function(namePart) {
  grepl(leading_title_regex, namePart, ignore.case = TRUE)
}

# extract_leading_title(namePart)
#   Returns:  list(titlePart = "Dr" or NA, nameWithTitleRemoved = string).
extract_leading_title <- function(namePart) {
  if (!has_leading_title(namePart)) {
    return(list(titlePart = NA, nameWithTitleRemoved = trimws(namePart)))
  }
  tit <- sub(paste0(leading_title_regex, ".*"), "\\1", namePart, ignore.case = TRUE)
  nameWithout <- sub(leading_title_regex, "", namePart, ignore.case = TRUE)
  list(titlePart = strip_periods(tit), nameWithTitleRemoved = trimws(nameWithout))
}

# parse_name_last_comma_first(namePart)
#   For names like "Last, First" or "Last, First Middle".
#   Arguments:  namePart = string containing exactly one comma
#   Returns:    list(first =, middle =, last =). Periods stripped from each.
parse_name_last_comma_first <- function(namePart) {
  parts <- strsplit(namePart, ",\\s*")[[1]]
  last <- strip_periods(trimws(parts[1]))
  firstAndMiddle <- strsplit(trimws(parts[2]), "\\s+")[[1]]
  first <- strip_periods(firstAndMiddle[1])
  middle <- if (length(firstAndMiddle) >= 2) strip_periods(firstAndMiddle[2]) else NA
  list(first = first, middle = middle, last = last)
}

# parse_name_first_last(namePart)
#   For names like "First Last", "First Middle Last", or just "Last".
#   Arguments:  namePart = string with no comma
#   Returns:    list(first =, middle =, last =).
parse_name_first_last <- function(namePart) {
  words <- strsplit(trimws(namePart), "\\s+")[[1]]
  if (length(words) == 1) {
    list(first = NA, middle = NA, last = strip_periods(words[1]))
  } else if (length(words) == 2) {
    list(first = strip_periods(words[1]), middle = NA, last = strip_periods(words[2]))
  } else {
    first <- strip_periods(words[1])
    middle <- strip_periods(words[2])
    last <- paste(strip_periods(words[3:length(words)]), collapse = " ")
    list(first = first, middle = middle, last = last)
  }
}

# parse_one_employee(employeeName)
#   Arguments:  employeeName = one raw employee string (e.g. "Dr. A. Davis", "Schwartz, Joe")
#   Returns:    list with employeeTitle, employeeFirst, employeeMiddle, employeeLast,
#               employeeSuffix, employeeFullName. All name parts without periods.
#   Example:    parse_one_employee("Conway, Ella Sr.")  ->  first=Ella, last=Conway, suffix=Sr, title=NA
parse_one_employee <- function(employeeName) {
  raw <- trimws(employeeName)
  outSuffix <- extract_suffix(raw)
  raw <- outSuffix$nameWithSuffixRemoved
  outPhd <- extract_trailing_phd(raw)
  title <- outPhd$titlePart
  raw <- outPhd$nameWithTitleRemoved
  outLead <- extract_leading_title(raw)
  if (!is.na(outLead$titlePart)) {
    title <- if (is.na(title) || title == "") outLead$titlePart else paste0(outLead$titlePart, " ", title)
  }
  raw <- outLead$nameWithTitleRemoved
  if (is.na(title) || title == "") title <- NA

  if (length(strsplit(raw, ",\\s*")[[1]]) >= 2) {
    parsed <- parse_name_last_comma_first(raw)
  } else {
    parsed <- parse_name_first_last(raw)
  }
  first <- to_sentence_case(parsed$first)
  middle <- to_sentence_case(parsed$middle)
  last <- to_sentence_case(parsed$last)
  suffix <- normalize_suffix(outSuffix$suffix)

  fullParts <- c(title, first, middle, last, suffix)
  fullParts <- fullParts[!is.na(fullParts) & fullParts != ""]
  employeeFullName <- paste(fullParts, collapse = " ")

  list(employeeTitle = title, employeeFirst = first, employeeMiddle = middle,
       employeeLast = last, employeeSuffix = suffix, employeeFullName = employeeFullName)
}

# parse_employee_column(employeeNames)
#   Arguments:  employeeNames = character vector of raw employee strings
#   Returns:    data frame with columns employeeFullName, employeeTitle, employeeFirst,
#               employeeMiddle, employeeLast, employeeSuffix.
parse_employee_column <- function(employeeNames) {
  rows <- lapply(employeeNames, parse_one_employee)
  out <- as.data.frame(do.call(rbind, lapply(rows, unlist)), stringsAsFactors = FALSE)
  out <- out[c("employeeFullName", "employeeTitle", "employeeFirst", "employeeMiddle", "employeeLast", "employeeSuffix")]
  out
}

# =============================================================================
# DATE PARSING
# =============================================================================

# is_iso_date(dateStr)
#   Returns TRUE if string is already yyyy-mm-dd (e.g. "2023-01-19").
iso_date_regex <- paste0(
  "^[0-9]{4}",      # 4 digits at start (year)
  "-",
  "[0-9]{1,2}",     # 1 or 2 digits (month)
  "-",
  "[0-9]{1,2}$"     # 1 or 2 digits at end (day)
)
is_iso_date <- function(dateStr) {
  grepl(iso_date_regex, trimws(dateStr))
}

# is_month_name_date(dateStr)
#   Returns TRUE for strings like "Jan 19, 2023" or "January 19, 2023".
month_name_date_regex <- paste0(
  "^[A-Za-z]+",     # month name (letters)
  "\\s+",
  "[0-9]{1,2}",     # day
  ",\\s*",
  "[0-9]{2,4}$"     # 2- or 4-digit year
)
is_month_name_date <- function(dateStr) {
  grepl(month_name_date_regex, trimws(dateStr))
}

# parse_one_date(dateStr)
#   Arguments:  dateStr = one date string in any of the supported formats
#   Returns:    string "yyyy-mm-dd" or NA if unparseable.
#   Example:    parse_one_date("1/19/2023")  ->  "2023-01-19"
parse_one_date <- function(dateStr) {
  s <- trimws(dateStr)
  if (is_iso_date(s)) return(s)
  if (is_month_name_date(s)) {
    d <- as.Date(s, format = "%b %d, %Y")
    if (is.na(d)) d <- as.Date(s, format = "%B %d, %Y")
    return(format(d, "%Y-%m-%d"))
  }
  date_parts <- strsplit(s, "[/-]")[[1]]
  if (length(date_parts) != 3) return(NA)
  n1 <- as.integer(date_parts[1])
  n2 <- as.integer(date_parts[2])
  n3 <- as.integer(date_parts[3])
  len1 <- nchar(date_parts[1])
  len3 <- nchar(date_parts[3])
  if (len1 == 4) {
    y <- n1; m <- n2; d <- n3
  } else if (len3 == 4) {
    y <- n3
    if (n1 > 12) { d <- n1; m <- n2 } else if (n2 > 12) { m <- n1; d <- n2 } else { m <- n1; d <- n2 }
  } else {
    y <- 2000 + (n3 - 2000) %% 100
    if (n1 > 12) { d <- n1; m <- n2 } else if (n2 > 12) { m <- n1; d <- n2 } else { m <- n1; d <- n2 }
  }
  sprintf("%04d-%02d-%02d", y, m, d)
}

# parse_dates(dateStrings)
#   Arguments:  dateStrings = character vector of date strings
#   Returns:    character vector of same length, each element "yyyy-mm-dd" or NA.
parse_dates <- function(dateStrings) {
  sapply(trimws(dateStrings), parse_one_date)
}

# =============================================================================
# AMOUNT, CATEGORY, COMMENTS
# =============================================================================

# word_to_number(word)
#   Arguments:  word = string with a number in words (e.g. "five hundred", "twenty-one",
#               "one thousand two hundred thirty-four"). Hyphens and "and" are allowed.
#   Returns:    numeric value, or NA if the string is not a recognized number phrase.
#   Examples:   word_to_number("five hundred") -> 500; word_to_number("twenty-one") -> 21;
#               word_to_number("one thousand two hundred thirty four") -> 1234.
#   Supports:   zero–nine, ten–nineteen, twenty–ninety, hundred, thousand, million.
word_to_number <- function(word) {
  w <- tolower(trimws(word))
  w <- gsub("-", " ", w)
  w <- gsub("\\band\\b", " ", w)
  words <- strsplit(w, "\\s+")[[1]]
  words <- words[words != ""]
  if (length(words) == 0) return(NA)

  ones <- c(zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5, six = 6, seven = 7, eight = 8, nine = 9)
  teens <- c(ten = 10, eleven = 11, twelve = 12, thirteen = 13, fourteen = 14, fifteen = 15,
             sixteen = 16, seventeen = 17, eighteen = 18, nineteen = 19)
  tens <- c(twenty = 20, thirty = 30, forty = 40, fifty = 50, sixty = 60, seventy = 70, eighty = 80, ninety = 90)

  total <- 0
  current <- 0
  i <- 1
  while (i <= length(words)) {
    x <- words[i]
    if (x %in% names(ones)) {
      current <- current + ones[x]
      i <- i + 1
    } else if (x %in% names(teens)) {
      current <- current + teens[x]
      i <- i + 1
    } else if (x %in% names(tens)) {
      current <- current + tens[x]
      i <- i + 1
    } else if (x == "hundred") {
      if (current == 0) current <- 1
      current <- current * 100
      i <- i + 1
    } else if (x == "thousand") {
      if (current == 0) current <- 1
      total <- total + current * 1000
      current <- 0
      i <- i + 1
    } else if (x == "million") {
      if (current == 0) current <- 1
      total <- total + current * 1000000
      current <- 0
      i <- i + 1
    } else {
      return(NA)
    }
  }
  total + current
}

# clean_amounts(amountStrings)
#   Arguments:  amountStrings = character vector (e.g. "$27.00", "1,234.56", "five hundred")
#   Returns:    numeric vector. Removes $ and commas; converts word amounts when possible.
clean_amounts <- function(amountStrings) {
  x <- trimws(amountStrings)
  result <- numeric(length(x))
  for (i in seq_along(x)) {
    s <- gsub("[$,]", "", x[i])
    s <- trimws(s)
    if (grepl("^[0-9.]+$", s)) {
      result[i] <- as.numeric(s)
    } else {
      result[i] <- word_to_number(x[i])
    }
  }
  result
}

# clean_categories(categoryStrings)
#   Arguments:  categoryStrings = character vector (e.g. "FOOD", "office suppl.")
#   Returns:    character vector, lowercase, with abbreviations normalized
#               (e.g. "office suppl." and "Office Stuff" -> "office supplies", "LUNCH" -> "food").
office_suppl_regex <- paste0(
  "\\boffice\\s+suppl\\.?\\s*$"   # "office suppl" or "office suppl." at end of string
)
office_stuff_regex <- "\\boffice\\s+stuff\\s*$"
lunch_regex <- "\\blunch\\s*$"
clean_categories <- function(categoryStrings) {
  x <- tolower(trimws(categoryStrings))
  x <- sub(office_suppl_regex, "office supplies", x)
  x <- sub(office_stuff_regex, "office supplies", x)
  x <- sub(lunch_regex, "food", x)
  x
}

# clean_comments(commentStrings)
#   Arguments:  commentStrings = character vector; can contain placeholders like "---", "NA"
#   Returns:    character vector; placeholders and empty strings replaced with NA.
placeholder_regex <- "^(---|NA|n/a)$"
clean_comments <- function(commentStrings) {
  x <- trimws(as.character(commentStrings))
  x[grepl(placeholder_regex, x, ignore.case = TRUE)] <- NA
  x[x == ""] <- NA
  x
}

# =============================================================================
# MAIN CLEANUP
# =============================================================================

# cleanUp(expensesDF)
#   Arguments:  expensesDF = data frame with columns employee, date, amount, category, comments
#   Returns:    data frame with employee replaced by employeeFullName, employeeTitle,
#               employeeFirst, employeeMiddle, employeeLast, employeeSuffix; and with
#               date, amount, category, comments cleaned to consistent formats.
#   Example:    cleanUp(expenses)  ->  wide table with cleaned columns
cleanup <- function(expensesDF) {
  df <- as.data.frame(expensesDF, stringsAsFactors = FALSE)
  emp <- parse_employee_column(df$employee)
  df$employee <- NULL
  df <- cbind(emp, df)
  df$date     <- parse_dates(df$date)
  df$amount   <- clean_amounts(df$amount)
  df$category <- clean_categories(df$category)
  df$comments <- clean_comments(df$comments)
  df
}

cleanup(expenses)
        employeeFullName employeeTitle employeeFirst employeeMiddle employeeLast employeeSuffix       date  amount        category          comments
1              Sue Smith          <NA>           Sue           <NA>        Smith           <NA> 2023-01-19   59.99            food lunch with client
2           Joe Schwartz          <NA>           Joe           <NA>     Schwartz           <NA> 2023-01-19   27.00 office supplies paper for printer
3      Mike David Harris          <NA>          Mike          David       Harris           <NA> 2023-01-19   25.00 office supplies              <NA>
4             Dr A Davis            Dr             A           <NA>        Davis           <NA> 2023-01-19  500.00            food              <NA>
5               Dr Jones            Dr          <NA>           <NA>        Jones           <NA> 2023-01-19 1234.56 office supplies            chairs
6             S Jones Jr          <NA>             S           <NA>        Jones             Jr 2023-01-19 1000.00 office supplies              desk
7         Ella Conway Sr          <NA>          Ella           <NA>       Conway             Sr 2023-01-19   35.23            food              <NA>
8              Tom Brown          <NA>           Tom           <NA>        Brown           <NA> 2023-02-15   42.50            food    dinner meeting
9          Kate Williams          <NA>          Kate           <NA>     Williams           <NA> 2023-02-15   89.99 office supplies   toner cartridge
10      Robert James Lee          <NA>        Robert          James          Lee           <NA> 2023-02-15   75.00 office supplies              <NA>
11         Mike Green Jr          <NA>          Mike           <NA>        Green             Jr 2023-02-15  500.00 office supplies              <NA>
12           Dr B Wilson            Dr             B           <NA>       Wilson           <NA> 2023-02-15  200.00            food              <NA>
13              Dr Smith            Dr          <NA>           <NA>        Smith           <NA> 2023-02-15 2345.67 office supplies           monitor
14            R Brown Jr          <NA>             R           <NA>        Brown             Jr 2023-03-02  750.00 office supplies    filing cabinet
15       Ana Martinez Sr          <NA>           Ana           <NA>     Martinez             Sr 2023-02-15   48.75            food              <NA>
16          Mary Johnson          <NA>          Mary           <NA>      Johnson           <NA> 2023-03-22   63.45            food        team lunch
17            Sam Cooper          <NA>           Sam           <NA>       Cooper           <NA> 2023-03-22  156.00 office supplies office decoration
18      Peter Michael Wu          <NA>         Peter        Michael           Wu           <NA> 2023-03-22   50.00 office supplies              <NA>
19           Dr C Taylor            Dr             C           <NA>       Taylor           <NA> 2023-02-03 1000.00            food              <NA>
20          Drake Wilson          <NA>         Drake           <NA>       Wilson           <NA> 2024-03-14  500.00 office supplies           printer
21               Dr Chen            Dr          <NA>           <NA>         Chen           <NA> 2023-03-02 3456.78 office supplies            laptop
22           T Wilson Jr          <NA>             T           <NA>       Wilson             Jr       <NA>  500.00 office supplies           printer
23      Eva Rodriguez Sr          <NA>           Eva           <NA>    Rodriguez             Sr 2023-03-22   92.45            food              <NA>
24          Pat Williams          <NA>           Pat           <NA>     Williams           <NA> 2023-02-20  150.00          travel      train ticket
25              Wei Chen          <NA>           Wei           <NA>         Chen           <NA> 2023-02-20   89.50          travel             hotel
26 Jennifer Ann Martinez          <NA>      Jennifer            Ann     Martinez           <NA> 2023-02-21   42.00            food              <NA>
27              Ms B Kim            Ms             B           <NA>          Kim           <NA> 2023-02-21  200.00 office supplies           stapler
28            J Thompson          <NA>             J           <NA>     Thompson           <NA> 2023-02-21 2500.00          travel            flight
29               R Patel          <NA>             R           <NA>        Patel           <NA> 2023-02-21   75.00 office supplies         notebooks
30       Maria Garcia Jr          <NA>         Maria           <NA>       Garcia             Jr 2023-02-22  120.00            food       team dinner
31              Dr Evans            Dr          <NA>           <NA>        Evans           <NA> 2023-03-01   95.99 office supplies          supplies
32          Thompson III          <NA>          <NA>           <NA>     Thompson            III 2023-03-05  200.00          travel        conference
33             Tom Brown          <NA>           Tom           <NA>        Brown           <NA> 2023-02-15   42.50            food    dinner meeting
34         Kate Williams          <NA>          Kate           <NA>     Williams           <NA> 2023-02-15   89.99 office supplies   toner cartridge
35      Robert James Lee          <NA>        Robert          James          Lee           <NA> 2023-02-15   75.00 office supplies              <NA>
36         Mike Green Jr          <NA>          Mike           <NA>        Green             Jr 2023-02-15  500.00 office supplies              <NA>
37           Dr B Wilson            Dr             B           <NA>       Wilson           <NA> 2023-02-15  200.00            food              <NA>
38              Dr Smith            Dr          <NA>           <NA>        Smith           <NA> 2023-02-15 2345.67 office supplies           monitor
39            R Brown Jr          <NA>             R           <NA>        Brown             Jr 2023-03-02  750.00 office supplies    filing cabinet
40       Ana Martinez Sr          <NA>           Ana           <NA>     Martinez             Sr 2023-02-15   48.75            food              <NA>
41          Mary Johnson          <NA>          Mary           <NA>      Johnson           <NA> 2023-03-22   63.45            food        team lunch
42            Sam Cooper          <NA>           Sam           <NA>       Cooper           <NA> 2023-03-22  156.00 office supplies office decoration
43      Peter Michael Wu          <NA>         Peter        Michael           Wu           <NA> 2023-03-22   50.00 office supplies              <NA>
44           Dr C Taylor            Dr             C           <NA>       Taylor           <NA> 2023-02-03 1000.00            food              <NA>
45          Drake Wilson          <NA>         Drake           <NA>       Wilson           <NA> 2024-03-14  500.00 office supplies           printer
46               Dr Chen            Dr          <NA>           <NA>         Chen           <NA> 2023-03-02 3456.78 office supplies            laptop
47           T Wilson Jr          <NA>             T           <NA>       Wilson             Jr       <NA>  500.00 office supplies           printer
48      Eva Rodriguez Sr          <NA>           Eva           <NA>    Rodriguez             Sr 2023-03-22   92.45            food              <NA>