35 35. dataframes

# Remove all variables
rm(list=ls())

########################################################################
# dataframes
#
# The contents of this file assume that you are familiar with the 
# following topics
#
#   - lists
#   - factors
#   - attributes and attr
#
# A dataframe allows you to work with multiple parallel vectors
# that are arranged in a grid.
########################################################################

35.1 Example of a data.frame

# Example of a dataframe
#
# You can create a dataframe with the data.frame function
# (NOTICE the "." in the name data.frame. Don't forget to type it.
#  In R, a period is simply a regular character that can be used
#  in the name of a variable or function. It is often used to separate
#  words such as: a.long.variable.name = 100)

gradebook = data.frame(student =      c("joe", "sue", "sam", "anne", "bob", "carla", "dana", "david"),
                       test1 =        c(70,     80,    90,    75,    85,    95,      100,    60),
                       test2 =        c(81,     77,    88,    87,    91,    92,      99,     73),
                       year  = factor(c("fr",   "fr",  "so",  "so",  "fr",  "se",    "so",   "so"), 
                                      ordered=TRUE, levels=c("fr","so","ju","se")),
                       honors =       c(FALSE,  FALSE, FALSE, FALSE, FALSE, TRUE,    TRUE,   FALSE),
                         stringsAsFactors = FALSE)
                       
gradebook

  student test1 test2 year honors
1     joe    70    81   fr  FALSE
2     sue    80    77   fr  FALSE
3     sam    90    88   so  FALSE
4    anne    75    87   so  FALSE
5     bob    85    91   fr  FALSE
6   carla    95    92   se   TRUE
7    dana   100    99   so   TRUE
8   david    60    73   so  FALSE

# The data.frame function takes a series of vectors as arguments. The vectors
# must all be the same length. (remember that a factor is a vector too).
#
# The vectors become the columns of the dataframe.
# The names of the arguments become the names of the columns in the dataframe.
#
# There are other arguments to the data.frame function that you may 
# be interested in exploring when you get more adept at using dataframes.
#
# For now, the other argument we will look at is stringsAsFactors. 
# We will discuss stringsAsFactors in more detail later. For now, we will simply set 
#    stringsAsFactors=TRUE
# Later, we will explain what stringsAsFactors=TRUE does and what 
# it means if you set stringsAsFactors=FALSE or leave out stringsAsFactors
# entirely.

# View the help page by typing: 
#
#    ?data.frame

gradebook

  student test1 test2 year honors
1     joe    70    81   fr  FALSE
2     sue    80    77   fr  FALSE
3     sam    90    88   so  FALSE
4    anne    75    87   so  FALSE
5     bob    85    91   fr  FALSE
6   carla    95    92   se   TRUE
7    dana   100    99   so   TRUE
8   david    60    73   so  FALSE

35.2 Anatomy of a data.frame (it’s a list of parallel vectors … )

#-----------------------------------------------------------------------
# "Under the covers", a dataframe is actually a list of vectors.
# All of the vectors in the list must have the same length.
# R arranges the data into rows (horizontal) and columns (vertical).
# The vectors are the columns.
#
# R arranges the vectors as the columns and displays them next to 
# each other to make the dataframe appear as a "grid" with rows and columns.
# Because each column in actually a vector - each column in the dataframe
# must be a single class (eg. all data in a single column must be "numeric", "logical"
# "character", "factor", etc.) There is NO such requirement for the rows
# of a dataframe.
#
# A dataframe looks different than a simple list and has a few added
# features (which we'll explore later below). This is because R recognizes
# that the list should treated as a dataframe because the class attributre of
# the list is set to "data.frame". This is done by the data.frame function
# which is used to create the dataframe. There are also a couple of other 
# attributes that are attached to the list. (see below)
#
# The following attributes are attached to every dataframe:
#
#      attribute name    attribute value
#      --------------    ---------------
#      class             "data.frame"
#
#      names             character vector with names of the columns
#                        note that a plain list can also have a names attribute
#                        with names of the entries in the list.
#
#      row.names         a character vector with names of the rows.
#                        By default the row names are simply numbers.
#                        You can change the row names to anything you like
#                        (If you recall, we also did this with matrices.)
# 
# You can access these attributes by using the following functions
# (see details in the code below)
#
#    attr(SOME_DATAFRAME, ATTRIBUTE_NAME)
#    attributes(SOME_DATAFRAME)
#    names(SOME_DATAFRAME)
#    colnames(SOME_DATAFRAME)
#    rownames(SOME_DATAFRAME)
#    row.names(SOME_DATAFRAME)
#------------------------------------------------------------------------------

mode(gradebook) # "list"

[1] "list"

class(gradebook) # "data.frame"

[1] "data.frame"

# There are a few attributes on the list that make R interpret how to display
# and use the dataframe. 

attributes(gradebook)  # names   class   row.names

$names
[1] "student" "test1"   "test2"   "year"    "honors" 

$class
[1] "data.frame"

$row.names
[1] 1 2 3 4 5 6 7 8

# The class attribute
# Any of the following commands will display the contents of the "class" attribute.

class(gradebook)            # "data.frame"

[1] "data.frame"

attr(gradebook, "class")    # same thing

[1] "data.frame"

attributes(gradebook)$class # same thing

[1] "data.frame"

# The names attribute - contains the names of the columns
# Any of the following commands will display the contents of the "names" attribute.

names(gradebook)            # "student" "test1"   "test2"   "year"    "honors"

[1] "student" "test1"   "test2"   "year"    "honors"

colnames(gradebook)         # same thing

[1] "student" "test1"   "test2"   "year"    "honors"

attr(gradebook, "names")    # same thing

[1] "student" "test1"   "test2"   "year"    "honors"

attributes(gradebook)$names # same thing

[1] "student" "test1"   "test2"   "year"    "honors"

# The row.names attribute - contains the names of the rows
# Any of the following commands will display the contents of the "row.names" attribute.
#
# NOTE that there is both a "row.names" and a "rownames" function.
# They return the same value. If you're curious about why both exist and which
# is preferable to use (ie. row.names) see the link below. 
#
# ALSO NOTE, that while there is a row.names function, there is no
# col.names function, only colnames.
#
# https://stackoverflow.com/questions/38466276/why-is-row-names-preferred-over-rownames/39179031

row.names(gradebook)         # (as a character vector) - "1" "2" "3" "4" "5" "6" "7" "8"

[1] "1" "2" "3" "4" "5" "6" "7" "8"

rownames(gradebook)          # same thing (as a character vector)

[1] "1" "2" "3" "4" "5" "6" "7" "8"

attr(gradebook, "row.names")    # actual value of the row.names attribute (by default these are integers)

[1] 1 2 3 4 5 6 7 8

attributes(gradebook)$row.names # same thing

[1] 1 2 3 4 5 6 7 8

35.3 Some useful functions: nrow, ncol, head, tail, class, length, etc.

#-----------------------------------------------------------------------------
# Other functions you can use with dataframes
#-----------------------------------------------------------------------------

gradebook

  student test1 test2 year honors
1     joe    70    81   fr  FALSE
2     sue    80    77   fr  FALSE
3     sam    90    88   so  FALSE
4    anne    75    87   so  FALSE
5     bob    85    91   fr  FALSE
6   carla    95    92   se   TRUE
7    dana   100    99   so   TRUE
8   david    60    73   so  FALSE

nrow(gradebook)   # number of rows

[1] 8

ncol(gradebook)   # number of columns

[1] 5

head(gradebook, 2)  # show just the first 2 rows (or any other number)

  student test1 test2 year honors
1     joe    70    81   fr  FALSE
2     sue    80    77   fr  FALSE

tail(gradebook, 2)  # show just the last 2 rows (or any other number)

  student test1 test2 year honors
7    dana   100    99   so   TRUE
8   david    60    73   so  FALSE

35.4 unclass(SOME_OBJECT)

The unclass function takes any object and returns a new version of the object that doesn’t contain the class attribute. (If the original object never had a class attribute then the unclass function doesn’t really do anything special)

You can see that the dataframe is actually a list by removing the class attribute.

This will stop the list from being treated as a “data.frame”. When it is displayed it will look just like a plain list.

gradebook         # displayed in rows and columns

  student test1 test2 year honors
1     joe    70    81   fr  FALSE
2     sue    80    77   fr  FALSE
3     sam    90    88   so  FALSE
4    anne    75    87   so  FALSE
5     bob    85    91   fr  FALSE
6   carla    95    92   se   TRUE
7    dana   100    99   so   TRUE
8   david    60    73   so  FALSE

class(gradebook)  # "data.frame"

[1] "data.frame"

# either of the following lines will do the same thing ..
gradebook = unclass(gradebook)  # remove the class attribute
attr(gradebook, "class") = NULL # this does the same thing

attributes(gradebook)  # "class" is gone!

$names
[1] "student" "test1"   "test2"   "year"    "honors" 

$row.names
[1] 1 2 3 4 5 6 7 8

# Now you can see that the gradebook is no longer a dataframe
gradebook        # displayed as a regular "list"

$student
[1] "joe"   "sue"   "sam"   "anne"  "bob"   "carla" "dana"  "david"

$test1
[1]  70  80  90  75  85  95 100  60

$test2
[1] 81 77 88 87 91 92 99 73

$year
[1] fr fr so so fr se so so
Levels: fr < so < ju < se

$honors
[1] FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE

attr(,"row.names")
[1] 1 2 3 4 5 6 7 8

class(gradebook) # "list"

[1] "list"

# Let's put back the class attribute and we'll see that it once again
# is a dataframe
class(gradebook) = "data.frame"
gradebook        # once again it is a dataframe

  student test1 test2 year honors
1     joe    70    81   fr  FALSE
2     sue    80    77   fr  FALSE
3     sam    90    88   so  FALSE
4    anne    75    87   so  FALSE
5     bob    85    91   fr  FALSE
6   carla    95    92   se   TRUE
7    dana   100    99   so   TRUE
8   david    60    73   so  FALSE

class(gradebook) # "data.frame"

[1] "data.frame"

35.5 The “list” features of a dataframe.

A dataframe is a “list” of parallel vectors (as noted above).

Since a dataframe is a “list”, any technique that works with lists also works with dataframes. If you understand how to use lists then you already understand how to use many of the features of dataframes since a dataframe IS A LIST. The following topics do not introduce any new concepts. The following topics simply show how to apply your knowledge of manipulating lists directly to dataframes.

35.5.1 length(SOME_DATAFRAME) ncol(SOME_DATAFRAME)

length(gradebook)    # the number of columns - same as ncol(gradebook)

[1] 5

ncol(gradebook)      # same thing

[1] 5

# There is also an nrow function - but this only works on dataframes, not
# regular lists. See description of nrow above.

35.5.2 unlist(SOME_DATAFRAME) - retrieve all values into a named vector

vec = unlist(gradebook)  # put the entire contents of the dataframe in a single named vector
vec

student1 student2 student3 student4 student5 student6 student7 student8 
   "joe"    "sue"    "sam"   "anne"    "bob"  "carla"   "dana"  "david" 
  test11   test12   test13   test14   test15   test16   test17   test18 
    "70"     "80"     "90"     "75"     "85"     "95"    "100"     "60" 
  test21   test22   test23   test24   test25   test26   test27   test28 
    "81"     "77"     "88"     "87"     "91"     "92"     "99"     "73" 
   year1    year2    year3    year4    year5    year6    year7    year8 
     "1"      "1"      "2"      "2"      "1"      "4"      "2"      "2" 
 honors1  honors2  honors3  honors4  honors5  honors6  honors7  honors8 
 "FALSE"  "FALSE"  "FALSE"  "FALSE"  "FALSE"   "TRUE"   "TRUE"  "FALSE"

#. . . . . . . . . . . . . 
# using the unlisted data
#. . . . . . . . . . . . . 
mode(vec)   # "character" - the exact mode will depend on the implicit conversion rules

[1] "character"

class(vec)  # "character"

[1] "character"

names(vec)  # just the names of the named vector

 [1] "student1" "student2" "student3" "student4" "student5" "student6"
 [7] "student7" "student8" "test11"   "test12"   "test13"   "test14"  
[13] "test15"   "test16"   "test17"   "test18"   "test21"   "test22"  
[19] "test23"   "test24"   "test25"   "test26"   "test27"   "test28"  
[25] "year1"    "year2"    "year3"    "year4"    "year5"    "year6"   
[31] "year7"    "year8"    "honors1"  "honors2"  "honors3"  "honors4" 
[37] "honors5"  "honors6"  "honors7"  "honors8"

names(vec) = NULL # get rid of the names

vec               # just the data without the names

 [1] "joe"   "sue"   "sam"   "anne"  "bob"   "carla" "dana"  "david" "70"   
[10] "80"    "90"    "75"    "85"    "95"    "100"   "60"    "81"    "77"   
[19] "88"    "87"    "91"    "92"    "99"    "73"    "1"     "1"     "2"    
[28] "2"     "1"     "4"     "2"     "2"     "FALSE" "FALSE" "FALSE" "FALSE"
[37] "FALSE" "TRUE"  "TRUE"  "FALSE"

35.5.3 Select columns from a dataframe

#............................................................
# Retrieve specific columns with [single-bracket] notation.
# This will return a smaller dataframe (ie. list) with just those columns.
#
#     ** This all works because a dataframe IS A LIST **
#............................................................

# single brackets (with one vector inside the [brackets])
# will return just the columns that you request.
# You can use any of the methods to request the columns that you can use 
# with a named list, i.e. a vector that contains
#   - position numbers
#   - negative position numbers
#   - TRUE FALSE values
#   - names of items in the list (i.e. the column names)

gradebook[1]  # a dataframe that contains just the 1st column

  student
1     joe
2     sue
3     sam
4    anne
5     bob
6   carla
7    dana
8   david

gradebook[c(1,3)]   # items 1 and 3 from list - i.e. 1st and 3rd columns

  student test2
1     joe    81
2     sue    77
3     sam    88
4    anne    87
5     bob    91
6   carla    92
7    dana    99
8   david    73

gradebook[c(-2,-4,-5)]  # everything EXCEPT for columns 2,4,5 - i.e. same result

  student test2
1     joe    81
2     sue    77
3     sam    88
4    anne    87
5     bob    91
6   carla    92
7    dana    99
8   david    73

gradebook[c(TRUE,FALSE,TRUE,FALSE,FALSE)] # same result

  student test2
1     joe    81
2     sue    77
3     sam    88
4    anne    87
5     bob    91
6   carla    92
7    dana    99
8   david    73

gradebook[c("student","test2")] # items named "student" and "test2" from the list

  student test2
1     joe    81
2     sue    77
3     sam    88
4    anne    87
5     bob    91
6   carla    92
7    dana    99
8   david    73

# The recycling rule also works for indexing with logical vectors
gradebook[c(TRUE,FALSE)]  # every other column starting with the 1st

  student test2 honors
1     joe    81  FALSE
2     sue    77  FALSE
3     sam    88  FALSE
4    anne    87  FALSE
5     bob    91  FALSE
6   carla    92   TRUE
7    dana    99   TRUE
8   david    73  FALSE

35.5.4 $ sign notation: someDataFrame$someColumn returns a vector

#.............................................................................
# Retrieve specific columns with $dollar-sign-notation.
# This returns a VECTOR (i.e. the actual contents of what's in the list)
#
#     ** This all works because a dataframe IS A LIST **
#.............................................................................

gradebook$student        # "joe"   "sue"   "sam"   "anne"  "bob"   "carla" "dana"  "david"

[1] "joe"   "sue"   "sam"   "anne"  "bob"   "carla" "dana"  "david"

class(gradebook$student) # "character"   ( NOT "data.frame" )

[1] "character"

gradebook$test1        # 70  80  90  75  85  95 100  60

[1]  70  80  90  75  85  95 100  60

class(gradebook$test1) # "numeric"   ( NOT "data.frame" )

[1] "numeric"

35.5.5 someDataFrame[[1]]: returns a vector

#.............................................................................
# Retrieve specific columns with [[double-bracket]] notation.
# Same as using $dollar-sign-notation.
# This returns a VECTOR (i.e. the actual contents of what's in the list)
#
#     ** This all works because a dataframe IS A LIST **
#.............................................................................

gradebook[[1]] # Just 1st column AS A VECTOR, (same as gradebook$student) - "joe" "sue" etc ...

[1] "joe"   "sue"   "sam"   "anne"  "bob"   "carla" "dana"  "david"

gradebook[[2]] # Just 2nd column AS A VECTOR, (same as gradebook$test1) - 70 80 90 etc ...

[1]  70  80  90  75  85  95 100  60

35.5.6 Using lapply with a dataframe

You can use lapply with a dataframe just as you’d use lapply with a simple list.

lapply will apply a function to each column of the gradebook (i.e. to each item in the list).
lapply returns a list of the results of running the function on each different column.
see examples below

** This all works because a dataframe IS A LIST **

lapply(gradebook, mode)  # a list of the mode of each column

$student
[1] "character"

$test1
[1] "numeric"

$test2
[1] "numeric"

$year
[1] "numeric"

$honors
[1] "logical"

lapply(gradebook, class)  # a list of the class of each column (notice that year is a factor)

$student
[1] "character"

$test1
[1] "numeric"

$test2
[1] "numeric"

$year
[1] "ordered" "factor" 

$honors
[1] "logical"

lapply(gradebook, max)    # a list of the max value from each column

$student
[1] "sue"

$test1
[1] 100

$test2
[1] 99

$year
[1] se
Levels: fr < so < ju < se

$honors
[1] 1

lapply(gradebook, summary) # a list with the results of the summary function for each column

$student
   Length     Class      Mode 
        8 character character 

$test1
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  60.00   73.75   82.50   81.88   91.25  100.00 

$test2
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  73.00   80.00   87.50   86.00   91.25   99.00 

$year
fr so ju se 
 3  4  0  1 

$honors
   Mode   FALSE    TRUE 
logical       6       2

# You can also call summary directly on the dataframe
#
# Remember that summary is a generic function that has different versions (i.e. methods)
# for different classes of data. 

summary(gradebook)             # This automatically calls summary.data.frame(gradebook)

   student              test1            test2       year     honors       
 Length:8           Min.   : 60.00   Min.   :73.00   fr:3   Mode :logical  
 Class :character   1st Qu.: 73.75   1st Qu.:80.00   so:4   FALSE:6        
 Mode  :character   Median : 82.50   Median :87.50   ju:0   TRUE :2        
                    Mean   : 81.88   Mean   :86.00   se:1                  
                    3rd Qu.: 91.25   3rd Qu.:91.25                         
                    Max.   :100.00   Max.   :99.00

summary.data.frame(gradebook)  # REVIEW - same thing - this is not necessary - just call summary(gradebook)

   student              test1            test2       year     honors       
 Length:8           Min.   : 60.00   Min.   :73.00   fr:3   Mode :logical  
 Class :character   1st Qu.: 73.75   1st Qu.:80.00   so:4   FALSE:6        
 Mode  :character   Median : 82.50   Median :87.50   ju:0   TRUE :2        
                    Mean   : 81.88   Mean   :86.00   se:1                  
                    3rd Qu.: 91.25   3rd Qu.:91.25                         
                    Max.   :100.00   Max.   :99.00

35.5.7 A more complex example of using lapply

The mean function will not work for character or factor columns, only numeric columns. The following are examples show how to get the means for each test.

# Get a copy of the gradebook with just the test columns.
gradebook_justTests = gradebook[ colnames(gradebook) == "test1" | colnames(gradebook) == "test2"]
gradebook_justTests

  test1 test2
1    70    81
2    80    77
3    90    88
4    75    87
5    85    91
6    95    92
7   100    99
8    60    73

# another way that assumes you know the positions of the columns
gradebook_justTests = gradebook[c(2,3)]
gradebook_justTests

  test1 test2
1    70    81
2    80    77
3    90    88
4    75    87
5    85    91
6    95    92
7   100    99
8    60    73

# Now get the mean of just those columns
lapply ( gradebook_justTests, mean)

$test1
[1] 81.875

$test2
[1] 86

# or all in one shot
lapply(gradebook[c(2,3)], mean)

$test1
[1] 81.875

$test2
[1] 86

35.5.8 Nested calls to lapply (one lapply inside another lapply)

We can also do the following to automatically figure out which columns are numeric and which aren’t.

The following uses lapply to generate a logical vector with FALSE for each non-numeric column and TRUE for each numeric column.

unlist(lapply(gradebook,function(x) is.numeric(x)))

student   test1   test2    year  honors 
  FALSE    TRUE    TRUE   FALSE   FALSE

We can now use that technique directly inside of the [brackets] to apply the mean function to only those columns that are numeric.

lapply(gradebook[  
                  unlist(lapply(gradebook,function(x) is.numeric(x)))
                ], 
       mean)

$test1
[1] 81.875

$test2
[1] 86

35.5.9 REMEMBER - You can also use custom functions with lapply.

# return largest two values in a vector
largestTwo = function(vec){
  sort(vec)[c(length(vec)-1, length(vec))]
}
lapply(gradebook[c(2,3)], largestTwo)   # highest two grades on both tests

$test1
[1]  95 100

$test2
[1] 92 99

# REMEMBER - You can also do it with an anonymous function
lapply(gradebook[c(2,3)], function(col) sort(col)[c(length(col)-1, length(col))] )

$test1
[1]  95 100

$test2
[1] 92 99

35.5.10 Removing columns from a dataframe (same as removing items from a list)

#............................................................
# Remove columns from a dataframe
#............................................................
# You can remove columns from a dataframe by setting the column value to NULL
# (just as you can remove an item from a list by setting the value to NULL) by
# using $dollar-sign-notation 
# or    [single-bracket-notation]
# or    [[double-bracket-notation]]
#
#     ** This all works because a dataframe IS A LIST **
#............................................................

gradebook

  student test1 test2 year honors
1     joe    70    81   fr  FALSE
2     sue    80    77   fr  FALSE
3     sam    90    88   so  FALSE
4    anne    75    87   so  FALSE
5     bob    85    91   fr  FALSE
6   carla    95    92   se   TRUE
7    dana   100    99   so   TRUE
8   david    60    73   so  FALSE

# any of the methods to refer to columns works

gradebook[[4]]= NULL # remove the 4th column (i.e. the year)
gradebook

  student test1 test2 honors
1     joe    70    81  FALSE
2     sue    80    77  FALSE
3     sam    90    88  FALSE
4    anne    75    87  FALSE
5     bob    85    91  FALSE
6   carla    95    92   TRUE
7    dana   100    99   TRUE
8   david    60    73  FALSE

gradebook$honors = NULL   # remove the honors column
gradebook

  student test1 test2
1     joe    70    81
2     sue    80    77
3     sam    90    88
4    anne    75    87
5     bob    85    91
6   carla    95    92
7    dana   100    99
8   david    60    73

gradebook[c(2,3)] = NULL  # remove the 2nd and 3rd columns
gradebook

  student
1     joe
2     sue
3     sam
4    anne
5     bob
6   carla
7    dana
8   david

35.5.11 Adding new columns to an already existing dataframe (same as adding vectors to a list)

# let's recreate the gradebook
gradebook = data.frame(student =      c("joe", "sue", "sam", "anne", "bob", "carla", "dana", "david"),
                       test1 =        c(70,     80,    90,    75,    85,    95,      100,    60),
                       test2 =        c(81,     77,    88,    87,    91,    92,      99,     73),
                       year  = factor(c("fr",   "fr",  "so",  "so",  "fr",  "se",    "so",   "so"), 
                                      ordered=TRUE, levels=c("fr","so","ju","se")),
                       honors =       c(FALSE,  FALSE, FALSE, FALSE, FALSE, TRUE,    TRUE,   FALSE),
                       stringsAsFactors = FALSE)

#............................................................
# Add columns to a dataframe
#............................................................
# You can add columns to a dataframe by
# using $dollar-sign-notation 
# or    [single-bracket-notation]
# or    [[double-bracket-notation]]
#
#     ** This all works because a dataframe IS A LIST **
#............................................................

gradebook

  student test1 test2 year honors
1     joe    70    81   fr  FALSE
2     sue    80    77   fr  FALSE
3     sam    90    88   so  FALSE
4    anne    75    87   so  FALSE
5     bob    85    91   fr  FALSE
6   carla    95    92   se   TRUE
7    dana   100    99   so   TRUE
8   david    60    73   so  FALSE

ncol(gradebook)

[1] 5

# Add test3 as c(70,80,90,60,70,80,90,100)
# using $dollar-sign-notation

gradebook$test3 = c(70,80,90,60,70,80,90,100)
ncol(gradebook)  # we added a column

[1] 6

gradebook        # the new column is there and it is named "test3"

  student test1 test2 year honors test3
1     joe    70    81   fr  FALSE    70
2     sue    80    77   fr  FALSE    80
3     sam    90    88   so  FALSE    90
4    anne    75    87   so  FALSE    60
5     bob    85    91   fr  FALSE    70
6   carla    95    92   se   TRUE    80
7    dana   100    99   so   TRUE    90
8   david    60    73   so  FALSE   100

# Add test4
# using double bracket notation

gradebook[[7]] = c(74,84,94,64,74,84,94,99)
ncol(gradebook)  # we added a column

[1] 7

gradebook        # name of new column is "V7" - not exactly what we want

  student test1 test2 year honors test3 V7
1     joe    70    81   fr  FALSE    70 74
2     sue    80    77   fr  FALSE    80 84
3     sam    90    88   so  FALSE    90 94
4    anne    75    87   so  FALSE    60 64
5     bob    85    91   fr  FALSE    70 74
6   carla    95    92   se   TRUE    80 84
7    dana   100    99   so   TRUE    90 94
8   david    60    73   so  FALSE   100 99

names(gradebook)[7] = "test4"  # change the name to test4
gradebook

  student test1 test2 year honors test3 test4
1     joe    70    81   fr  FALSE    70    74
2     sue    80    77   fr  FALSE    80    84
3     sam    90    88   so  FALSE    90    94
4    anne    75    87   so  FALSE    60    64
5     bob    85    91   fr  FALSE    70    74
6   carla    95    92   se   TRUE    80    84
7    dana   100    99   so   TRUE    90    94
8   david    60    73   so  FALSE   100    99

# Add test5
# using single bracket notation

gradebook[8] = c(75, 85,95,65,75,85,95,98)
ncol(gradebook)

[1] 8

gradebook

  student test1 test2 year honors test3 test4 V8
1     joe    70    81   fr  FALSE    70    74 75
2     sue    80    77   fr  FALSE    80    84 85
3     sam    90    88   so  FALSE    90    94 95
4    anne    75    87   so  FALSE    60    64 65
5     bob    85    91   fr  FALSE    70    74 75
6   carla    95    92   se   TRUE    80    84 85
7    dana   100    99   so   TRUE    90    94 95
8   david    60    73   so  FALSE   100    99 98

# change the name of the last column
names(gradebook)[ncol(gradebook)] = "test5" # change the name of the last column
gradebook

  student test1 test2 year honors test3 test4 test5
1     joe    70    81   fr  FALSE    70    74    75
2     sue    80    77   fr  FALSE    80    84    85
3     sam    90    88   so  FALSE    90    94    95
4    anne    75    87   so  FALSE    60    64    65
5     bob    85    91   fr  FALSE    70    74    75
6   carla    95    92   se   TRUE    80    84    85
7    dana   100    99   so   TRUE    90    94    95
8   david    60    73   so  FALSE   100    99    98

35.6 Replace columns with other columns (same as replacing items in a list with other items)

#............................................................
# Replace columns in a dataframe
#
# You can replace a column in a dataframe with a different column (ie. vector)
# by using $dollar-sign-notation
# or using [[double-bracket-notation]]
#
#     ** This all works because a dataframe IS A LIST **
#............................................................

# replace the test5 column with lastName
#
# you can use [single-bracket-notation]
# or [[double-bracket-notation]]
# or $dollar-sign-notation

gradebook

  student test1 test2 year honors test3 test4 test5
1     joe    70    81   fr  FALSE    70    74    75
2     sue    80    77   fr  FALSE    80    84    85
3     sam    90    88   so  FALSE    90    94    95
4    anne    75    87   so  FALSE    60    64    65
5     bob    85    91   fr  FALSE    70    74    75
6   carla    95    92   se   TRUE    80    84    85
7    dana   100    99   so   TRUE    90    94    95
8   david    60    73   so  FALSE   100    99    98

ncol(gradebook)

[1] 8

gradebook[8] = c("schwartz", "rosen", "aames", "chill", "jones", "fox", "katz", "cohen")

# The following alternatives would have accomplished the same thing as 
# the line of code above.
#
#    # [[double-brackets]]
#    gradebook[[8]] = c("schwartz", "rosen", "aames", "chill", "jones", "fox", "katz", "cohen")
#
#    # $dollar-sign-notation
#    gradebook$test5 = c("schwartz", "rosen", "aames", "chill", "jones", "fox", "katz", "cohen")

gradebook  # the column name was not changed. It is still "test5"

  student test1 test2 year honors test3 test4    test5
1     joe    70    81   fr  FALSE    70    74 schwartz
2     sue    80    77   fr  FALSE    80    84    rosen
3     sam    90    88   so  FALSE    90    94    aames
4    anne    75    87   so  FALSE    60    64    chill
5     bob    85    91   fr  FALSE    70    74    jones
6   carla    95    92   se   TRUE    80    84      fox
7    dana   100    99   so   TRUE    90    94     katz
8   david    60    73   so  FALSE   100    99    cohen

names(gradebook)[8] = "lastName" # change the name of the 8th column
gradebook

  student test1 test2 year honors test3 test4 lastName
1     joe    70    81   fr  FALSE    70    74 schwartz
2     sue    80    77   fr  FALSE    80    84    rosen
3     sam    90    88   so  FALSE    90    94    aames
4    anne    75    87   so  FALSE    60    64    chill
5     bob    85    91   fr  FALSE    70    74    jones
6   carla    95    92   se   TRUE    80    84      fox
7    dana   100    99   so   TRUE    90    94     katz
8   david    60    73   so  FALSE   100    99    cohen

35.7 Rearrange the order of the columns (same as rearranging the items in a list)

You can rearrange the order of columns in a dataframe by using [single-bracket-notation].

This all works because a dataframe IS A LIST

gradebook

  student test1 test2 year honors test3 test4 lastName
1     joe    70    81   fr  FALSE    70    74 schwartz
2     sue    80    77   fr  FALSE    80    84    rosen
3     sam    90    88   so  FALSE    90    94    aames
4    anne    75    87   so  FALSE    60    64    chill
5     bob    85    91   fr  FALSE    70    74    jones
6   carla    95    92   se   TRUE    80    84      fox
7    dana   100    99   so   TRUE    90    94     katz
8   david    60    73   so  FALSE   100    99    cohen

ncol(gradebook)

[1] 8

# Rearrange the gradebook so firstName and lastName are grouped together
# and all tests are grouped together.
#
# Either of the following will work

gradebook = gradebook[  c(1,8,2,3,6,7,4,5)   ]
gradebook

  student lastName test1 test2 test3 test4 year honors
1     joe schwartz    70    81    70    74   fr  FALSE
2     sue    rosen    80    77    80    84   fr  FALSE
3     sam    aames    90    88    90    94   so  FALSE
4    anne    chill    75    87    60    64   so  FALSE
5     bob    jones    85    91    70    74   fr  FALSE
6   carla      fox    95    92    80    84   se   TRUE
7    dana     katz   100    99    90    94   so   TRUE
8   david    cohen    60    73   100    99   so  FALSE

# Reorder thew columns again - this time using a different notation
gradebook = gradebook[  c("student", "lastName", "year", "honors", "test1", "test2", "test3", "test4")   ]
gradebook

  student lastName year honors test1 test2 test3 test4
1     joe schwartz   fr  FALSE    70    81    70    74
2     sue    rosen   fr  FALSE    80    77    80    84
3     sam    aames   so  FALSE    90    88    90    94
4    anne    chill   so  FALSE    75    87    60    64
5     bob    jones   fr  FALSE    85    91    70    74
6   carla      fox   se   TRUE    95    92    80    84
7    dana     katz   so   TRUE   100    99    90    94
8   david    cohen   so  FALSE    60    73   100    99

35.8 row, column notation, i.e. SOME_DATAFRAME [ WHICH_ROWS_VECTOR , WHICH_COLUMNS_VECTOR ]

###########################################################################.
# Dataframes vs matrices
#
# Dataframes and matrices are different types of objects. 
# A matrix is actually a vector while a dataframe is actually a list.
# Therefore a matrix is limited to a single mode of data (e.g. numeric,
# logical or character). However, a dataframe can have columns of 
# different modes.
#
# However, dataframes and matrices are similar in that they both arrange
# their data in rows and columns. Therefore the syntax for manipulating
# the data by specifying specific rows and columns is basically the 
# same syntax for dataframes as for matrices.
# If you understand how to access data from specific rows/columns in 
# in a matrix, the same techniques are available for dataframes.
#########################################################################.

35.8.1 Access data in specific rows and columns (same syntax as for matrices)

rm(list=ls() )   # start over
gradebook = data.frame(student =      c("joe", "sue", "sam", "anne", "bob", "carla", "dana", "david"),
                       test1 =        c(70,     80,    90,    75,    85,    95,      100,    60),
                       test2 =        c(81,     77,    88,    87,    91,    92,      99,     73),
                       year  = factor(c("fr",   "fr",  "so",  "so",  "fr",  "se",    "so",   "so"), 
                                      ordered=TRUE, levels=c("fr","so","ju","se")),
                       honors =       c(FALSE,  FALSE, FALSE, FALSE, FALSE, TRUE,    TRUE,   FALSE),
                       stringsAsFactors = FALSE)

gradebook

  student test1 test2 year honors
1     joe    70    81   fr  FALSE
2     sue    80    77   fr  FALSE
3     sam    90    88   so  FALSE
4    anne    75    87   so  FALSE
5     bob    85    91   fr  FALSE
6   carla    95    92   se   TRUE
7    dana   100    99   so   TRUE
8   david    60    73   so  FALSE

# If you specify TWO vectors in [single-brackets], then
# the 1st vector indicates the ROWS you want and
# the 2nd vector indicates the COLUMNS you want.
# Examples:

gradebook [  c(1,2) , c(1,2,3)]  # rows: 1,2   columns: 1,2,3

  student test1 test2
1     joe    70    81
2     sue    80    77

gradebook [ c(TRUE,FALSE) , c(-2,-3)]  # rows: every other, cols: all except 2 and 3

  student year honors
1     joe   fr  FALSE
3     sam   so  FALSE
5     bob   fr  FALSE
7    dana   so   TRUE

gradebook [ c(-2,-3) , c("student", "year")] # rows: all except 2 and 3; columns: student, year

  student year
1     joe   fr
4    anne   so
5     bob   fr
6   carla   se
7    dana   so
8   david   so

# Use column names to indicate columns. 
gradebook [ 1:3 , c("student","honors")]  # rows 1,2,3, "student" and "honors" cols

  student honors
1     joe  FALSE
2     sue  FALSE
3     sam  FALSE

# If the rows are NOT specified but the comma (,) is present it implies ALL rows
gradebook [    ,    c(1,2)]  # rows: all , columns: 1,2

  student test1
1     joe    70
2     sue    80
3     sam    90
4    anne    75
5     bob    85
6   carla    95
7    dana   100
8   david    60

gradebook [ c(1,2) ]   # same as above BECAUSE no comma means only specify columns

  student test1
1     joe    70
2     sue    80
3     sam    90
4    anne    75
5     bob    85
6   carla    95
7    dana   100
8   david    60

# If the columns are NOT specified but the comma (,) is present it implies ALL columns
gradebook [ c(1,2)   ,    ]  # rows: 1,2  columns: all

  student test1 test2 year honors
1     joe    70    81   fr  FALSE
2     sue    80    77   fr  FALSE

35.8.2 row.names

#.............................................................................
# row names can have actual values instead of just numbers
#.............................................................................

# Rows can also have names can have actual values instead of just numbers
# For example the following version of the dataframe uses the student names
# as the row names. This is not necessarily recommended ... but it is possible.

gradebookWithRownames = 
            data.frame(test1 =        c(70,     80,    90,    75,    85,    95,      100,    60),
                       test2 =        c(81,     77,    88,    87,    91,    92,      99,     73),
                       year  = factor(c("fr",   "fr",  "so",  "so",  "fr",  "se",    "so",   "so"), 
                                      ordered=TRUE, levels=c("fr","so","ju","se")),
                       honors =       c(FALSE,  FALSE, FALSE, FALSE, FALSE, TRUE,    TRUE,   FALSE),
                       row.names =      c("joe", "sue", "sam", "anne", "bob", "carla", "dana", "david"),
                       stringsAsFactors = FALSE)

gradebookWithRownames # in this version the student names are the row names and are not an actual column of data

      test1 test2 year honors
joe      70    81   fr  FALSE
sue      80    77   fr  FALSE
sam      90    88   so  FALSE
anne     75    87   so  FALSE
bob      85    91   fr  FALSE
carla    95    92   se   TRUE
dana    100    99   so   TRUE
david    60    73   so  FALSE

ncol(gradebookWithRownames) # only 4 columns - student names are no longer a column

[1] 4

gradebook # in this version the student names are a separate column

  student test1 test2 year honors
1     joe    70    81   fr  FALSE
2     sue    80    77   fr  FALSE
3     sam    90    88   so  FALSE
4    anne    75    87   so  FALSE
5     bob    85    91   fr  FALSE
6   carla    95    92   se   TRUE
7    dana   100    99   so   TRUE
8   david    60    73   so  FALSE

ncol(gradebook) # 5 columns - student names ARE a column of data

[1] 5

# you can use the row names to access data too

gradebookWithRownames[c("joe","sam") , ]  # just rows for joe and sam, all columns

    test1 test2 year honors
joe    70    81   fr  FALSE
sam    90    88   so  FALSE

gradebookWithRownames[c(1,2) , ]   # same thing, we're just using row numbers instead of names

    test1 test2 year honors
joe    70    81   fr  FALSE
sue    80    77   fr  FALSE

gradebookWithRownames[c("joe","sam") , c("test2","year")]  # rows: joe, sam    columns: test2, year

    test2 year
joe    81   fr
sam    88   so

# You may use different indexing methods for the rows and for the cols

gradebookWithRownames[c("joe","sam") , c(2,3)]  # rows: joe, sam    columns: 2,3

    test2 year
joe    81   fr
sam    88   so

# Both rownames and row.names functions work to get a vector with the names of the rows.
rownames(gradebookWithRownames)

[1] "joe"   "sue"   "sam"   "anne"  "bob"   "carla" "dana"  "david"

row.names(gradebookWithRownames)

[1] "joe"   "sue"   "sam"   "anne"  "bob"   "carla" "dana"  "david"

35.8.3 Data from a SINGLE ROW is returned as a data.frame BUT data from a SINGLE COLUMN is returned as a VECTOR!

Data from a SINGLE ROW is returned as a data.frame.
Data from a SINGLE COLUMN is returned as a VECTOR!

This should not be surprising. Remember that the values in a single row might be of different types (i.e. modes). Since a vector may only contain one mode of data returning a single row from a dataframe returns a dataframe. This is true even if all the columns in the dataframe happen to have the same type.

NOTE: Recall that matrices are different. Since a matrix IS a vector, retruning data from just one row of a matrix results in a VECTOR be default (unless, drop=FALSE is specified).

# Data from a single row is returned as a dataframe.
#

gradebook[ 2 ,   ]   # one row - result is a data.frame

  student test1 test2 year honors
2     sue    80    77   fr  FALSE

gradebook[ 2 ,  c(2,3) ]   # one row - result is a data.frame

  test1 test2
2    80    77

gradebook[ 2 ,  c("test1", "test2") ]   # same thing

  test1 test2
2    80    77

# Data from a single row is returned as a VECTOR!

gradebook[   , 2 ]   # one column - result is a vector

[1]  70  80  90  75  85  95 100  60

gradebook[   , 2 , drop=FALSE]   # one column - result is data.frame

gradebook[   , "test2" ]   # same thing

[1] 81 77 88 87 91 92 99 73

gradebook[   , "test2" , drop=FALSE]   # one column - result is data.frame

gradebook[   , c(2,3) ]   # two columns - result is a data.frame

  test1 test2
1    70    81
2    80    77
3    90    88
4    75    87
5    85    91
6    95    92
7   100    99
8    60    73

gradebook[   , c("test1", "test2") ]   # same thing

  test1 test2
1    70    81
2    80    77
3    90    88
4    75    87
5    85    91
6    95    92
7   100    99
8    60    73

gradebook[  gradebook$test1 >= 90 , 2 ]   # Data from a single column - VECTOR!

[1]  90  95 100

# Show the year for the students who got above a 90 on test1
gradebook[  gradebook$test1 >= 90 , 4 ]

[1] so se so
Levels: fr < so < ju < se

# another way
gradebook[  gradebook$test1 >= 90 , "year" ]

[1] so se so
Levels: fr < so < ju < se

35.9 — Practice —

Answer the following questions by referring to this data:

rm(list=ls() )   # start over
gradebook = data.frame(student =      c("joe", "sue", "sam", "anne", "bob", "carla", "dana", "david"),
                       test1 =        c(70,     80,    70,    75,    85,    95,      100,    60),
                       test2 =        c(81,     77,    60,    87,    91,    92,      99,     73),
                       year  = factor(c("fr",   "fr",  "so",  "so",  "fr",  "se",    "so",   "so"), 
                                      ordered=TRUE, levels=c("fr","so","ju","se")),
                       honors =       c(FALSE,  FALSE, FALSE, FALSE, FALSE, TRUE,    TRUE,   FALSE),
                       stringsAsFactors = FALSE)

gradebook

  student test1 test2 year honors
1     joe    70    81   fr  FALSE
2     sue    80    77   fr  FALSE
3     sam    70    60   so  FALSE
4    anne    75    87   so  FALSE
5     bob    85    91   fr  FALSE
6   carla    95    92   se   TRUE
7    dana   100    99   so   TRUE
8   david    60    73   so  FALSE

35.9.1 Question 1

Show the average grade on test1

click for answer

mean(gradebook[ , "test1"])

[1] 79.375

mean(gradebook[ , 2])

[1] 79.375

mean(gradebook[[2]])

[1] 79.375

mean(gradebook$test1)

[1] 79.375

35.9.2 Question 2

Show just the data for sophomores test1 as a vector

click for answer

gradebook [   gradebook$year == "so"   ,      "test1" ]

[1]  70  75 100  60

35.9.3 Question 3

Show the average grade that sophomores got on test1

click for answer

mean ( gradebook$test1[ gradebook$year == "so"    ] )

[1] 76.25

mean ( gradebook [   gradebook$year == "so"   ,      "test1" ] )

[1] 76.25

35.9.4 Question 4 (part A)

Show the rows for the students who scored above average on test1

click for answer

gradebook [ gradebook$test1 >   mean(gradebook$test1)            ,     ]

  student test1 test2 year honors
2     sue    80    77   fr  FALSE
5     bob    85    91   fr  FALSE
6   carla    95    92   se   TRUE
7    dana   100    99   so   TRUE

35.9.5 Question 4 (part B)

Show the names for the students who scored above average on test1

click for answer

# one way
gradebook [ gradebook$test1 >   mean(gradebook$test1) , "student"]

[1] "sue"   "bob"   "carla" "dana"

# Another way
gradebook$student [ gradebook$test1 >   mean(gradebook$test1) ]

[1] "sue"   "bob"   "carla" "dana"

35.9.6 Question 5

Show just the student names and test1 grades for students who scored above average on test1

click for answer

gradebook [ gradebook$test1 >   mean(gradebook$test1)  ,  c("student", "test1")   ]

  student test1
2     sue    80
5     bob    85
6   carla    95
7    dana   100

35.9.7 Question 6

Show the rows for the students who scored above average on test1 and on test2

click for answer

gradebook [ gradebook$test1 >   mean(gradebook$test1)  &
              gradebook$test2 >   mean(gradebook$test2) 
            ,    ]

  student test1 test2 year honors
5     bob    85    91   fr  FALSE
6   carla    95    92   se   TRUE
7    dana   100    99   so   TRUE

35.9.8 Question 7

Show the rows for just the freshmen and sophomores who scored above average on test1 and on test2

click for answer

# ONE ANSWER - using or (|)
gradebook [ gradebook$test1 >   mean(gradebook$test1)  &
              gradebook$test2 >   mean(gradebook$test2) &
              (gradebook$year == "fr" | gradebook$year == "so")
            ,    ]

  student test1 test2 year honors
5     bob    85    91   fr  FALSE
7    dana   100    99   so   TRUE

# ANOTHER ANSWER - using %in%
gradebook [ gradebook$test1 >   mean(gradebook$test1)  &
              gradebook$test2 >   mean(gradebook$test2) &
              gradebook$year %in% c("fr","so")
            ,    ]

  student test1 test2 year honors
5     bob    85    91   fr  FALSE
7    dana   100    99   so   TRUE

36 More practice questions

36.0.1 Question 8

Show the complete rows for “sue” and “bob”. Write the code so that you do NOT need to know in which position the desired students appear.

click for answer

# One answer
gradebook [ gradebook$student %in% c("sue", "bob") ,  ]   # don't forget the comma

  student test1 test2 year honors
2     sue    80    77   fr  FALSE
5     bob    85    91   fr  FALSE

# Another answer:
gradebook[gradebook$student=="sue"|gradebook$student=="bob" ,  ]   # don't forget the comma

  student test1 test2 year honors
2     sue    80    77   fr  FALSE
5     bob    85    91   fr  FALSE

36.0.2 QUESTION 9 (part A)

Show just carla’s grade on test1. (Write the code in a way that you do NOT need to know which row).

click for answer

gradebook

  student test1 test2 year honors
1     joe    70    81   fr  FALSE
2     sue    80    77   fr  FALSE
3     sam    70    60   so  FALSE
4    anne    75    87   so  FALSE
5     bob    85    91   fr  FALSE
6   carla    95    92   se   TRUE
7    dana   100    99   so   TRUE
8   david    60    73   so  FALSE

gradebook[ gradebook$student == "carla" , "test1"]

[1] 95

36.0.3 QUESTION 9 (part B)

Add 1 point to carla’s grade on test1. (Write the code in a way that you do NOT need to know which row contains carla’s data).

click for answer

gradebook

  student test1 test2 year honors
1     joe    70    81   fr  FALSE
2     sue    80    77   fr  FALSE
3     sam    70    60   so  FALSE
4    anne    75    87   so  FALSE
5     bob    85    91   fr  FALSE
6   carla    95    92   se   TRUE
7    dana   100    99   so   TRUE
8   david    60    73   so  FALSE

gradebook[ gradebook$student == "carla" , "test1"] = 
  gradebook[ gradebook$student == "carla" , "test1"] + 1

gradebook

  student test1 test2 year honors
1     joe    70    81   fr  FALSE
2     sue    80    77   fr  FALSE
3     sam    70    60   so  FALSE
4    anne    75    87   so  FALSE
5     bob    85    91   fr  FALSE
6   carla    96    92   se   TRUE
7    dana   100    99   so   TRUE
8   david    60    73   so  FALSE

36.0.4 QUESTION 10

Add 2 points to the test1 grades for all freshmen (year == “fr”)

click for answer

gradebook

  student test1 test2 year honors
1     joe    70    81   fr  FALSE
2     sue    80    77   fr  FALSE
3     sam    70    60   so  FALSE
4    anne    75    87   so  FALSE
5     bob    85    91   fr  FALSE
6   carla    96    92   se   TRUE
7    dana   100    99   so   TRUE
8   david    60    73   so  FALSE

gradebook [ gradebook$year == "fr", "test1" ] = 
  gradebook [ gradebook$year == "fr", "test1" ] + 2

gradebook

  student test1 test2 year honors
1     joe    72    81   fr  FALSE
2     sue    82    77   fr  FALSE
3     sam    70    60   so  FALSE
4    anne    75    87   so  FALSE
5     bob    87    91   fr  FALSE
6   carla    96    92   se   TRUE
7    dana   100    99   so   TRUE
8   david    60    73   so  FALSE

36.0.5 QUESTION 11 (part A)

PART A - Display the complete rows for all sophomores who scored at least 5 points below average on test1 and on test2

click for answer

gradebook[  gradebook$year == "so" &
            gradebook$test1 <= mean(gradebook$test1) - 5 &
            gradebook$test2 <= mean(gradebook$test2) - 5 
            , ]

  student test1 test2 year honors
3     sam    70    60   so  FALSE
8   david    60    73   so  FALSE

36.0.6 QUESTION 11 (part B)

Display JUST the test1 and test2 grades of those students.

click for answer

gradebook[  gradebook$year == "so" &
              gradebook$test1 <= mean(gradebook$test1) - 5 &
              gradebook$test2 <= mean(gradebook$test2) - 5 
            , c("test1", "test2")]

  test1 test2
3    70    60
8    60    73

36.0.7 QUESTION 11 (part C)

Add 2 points to the test1 and test2 grades of those students.

click for answer

gradebook

  student test1 test2 year honors
1     joe    72    81   fr  FALSE
2     sue    82    77   fr  FALSE
3     sam    70    60   so  FALSE
4    anne    75    87   so  FALSE
5     bob    87    91   fr  FALSE
6   carla    96    92   se   TRUE
7    dana   100    99   so   TRUE
8   david    60    73   so  FALSE

gradebook[  gradebook$year == "so" &
              gradebook$test1 <= mean(gradebook$test1) - 5 &
              gradebook$test2 <= mean(gradebook$test2) - 5 
            , c("test1", "test2")] = 
  
   2 + gradebook[  gradebook$year == "so" &
                   gradebook$test1 <= mean(gradebook$test1) - 5 &
                   gradebook$test2 <= mean(gradebook$test2) - 5 
                 , c("test1", "test2")]
  
  
gradebook

  student test1 test2 year honors
1     joe    72    81   fr  FALSE
2     sue    82    77   fr  FALSE
3     sam    72    62   so  FALSE
4    anne    75    87   so  FALSE
5     bob    87    91   fr  FALSE
6   carla    96    92   se   TRUE
7    dana   100    99   so   TRUE
8   david    62    75   so  FALSE